Skip to content
Extraits de code Groupes Projets

Comparer les révisions

Les modifications sont affichées comme si la révision source était fusionnée avec la révision cible. En savoir plus sur la comparaison des révisions.

Source

Sélectionner le projet cible
No results found

Cible

Sélectionner le projet cible
  • la-quadrature-du-net/Attrap
  • foggyfrog/Attrap
  • skhwiz/Attrap
  • precambrien/Attrap
  • ketsapiwiq/Attrap
  • Joseki/Attrap
  • kr1p/attrap-pref-12
  • kr1p/attrap-pref-46
  • kr1p/attrap-pi
  • Guinness/Attrap
  • astroidgritty/attrap-pref-84
  • davinov/Attrap
  • maettellite/attrap-pref-01
  • m242/Attrap
  • multi/Attrap
  • mverdeil/Attrap
  • olpo/Attrap
17 résultats
Afficher les modifications
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref33(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.gironde.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs'
full_name = 'Préfecture de la Gironde'
short_code = 'pref33'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'Recueils{0,1} des [Aa]ctes [Aa]dministratifs de l\'année ([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.grey_card['follow_link_on_unrecognised_date'] = False
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref34(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.herault.gouv.fr'
raa_page = f'{hostname}/Publications/Recueils-des-actes-administratifs'
full_name = 'Préfecture de l\'Hérault'
short_code = 'pref34'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
year_regex = '(?:(?:Recueil des actes administratifs)|(?:Année))[ -]([0-9]{4})'
Attrap_prefdpt.white_card['regex']['year'] = year_regex
Attrap_prefdpt.grey_card['regex']['year'] = year_regex
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref35(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.ille-et-vilaine.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
full_name = 'Préfecture d\'Ille-et-Vilaine'
short_code = 'pref35'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
year_regex = 'Recueil des actes administratifs ([0-9]{4})'
Attrap_prefdpt.white_card['regex']['year'] = year_regex
# On ajoute un widget de menu déroulant
Attrap_prefdpt.select_widgets.append(
Attrap_prefdpt.DptSelectWidget(
'menu_deroulant',
regex=year_regex,
css_path='div.fr-select-group select#Archives-des-RAA-liste-docs.fr-select',
type='year'
)
)
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref38(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.isere.gouv.fr'
raa_page = f'{hostname}/Publications/RAA-Recueil-des-actes-administratifs'
full_name = 'Préfecture de l\'Isère'
short_code = 'pref38'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
year_regex = '(?:(?:[Rr]ecueils{0,1} des [Aa]ctes [Aa]dministratifs de la [Pp]réfecture de l\'Isère[ -]*)|(?:Année ))([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['year'] = year_regex
Attrap_prefdpt.white_card['regex']['year'] = year_regex
Attrap_prefdpt.white_card['exclude'] = ['Vous recherchez "Le Journal officiel de la République française" ?']
# On ajoute un widget de menu déroulant
Attrap_prefdpt.select_widgets.append(
Attrap_prefdpt.DptSelectWidget(
'menu_deroulant',
regex='([0-9]{1,2}[er]{0,1} [a-zéû]* [0-9]{4})',
css_path='select#-liste-docs',
type='year-month-day'
)
)
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref39(Attrap_prefdpt):
# Configuration de la préfecture
hostname = "https://www.jura.gouv.fr"
raa_page = f'{hostname}/Publications/Publications-legales/Recueil-des-Actes-Administratifs'
full_name = "Préfecture du Jura"
short_code = "pref39"
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'Année ([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref42(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.loire.gouv.fr'
raa_page = f'{hostname}/Publications/Publications-legales/Recueil-des-Actes-Administratifs'
full_name = 'Préfecture de la Loire'
short_code = 'pref42'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.white_card['regex']['year'] = '([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref44(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.loire-atlantique.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs-RAA-en-Loire-Atlantique'
full_name = 'Préfecture de la Loire-Atlantique'
short_code = 'pref44'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.grey_card['add_year_to_months'] = True
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref49(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.maine-et-loire.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs'
full_name = 'Préfecture de Maine-et-Loire'
short_code = 'pref49'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '.*([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref50(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.manche.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
full_name = 'Préfecture de la Manche'
short_code = 'pref50'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref52(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.haute-marne.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs-RAA'
full_name = 'Préfecture de la Haute-Marne'
short_code = 'pref52'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.white_card['regex']['year'] = 'Année ([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref54(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.meurthe-et-moselle.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
full_name = 'Préfecture de Meurthe-et-Moselle'
short_code = 'pref54'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.white_card['regex']['year'] = '([0-9]{4})'
# On ajoute un widget de menu déroulant
Attrap_prefdpt.select_widgets.append(
Attrap_prefdpt.DptSelectWidget(
'menu_deroulant',
regex='.* du ([0-9]*(?:er|ER)? [A-Za-zéÉûÛ]* [0-9]*)',
css_path='select#Liste-liste-docs',
type='year-month-day'
)
)
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref55(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.meuse.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs-RAA'
full_name = 'Préfecture de la Meuse'
short_code = 'pref55'
timezone = 'Europe/Paris'
# On configure le widget de menu déroulant
Attrap_prefdpt.select_widgets.append(
Attrap_prefdpt.DptSelectWidget(
'menu_deroulant',
regex='RAA année ([0-9]{4})',
css_path='select#Liste-des-recueils-liste-docs',
type='year'
)
)
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref59(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.nord.gouv.fr'
raa_page = f'{hostname}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord'
full_name = 'Préfecture du Nord'
short_code = 'pref59'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.grey_card['add_year_to_months'] = True
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref61(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.orne.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs-RAA/Recueil-des-Actes-Administratifs-RAA'
full_name = 'Préfecture de l\'Orne'
short_code = 'pref61'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'Le Recueil des actes administratifs ([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.grey_card['add_year_to_months'] = True
Attrap_prefdpt.white_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.white_card['add_year_to_months'] = True
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref62(Attrap_prefdpt):
# Config
hostname = 'https://www.pas-de-calais.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
full_name = 'Préfecture du Pas-de-Calais'
short_code = 'pref62'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.white_card['regex']['year'] = '([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref63(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.puy-de-dome.gouv.fr'
raa_page = f'{hostname}/Publications/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-Puy-de-Dome'
full_name = 'Préfecture du Puy-de-Dôme'
short_code = 'pref63'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
......@@ -4,53 +4,58 @@ import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
from Attrap import Attrap
class RAAspotter_pref31(RAAspotter):
class Attrap_pref64(Attrap):
# Config
__HOST = 'https://www.haute-garonne.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-Actes-Administratifs/Recueil-des-Actes-Administratifs-Haute-Garonne'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
full_name = 'Préfecture de la Haute-Garonne'
short_code = 'pref31'
hostname = 'https://www.pyrenees-atlantiques.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture des Pyrénées-Atlantiques'
short_code = 'pref64'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref31')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
# On cherche les pages de chaque mois
page_content = self.get_page(self.__RAA_PAGE, 'get').content
month_pages = self.get_sub_pages(
# On récupère les pages d'années
year_pages = []
page_content = self.get_page(self.raa_page, 'get').content
for year_page in self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.hostname,
False
)[::-1]
pages_to_parse = []
# On filtre les pages de mois pour limiter le nombre de requêtes
for month_page in month_pages:
guessed_date = RAAspotter.guess_date(month_page['name'], '([a-zéû]* [0-9]{4})')
if guessed_date >= self.not_before.replace(day=1):
pages_to_parse.append(month_page['url'])
elements = []
# On parse les pages des mois qu'on veut analyser
for element in self.get_raa_with_pager(
pages_to_parse,
".fr-pagination__link.fr-pagination__link--next",
self.__HOST
):
elements.append(element)
year = Attrap.guess_date(year_page['name'], '.* ([0-9]{4})').year
if year < 9999 and year >= self.not_before.year:
year_pages.append(year_page['url'])
# Pour chaque page d'année, on récupère les pages de mois
month_pages = []
for year_page in year_pages:
page_content = self.get_page(year_page, 'get').content
for month_page in self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.hostname,
False
):
if Attrap.guess_date(month_page['name'], '(.*)').replace(day=1) >= self.not_before.replace(day=1):
month_pages.append(month_page['url'])
# On récupère les RAA en suivant la navigation de chaque page de mois
elements = self.get_raa_with_pager(
month_pages[::-1],
'a.fr-pagination__link--next.fr-pagination__link--lg-label',
self.hostname
)[::-1]
self.parse_raa(elements, keywords.split(','))
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
......@@ -62,14 +67,14 @@ class RAAspotter_pref31(RAAspotter):
for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link.menu-item-link'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
url = f"{self.hostname}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.get_text().strip().capitalize()
name = a.get_text().strip()
date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y')
raa = RAAspotter.RAA(url, date, name)
raa = Attrap.RAA(url, date, name, timezone=self.timezone)
elements.append(raa)
return elements
import os
import datetime
import re
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
from Attrap import Attrap
class RAAspotter_pref42(RAAspotter):
class Attrap_pref65(Attrap):
# Config
__HOST = 'https://www.loire.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Publications-legales/Recueil-des-Actes-Administratifs'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture de de la Loire'
short_code = 'pref42'
hostname = 'https://www.hautes-pyrenees.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-d-actes-administratifs'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture des Hautes-Pyrénées'
short_code = 'pref65'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref42')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
year_pages_to_parse = []
# On détermine quelles pages d'année parser
pages_to_parse = []
year_pages = self.get_sub_pages_with_pager(
self.__RAA_PAGE,
self.raa_page,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.__HOST
self.hostname
)
for year_page in year_pages:
year = 9999
try:
year = int(re.search('([0-9]{4})', year_page['name'], re.IGNORECASE).group(1))
if year is None:
year = 9999
except Exception as exc:
logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}")
year = 9999
if year >= self.not_before.year:
year_pages_to_parse.append(year_page['url'])
if Attrap.guess_date(year_page['name'].strip(), '.*([0-9]{4})').year >= self.not_before.year:
pages_to_parse.append(year_page['url'])
elements = []
# Pour chaque année, on parse les RAA
for year_page in year_pages_to_parse:
page_content = self.get_page(year_page, 'get').content
for element in self.get_raa_elements(page_content)[::-1]:
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
for element in self.get_raa_elements(page_content):
elements.append(element)
# On parse les RAA
self.parse_raa(elements, keywords.split(','))
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
......@@ -65,11 +49,11 @@ class RAAspotter_pref42(RAAspotter):
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('div.fr-downloads-group.fr-downloads-group--bordered ul li a'):
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
url = f"{self.hostname}{a['href']}"
else:
url = a['href']
......@@ -77,6 +61,6 @@ class RAAspotter_pref42(RAAspotter):
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = RAAspotter.RAA(url, date, name)
raa = Attrap.RAA(url, date, name, timezone=self.timezone)
elements.append(raa)
return elements
......@@ -6,35 +6,38 @@ import logging
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
from Attrap import Attrap
logger = logging.getLogger(__name__)
class RAAspotter_pref66(RAAspotter):
class Attrap_pref66(Attrap):
# Config
__HOST = 'https://www.pyrenees-orientales.gouv.fr'
__RAA_PAGE = {
'2024': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2024',
'2023': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2023',
'2022': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2022',
'2021': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2021',
'2020': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2020',
'2019': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2019'
}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
hostname = 'https://www.pyrenees-orientales.gouv.fr'
raa_page = f'{hostname}/Publications/Le-recueil-des-actes-administratifs'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture des Pyrénées-Orientales'
short_code = 'pref66'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref66')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
# On détermine quelles pages d'année parser
year_pages = []
page_content = self.get_page(self.raa_page, 'get').content
for year_page in self.get_sub_pages(
page_content,
'.fr-table table tr td h3 a.fr-link',
self.hostname,
False
):
year = Attrap.guess_date(year_page['name'].strip(), '.* ([0-9]{4})').year
if year < 9999 and year >= self.not_before.year:
year_pages.append([year_page['url'], year])
elements = []
......@@ -47,26 +50,18 @@ class RAAspotter_pref66(RAAspotter):
# n'est pas exhaustif. On doit donc parser toutes les sous-pages de
# 2024 puisqu'on ne peut se fier au tableau récapitulatif.
# Grrr.
if self.not_before.year <= 2024:
for element in self.get_raa_elements_since_2024(self.__RAA_PAGE['2024']):
elements.append(element)
if self.not_before.year <= 2023:
for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2023']):
elements.append(element)
if self.not_before.year <= 2022:
for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2022']):
elements.append(element)
if self.not_before.year <= 2021:
for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2021']):
elements.append(element)
if self.not_before.year <= 2020:
for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2020']):
elements.append(element)
if self.not_before.year <= 2019:
for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2019']):
elements.append(element)
self.parse_raa(elements, keywords.split(','))
for year_page in year_pages:
url = year_page[0]
year = year_page[1]
if year >= 2024:
for element in self.get_raa_elements_since_2024(url):
elements.append(element)
else:
for element in self.get_raa_elements_before_2024(url):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
# On parse un lien d'avant 2024
......@@ -93,7 +88,7 @@ class RAAspotter_pref66(RAAspotter):
if date >= self.not_before:
url = ''
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
url = f"{self.hostname}{a['href']}"
else:
url = a['href']
......@@ -104,7 +99,7 @@ class RAAspotter_pref66(RAAspotter):
else:
name = a.get_text().replace('Télécharger ', '').strip()
elements.append(RAAspotter.RAA(url, date, name))
elements.append(Attrap.RAA(url, date, name))
return elements
# On parse les RAA depuis 2024
......@@ -114,7 +109,7 @@ class RAAspotter_pref66(RAAspotter):
'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next',
'div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.__HOST
self.hostname
)[::-1]
pages_to_parse = []
......@@ -125,7 +120,7 @@ class RAAspotter_pref66(RAAspotter):
logger.warning(f"Attention, le lien vers {page['url']} n'est pas bon !")
else:
if page['url'].startswith('/'):
url = f"{self.__HOST}{page['url']}"
url = f"{self.hostname}{page['url']}"
else:
url = page['url']
......@@ -133,5 +128,5 @@ class RAAspotter_pref66(RAAspotter):
name = page['name'].replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y')
elements.append(RAAspotter.RAA(url, date, name))
elements.append(Attrap.RAA(url, date, name, timezone=self.timezone))
return elements
......@@ -4,57 +4,45 @@ import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
from Attrap import Attrap
class RAAspotter_pref69(RAAspotter):
class Attrap_pref69(Attrap):
# Config
__HOST = 'https://www.rhone.gouv.fr'
__RAA_PAGE = {
'2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2024',
'2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2023',
'2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2022',
'2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2021',
'2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2020',
'2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2019'
}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
hostname = 'https://www.rhone.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Rhône'
short_code = 'pref69'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(20)
super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref69')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
# On détermine quelles pages d'année parser
year_pages = []
page_content = self.get_page(self.raa_page, 'get').content
for year_page in self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.hostname,
False
):
year = Attrap.guess_date(year_page['name'].strip(), '.* ([0-9]{4})').year
if year < 9999 and year >= self.not_before.year:
year_pages.append(year_page['url'])
sub_pages_to_parse = []
for raa_page in pages_to_parse:
for raa_page in year_pages:
sub_pages = self.get_sub_pages_with_pager(
raa_page,
'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link--next',
None,
self.__HOST)[::-1]
self.hostname)[::-1]
for sub_page in sub_pages:
sub_pages_to_parse.append(sub_page['url'])
......@@ -64,7 +52,7 @@ class RAAspotter_pref69(RAAspotter):
for element in self.get_raa_elements(page_content)[::-1]:
elements.append(element)
self.parse_raa(elements, keywords.split(','))
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
......@@ -76,7 +64,7 @@ class RAAspotter_pref69(RAAspotter):
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
url = f"{self.hostname}{a['href']}"
else:
url = a['href']
......@@ -84,6 +72,6 @@ class RAAspotter_pref69(RAAspotter):
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = RAAspotter.RAA(url, date, name)
raa = Attrap.RAA(url, date, name, timezone=self.timezone)
elements.append(raa)
return elements