diff --git a/Attrap_pref80.py b/Attrap_pref80.py index 96198be4923db9973341b5300eb79215f6301edd..b10fb341b40948252ff9c09f0e40969d835ab1da 100644 --- a/Attrap_pref80.py +++ b/Attrap_pref80.py @@ -14,14 +14,7 @@ class Attrap_pref80(Attrap): # Config hostname = 'https://www.somme.gouv.fr' - raa_page = { - '2024': f'{hostname}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2024', - '2023': f'{hostname}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2023', - '2022': f'{hostname}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2022', - '2021': f'{hostname}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2021', - '2020': f'{hostname}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2020', - '2019': f'{hostname}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2019' - } + raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme' user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0' full_name = 'Préfecture de la Somme' short_code = 'pref80' @@ -33,18 +26,19 @@ class Attrap_pref80(Attrap): def get_raa(self, keywords): year_pages_to_parse = [] - if self.not_before.year <= 2024: - year_pages_to_parse.append(self.raa_page['2024']) - if self.not_before.year <= 2023: - year_pages_to_parse.append(self.raa_page['2023']) - if self.not_before.year <= 2022: - year_pages_to_parse.append(self.raa_page['2022']) - if self.not_before.year <= 2021: - year_pages_to_parse.append(self.raa_page['2021']) - if self.not_before.year <= 2020: - year_pages_to_parse.append(self.raa_page['2020']) - if self.not_before.year <= 2019: - year_pages_to_parse.append(self.raa_page['2019']) + + # On détermine quelles pages d'année parser + page_content = self.get_page(self.raa_page, 'get').content + year_pages = self.get_sub_pages( + page_content, + 'div.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.hostname, + False + ) + for year_page in year_pages: + year_date = Attrap.guess_date(year_page['name'].strip(), '.*([0-9]{4})').replace(day=1, month=1) + if year_date.year >= self.not_before.year: + year_pages_to_parse.append(year_page['url']) # Pour chaque page Année, on récupère la liste des RAA elements = [] @@ -63,7 +57,7 @@ class Attrap_pref80(Attrap): # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le # parse - for a in soup.select('div.fr-text--lead.fr-my-3w p a.fr-link'): + for a in soup.select('div.fr-text--lead p a.fr-link'): if a.get('href') and a['href'].endswith('.pdf'): if a['href'].startswith('/'): url = f"{self.hostname}{a['href']}"