From f37ec7a667ece7d8f6ffe5e04e23c2570c1b8926 Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Thu, 2 Jan 2025 15:23:25 +0100 Subject: [PATCH] =?UTF-8?q?pref80:=20d=C3=A9tecte=20l'URL=20de=20l'ann?= =?UTF-8?q?=C3=A9e=20voulue=20automatiquement?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Attrap_pref80.py | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/Attrap_pref80.py b/Attrap_pref80.py index 96198be..b10fb34 100644 --- a/Attrap_pref80.py +++ b/Attrap_pref80.py @@ -14,14 +14,7 @@ class Attrap_pref80(Attrap): # Config hostname = 'https://www.somme.gouv.fr' - raa_page = { - '2024': f'{hostname}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2024', - '2023': f'{hostname}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2023', - '2022': f'{hostname}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2022', - '2021': f'{hostname}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2021', - '2020': f'{hostname}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2020', - '2019': f'{hostname}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2019' - } + raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme' user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0' full_name = 'Préfecture de la Somme' short_code = 'pref80' @@ -33,18 +26,19 @@ class Attrap_pref80(Attrap): def get_raa(self, keywords): year_pages_to_parse = [] - if self.not_before.year <= 2024: - year_pages_to_parse.append(self.raa_page['2024']) - if self.not_before.year <= 2023: - year_pages_to_parse.append(self.raa_page['2023']) - if self.not_before.year <= 2022: - year_pages_to_parse.append(self.raa_page['2022']) - if self.not_before.year <= 2021: - year_pages_to_parse.append(self.raa_page['2021']) - if self.not_before.year <= 2020: - year_pages_to_parse.append(self.raa_page['2020']) - if self.not_before.year <= 2019: - year_pages_to_parse.append(self.raa_page['2019']) + + # On détermine quelles pages d'année parser + page_content = self.get_page(self.raa_page, 'get').content + year_pages = self.get_sub_pages( + page_content, + 'div.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.hostname, + False + ) + for year_page in year_pages: + year_date = Attrap.guess_date(year_page['name'].strip(), '.*([0-9]{4})').replace(day=1, month=1) + if year_date.year >= self.not_before.year: + year_pages_to_parse.append(year_page['url']) # Pour chaque page Année, on récupère la liste des RAA elements = [] @@ -63,7 +57,7 @@ class Attrap_pref80(Attrap): # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le # parse - for a in soup.select('div.fr-text--lead.fr-my-3w p a.fr-link'): + for a in soup.select('div.fr-text--lead p a.fr-link'): if a.get('href') and a['href'].endswith('.pdf'): if a['href'].startswith('/'): url = f"{self.hostname}{a['href']}" -- GitLab