diff --git a/Attrap_pref75.py b/Attrap_pref75.py index 58e2fc6640bc80a5743699a948956c10d1ab19e3..82b4ef51174bd841199c34c594491869d32f20b4 100644 --- a/Attrap_pref75.py +++ b/Attrap_pref75.py @@ -14,16 +14,7 @@ class Attrap_pref75(Attrap): # Config hostname = 'https://www.prefectures-regions.gouv.fr' - raa_page = { - '2025': f'{hostname}/ile-de-france/ile-de-france/ile-de-france/Documents-publications/Recueil-des-actes-administratifs/Raa-du-departement-de-Paris-2025', - '2024': f'{hostname}/ile-de-france/ile-de-france/ile-de-france/Documents-publications/Recueil-des-actes-administratifs/Raa-du-departement-de-Paris-2024', - '2023': f'{hostname}/ile-de-france/ile-de-france/ile-de-france/Documents-publications/Recueil-des-actes-administratifs/Raa-du-departement-de-Paris-2023', - '2022': f'{hostname}/ile-de-france/ile-de-france/ile-de-france/Documents-publications/Recueil-des-actes-administratifs/Raa-du-departement-de-Paris-2022', - '2021': f'{hostname}/ile-de-france/ile-de-france/ile-de-france/Documents-publications/Recueil-des-actes-administratifs/Raa-du-departement-de-Paris-2021', - '2020': f'{hostname}/ile-de-france/ile-de-france/ile-de-france/Documents-publications/Recueil-des-actes-administratifs/Raa-du-departement-de-Paris-2020', - '2019': f'{hostname}/ile-de-france/ile-de-france/ile-de-france/Documents-publications/Recueil-des-actes-administratifs/Raa-du-departement-de-Paris-2019', - '2018': f'{hostname}/ile-de-france/ile-de-france/ile-de-france/Documents-publications/Recueil-des-actes-administratifs/Raa-du-departement-de-Paris-2018' - } + raa_page = f'{hostname}/ile-de-france/tags/view/Ile-de-France/Documents+et+publications/Recueil+des+actes+administratifs' user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0' full_name = 'Préfecture de Paris' short_code = 'pref75' @@ -36,22 +27,18 @@ class Attrap_pref75(Attrap): def get_raa(self, keywords): year_pages_to_parse = [] - # Les RAA de Paris sont éparpillés sur des sous-pages par mois. - # Donc on parse la page principale à la recherche des sous-pages. - if self.not_before.year <= 2025: - year_pages_to_parse.append(self.raa_page['2025']) - if self.not_before.year <= 2024: - year_pages_to_parse.append(self.raa_page['2024']) - if self.not_before.year <= 2023: - year_pages_to_parse.append(self.raa_page['2023']) - if self.not_before.year <= 2022: - year_pages_to_parse.append(self.raa_page['2022']) - if self.not_before.year <= 2021: - year_pages_to_parse.append(self.raa_page['2021']) - if self.not_before.year <= 2020: - year_pages_to_parse.append(self.raa_page['2020']) - if self.not_before.year <= 2019: - year_pages_to_parse.append(self.raa_page['2019']) + # On détermine quelles pages d'année parser + page_content = self.get_page(self.raa_page, 'get').content + year_pages = self.get_sub_pages( + page_content, + 'article.news-list-item header h2.news-list-title a', + self.hostname, + False + ) + for year_page in year_pages: + year_date = Attrap.guess_date(year_page['name'].strip(), '(?:.*Paris.*)([0-9]{4})').replace(day=1, month=1) + if year_date.year >= self.not_before.year and year_date.year < 9999: + year_pages_to_parse.append(year_page['url']) pages_to_parse = [] for year_page in year_pages_to_parse: