diff --git a/Attrap_pref976.py b/Attrap_pref976.py index acfd184b5b39fa5e11645badff789427fa2c702c..f195d4b5359d52c1cec7ab4a3dbd2ad04292ee99 100644 --- a/Attrap_pref976.py +++ b/Attrap_pref976.py @@ -11,15 +11,7 @@ class Attrap_pref976(Attrap): # Config hostname = 'https://www.mayotte.gouv.fr' - raa_page = { - 'default': f'{hostname}/Publications/Recueil-des-actes-administratifs-R.A.A', - '2024': f'{hostname}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2024', - '2023': f'{hostname}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2023', - '2022': f'{hostname}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2022', - '2021': f'{hostname}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2021', - '2020': f'{hostname}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2020', - '2019': f'{hostname}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2019' - } + raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs-R.A.A' user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0' full_name = 'Préfecture de Mayotte' short_code = 'pref976' @@ -30,24 +22,23 @@ class Attrap_pref976(Attrap): self.set_sleep_time(30) def get_raa(self, keywords): - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.raa_page['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.raa_page['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.raa_page['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.raa_page['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.raa_page['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.raa_page['2019']) - - sub_pages_to_parse = [self.raa_page['default']] + year_pages_to_parse = [] + + # On récupère les pages d'années + page_content = self.get_page(self.raa_page, 'get').content + for card in self.get_sub_pages( + page_content, + 'div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.hostname, + False + ): + if Attrap.guess_date(card['name'], '([0-9]{4})').year >= self.not_before.year: + year_pages_to_parse.append(card['url']) + + pages_to_parse = [self.raa_page] # Pour chaque année, on cherche les sous-pages de mois - for raa_page in pages_to_parse: + for raa_page in year_pages_to_parse: page_content = self.get_page(raa_page, 'get').content month_pages = self.get_sub_pages( page_content, @@ -59,7 +50,7 @@ class Attrap_pref976(Attrap): # On regarde aussi si sur la page de l'année il n'y aurait pas un # RAA mal catégorisé for page_to_parse in self.find_raa_card(raa_page): - sub_pages_to_parse.append(page_to_parse) + pages_to_parse.append(page_to_parse) # Pour chaque mois, on cherche les pages des RAA for month_page in month_pages: @@ -68,11 +59,11 @@ class Attrap_pref976(Attrap): month_page['url'], year ): - sub_pages_to_parse.append(page_to_parse) + pages_to_parse.append(page_to_parse) # On parse les pages contenant des RAA elements = [] - for page in sub_pages_to_parse: + for page in pages_to_parse: page_content = self.get_page(page, 'get').content for element in self.get_raa_elements(page_content): elements.append(element)