diff --git a/Attrap_pref38.py b/Attrap_pref38.py index 4699659ee1c90f7a871063fb0ec01d643099ca06..5285bc8e970b0ed3d4556bea2a3ef738b5748078 100644 --- a/Attrap_pref38.py +++ b/Attrap_pref38.py @@ -14,14 +14,10 @@ class Attrap_pref38(Attrap): # Config __HOST = 'https://www.isere.gouv.fr' - __RAA_PAGE = { - '2024': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2024', - '2023': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2023', - '2022': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2022', - '2021': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-recueils-des-actes-administratifs-de-la-prefecture-de-l-Isere-2021/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2021', - '2020': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-recueils-des-actes-administratifs-de-la-prefecture-de-l-Isere-2020/Recueils-des-Actes-Administratifs-de-la-Prefecture-de-l-Isere-2020', - '2019': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2019/Archives-des-Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2019' - } + __RAA_PAGE = [ + f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs', + f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives' + ] __USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0' full_name = 'Préfecture de l\'Isère' short_code = 'pref38' @@ -32,18 +28,33 @@ class Attrap_pref38(Attrap): def get_raa(self, keywords): pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) + + # On cherche les pages d'années. Elles sont mélangées dans des blocs de cartes grises et des blocs blancs avec pager + for page in self.__RAA_PAGE: + page_content = self.get_page(page, 'get').content + # On parse les cartes grises + cards = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + )[::-1] + for card in cards: + year = Attrap.guess_date(card['name'].strip(), '.*([0-9]{4})').year + if year >= self.not_before.year and year < 9999: + pages_to_parse.append(card['url']) + # On parse les blocs blancs + blocks = self.get_sub_pages_with_pager( + page, + 'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', + 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label', + None, + self.__HOST + ) + for block in blocks: + year = Attrap.guess_date(block['name'].strip(), '.*([0-9]{4})').year + if year >= self.not_before.year and year < 9999: + pages_to_parse.append(block['url']) elements = [] for raa_page in pages_to_parse: