From 4d9c51ace77f31d3b59ce983928f5b53cb9514ac Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Mon, 21 Oct 2024 12:19:41 +0200 Subject: [PATCH] =?UTF-8?q?pref38:=20d=C3=A9tecte=20l'URL=20de=20l'ann?= =?UTF-8?q?=C3=A9e=20voulue=20automatiquement?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Attrap_pref38.py | 51 +++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/Attrap_pref38.py b/Attrap_pref38.py index 4699659..5285bc8 100644 --- a/Attrap_pref38.py +++ b/Attrap_pref38.py @@ -14,14 +14,10 @@ class Attrap_pref38(Attrap): # Config __HOST = 'https://www.isere.gouv.fr' - __RAA_PAGE = { - '2024': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2024', - '2023': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2023', - '2022': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2022', - '2021': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-recueils-des-actes-administratifs-de-la-prefecture-de-l-Isere-2021/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2021', - '2020': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-recueils-des-actes-administratifs-de-la-prefecture-de-l-Isere-2020/Recueils-des-Actes-Administratifs-de-la-Prefecture-de-l-Isere-2020', - '2019': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2019/Archives-des-Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2019' - } + __RAA_PAGE = [ + f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs', + f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives' + ] __USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0' full_name = 'Préfecture de l\'Isère' short_code = 'pref38' @@ -32,18 +28,33 @@ class Attrap_pref38(Attrap): def get_raa(self, keywords): pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) + + # On cherche les pages d'années. Elles sont mélangées dans des blocs de cartes grises et des blocs blancs avec pager + for page in self.__RAA_PAGE: + page_content = self.get_page(page, 'get').content + # On parse les cartes grises + cards = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + )[::-1] + for card in cards: + year = Attrap.guess_date(card['name'].strip(), '.*([0-9]{4})').year + if year >= self.not_before.year and year < 9999: + pages_to_parse.append(card['url']) + # On parse les blocs blancs + blocks = self.get_sub_pages_with_pager( + page, + 'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', + 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label', + None, + self.__HOST + ) + for block in blocks: + year = Attrap.guess_date(block['name'].strip(), '.*([0-9]{4})').year + if year >= self.not_before.year and year < 9999: + pages_to_parse.append(block['url']) elements = [] for raa_page in pages_to_parse: -- GitLab