From 0ced41ed9afcaa7a529bbc1c283bac01c6a5f3e5 Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Fri, 21 Jun 2024 23:28:11 +0200 Subject: [PATCH] pref81: simplification de l'analyse des pages --- Attrap_pref81.py | 103 +++++++++++++++++++++-------------------------- 1 file changed, 45 insertions(+), 58 deletions(-) diff --git a/Attrap_pref81.py b/Attrap_pref81.py index 6d6943b..65dab05 100644 --- a/Attrap_pref81.py +++ b/Attrap_pref81.py @@ -11,15 +11,7 @@ class Attrap_pref81(Attrap): # Config __HOST = 'https://www.tarn.gouv.fr' - __RAA_PAGE = { - 'default': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA', - '2024': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2024', - '2023': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2023', - '2022': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2022', - '2021': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2021', - '2020': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2020', - '2019': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2019', - } + __RAA_PAGE = f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA' __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' full_name = 'Préfecture du Tarn' short_code = 'pref81' @@ -29,79 +21,74 @@ class Attrap_pref81(Attrap): self.enable_tor(10) def get_raa(self, keywords): - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) - - sub_pages_to_parse = [self.__RAA_PAGE['default']] - + year_pages_to_parse = [] + + # On détermine quelles pages d'année parser + page_content = self.get_page(self.__RAA_PAGE, 'get').content + year_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + ) + for year_page in year_pages: + if int(year_page['name'].replace('Année ', '').strip()) >= self.not_before.year: + year_pages_to_parse.append(year_page['url']) + + month_pages_to_parse = [] # Pour chaque année, on cherche les sous-pages de mois - for raa_page in pages_to_parse: - page_content = self.get_page(raa_page, 'get').content + for year_page in year_pages_to_parse: + page_content = self.get_page(year_page, 'get').content month_pages = self.get_sub_pages( page_content, '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', self.__HOST, False )[::-1] + for month_page in month_pages: + # On filtre les mois ne correspondant pas à la période analysée + guessed_date = Attrap.guess_date(month_page['name'], '(.*)') + if guessed_date.replace(day=1) >= self.not_before.replace(day=1): + month_pages_to_parse.append(month_page['url']) - # On regarde aussi si sur la page de l'année il n'y aurait pas un - # RAA mal catégorisé - for page_to_parse in self.find_raa_card(raa_page): - sub_pages_to_parse.append(page_to_parse) + pages_to_parse = [] + # Pour chaque page de mois, on cherche les pages de RAA + for month_page in month_pages_to_parse: + # TODO : il reste à gérer le cas où une page de mois redirige vers un RAA (cela + # arrive quand la préfecture n'a publié qu'un seul RAA pendant le mois) + pages = self.get_sub_pages_with_pager( + month_page, + 'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', + 'nav.fr-pagination ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label', + 'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail', + self.__HOST + )[::-1] + for page in pages: + guessed_date = datetime.datetime.strptime(page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y') + if guessed_date.replace(day=1) >= self.not_before.replace(day=1): + pages_to_parse.append(page['url']) - # Pour chaque mois, on cherche les pages des RAA - for month_page in month_pages: - year = Attrap.guess_date(month_page['name'], '(.*)').year - for page_to_parse in self.find_raa_card(month_page['url'], year): - sub_pages_to_parse.append(page_to_parse) - # On ajoute aussi la page des mois à parser au cas où il y ait - # eu une redirection vers un RAA - sub_pages_to_parse.append(month_page['url']) + # On ajoute également la page racine, qui peut contenir des RAA mal catégorisés + pages_to_parse.append(self.__RAA_PAGE) - # On parse les pages contenant des RAA elements = [] - for page in sub_pages_to_parse: + # On parse les pages contenant des RAA + for page in pages_to_parse: page_content = self.get_page(page, 'get').content for element in self.get_raa_elements(page_content): elements.append(element) + # On parse les RAA self.parse_raa(elements, keywords) self.mailer() - def find_raa_card(self, page, year=None): - pages = [] - card_pages = self.get_sub_pages_with_pager( - page, - 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', - 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', - 'div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail', - self.__HOST - )[::-1] - for card_page in card_pages: - # On filtre les pages de RAA ne correspondant pas à la période analysée - guessed_date = datetime.datetime.strptime(card_page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y') - if guessed_date >= self.not_before: - pages.append(card_page['url']) - return pages - def get_raa_elements(self, page_content): elements = [] # On charge le parser soup = BeautifulSoup(page_content, 'html.parser') # On récupère chaque balise a - for a in soup.select('div.fr-downloads-group.fr-downloads-group--bordered ul li a'): + for a in soup.select('div.fr-grid-row div.fr-downloads-group.fr-downloads-group--bordered ul li a'): if a.get('href') and a['href'].endswith('.pdf'): if a['href'].startswith('/'): url = f"{self.__HOST}{a['href']}" -- GitLab