From 2d409454fb208831fb2665bd64c03d83a616e8c8 Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Sun, 24 Mar 2024 01:32:06 +0100 Subject: [PATCH] =?UTF-8?q?pref976:=20recherche=20aussi=20les=20RAA=20mal?= =?UTF-8?q?=20cat=C3=A9goris=C3=A9s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RAAspotter_pref976.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/RAAspotter_pref976.py b/RAAspotter_pref976.py index 28c86c1..d0cdc69 100644 --- a/RAAspotter_pref976.py +++ b/RAAspotter_pref976.py @@ -55,19 +55,15 @@ class RAAspotter_pref976(RAAspotter): self.__HOST, False )[::-1] + + # On regarde aussi si sur la page de l'année il n'y aurait pas un RAA mal catégorisé + for page_to_parse in self.find_raa_card(raa_page): + sub_pages_to_parse.append(page_to_parse) + # Pour chaque mois, on cherche les pages des RAA for month_page in month_pages: - sub_pages = self.get_sub_pages_with_pager( - month_page['url'], - 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', - 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', - self.__HOST - )[::-1] - for sub_page in sub_pages: - # On filtre les pages de RAA ne correspondant pas à la période analysée - guessed_date = RAAspotter.guess_date(sub_page['name'], 'n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)') - if guessed_date >= self.not_before: - sub_pages_to_parse.append(sub_page['url']) + for page_to_parse in self.find_raa_card(month_page['url']): + sub_pages_to_parse.append(page_to_parse) # On parse les pages contenant des RAA for page in sub_pages_to_parse: @@ -76,6 +72,21 @@ class RAAspotter_pref976(RAAspotter): self.parse_raa(raa_elements, keywords.split(',')) self.mailer() + def find_raa_card(self, page): + pages = [] + card_pages = self.get_sub_pages_with_pager( + page, + 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', + 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', + self.__HOST + )[::-1] + for card_page in card_pages: + # On filtre les pages de RAA ne correspondant pas à la période analysée + guessed_date = RAAspotter.guess_date(card_page['name'], 'n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)') + if guessed_date >= self.not_before: + pages.append(card_page['url']) + return pages + def get_raa_elements(self, page_content): elements = [] # On charge le parser -- GitLab