From 2d409454fb208831fb2665bd64c03d83a616e8c8 Mon Sep 17 00:00:00 2001
From: Bastien Le Querrec <blq@laquadrature.net>
Date: Sun, 24 Mar 2024 01:32:06 +0100
Subject: [PATCH] =?UTF-8?q?pref976:=20recherche=20aussi=20les=20RAA=20mal?=
 =?UTF-8?q?=20cat=C3=A9goris=C3=A9s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 RAAspotter_pref976.py | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/RAAspotter_pref976.py b/RAAspotter_pref976.py
index 28c86c1..d0cdc69 100644
--- a/RAAspotter_pref976.py
+++ b/RAAspotter_pref976.py
@@ -55,19 +55,15 @@ class RAAspotter_pref976(RAAspotter):
         self.__HOST,
         False
       )[::-1]
+      
+      # On regarde aussi si sur la page de l'année il n'y aurait pas un RAA mal catégorisé
+      for page_to_parse in self.find_raa_card(raa_page):
+        sub_pages_to_parse.append(page_to_parse)
+
       # Pour chaque mois, on cherche les pages des RAA
       for month_page in month_pages:
-        sub_pages = self.get_sub_pages_with_pager(
-          month_page['url'],
-          'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
-          'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next',
-          self.__HOST
-        )[::-1]
-        for sub_page in sub_pages:
-          # On filtre les pages de RAA ne correspondant pas à la période analysée
-          guessed_date = RAAspotter.guess_date(sub_page['name'], 'n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)')
-          if guessed_date >= self.not_before:
-            sub_pages_to_parse.append(sub_page['url'])
+        for page_to_parse in self.find_raa_card(month_page['url']):
+          sub_pages_to_parse.append(page_to_parse)
 
     # On parse les pages contenant des RAA
     for page in sub_pages_to_parse:
@@ -76,6 +72,21 @@ class RAAspotter_pref976(RAAspotter):
       self.parse_raa(raa_elements, keywords.split(','))
     self.mailer()
 
+  def find_raa_card(self, page):
+    pages = []
+    card_pages = self.get_sub_pages_with_pager(
+      page,
+      'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
+      'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next',
+      self.__HOST
+    )[::-1]
+    for card_page in card_pages:
+      # On filtre les pages de RAA ne correspondant pas à la période analysée
+      guessed_date = RAAspotter.guess_date(card_page['name'], 'n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)')
+      if guessed_date >= self.not_before:
+        pages.append(card_page['url'])
+    return pages
+
   def get_raa_elements(self, page_content):
     elements = []
     # On charge le parser
-- 
GitLab