From 0ced41ed9afcaa7a529bbc1c283bac01c6a5f3e5 Mon Sep 17 00:00:00 2001
From: Bastien Le Querrec <blq@laquadrature.net>
Date: Fri, 21 Jun 2024 23:28:11 +0200
Subject: [PATCH] pref81: simplification de l'analyse des pages

---
 Attrap_pref81.py | 103 +++++++++++++++++++++--------------------------
 1 file changed, 45 insertions(+), 58 deletions(-)

diff --git a/Attrap_pref81.py b/Attrap_pref81.py
index 6d6943b..65dab05 100644
--- a/Attrap_pref81.py
+++ b/Attrap_pref81.py
@@ -11,15 +11,7 @@ class Attrap_pref81(Attrap):
 
     # Config
     __HOST = 'https://www.tarn.gouv.fr'
-    __RAA_PAGE = {
-        'default': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA',
-        '2024': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2024',
-        '2023': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2023',
-        '2022': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2022',
-        '2021': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2021',
-        '2020': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2020',
-        '2019': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2019',
-    }
+    __RAA_PAGE = f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA'
     __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
     full_name = 'Préfecture du Tarn'
     short_code = 'pref81'
@@ -29,79 +21,74 @@ class Attrap_pref81(Attrap):
         self.enable_tor(10)
 
     def get_raa(self, keywords):
-        pages_to_parse = []
-        if self.not_before.year <= 2024:
-            pages_to_parse.append(self.__RAA_PAGE['2024'])
-        if self.not_before.year <= 2023:
-            pages_to_parse.append(self.__RAA_PAGE['2023'])
-        if self.not_before.year <= 2022:
-            pages_to_parse.append(self.__RAA_PAGE['2022'])
-        if self.not_before.year <= 2021:
-            pages_to_parse.append(self.__RAA_PAGE['2021'])
-        if self.not_before.year <= 2020:
-            pages_to_parse.append(self.__RAA_PAGE['2020'])
-        if self.not_before.year <= 2019:
-            pages_to_parse.append(self.__RAA_PAGE['2019'])
-
-        sub_pages_to_parse = [self.__RAA_PAGE['default']]
-
+        year_pages_to_parse = []
+
+        # On détermine quelles pages d'année parser
+        page_content = self.get_page(self.__RAA_PAGE, 'get').content
+        year_pages = self.get_sub_pages(
+            page_content,
+            '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
+            self.__HOST,
+            False
+        )
+        for year_page in year_pages:
+            if int(year_page['name'].replace('Année ', '').strip()) >= self.not_before.year:
+                year_pages_to_parse.append(year_page['url'])
+
+        month_pages_to_parse = []
         # Pour chaque année, on cherche les sous-pages de mois
-        for raa_page in pages_to_parse:
-            page_content = self.get_page(raa_page, 'get').content
+        for year_page in year_pages_to_parse:
+            page_content = self.get_page(year_page, 'get').content
             month_pages = self.get_sub_pages(
                 page_content,
                 '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
                 self.__HOST,
                 False
             )[::-1]
+            for month_page in month_pages:
+                # On filtre les mois ne correspondant pas à la période analysée
+                guessed_date = Attrap.guess_date(month_page['name'], '(.*)')
+                if guessed_date.replace(day=1) >= self.not_before.replace(day=1):
+                    month_pages_to_parse.append(month_page['url'])
 
-            # On regarde aussi si sur la page de l'année il n'y aurait pas un
-            # RAA mal catégorisé
-            for page_to_parse in self.find_raa_card(raa_page):
-                sub_pages_to_parse.append(page_to_parse)
+        pages_to_parse = []
+        # Pour chaque page de mois, on cherche les pages de RAA
+        for month_page in month_pages_to_parse:
+            # TODO : il reste à gérer le cas où une page de mois redirige vers un RAA (cela
+            # arrive quand la préfecture n'a publié qu'un seul RAA pendant le mois)
+            pages = self.get_sub_pages_with_pager(
+                month_page,
+                'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
+                'nav.fr-pagination ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
+                'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
+                self.__HOST
+            )[::-1]
+            for page in pages:
+                guessed_date = datetime.datetime.strptime(page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y')
+                if guessed_date.replace(day=1) >= self.not_before.replace(day=1):
+                    pages_to_parse.append(page['url'])
 
-            # Pour chaque mois, on cherche les pages des RAA
-            for month_page in month_pages:
-                year = Attrap.guess_date(month_page['name'], '(.*)').year
-                for page_to_parse in self.find_raa_card(month_page['url'], year):
-                    sub_pages_to_parse.append(page_to_parse)
-                # On ajoute aussi la page des mois à parser au cas où il y ait
-                # eu une redirection vers un RAA
-                sub_pages_to_parse.append(month_page['url'])
+        # On ajoute également la page racine, qui peut contenir des RAA mal catégorisés
+        pages_to_parse.append(self.__RAA_PAGE)
 
-        # On parse les pages contenant des RAA
         elements = []
-        for page in sub_pages_to_parse:
+        # On parse les pages contenant des RAA
+        for page in pages_to_parse:
             page_content = self.get_page(page, 'get').content
             for element in self.get_raa_elements(page_content):
                 elements.append(element)
 
+        # On parse les RAA
         self.parse_raa(elements, keywords)
         self.mailer()
 
-    def find_raa_card(self, page, year=None):
-        pages = []
-        card_pages = self.get_sub_pages_with_pager(
-            page,
-            'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
-            'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next',
-            'div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
-            self.__HOST
-        )[::-1]
-        for card_page in card_pages:
-            # On filtre les pages de RAA ne correspondant pas à la période analysée
-            guessed_date = datetime.datetime.strptime(card_page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y')
-            if guessed_date >= self.not_before:
-                pages.append(card_page['url'])
-        return pages
-
     def get_raa_elements(self, page_content):
         elements = []
         # On charge le parser
         soup = BeautifulSoup(page_content, 'html.parser')
 
         # On récupère chaque balise a
-        for a in soup.select('div.fr-downloads-group.fr-downloads-group--bordered ul li a'):
+        for a in soup.select('div.fr-grid-row div.fr-downloads-group.fr-downloads-group--bordered ul li a'):
             if a.get('href') and a['href'].endswith('.pdf'):
                 if a['href'].startswith('/'):
                     url = f"{self.__HOST}{a['href']}"
-- 
GitLab