From 4d9c51ace77f31d3b59ce983928f5b53cb9514ac Mon Sep 17 00:00:00 2001
From: Bastien Le Querrec <blq@laquadrature.net>
Date: Mon, 21 Oct 2024 12:19:41 +0200
Subject: [PATCH] =?UTF-8?q?pref38:=20d=C3=A9tecte=20l'URL=20de=20l'ann?=
 =?UTF-8?q?=C3=A9e=20voulue=20automatiquement?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Attrap_pref38.py | 51 +++++++++++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/Attrap_pref38.py b/Attrap_pref38.py
index 4699659..5285bc8 100644
--- a/Attrap_pref38.py
+++ b/Attrap_pref38.py
@@ -14,14 +14,10 @@ class Attrap_pref38(Attrap):
 
     # Config
     __HOST = 'https://www.isere.gouv.fr'
-    __RAA_PAGE = {
-        '2024': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2024',
-        '2023': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2023',
-        '2022': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2022',
-        '2021': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-recueils-des-actes-administratifs-de-la-prefecture-de-l-Isere-2021/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2021',
-        '2020': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-recueils-des-actes-administratifs-de-la-prefecture-de-l-Isere-2020/Recueils-des-Actes-Administratifs-de-la-Prefecture-de-l-Isere-2020',
-        '2019': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2019/Archives-des-Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2019'
-    }
+    __RAA_PAGE = [
+        f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs',
+        f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives'
+    ]
     __USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
     full_name = 'Préfecture de l\'Isère'
     short_code = 'pref38'
@@ -32,18 +28,33 @@ class Attrap_pref38(Attrap):
 
     def get_raa(self, keywords):
         pages_to_parse = []
-        if self.not_before.year <= 2024:
-            pages_to_parse.append(self.__RAA_PAGE['2024'])
-        if self.not_before.year <= 2023:
-            pages_to_parse.append(self.__RAA_PAGE['2023'])
-        if self.not_before.year <= 2022:
-            pages_to_parse.append(self.__RAA_PAGE['2022'])
-        if self.not_before.year <= 2021:
-            pages_to_parse.append(self.__RAA_PAGE['2021'])
-        if self.not_before.year <= 2020:
-            pages_to_parse.append(self.__RAA_PAGE['2020'])
-        if self.not_before.year <= 2019:
-            pages_to_parse.append(self.__RAA_PAGE['2019'])
+
+        # On cherche les pages d'années. Elles sont mélangées dans des blocs de cartes grises et des blocs blancs avec pager
+        for page in self.__RAA_PAGE:
+            page_content = self.get_page(page, 'get').content
+            # On parse les cartes grises
+            cards = self.get_sub_pages(
+                page_content,
+                '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
+                self.__HOST,
+                False
+            )[::-1]
+            for card in cards:
+                year = Attrap.guess_date(card['name'].strip(), '.*([0-9]{4})').year
+                if year >= self.not_before.year and year < 9999:
+                    pages_to_parse.append(card['url'])
+            # On parse les blocs blancs
+            blocks = self.get_sub_pages_with_pager(
+                page,
+                'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
+                'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
+                None,
+                self.__HOST
+            )
+            for block in blocks:
+                year = Attrap.guess_date(block['name'].strip(), '.*([0-9]{4})').year
+                if year >= self.not_before.year and year < 9999:
+                    pages_to_parse.append(block['url'])
 
         elements = []
         for raa_page in pages_to_parse:
-- 
GitLab