From 3bcaf7c74d4dcbbfd108b915e129f2af6d6fb008 Mon Sep 17 00:00:00 2001
From: Bastien Le Querrec <blq@laquadrature.net>
Date: Sun, 24 Mar 2024 00:31:47 +0100
Subject: [PATCH] RAAspotter: renvoie le nom des sous-pages

---
 RAAspotter.py         | 16 +++++++++++++---
 RAAspotter_pref59.py  |  2 +-
 RAAspotter_pref69.py  |  2 +-
 RAAspotter_pref976.py |  4 ++--
 4 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/RAAspotter.py b/RAAspotter.py
index 1def628..40ad8db 100644
--- a/RAAspotter.py
+++ b/RAAspotter.py
@@ -135,7 +135,11 @@ class RAAspotter:
           for sub_sub_page in self.get_sub_pages(sub_page_content, element, host, recursive_until_pdf):
             sub_pages.append(sub_sub_page)
         else:
-          sub_pages.append(url)
+          sub_page = {
+            'url': url,
+            'name': a.get_text().strip()
+          }
+          sub_pages.append(sub_page)
     return sub_pages
 
   def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, host):
@@ -149,7 +153,11 @@ class RAAspotter:
     sub_pages = soup.select(sub_page_element)
     for sub_page in sub_pages:
       if sub_page.get('href'):
-        pages.append(f"{host}{sub_page['href']}")
+        page = {
+          'url': f"{host}{sub_page['href']}",
+          'name': sub_page.get_text().strip()
+        }
+        pages.append(page)
 
     # On recherche un pager, et si on le trouve on le suit
     pager = soup.select(pager_element)
@@ -170,7 +178,9 @@ class RAAspotter:
         elements.append(raa)
 
       # On regarde également s'il n'y aurait pas un pager
-      sub_pages = self.get_sub_pages(page_content, pager_element, host, True)
+      sub_pages = []
+      for sub_page in self.get_sub_pages(page_content, pager_element, host, True):
+        sub_pages.append(sub_page['url'])
       for sub_raa in self.get_raa_with_pager(sub_pages, pager_element, host):
         elements.append(sub_raa)
     return elements
diff --git a/RAAspotter_pref59.py b/RAAspotter_pref59.py
index 5e6f8ad..e5078d8 100644
--- a/RAAspotter_pref59.py
+++ b/RAAspotter_pref59.py
@@ -51,7 +51,7 @@ class RAAspotter_pref59(RAAspotter):
       page_content = self.get_page(raa_page, 'get').content
       sub_pages = self.get_sub_pages(page_content, "div.fr-card__body div.fr-card__content h2.fr-card__title a", self.__HOST, True)
       for sub_page in sub_pages[::-1]:
-        sub_page_content = self.get_page(sub_page, 'get').content
+        sub_page_content = self.get_page(sub_page['url'], 'get').content
         sub_raa_elements = self.get_raa_elements(sub_page_content)
         self.parse_raa(sub_raa_elements, keywords.split(','))
     self.mailer()
diff --git a/RAAspotter_pref69.py b/RAAspotter_pref69.py
index 63ac570..be16a7d 100644
--- a/RAAspotter_pref69.py
+++ b/RAAspotter_pref69.py
@@ -51,7 +51,7 @@ class RAAspotter_pref69(RAAspotter):
                                                 "ul.fr-pagination__list li a.fr-pagination__link--next",
                                                 self.__HOST)[::-1]
       for sub_page in sub_pages:
-        sub_pages_to_parse.append(sub_page)
+        sub_pages_to_parse.append(sub_page['url'])
     
     elements = []
     for sub_page_to_parse in sub_pages_to_parse:
diff --git a/RAAspotter_pref976.py b/RAAspotter_pref976.py
index 8778be9..5cb7885 100644
--- a/RAAspotter_pref976.py
+++ b/RAAspotter_pref976.py
@@ -58,13 +58,13 @@ class RAAspotter_pref976(RAAspotter):
       # Pour chaque mois, on cherche les pages des RAA
       for month_page in month_pages:
         sub_pages = self.get_sub_pages_with_pager(
-          month_page,
+          month_page['url'],
           'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
           'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next',
           self.__HOST
         )[::-1]
         for sub_page in sub_pages:
-          sub_pages_to_parse.append(sub_page)
+          sub_pages_to_parse.append(sub_page['url'])
 
     # On parse les pages contenant des RAA
     for page in sub_pages_to_parse:
-- 
GitLab