From 6368de4abd92cb4240b73f78e2960adc2bf9606c Mon Sep 17 00:00:00 2001
From: Bastien Le Querrec <blq@laquadrature.net>
Date: Sat, 23 Mar 2024 23:03:59 +0100
Subject: [PATCH] =?UTF-8?q?RAAspotter:=20ajoute=20une=20option=20pour=20ne?=
 =?UTF-8?q?=20pas=20rechercher=20les=20sous-pages=20de=20mani=C3=A8re=20r?=
 =?UTF-8?q?=C3=A9cursive?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 RAAspotter.py        | 8 ++++----
 RAAspotter_pref59.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/RAAspotter.py b/RAAspotter.py
index 2d9c628..1def628 100644
--- a/RAAspotter.py
+++ b/RAAspotter.py
@@ -123,16 +123,16 @@ class RAAspotter:
     except:
       logger.debug('Impossible de changer d\'identité Tor')
 
-  def get_sub_pages(self, page_content, element, host):
+  def get_sub_pages(self, page_content, element, host, recursive_until_pdf):
     soup = BeautifulSoup(page_content, 'html.parser')
     sub_pages = []
     for a in soup.select(element):
       if a.get('href'):
         url = f"{host}{a['href']}"
         sub_page_content = self.get_page(url, 'get').content
-        if not self.has_pdf(sub_page_content):
+        if recursive_until_pdf and not self.has_pdf(sub_page_content):
           logger.info(f'{url} ne contient pas de PDF, on récupère ses sous-pages')
-          for sub_sub_page in self.get_sub_pages(sub_page_content, element, host):
+          for sub_sub_page in self.get_sub_pages(sub_page_content, element, host, recursive_until_pdf):
             sub_pages.append(sub_sub_page)
         else:
           sub_pages.append(url)
@@ -170,7 +170,7 @@ class RAAspotter:
         elements.append(raa)
 
       # On regarde également s'il n'y aurait pas un pager
-      sub_pages = self.get_sub_pages(page_content, pager_element, host)
+      sub_pages = self.get_sub_pages(page_content, pager_element, host, True)
       for sub_raa in self.get_raa_with_pager(sub_pages, pager_element, host):
         elements.append(sub_raa)
     return elements
diff --git a/RAAspotter_pref59.py b/RAAspotter_pref59.py
index 26004cd..5e6f8ad 100644
--- a/RAAspotter_pref59.py
+++ b/RAAspotter_pref59.py
@@ -49,7 +49,7 @@ class RAAspotter_pref59(RAAspotter):
 
     for raa_page in pages_to_parse:
       page_content = self.get_page(raa_page, 'get').content
-      sub_pages = self.get_sub_pages(page_content, "div.fr-card__body div.fr-card__content h2.fr-card__title a", self.__HOST)
+      sub_pages = self.get_sub_pages(page_content, "div.fr-card__body div.fr-card__content h2.fr-card__title a", self.__HOST, True)
       for sub_page in sub_pages[::-1]:
         sub_page_content = self.get_page(sub_page, 'get').content
         sub_raa_elements = self.get_raa_elements(sub_page_content)
-- 
GitLab