From 6368de4abd92cb4240b73f78e2960adc2bf9606c Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Sat, 23 Mar 2024 23:03:59 +0100 Subject: [PATCH] =?UTF-8?q?RAAspotter:=20ajoute=20une=20option=20pour=20ne?= =?UTF-8?q?=20pas=20rechercher=20les=20sous-pages=20de=20mani=C3=A8re=20r?= =?UTF-8?q?=C3=A9cursive?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RAAspotter.py | 8 ++++---- RAAspotter_pref59.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/RAAspotter.py b/RAAspotter.py index 2d9c628..1def628 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -123,16 +123,16 @@ class RAAspotter: except: logger.debug('Impossible de changer d\'identité Tor') - def get_sub_pages(self, page_content, element, host): + def get_sub_pages(self, page_content, element, host, recursive_until_pdf): soup = BeautifulSoup(page_content, 'html.parser') sub_pages = [] for a in soup.select(element): if a.get('href'): url = f"{host}{a['href']}" sub_page_content = self.get_page(url, 'get').content - if not self.has_pdf(sub_page_content): + if recursive_until_pdf and not self.has_pdf(sub_page_content): logger.info(f'{url} ne contient pas de PDF, on récupère ses sous-pages') - for sub_sub_page in self.get_sub_pages(sub_page_content, element, host): + for sub_sub_page in self.get_sub_pages(sub_page_content, element, host, recursive_until_pdf): sub_pages.append(sub_sub_page) else: sub_pages.append(url) @@ -170,7 +170,7 @@ class RAAspotter: elements.append(raa) # On regarde également s'il n'y aurait pas un pager - sub_pages = self.get_sub_pages(page_content, pager_element, host) + sub_pages = self.get_sub_pages(page_content, pager_element, host, True) for sub_raa in self.get_raa_with_pager(sub_pages, pager_element, host): elements.append(sub_raa) return elements diff --git a/RAAspotter_pref59.py b/RAAspotter_pref59.py index 26004cd..5e6f8ad 100644 --- a/RAAspotter_pref59.py +++ b/RAAspotter_pref59.py @@ -49,7 +49,7 @@ class RAAspotter_pref59(RAAspotter): for raa_page in pages_to_parse: page_content = self.get_page(raa_page, 'get').content - sub_pages = self.get_sub_pages(page_content, "div.fr-card__body div.fr-card__content h2.fr-card__title a", self.__HOST) + sub_pages = self.get_sub_pages(page_content, "div.fr-card__body div.fr-card__content h2.fr-card__title a", self.__HOST, True) for sub_page in sub_pages[::-1]: sub_page_content = self.get_page(sub_page, 'get').content sub_raa_elements = self.get_raa_elements(sub_page_content) -- GitLab