From 2f7d3fa594a028e19d0882e5d7f093bce3db3b78 Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Sun, 24 Mar 2024 16:00:18 +0100 Subject: [PATCH] =?UTF-8?q?RAAspotter:=20r=C3=A9duit=20le=20nombre=20de=20?= =?UTF-8?q?requ=C3=AAtes=20lors=20de=20la=20r=C3=A9cup=C3=A9ration=20des?= =?UTF-8?q?=20sous-pages?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RAAspotter.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/RAAspotter.py b/RAAspotter.py index e8df3b6..37c24c5 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -129,11 +129,18 @@ class RAAspotter: for a in soup.select(element): if a.get('href'): url = f"{host}{a['href']}" - sub_page_content = self.get_page(url, 'get').content - if recursive_until_pdf and not self.has_pdf(sub_page_content): - logger.info(f'{url} ne contient pas de PDF, on récupère ses sous-pages') - for sub_sub_page in self.get_sub_pages(sub_page_content, element, host, recursive_until_pdf): - sub_pages.append(sub_sub_page) + if recursive_until_pdf: + sub_page_content = self.get_page(url, 'get').content + if not self.has_pdf(sub_page_content): + logger.info(f'{url} ne contient pas de PDF, on récupère ses sous-pages') + for sub_sub_page in self.get_sub_pages(sub_page_content, element, host, recursive_until_pdf): + sub_pages.append(sub_sub_page) + else: + sub_page = { + 'url': url, + 'name': a.get_text().strip() + } + sub_pages.append(sub_page) else: sub_page = { 'url': url, -- GitLab