From b2e0a18b39c65d2f8269d45229d8a9332c9a6a68 Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Mon, 18 Mar 2024 22:50:42 +0100 Subject: [PATCH] =?UTF-8?q?RAAspotter:=20s'assure=20de=20la=20pr=C3=A9senc?= =?UTF-8?q?e=20d'un=20lien=20avant=20de=20chercher=20une=20sous-page?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RAAspotter.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/RAAspotter.py b/RAAspotter.py index fbd34c7..1e83b18 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -124,14 +124,15 @@ class RAAspotter: soup = BeautifulSoup(page_content, 'html.parser') sub_pages = [] for a in soup.select(element): - url = f"{host}{a['href']}" - sub_page_content = self.get_page(url).content - if not self.has_pdf(sub_page_content): - logger.info(f'{url} ne contient pas de PDF, on récupère ses sous-pages') - for sub_sub_page in self.get_sub_pages(sub_page_content, element, host): - sub_pages.append(sub_sub_page) - else: - sub_pages.append(url) + if a.get('href'): + url = f"{host}{a['href']}" + sub_page_content = self.get_page(url).content + if not self.has_pdf(sub_page_content): + logger.info(f'{url} ne contient pas de PDF, on récupère ses sous-pages') + for sub_sub_page in self.get_sub_pages(sub_page_content, element, host): + sub_pages.append(sub_sub_page) + else: + sub_pages.append(url) return sub_pages def get_raa_with_pager(self, pages_list, pager_element, host=""): -- GitLab