diff --git a/RAAspotter.py b/RAAspotter.py index 2d9c6288a965830635693954e5d06b06814c2e75..1def628f3d2ed4e82b939ab7e1bd0a446e74a315 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -123,16 +123,16 @@ class RAAspotter: except: logger.debug('Impossible de changer d\'identité Tor') - def get_sub_pages(self, page_content, element, host): + def get_sub_pages(self, page_content, element, host, recursive_until_pdf): soup = BeautifulSoup(page_content, 'html.parser') sub_pages = [] for a in soup.select(element): if a.get('href'): url = f"{host}{a['href']}" sub_page_content = self.get_page(url, 'get').content - if not self.has_pdf(sub_page_content): + if recursive_until_pdf and not self.has_pdf(sub_page_content): logger.info(f'{url} ne contient pas de PDF, on récupère ses sous-pages') - for sub_sub_page in self.get_sub_pages(sub_page_content, element, host): + for sub_sub_page in self.get_sub_pages(sub_page_content, element, host, recursive_until_pdf): sub_pages.append(sub_sub_page) else: sub_pages.append(url) @@ -170,7 +170,7 @@ class RAAspotter: elements.append(raa) # On regarde également s'il n'y aurait pas un pager - sub_pages = self.get_sub_pages(page_content, pager_element, host) + sub_pages = self.get_sub_pages(page_content, pager_element, host, True) for sub_raa in self.get_raa_with_pager(sub_pages, pager_element, host): elements.append(sub_raa) return elements diff --git a/RAAspotter_pref59.py b/RAAspotter_pref59.py index 26004cd6e12719f5f3495747caaeac092ed06f28..5e6f8ad4a51abf857702015494bf634269f6061c 100644 --- a/RAAspotter_pref59.py +++ b/RAAspotter_pref59.py @@ -49,7 +49,7 @@ class RAAspotter_pref59(RAAspotter): for raa_page in pages_to_parse: page_content = self.get_page(raa_page, 'get').content - sub_pages = self.get_sub_pages(page_content, "div.fr-card__body div.fr-card__content h2.fr-card__title a", self.__HOST) + sub_pages = self.get_sub_pages(page_content, "div.fr-card__body div.fr-card__content h2.fr-card__title a", self.__HOST, True) for sub_page in sub_pages[::-1]: sub_page_content = self.get_page(sub_page, 'get').content sub_raa_elements = self.get_raa_elements(sub_page_content)