From 3bcaf7c74d4dcbbfd108b915e129f2af6d6fb008 Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Sun, 24 Mar 2024 00:31:47 +0100 Subject: [PATCH] RAAspotter: renvoie le nom des sous-pages --- RAAspotter.py | 16 +++++++++++++--- RAAspotter_pref59.py | 2 +- RAAspotter_pref69.py | 2 +- RAAspotter_pref976.py | 4 ++-- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/RAAspotter.py b/RAAspotter.py index 1def628..40ad8db 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -135,7 +135,11 @@ class RAAspotter: for sub_sub_page in self.get_sub_pages(sub_page_content, element, host, recursive_until_pdf): sub_pages.append(sub_sub_page) else: - sub_pages.append(url) + sub_page = { + 'url': url, + 'name': a.get_text().strip() + } + sub_pages.append(sub_page) return sub_pages def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, host): @@ -149,7 +153,11 @@ class RAAspotter: sub_pages = soup.select(sub_page_element) for sub_page in sub_pages: if sub_page.get('href'): - pages.append(f"{host}{sub_page['href']}") + page = { + 'url': f"{host}{sub_page['href']}", + 'name': sub_page.get_text().strip() + } + pages.append(page) # On recherche un pager, et si on le trouve on le suit pager = soup.select(pager_element) @@ -170,7 +178,9 @@ class RAAspotter: elements.append(raa) # On regarde également s'il n'y aurait pas un pager - sub_pages = self.get_sub_pages(page_content, pager_element, host, True) + sub_pages = [] + for sub_page in self.get_sub_pages(page_content, pager_element, host, True): + sub_pages.append(sub_page['url']) for sub_raa in self.get_raa_with_pager(sub_pages, pager_element, host): elements.append(sub_raa) return elements diff --git a/RAAspotter_pref59.py b/RAAspotter_pref59.py index 5e6f8ad..e5078d8 100644 --- a/RAAspotter_pref59.py +++ b/RAAspotter_pref59.py @@ -51,7 +51,7 @@ class RAAspotter_pref59(RAAspotter): page_content = self.get_page(raa_page, 'get').content sub_pages = self.get_sub_pages(page_content, "div.fr-card__body div.fr-card__content h2.fr-card__title a", self.__HOST, True) for sub_page in sub_pages[::-1]: - sub_page_content = self.get_page(sub_page, 'get').content + sub_page_content = self.get_page(sub_page['url'], 'get').content sub_raa_elements = self.get_raa_elements(sub_page_content) self.parse_raa(sub_raa_elements, keywords.split(',')) self.mailer() diff --git a/RAAspotter_pref69.py b/RAAspotter_pref69.py index 63ac570..be16a7d 100644 --- a/RAAspotter_pref69.py +++ b/RAAspotter_pref69.py @@ -51,7 +51,7 @@ class RAAspotter_pref69(RAAspotter): "ul.fr-pagination__list li a.fr-pagination__link--next", self.__HOST)[::-1] for sub_page in sub_pages: - sub_pages_to_parse.append(sub_page) + sub_pages_to_parse.append(sub_page['url']) elements = [] for sub_page_to_parse in sub_pages_to_parse: diff --git a/RAAspotter_pref976.py b/RAAspotter_pref976.py index 8778be9..5cb7885 100644 --- a/RAAspotter_pref976.py +++ b/RAAspotter_pref976.py @@ -58,13 +58,13 @@ class RAAspotter_pref976(RAAspotter): # Pour chaque mois, on cherche les pages des RAA for month_page in month_pages: sub_pages = self.get_sub_pages_with_pager( - month_page, + month_page['url'], 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', self.__HOST )[::-1] for sub_page in sub_pages: - sub_pages_to_parse.append(sub_page) + sub_pages_to_parse.append(sub_page['url']) # On parse les pages contenant des RAA for page in sub_pages_to_parse: -- GitLab