diff --git a/RAAspotter.py b/RAAspotter.py index 1def628f3d2ed4e82b939ab7e1bd0a446e74a315..40ad8db9f34e6797fdd7f08bb73b664373aff805 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -135,7 +135,11 @@ class RAAspotter: for sub_sub_page in self.get_sub_pages(sub_page_content, element, host, recursive_until_pdf): sub_pages.append(sub_sub_page) else: - sub_pages.append(url) + sub_page = { + 'url': url, + 'name': a.get_text().strip() + } + sub_pages.append(sub_page) return sub_pages def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, host): @@ -149,7 +153,11 @@ class RAAspotter: sub_pages = soup.select(sub_page_element) for sub_page in sub_pages: if sub_page.get('href'): - pages.append(f"{host}{sub_page['href']}") + page = { + 'url': f"{host}{sub_page['href']}", + 'name': sub_page.get_text().strip() + } + pages.append(page) # On recherche un pager, et si on le trouve on le suit pager = soup.select(pager_element) @@ -170,7 +178,9 @@ class RAAspotter: elements.append(raa) # On regarde également s'il n'y aurait pas un pager - sub_pages = self.get_sub_pages(page_content, pager_element, host, True) + sub_pages = [] + for sub_page in self.get_sub_pages(page_content, pager_element, host, True): + sub_pages.append(sub_page['url']) for sub_raa in self.get_raa_with_pager(sub_pages, pager_element, host): elements.append(sub_raa) return elements diff --git a/RAAspotter_pref59.py b/RAAspotter_pref59.py index 5e6f8ad4a51abf857702015494bf634269f6061c..e5078d8da090fdd07fe7f87bd69d9f2c9d5b1b36 100644 --- a/RAAspotter_pref59.py +++ b/RAAspotter_pref59.py @@ -51,7 +51,7 @@ class RAAspotter_pref59(RAAspotter): page_content = self.get_page(raa_page, 'get').content sub_pages = self.get_sub_pages(page_content, "div.fr-card__body div.fr-card__content h2.fr-card__title a", self.__HOST, True) for sub_page in sub_pages[::-1]: - sub_page_content = self.get_page(sub_page, 'get').content + sub_page_content = self.get_page(sub_page['url'], 'get').content sub_raa_elements = self.get_raa_elements(sub_page_content) self.parse_raa(sub_raa_elements, keywords.split(',')) self.mailer() diff --git a/RAAspotter_pref69.py b/RAAspotter_pref69.py index 63ac5703ae05e4c442e5848ce7bfa4ff4bacf226..be16a7dce5f897dc5b7759cabde36610bc6dc6aa 100644 --- a/RAAspotter_pref69.py +++ b/RAAspotter_pref69.py @@ -51,7 +51,7 @@ class RAAspotter_pref69(RAAspotter): "ul.fr-pagination__list li a.fr-pagination__link--next", self.__HOST)[::-1] for sub_page in sub_pages: - sub_pages_to_parse.append(sub_page) + sub_pages_to_parse.append(sub_page['url']) elements = [] for sub_page_to_parse in sub_pages_to_parse: diff --git a/RAAspotter_pref976.py b/RAAspotter_pref976.py index 8778be9e071cf598983052999a7a35151ae5861c..5cb7885ae25a67e0dcb8e813f26f6a8211c7be20 100644 --- a/RAAspotter_pref976.py +++ b/RAAspotter_pref976.py @@ -58,13 +58,13 @@ class RAAspotter_pref976(RAAspotter): # Pour chaque mois, on cherche les pages des RAA for month_page in month_pages: sub_pages = self.get_sub_pages_with_pager( - month_page, + month_page['url'], 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', self.__HOST )[::-1] for sub_page in sub_pages: - sub_pages_to_parse.append(sub_page) + sub_pages_to_parse.append(sub_page['url']) # On parse les pages contenant des RAA for page in sub_pages_to_parse: