From 873680763a84e9d7afbd303b214933bfc33c23a9 Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Sat, 23 Mar 2024 23:33:50 +0100 Subject: [PATCH] =?UTF-8?q?pref976:=20optimise=20le=20nombre=20de=20requ?= =?UTF-8?q?=C3=AAte=20pour=20obtenir=20la=20liste=20des=20pages=20=C3=A0?= =?UTF-8?q?=20parser?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RAAspotter_pref976.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/RAAspotter_pref976.py b/RAAspotter_pref976.py index 1eb9278..8778be9 100644 --- a/RAAspotter_pref976.py +++ b/RAAspotter_pref976.py @@ -45,11 +45,28 @@ class RAAspotter_pref976(RAAspotter): pages_to_parse.append(self.__RAA_PAGE['2019']) sub_pages_to_parse = [self.__RAA_PAGE['default']] + + # Pour chaque année, on cherche les sous-pages de mois for raa_page in pages_to_parse: page_content = self.get_page(raa_page, 'get').content - sub_pages = self.get_sub_pages(page_content, ":is(.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a,div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link,ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next)", self.__HOST)[::-1] - for sub_page in sub_pages: - sub_pages_to_parse.append(sub_page) + month_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + )[::-1] + # Pour chaque mois, on cherche les pages des RAA + for month_page in month_pages: + sub_pages = self.get_sub_pages_with_pager( + month_page, + 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', + 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', + self.__HOST + )[::-1] + for sub_page in sub_pages: + sub_pages_to_parse.append(sub_page) + + # On parse les pages contenant des RAA for page in sub_pages_to_parse: page_content = self.get_page(page, 'get').content raa_elements = self.get_raa_elements(page_content) -- GitLab