Skip to content
Extraits de code Groupes Projets
Valider 3bcaf7c7 rédigé par Bastien Le Querrec's avatar Bastien Le Querrec
Parcourir les fichiers

RAAspotter: renvoie le nom des sous-pages

parent 87368076
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
......@@ -135,7 +135,11 @@ class RAAspotter:
for sub_sub_page in self.get_sub_pages(sub_page_content, element, host, recursive_until_pdf):
sub_pages.append(sub_sub_page)
else:
sub_pages.append(url)
sub_page = {
'url': url,
'name': a.get_text().strip()
}
sub_pages.append(sub_page)
return sub_pages
def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, host):
......@@ -149,7 +153,11 @@ class RAAspotter:
sub_pages = soup.select(sub_page_element)
for sub_page in sub_pages:
if sub_page.get('href'):
pages.append(f"{host}{sub_page['href']}")
page = {
'url': f"{host}{sub_page['href']}",
'name': sub_page.get_text().strip()
}
pages.append(page)
# On recherche un pager, et si on le trouve on le suit
pager = soup.select(pager_element)
......@@ -170,7 +178,9 @@ class RAAspotter:
elements.append(raa)
# On regarde également s'il n'y aurait pas un pager
sub_pages = self.get_sub_pages(page_content, pager_element, host, True)
sub_pages = []
for sub_page in self.get_sub_pages(page_content, pager_element, host, True):
sub_pages.append(sub_page['url'])
for sub_raa in self.get_raa_with_pager(sub_pages, pager_element, host):
elements.append(sub_raa)
return elements
......
......@@ -51,7 +51,7 @@ class RAAspotter_pref59(RAAspotter):
page_content = self.get_page(raa_page, 'get').content
sub_pages = self.get_sub_pages(page_content, "div.fr-card__body div.fr-card__content h2.fr-card__title a", self.__HOST, True)
for sub_page in sub_pages[::-1]:
sub_page_content = self.get_page(sub_page, 'get').content
sub_page_content = self.get_page(sub_page['url'], 'get').content
sub_raa_elements = self.get_raa_elements(sub_page_content)
self.parse_raa(sub_raa_elements, keywords.split(','))
self.mailer()
......
......@@ -51,7 +51,7 @@ class RAAspotter_pref69(RAAspotter):
"ul.fr-pagination__list li a.fr-pagination__link--next",
self.__HOST)[::-1]
for sub_page in sub_pages:
sub_pages_to_parse.append(sub_page)
sub_pages_to_parse.append(sub_page['url'])
elements = []
for sub_page_to_parse in sub_pages_to_parse:
......
......@@ -58,13 +58,13 @@ class RAAspotter_pref976(RAAspotter):
# Pour chaque mois, on cherche les pages des RAA
for month_page in month_pages:
sub_pages = self.get_sub_pages_with_pager(
month_page,
month_page['url'],
'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next',
self.__HOST
)[::-1]
for sub_page in sub_pages:
sub_pages_to_parse.append(sub_page)
sub_pages_to_parse.append(sub_page['url'])
# On parse les pages contenant des RAA
for page in sub_pages_to_parse:
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter