Skip to content
Extraits de code Groupes Projets
Valider 6368de4a rédigé par Bastien Le Querrec's avatar Bastien Le Querrec
Parcourir les fichiers

RAAspotter: ajoute une option pour ne pas rechercher les sous-pages de manière récursive

parent d3068cf8
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
...@@ -123,16 +123,16 @@ class RAAspotter: ...@@ -123,16 +123,16 @@ class RAAspotter:
except: except:
logger.debug('Impossible de changer d\'identité Tor') logger.debug('Impossible de changer d\'identité Tor')
def get_sub_pages(self, page_content, element, host): def get_sub_pages(self, page_content, element, host, recursive_until_pdf):
soup = BeautifulSoup(page_content, 'html.parser') soup = BeautifulSoup(page_content, 'html.parser')
sub_pages = [] sub_pages = []
for a in soup.select(element): for a in soup.select(element):
if a.get('href'): if a.get('href'):
url = f"{host}{a['href']}" url = f"{host}{a['href']}"
sub_page_content = self.get_page(url, 'get').content sub_page_content = self.get_page(url, 'get').content
if not self.has_pdf(sub_page_content): if recursive_until_pdf and not self.has_pdf(sub_page_content):
logger.info(f'{url} ne contient pas de PDF, on récupère ses sous-pages') logger.info(f'{url} ne contient pas de PDF, on récupère ses sous-pages')
for sub_sub_page in self.get_sub_pages(sub_page_content, element, host): for sub_sub_page in self.get_sub_pages(sub_page_content, element, host, recursive_until_pdf):
sub_pages.append(sub_sub_page) sub_pages.append(sub_sub_page)
else: else:
sub_pages.append(url) sub_pages.append(url)
...@@ -170,7 +170,7 @@ class RAAspotter: ...@@ -170,7 +170,7 @@ class RAAspotter:
elements.append(raa) elements.append(raa)
# On regarde également s'il n'y aurait pas un pager # On regarde également s'il n'y aurait pas un pager
sub_pages = self.get_sub_pages(page_content, pager_element, host) sub_pages = self.get_sub_pages(page_content, pager_element, host, True)
for sub_raa in self.get_raa_with_pager(sub_pages, pager_element, host): for sub_raa in self.get_raa_with_pager(sub_pages, pager_element, host):
elements.append(sub_raa) elements.append(sub_raa)
return elements return elements
......
...@@ -49,7 +49,7 @@ class RAAspotter_pref59(RAAspotter): ...@@ -49,7 +49,7 @@ class RAAspotter_pref59(RAAspotter):
for raa_page in pages_to_parse: for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content page_content = self.get_page(raa_page, 'get').content
sub_pages = self.get_sub_pages(page_content, "div.fr-card__body div.fr-card__content h2.fr-card__title a", self.__HOST) sub_pages = self.get_sub_pages(page_content, "div.fr-card__body div.fr-card__content h2.fr-card__title a", self.__HOST, True)
for sub_page in sub_pages[::-1]: for sub_page in sub_pages[::-1]:
sub_page_content = self.get_page(sub_page, 'get').content sub_page_content = self.get_page(sub_page, 'get').content
sub_raa_elements = self.get_raa_elements(sub_page_content) sub_raa_elements = self.get_raa_elements(sub_page_content)
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter