diff --git a/RAAspotter.py b/RAAspotter.py index 69d9314af26554dc0b78d4f58a05dc53d99ba39d..403aff754165f58f6eac84ee7f0c502d34687a8b 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -2,12 +2,15 @@ import os, re, ssl import subprocess import logging import requests +import time from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions +from bs4 import BeautifulSoup + from pyvirtualdisplay import Display from pdfminer.high_level import extract_text @@ -48,6 +51,7 @@ class RAAspotter: self.data_dir = data_dir self.found = False self.output_file_path = os.path.dirname(os.path.abspath(__file__))+'/output.log' + self.sleep_time = 0 self.update_user_agent(user_agent) @@ -55,6 +59,31 @@ class RAAspotter: f.write('') f.close() + def get_sub_pages(self, page_content, element, host=""): + soup = BeautifulSoup(page_content, 'html.parser') + sub_pages = [] + for a in soup.select(element): + url = f"{host}{a['href']}" + sub_page_content = self.get_page(url).content + if not self.has_pdf(sub_page_content): + logger.info(f'{url} ne contient pas de PDF, on récupère ses sous-pages') + for sub_sub_page in self.get_sub_pages(sub_page_content, element, host): + sub_pages.append(sub_sub_page) + else: + sub_pages.append(url) + return sub_pages + + def set_sleep_time(self, sleep_time): + self.sleep_time = sleep_time + + def has_pdf(self, page_content): + elements = [] + soup = BeautifulSoup(page_content, 'html.parser') + for a in soup.find_all('a', href=True): + if a['href'].endswith('.pdf'): + return True + return False + # On démarre le navigateur def get_session(self, url, wait_element=""): webdriver_options = webdriver.ChromeOptions() @@ -103,6 +132,8 @@ class RAAspotter: f.close() def get_page(self, url): + if self.sleep_time > 0: + time.sleep(self.sleep_time) return self.session.get(url) def update_user_agent(self, user_agent):