From 59778e0f31ab8098a8963aeb6dbc8db9de295762 Mon Sep 17 00:00:00 2001
From: Bastien Le Querrec <blq@laquadrature.net>
Date: Sun, 17 Mar 2024 11:33:22 +0100
Subject: [PATCH] =?UTF-8?q?RAAspotter:=20ajout=20d'une=20fonction=20pour?=
 =?UTF-8?q?=20retrouver=20des=20sous-pages=20sur=20la=20base=20d'une=20pro?=
 =?UTF-8?q?pri=C3=A9t=C3=A9=20CSS=20et=20d'une=20possibilit=C3=A9=20de=20r?=
 =?UTF-8?q?alentir=20les=20requ=C3=AAtes?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 RAAspotter.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/RAAspotter.py b/RAAspotter.py
index 69d9314..403aff7 100644
--- a/RAAspotter.py
+++ b/RAAspotter.py
@@ -2,12 +2,15 @@ import os, re, ssl
 import subprocess
 import logging
 import requests
+import time
 
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support import expected_conditions
 
+from bs4 import BeautifulSoup
+
 from pyvirtualdisplay import Display
 
 from pdfminer.high_level import extract_text
@@ -48,6 +51,7 @@ class RAAspotter:
     self.data_dir         = data_dir
     self.found            = False
     self.output_file_path = os.path.dirname(os.path.abspath(__file__))+'/output.log'
+    self.sleep_time       = 0
 
     self.update_user_agent(user_agent)
 
@@ -55,6 +59,31 @@ class RAAspotter:
     f.write('')
     f.close()
 
+  def get_sub_pages(self, page_content, element, host=""):
+    soup = BeautifulSoup(page_content, 'html.parser')
+    sub_pages = []
+    for a in soup.select(element):
+      url = f"{host}{a['href']}"
+      sub_page_content = self.get_page(url).content
+      if not self.has_pdf(sub_page_content):
+        logger.info(f'{url} ne contient pas de PDF, on récupère ses sous-pages')
+        for sub_sub_page in self.get_sub_pages(sub_page_content, element, host):
+          sub_pages.append(sub_sub_page)
+      else:
+        sub_pages.append(url)
+    return sub_pages
+
+  def set_sleep_time(self, sleep_time):
+    self.sleep_time = sleep_time
+
+  def has_pdf(self, page_content):
+    elements = []
+    soup = BeautifulSoup(page_content, 'html.parser')
+    for a in soup.find_all('a', href=True):
+      if a['href'].endswith('.pdf'):
+        return True
+    return False
+
   # On démarre le navigateur
   def get_session(self, url, wait_element=""):
     webdriver_options = webdriver.ChromeOptions()
@@ -103,6 +132,8 @@ class RAAspotter:
     f.close()
 
   def get_page(self, url):
+    if self.sleep_time > 0:
+      time.sleep(self.sleep_time)
     return self.session.get(url)
 
   def update_user_agent(self, user_agent):
-- 
GitLab