From 0919eef88efabbdf4b17ff74e90402891c633c1d Mon Sep 17 00:00:00 2001
From: Bastien Le Querrec <blq@laquadrature.net>
Date: Tue, 19 Mar 2024 20:05:44 +0100
Subject: [PATCH] =?UTF-8?q?RAAspotter:=20ajoute=20le=20support=20des=20req?=
 =?UTF-8?q?u=C3=AAtes=20POST?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 RAAspotter.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/RAAspotter.py b/RAAspotter.py
index 79f1845..97b74d5 100644
--- a/RAAspotter.py
+++ b/RAAspotter.py
@@ -127,7 +127,7 @@ class RAAspotter:
     for a in soup.select(element):
       if a.get('href'):
         url = f"{host}{a['href']}"
-        sub_page_content = self.get_page(url).content
+        sub_page_content = self.get_page(url, 'get').content
         if not self.has_pdf(sub_page_content):
           logger.info(f'{url} ne contient pas de PDF, on récupère ses sous-pages')
           for sub_sub_page in self.get_sub_pages(sub_page_content, element, host):
@@ -140,7 +140,7 @@ class RAAspotter:
     elements = []
     # On parse chaque page passée en paramètre
     for page in pages_list:
-      page_content = self.get_page(page).content
+      page_content = self.get_page(page, 'get').content
 
       # Pour chaque page, on récupère les PDF
       for raa in self.get_raa_elements(page_content):
@@ -210,18 +210,23 @@ class RAAspotter:
     f.write(data+"\n")
     f.close()
 
-  def get_page(self, url):
+  def get_page(self, url, method, data={}):
     try:
       logger.debug(f'Chargement de la page {url}')
       if self.sleep_time > 0:
         time.sleep(self.sleep_time)
-      page = self.session.get(url)
+      
+      page = None
+      if method == 'get':
+        page = self.session.get(url)
+      if method == 'post':
+        page = self.session.post(url, data=data)
 
       if page.status_code == 429:
         logger.debug(f'Erreur 429 Too Many Requests reçue, temporisation...')
         self.tor_get_new_id()
         time.sleep(55)
-        return self.get_page(url)
+        return self.get_page(url, method, data)
       
       if self.tor_enabled:
         self.tor_requests+=1
@@ -233,7 +238,7 @@ class RAAspotter:
       logger.debug(f'Erreur de connexion, temporisation...')
       self.tor_get_new_id()
       time.sleep(55)
-      return self.get_page(url)
+      return self.get_page(url, method, data)
 
   def update_user_agent(self, user_agent):
     self.user_agent = user_agent
@@ -242,7 +247,7 @@ class RAAspotter:
   def download_file(self, raa):
     try:
       os.makedirs(os.path.dirname(f'{self.data_dir}{raa.get_sha256()}.pdf'), exist_ok=True)
-      file = self.get_page(raa.url)
+      file = self.get_page(raa.url, 'get')
       f = open(f'{self.data_dir}{raa.get_sha256()}.pdf','wb')
       f.write(file.content)
       f.close()
-- 
GitLab