From 02d0d660276e6140f7bd80749950fcc8c04a4127 Mon Sep 17 00:00:00 2001
From: Bastien Le Querrec <blq@laquadrature.net>
Date: Sun, 17 Mar 2024 12:42:37 +0100
Subject: [PATCH] =?UTF-8?q?RAAspotter:=20ajout=20de=20la=20possibilit?=
 =?UTF-8?q?=C3=A9=20de=20changer=20d'identit=C3=A9=20Tor?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 RAAspotter.py    | 37 +++++++++++++++++++++++++++++++++----
 requirements.txt |  1 +
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/RAAspotter.py b/RAAspotter.py
index a8190fe..c0798fd 100644
--- a/RAAspotter.py
+++ b/RAAspotter.py
@@ -10,11 +10,12 @@ from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support import expected_conditions
 
 from bs4 import BeautifulSoup
-
 from pyvirtualdisplay import Display
-
 from pdfminer.high_level import extract_text
 
+from stem import Signal
+from stem.control import Controller
+
 import hashlib
 import smtplib
 from email.message import EmailMessage
@@ -52,6 +53,9 @@ class RAAspotter:
     self.found            = False
     self.output_file_path = os.path.dirname(os.path.abspath(__file__))+'/output.log'
     self.sleep_time       = 0
+    self.tor_enabled      = False
+    self.tor_max_requests = 0
+    self.tor_requests     = 0
 
     self.update_user_agent(user_agent)
 
@@ -59,17 +63,34 @@ class RAAspotter:
     f.write('')
     f.close()
 
-  def enable_tor(self):
+  def enable_tor(self, max_requests=0):
     proxies = {
       "http": f"socks5h://localhost:9050",
       "https": f"socks5h://localhost:9050",
     }
+    self.tor_enabled      = True
+    self.tor_max_requests = max_requests
+    self.tor_requests     = 0
     self.session.proxies.update(proxies)
 
   def disable_tor(self):
     proxies = {}
+    self.tor_enabled      = False
+    self.tor_max_requests = 0
+    self.tor_requests     = 0
     self.session.proxies.update(proxies)
 
+  def tor_get_new_id(self):
+    logger.info('Changement d\'identité Tor')
+    try:
+      controller = Controller.from_port(port = 9051)
+      controller.authenticate()
+      controller.signal(Signal.NEWNYM)
+      time.sleep(3)
+      self.tor_requests = 0
+    except:
+      logger.debug('Impossible de changer d\'identité Tor')
+
   def get_sub_pages(self, page_content, element, host=""):
     soup = BeautifulSoup(page_content, 'html.parser')
     sub_pages = []
@@ -143,9 +164,17 @@ class RAAspotter:
     f.close()
 
   def get_page(self, url):
+    logger.debug(f'Chargement de la page {url}')
     if self.sleep_time > 0:
       time.sleep(self.sleep_time)
-    return self.session.get(url)
+    page = self.session.get(url)
+    
+    if self.tor_enabled:
+      self.tor_requests+=1
+      if self.tor_max_requests>0 and self.tor_requests>self.tor_max_requests:
+        self.tor_get_new_id()
+
+    return page
 
   def update_user_agent(self, user_agent):
     self.user_agent = user_agent
diff --git a/requirements.txt b/requirements.txt
index 9f7e72e..e206a7a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ selenium
 pyvirtualdisplay
 pdfminer.six
 requests
+stem
-- 
GitLab