From 02d0d660276e6140f7bd80749950fcc8c04a4127 Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Sun, 17 Mar 2024 12:42:37 +0100 Subject: [PATCH] =?UTF-8?q?RAAspotter:=20ajout=20de=20la=20possibilit?= =?UTF-8?q?=C3=A9=20de=20changer=20d'identit=C3=A9=20Tor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RAAspotter.py | 37 +++++++++++++++++++++++++++++++++---- requirements.txt | 1 + 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/RAAspotter.py b/RAAspotter.py index a8190fe..c0798fd 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -10,11 +10,12 @@ from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions from bs4 import BeautifulSoup - from pyvirtualdisplay import Display - from pdfminer.high_level import extract_text +from stem import Signal +from stem.control import Controller + import hashlib import smtplib from email.message import EmailMessage @@ -52,6 +53,9 @@ class RAAspotter: self.found = False self.output_file_path = os.path.dirname(os.path.abspath(__file__))+'/output.log' self.sleep_time = 0 + self.tor_enabled = False + self.tor_max_requests = 0 + self.tor_requests = 0 self.update_user_agent(user_agent) @@ -59,17 +63,34 @@ class RAAspotter: f.write('') f.close() - def enable_tor(self): + def enable_tor(self, max_requests=0): proxies = { "http": f"socks5h://localhost:9050", "https": f"socks5h://localhost:9050", } + self.tor_enabled = True + self.tor_max_requests = max_requests + self.tor_requests = 0 self.session.proxies.update(proxies) def disable_tor(self): proxies = {} + self.tor_enabled = False + self.tor_max_requests = 0 + self.tor_requests = 0 self.session.proxies.update(proxies) + def tor_get_new_id(self): + logger.info('Changement d\'identité Tor') + try: + controller = Controller.from_port(port = 9051) + controller.authenticate() + controller.signal(Signal.NEWNYM) + time.sleep(3) + self.tor_requests = 0 + except: + logger.debug('Impossible de changer d\'identité Tor') + def get_sub_pages(self, page_content, element, host=""): soup = BeautifulSoup(page_content, 'html.parser') sub_pages = [] @@ -143,9 +164,17 @@ class RAAspotter: f.close() def get_page(self, url): + logger.debug(f'Chargement de la page {url}') if self.sleep_time > 0: time.sleep(self.sleep_time) - return self.session.get(url) + page = self.session.get(url) + + if self.tor_enabled: + self.tor_requests+=1 + if self.tor_max_requests>0 and self.tor_requests>self.tor_max_requests: + self.tor_get_new_id() + + return page def update_user_agent(self, user_agent): self.user_agent = user_agent diff --git a/requirements.txt b/requirements.txt index 9f7e72e..e206a7a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ selenium pyvirtualdisplay pdfminer.six requests +stem -- GitLab