diff --git a/RAAspotter.py b/RAAspotter.py index a8190feccd4a4530e11bef5e3f2342e0708f75b3..c0798fd211fdfe6135c46c7309eb3bc58d4d48ae 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -10,11 +10,12 @@ from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions from bs4 import BeautifulSoup - from pyvirtualdisplay import Display - from pdfminer.high_level import extract_text +from stem import Signal +from stem.control import Controller + import hashlib import smtplib from email.message import EmailMessage @@ -52,6 +53,9 @@ class RAAspotter: self.found = False self.output_file_path = os.path.dirname(os.path.abspath(__file__))+'/output.log' self.sleep_time = 0 + self.tor_enabled = False + self.tor_max_requests = 0 + self.tor_requests = 0 self.update_user_agent(user_agent) @@ -59,17 +63,34 @@ class RAAspotter: f.write('') f.close() - def enable_tor(self): + def enable_tor(self, max_requests=0): proxies = { "http": f"socks5h://localhost:9050", "https": f"socks5h://localhost:9050", } + self.tor_enabled = True + self.tor_max_requests = max_requests + self.tor_requests = 0 self.session.proxies.update(proxies) def disable_tor(self): proxies = {} + self.tor_enabled = False + self.tor_max_requests = 0 + self.tor_requests = 0 self.session.proxies.update(proxies) + def tor_get_new_id(self): + logger.info('Changement d\'identité Tor') + try: + controller = Controller.from_port(port = 9051) + controller.authenticate() + controller.signal(Signal.NEWNYM) + time.sleep(3) + self.tor_requests = 0 + except: + logger.debug('Impossible de changer d\'identité Tor') + def get_sub_pages(self, page_content, element, host=""): soup = BeautifulSoup(page_content, 'html.parser') sub_pages = [] @@ -143,9 +164,17 @@ class RAAspotter: f.close() def get_page(self, url): + logger.debug(f'Chargement de la page {url}') if self.sleep_time > 0: time.sleep(self.sleep_time) - return self.session.get(url) + page = self.session.get(url) + + if self.tor_enabled: + self.tor_requests+=1 + if self.tor_max_requests>0 and self.tor_requests>self.tor_max_requests: + self.tor_get_new_id() + + return page def update_user_agent(self, user_agent): self.user_agent = user_agent diff --git a/requirements.txt b/requirements.txt index 9f7e72e49d4f46f5fc2582627385805f267f854d..e206a7a93a6da943fd1d632d2d3cd806fca54b60 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ selenium pyvirtualdisplay pdfminer.six requests +stem