From c250bd98d673fc3c9e441d1faf76eb9b748d3c41 Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Sat, 16 Mar 2024 19:25:18 +0100 Subject: [PATCH] =?UTF-8?q?RAAspotter:=20ajoute=20une=20fonction=20pour=20?= =?UTF-8?q?r=C3=A9cup=C3=A9rer=20une=20page=20sans=20Selenium?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RAAspotter.py | 14 +++++++++++--- RAAspotter_ppparis.py | 3 +-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/RAAspotter.py b/RAAspotter.py index e58e395..69d9314 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -41,7 +41,7 @@ class RAAspotter: self.sha256 = hashlib.sha256(self.filename.encode('utf-8')).hexdigest() return self.sha256 - def __init__(self, data_dir): + def __init__(self, data_dir, user_agent=''): logger.debug('Initialisation de RAAspotter') self.session = requests.Session() @@ -49,6 +49,8 @@ class RAAspotter: self.found = False self.output_file_path = os.path.dirname(os.path.abspath(__file__))+'/output.log' + self.update_user_agent(user_agent) + f = open(self.output_file_path,'w') f.write('') f.close() @@ -84,7 +86,6 @@ class RAAspotter: # On récupère les cookies du navigateur pour les réutiliser plus tard for cookie in browser.get_cookies(): self.session.cookies.set(cookie['name'], cookie['value']) - self.session.headers.update({'User-Agent': self.user_agent}) # On arrête le navigateur browser.quit() @@ -101,10 +102,17 @@ class RAAspotter: f.write(data+"\n") f.close() + def get_page(self, url): + return self.session.get(url) + + def update_user_agent(self, user_agent): + self.user_agent = user_agent + self.session.headers.update({'User-Agent': self.user_agent}) + def download_file(self, raa): try: os.makedirs(os.path.dirname(f'{self.data_dir}{raa.get_sha256()}.pdf'), exist_ok=True) - file = self.session.get(raa.url) + file = self.get_page(raa.url) f = open(f'{self.data_dir}{raa.get_sha256()}.pdf','wb') f.write(file.content) f.close() diff --git a/RAAspotter_ppparis.py b/RAAspotter_ppparis.py index 73cc411..e0fab5a 100644 --- a/RAAspotter_ppparis.py +++ b/RAAspotter_ppparis.py @@ -11,8 +11,7 @@ class RAAspotter_ppparis(RAAspotter): __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' def __init__(self, data_dir): - super().__init__(data_dir) - self.user_agent = self.__USER_AGENT + super().__init__(data_dir, self.__USER_AGENT) def get_raa(self, keywords): self.print_output('RAAspotter_ppparis') -- GitLab