From 0919eef88efabbdf4b17ff74e90402891c633c1d Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Tue, 19 Mar 2024 20:05:44 +0100 Subject: [PATCH] =?UTF-8?q?RAAspotter:=20ajoute=20le=20support=20des=20req?= =?UTF-8?q?u=C3=AAtes=20POST?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RAAspotter.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/RAAspotter.py b/RAAspotter.py index 79f1845..97b74d5 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -127,7 +127,7 @@ class RAAspotter: for a in soup.select(element): if a.get('href'): url = f"{host}{a['href']}" - sub_page_content = self.get_page(url).content + sub_page_content = self.get_page(url, 'get').content if not self.has_pdf(sub_page_content): logger.info(f'{url} ne contient pas de PDF, on récupère ses sous-pages') for sub_sub_page in self.get_sub_pages(sub_page_content, element, host): @@ -140,7 +140,7 @@ class RAAspotter: elements = [] # On parse chaque page passée en paramètre for page in pages_list: - page_content = self.get_page(page).content + page_content = self.get_page(page, 'get').content # Pour chaque page, on récupère les PDF for raa in self.get_raa_elements(page_content): @@ -210,18 +210,23 @@ class RAAspotter: f.write(data+"\n") f.close() - def get_page(self, url): + def get_page(self, url, method, data={}): try: logger.debug(f'Chargement de la page {url}') if self.sleep_time > 0: time.sleep(self.sleep_time) - page = self.session.get(url) + + page = None + if method == 'get': + page = self.session.get(url) + if method == 'post': + page = self.session.post(url, data=data) if page.status_code == 429: logger.debug(f'Erreur 429 Too Many Requests reçue, temporisation...') self.tor_get_new_id() time.sleep(55) - return self.get_page(url) + return self.get_page(url, method, data) if self.tor_enabled: self.tor_requests+=1 @@ -233,7 +238,7 @@ class RAAspotter: logger.debug(f'Erreur de connexion, temporisation...') self.tor_get_new_id() time.sleep(55) - return self.get_page(url) + return self.get_page(url, method, data) def update_user_agent(self, user_agent): self.user_agent = user_agent @@ -242,7 +247,7 @@ class RAAspotter: def download_file(self, raa): try: os.makedirs(os.path.dirname(f'{self.data_dir}{raa.get_sha256()}.pdf'), exist_ok=True) - file = self.get_page(raa.url) + file = self.get_page(raa.url, 'get') f = open(f'{self.data_dir}{raa.get_sha256()}.pdf','wb') f.write(file.content) f.close() -- GitLab