From 9eeed7fd6eb17ee8d8f0ce0016446e4a80b1fc2f Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Tue, 9 Apr 2024 00:48:12 +0200 Subject: [PATCH] =?UTF-8?q?RAAspotter:=20am=C3=A9lioration=20du=20stockage?= =?UTF-8?q?=20des=20donn=C3=A9es,=20conservation=20des=20propri=C3=A9t?= =?UTF-8?q?=C3=A9s=20des=20RAA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Première étape du support multi-recherches --- .gitlab-ci.yml | 6 +- RAAspotter.py | 160 +++++++++++++++++++++++++++--------------- RAAspotter_ppparis.py | 3 +- RAAspotter_pref04.py | 3 +- RAAspotter_pref05.py | 3 +- RAAspotter_pref06.py | 3 +- RAAspotter_pref09.py | 3 +- RAAspotter_pref13.py | 3 +- RAAspotter_pref31.py | 3 +- RAAspotter_pref34.py | 3 +- RAAspotter_pref35.py | 3 +- RAAspotter_pref38.py | 3 +- RAAspotter_pref42.py | 3 +- RAAspotter_pref59.py | 3 +- RAAspotter_pref62.py | 3 +- RAAspotter_pref64.py | 3 +- RAAspotter_pref65.py | 3 +- RAAspotter_pref66.py | 6 +- RAAspotter_pref69.py | 3 +- RAAspotter_pref80.py | 3 +- RAAspotter_pref81.py | 3 +- RAAspotter_pref83.py | 3 +- RAAspotter_pref87.py | 3 +- RAAspotter_pref976.py | 3 +- cli.py | 3 - 25 files changed, 132 insertions(+), 106 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 43a7140..bdb1946 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -88,10 +88,12 @@ pep8: - bin/ - lib/ - pyvenv.cfg - - data/${PREF}/*.txt + - data/${PREF}/raa/*.txt + - data/${PREF}/raa/*.json artifacts: paths: - - data/${PREF}/*.txt + - data/${PREF}/raa/*.txt + - data/${PREF}/raa/*.json - output_${PREF}.log expire_in: 2 days rules: diff --git a/RAAspotter.py b/RAAspotter.py index 1829441..4a75a3a 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -1,12 +1,13 @@ import os import re import ssl -import sys import subprocess +import shutil import logging import requests import time import datetime +import json from urllib.parse import quote from selenium import webdriver @@ -19,7 +20,10 @@ import dateparser from bs4 import BeautifulSoup from pyvirtualdisplay import Display + from pdfminer.high_level import extract_text +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfdocument import PDFDocument from stem import Signal from stem.control import Controller @@ -39,10 +43,10 @@ class RAAspotter: date = datetime.datetime(1970, 1, 1) date_str = "" name = "" - filename = "" sha256 = "" + pdf_creation_date = None - def __init__(self, url, date, name, filename): + def __init__(self, url, date, name): if not url == "": self.url = url if not date == "": @@ -50,17 +54,77 @@ class RAAspotter: self.date_str = date.strftime("%d/%m/%Y") if not name == "": self.name = name - if not filename == "": - self.filename = filename def get_sha256(self): if (self.sha256 == ""): - self.sha256 = hashlib.sha256(self.filename.encode('utf-8')).hexdigest() + self.sha256 = hashlib.sha256(self.url.encode('utf-8')).hexdigest() return self.sha256 + def get_pdf_creation_date(self, data_dir): + raa_data_dir = f'{data_dir}/raa/' + + try: + p_pdf = open(f'{raa_data_dir}{self.get_sha256()}.pdf', 'rb') + pdf_parser = PDFParser(p_pdf) + pdf_creation_date_raw = PDFDocument(pdf_parser).info[0]['CreationDate'].decode('utf-8').replace('D:', '').replace('\'', '') + if pdf_creation_date_raw: + try: + self.pdf_creation_date = datetime.datetime.strptime(pdf_creation_date_raw, '%Y%m%d%H%M%S%z') + except ValueError as exc: + self.pdf_creation_date = datetime.datetime.strptime(pdf_creation_date_raw, '%Y%m%d%H%M%S') + except Exception as exc: + logger.warning(f'Impossible d\'extraire la date du PDF {self.get_sha256()}.pdf') + + def extract_content(self, data_dir): + raa_data_dir = f'{data_dir}/raa/' + + text = "" + try: + text = extract_text(f'{raa_data_dir}{self.get_sha256()}.ocr.pdf') + except Exception as exc: + logger.warning(f'ATTENTION: Impossible d\'extraire le texte du fichier {self.get_sha256()}.pdf : {exc}') + + # Écrit le texte du PDF dans un fichier texte pour une analyse future + f = open(f'{raa_data_dir}{self.get_sha256()}.txt', 'w') + f.write(text) + f.close() + + # Supprime le PDF d'origine et la version OCRisée + os.remove(f'{raa_data_dir}{self.get_sha256()}.pdf') + os.remove(f'{raa_data_dir}{self.get_sha256()}.ocr.pdf') + + def write_properties(self, data_dir): + raa_data_dir = f'{data_dir}/raa/' + + pdf_creation_date_json = None + if self.pdf_creation_date: + pdf_creation_date_json = self.pdf_creation_date.strftime("%d/%m/%Y %H:%M:%S") + + properties = { + 'name': self.name, + 'date': self.date_str, + 'url': quote(self.url, safe='/:'), + 'first_saw_on': datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S"), + 'pdf_creation_date': pdf_creation_date_json + } + f = open(f'{raa_data_dir}{self.get_sha256()}.json', 'w') + f.write(json.dumps(properties)) + f.close() + + def parse(self, data_dir, not_before, keywords): + self.get_pdf_creation_date(data_dir) + self.write_properties(data_dir) + self.extract_content(data_dir) + def __init__(self, data_dir, user_agent=''): logger.debug('Initialisation de RAAspotter') + # On crée le dossier de téléchargement + os.makedirs(data_dir, exist_ok=True) + + # pdfminer.six est un peu trop verbeux en mode debug, donc on relève son niveau de log + logging.getLogger("pdfminer").setLevel(logging.WARNING) + self.session = requests.Session() self.data_dir = data_dir self.found = False @@ -340,11 +404,11 @@ class RAAspotter: def download_file(self, raa): try: os.makedirs( - os.path.dirname(f'{self.data_dir}{raa.get_sha256()}.pdf'), + os.path.dirname(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf'), exist_ok=True ) file = self.get_page(raa.url, 'get') - f = open(f'{self.data_dir}{raa.get_sha256()}.pdf', 'wb') + f = open(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', 'wb') f.write(file.content) f.close() except (requests.exceptions.ConnectionError, @@ -354,48 +418,6 @@ class RAAspotter: except Exception as exc: logger.warning(f'ATTENTION: Impossible de télécharger le fichier {raa.url}: {exc}') - def parse_pdf(self, raa, keywords): - if not os.path.isfile(f'{self.data_dir}{raa.get_sha256()}.pdf'): - logger.warning(f'ATTENTION: le fichier {raa.get_sha256()}.pdf n\'existe pas') - else: - text = "" - try: - # pdfminer.six est un peu trop verbeux en mode debug, donc on relève son niveau de log - logging.getLogger("pdfminer").setLevel(logging.WARNING) - text = extract_text(f'{self.data_dir}{raa.get_sha256()}.pdf') - except Exception as exc: - logger.warning(f'ATTENTION: Impossible d\'extraire le texte du fichier {raa.get_sha256()}.pdf : {exc}') - - found = False - found_keywords = [] - for keyword in keywords: - if re.search(keyword, text, re.IGNORECASE | re.MULTILINE): - if not found: - url = quote(raa.url, safe='/:') - self.print_output(f'\033[92m{raa.name}\033[0m ({raa.date_str})') - self.print_output(f'URL : {url}') - found = True - self.found = True - self.print_output(f' Le terme \033[1m{keyword}\033[0m a été trouvé.') - found_keywords.append(keyword) - - # Écrit le texte du PDF dans un fichier texte pour une analyse - # future, puis supprime le PDF - f = open(f'{self.data_dir}{raa.get_sha256()}.txt', 'w') - f.write(text) - f.close() - os.remove(f'{self.data_dir}{raa.get_sha256()}.pdf') - if found: - self.print_output('') - url = quote(raa.url, safe='/:') - found_keywords_str = ', '.join( - [str(x) for x in found_keywords] - ) - self.mastodon_toot( - f'{raa.name} ({raa.date_str})\n\nLes termes suivants ont ' - f'été trouvés : {found_keywords_str}.\n\nURL : {url}' - ) - def ocr(self, raa, retry_on_failure=True): cmd = [ 'ocrmypdf', @@ -404,8 +426,8 @@ class RAAspotter: '--redo-ocr', '--skip-big', '500', '--invalidate-digital-signatures', - f'{self.data_dir}{raa.get_sha256()}.pdf', - f'{self.data_dir}{raa.get_sha256()}.pdf' + f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', + f'{self.data_dir}/raa/{raa.get_sha256()}.ocr.pdf' ] logger.debug(f'Lancement de ocrmypdf: {cmd}') try: @@ -419,18 +441,46 @@ class RAAspotter: self.ocr(raa, False) elif (not exc.returncode == 6) and (not exc.returncode == 10) and (not exc.returncode == 4): logger.warning('ATTENTION : Impossible d\'OCRiser le document', exc.returncode, exc.output) + shutil.copy(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', f'{self.data_dir}/raa/{raa.get_sha256()}.ocr.pdf') + + def search_keywords(self, raa, keywords): + text = open(f'{self.data_dir}/raa/{raa.get_sha256()}.txt').read() + + found = False + found_keywords = [] + for keyword in keywords: + if re.search(keyword, text, re.IGNORECASE | re.MULTILINE): + if not found: + url = quote(raa.url, safe='/:') + self.print_output(f'\033[92m{raa.name}\033[0m ({raa.date_str})') + self.print_output(f'URL : {url}') + found = True + self.found = True + self.print_output(f' Le terme \033[1m{keyword}\033[0m a été trouvé.') + found_keywords.append(keyword) + + if found: + self.print_output('') + url = quote(raa.url, safe='/:') + found_keywords_str = ', '.join( + [str(x) for x in found_keywords] + ) + self.mastodon_toot( + f'{raa.name} ({raa.date_str})\n\nLes termes suivants ont ' + f'été trouvés : {found_keywords_str}.\n\nURL : {url}' + ) def parse_raa(self, elements, keywords): for raa in elements: # Si le fichier n'a pas déjà été parsé et qu'il est postérieur à la # date maximale d'analyse, on le télécharge et on le parse - if raa.date >= self.not_before and \ - not os.path.isfile(f'{self.data_dir}{raa.get_sha256()}.txt'): + if raa.date >= self.not_before and not os.path.isfile(f'{self.data_dir}/raa/{raa.get_sha256()}.txt'): url = quote(raa.url, safe='/:') logger.info(f'Nouveau fichier : {raa.name} ({raa.date_str}). URL : {url}') self.download_file(raa) self.ocr(raa, True) - self.parse_pdf(raa, keywords) + raa.parse(self.data_dir, self.not_before, keywords) + self.search_keywords(raa, keywords) def get_raa(self, page_content): logger.error('Cette fonction doit être surchargée') diff --git a/RAAspotter_ppparis.py b/RAAspotter_ppparis.py index 4a867a4..c7981d4 100644 --- a/RAAspotter_ppparis.py +++ b/RAAspotter_ppparis.py @@ -46,8 +46,7 @@ class RAAspotter_ppparis(RAAspotter): url = unquote(url) name = a.find('span').get_text() date = datetime.datetime.strptime(a.find('div', class_="field--type-datetime").get_text().strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref04.py b/RAAspotter_pref04.py index f79fa5e..2a16978 100644 --- a/RAAspotter_pref04.py +++ b/RAAspotter_pref04.py @@ -57,8 +57,7 @@ class RAAspotter_pref04(RAAspotter): url = unquote(url) name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref05.py b/RAAspotter_pref05.py index c29a2a7..1bfa6f8 100644 --- a/RAAspotter_pref05.py +++ b/RAAspotter_pref05.py @@ -97,8 +97,7 @@ class RAAspotter_pref05(RAAspotter): url = unquote(url) name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref06.py b/RAAspotter_pref06.py index 75f2d4e..0210da9 100644 --- a/RAAspotter_pref06.py +++ b/RAAspotter_pref06.py @@ -103,8 +103,7 @@ class RAAspotter_pref06(RAAspotter): url = unquote(url) name = a.get_text().strip() date = datetime.datetime.strptime(card.find('p', class_='fr-card__detail').get_text().replace('Publié le ', '').strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref09.py b/RAAspotter_pref09.py index 51c9331..20b395c 100644 --- a/RAAspotter_pref09.py +++ b/RAAspotter_pref09.py @@ -67,8 +67,7 @@ class RAAspotter_pref09(RAAspotter): url = unquote(url) name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref13.py b/RAAspotter_pref13.py index e66d4a6..17ca16e 100644 --- a/RAAspotter_pref13.py +++ b/RAAspotter_pref13.py @@ -55,8 +55,7 @@ class RAAspotter_pref13(RAAspotter): url = unquote(url) name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref31.py b/RAAspotter_pref31.py index cff6ade..7e3e72e 100644 --- a/RAAspotter_pref31.py +++ b/RAAspotter_pref31.py @@ -69,8 +69,7 @@ class RAAspotter_pref31(RAAspotter): url = unquote(url) name = a.get_text().strip().capitalize() date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref34.py b/RAAspotter_pref34.py index 06c1e1d..d905e4b 100644 --- a/RAAspotter_pref34.py +++ b/RAAspotter_pref34.py @@ -69,8 +69,7 @@ class RAAspotter_pref34(RAAspotter): url = unquote(url) name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref35.py b/RAAspotter_pref35.py index b9542b6..469215c 100644 --- a/RAAspotter_pref35.py +++ b/RAAspotter_pref35.py @@ -56,8 +56,7 @@ class RAAspotter_pref35(RAAspotter): url = unquote(url) name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref38.py b/RAAspotter_pref38.py index 58727dd..22e0f9f 100644 --- a/RAAspotter_pref38.py +++ b/RAAspotter_pref38.py @@ -96,8 +96,7 @@ class RAAspotter_pref38(RAAspotter): url = unquote(url) name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref42.py b/RAAspotter_pref42.py index 254f560..7e24629 100644 --- a/RAAspotter_pref42.py +++ b/RAAspotter_pref42.py @@ -76,8 +76,7 @@ class RAAspotter_pref42(RAAspotter): url = unquote(url) name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref59.py b/RAAspotter_pref59.py index ebbe460..e911a7e 100644 --- a/RAAspotter_pref59.py +++ b/RAAspotter_pref59.py @@ -80,8 +80,7 @@ class RAAspotter_pref59(RAAspotter): url = unquote(url) name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref62.py b/RAAspotter_pref62.py index 75b909c..3f64ccc 100644 --- a/RAAspotter_pref62.py +++ b/RAAspotter_pref62.py @@ -93,8 +93,7 @@ class RAAspotter_pref62(RAAspotter): url = unquote(url) name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements[::-1] diff --git a/RAAspotter_pref64.py b/RAAspotter_pref64.py index 803b043..adacb30 100644 --- a/RAAspotter_pref64.py +++ b/RAAspotter_pref64.py @@ -99,8 +99,7 @@ class RAAspotter_pref64(RAAspotter): url = unquote(url) name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref65.py b/RAAspotter_pref65.py index 880c8a9..68278d2 100644 --- a/RAAspotter_pref65.py +++ b/RAAspotter_pref65.py @@ -69,8 +69,7 @@ class RAAspotter_pref65(RAAspotter): url = unquote(url) name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref66.py b/RAAspotter_pref66.py index a2f4277..6ffde9b 100644 --- a/RAAspotter_pref66.py +++ b/RAAspotter_pref66.py @@ -103,9 +103,8 @@ class RAAspotter_pref66(RAAspotter): name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() else: name = a.get_text().replace('Télécharger ', '').strip() - filename = url.split('/')[-1] - elements.append(RAAspotter.RAA(url, date, name, filename)) + elements.append(RAAspotter.RAA(url, date, name)) return elements # On parse les RAA depuis 2024 @@ -133,7 +132,6 @@ class RAAspotter_pref66(RAAspotter): url = unquote(url) name = page['name'].replace('Télécharger ', '').strip() date = datetime.datetime.strptime(page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - elements.append(RAAspotter.RAA(url, date, name, filename)) + elements.append(RAAspotter.RAA(url, date, name)) return elements diff --git a/RAAspotter_pref69.py b/RAAspotter_pref69.py index 3c7c9bf..e28a831 100644 --- a/RAAspotter_pref69.py +++ b/RAAspotter_pref69.py @@ -83,8 +83,7 @@ class RAAspotter_pref69(RAAspotter): url = unquote(url) name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref80.py b/RAAspotter_pref80.py index 8118d0c..b616fd1 100644 --- a/RAAspotter_pref80.py +++ b/RAAspotter_pref80.py @@ -77,7 +77,6 @@ class RAAspotter_pref80(RAAspotter): # On enlève les espaces insécables, les double-espaces, et le texte « Télécharger » de certains liens name = a.get_text().replace('Télécharger ', '').strip().replace(u"\u00A0", ' ').replace(' ', ' ') if name and not name == '': - filename = url.split('/')[-1] # Certains RAA de la Somme ont une ligne avec les détails du fichier. Si cette ligne # est disponible, on la parse, sinon on devine la date à partir du nom date = None @@ -98,6 +97,6 @@ class RAAspotter_pref80(RAAspotter): if date.year == 9999: logger.warning(f'On ignore {name} (URL : {url})') else: - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements[::-1] diff --git a/RAAspotter_pref81.py b/RAAspotter_pref81.py index ca4052b..2f4bf7f 100644 --- a/RAAspotter_pref81.py +++ b/RAAspotter_pref81.py @@ -113,8 +113,7 @@ class RAAspotter_pref81(RAAspotter): url = unquote(url) name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref83.py b/RAAspotter_pref83.py index 8e52369..fe73b4f 100644 --- a/RAAspotter_pref83.py +++ b/RAAspotter_pref83.py @@ -88,8 +88,7 @@ class RAAspotter_pref83(RAAspotter): url = unquote(url) name = a.get_text().strip() date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref87.py b/RAAspotter_pref87.py index e9814d0..6436659 100644 --- a/RAAspotter_pref87.py +++ b/RAAspotter_pref87.py @@ -104,8 +104,7 @@ class RAAspotter_pref87(RAAspotter): url = unquote(url) name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/RAAspotter_pref976.py b/RAAspotter_pref976.py index ec78c39..5d17bb0 100644 --- a/RAAspotter_pref976.py +++ b/RAAspotter_pref976.py @@ -115,8 +115,7 @@ class RAAspotter_pref976(RAAspotter): url = unquote(url) name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) + raa = RAAspotter.RAA(url, date, name) elements.append(raa) return elements diff --git a/cli.py b/cli.py index 52af7b8..31e7f47 100755 --- a/cli.py +++ b/cli.py @@ -218,9 +218,6 @@ else: if __PREF_EMAIL_TO and not __PREF_EMAIL_TO == '': __EMAIL_TO = f'{__EMAIL_TO},{__PREF_EMAIL_TO}' -# On crée le dossier de téléchargement -os.makedirs(__DATA_DIR, exist_ok=True) - module = importlib.import_module(f'RAAspotter_{args.pref}') raa_spotter = getattr(module, f'RAAspotter_{args.pref}')(__DATA_DIR) -- GitLab