From 9eeed7fd6eb17ee8d8f0ce0016446e4a80b1fc2f Mon Sep 17 00:00:00 2001
From: Bastien Le Querrec <blq@laquadrature.net>
Date: Tue, 9 Apr 2024 00:48:12 +0200
Subject: [PATCH] =?UTF-8?q?RAAspotter:=20am=C3=A9lioration=20du=20stockage?=
 =?UTF-8?q?=20des=20donn=C3=A9es,=20conservation=20des=20propri=C3=A9t?=
 =?UTF-8?q?=C3=A9s=20des=20RAA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Première étape du support multi-recherches
---
 .gitlab-ci.yml        |   6 +-
 RAAspotter.py         | 160 +++++++++++++++++++++++++++---------------
 RAAspotter_ppparis.py |   3 +-
 RAAspotter_pref04.py  |   3 +-
 RAAspotter_pref05.py  |   3 +-
 RAAspotter_pref06.py  |   3 +-
 RAAspotter_pref09.py  |   3 +-
 RAAspotter_pref13.py  |   3 +-
 RAAspotter_pref31.py  |   3 +-
 RAAspotter_pref34.py  |   3 +-
 RAAspotter_pref35.py  |   3 +-
 RAAspotter_pref38.py  |   3 +-
 RAAspotter_pref42.py  |   3 +-
 RAAspotter_pref59.py  |   3 +-
 RAAspotter_pref62.py  |   3 +-
 RAAspotter_pref64.py  |   3 +-
 RAAspotter_pref65.py  |   3 +-
 RAAspotter_pref66.py  |   6 +-
 RAAspotter_pref69.py  |   3 +-
 RAAspotter_pref80.py  |   3 +-
 RAAspotter_pref81.py  |   3 +-
 RAAspotter_pref83.py  |   3 +-
 RAAspotter_pref87.py  |   3 +-
 RAAspotter_pref976.py |   3 +-
 cli.py                |   3 -
 25 files changed, 132 insertions(+), 106 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 43a7140..bdb1946 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -88,10 +88,12 @@ pep8:
     - bin/
     - lib/
     - pyvenv.cfg
-    - data/${PREF}/*.txt
+    - data/${PREF}/raa/*.txt
+    - data/${PREF}/raa/*.json
   artifacts:
     paths:
-    - data/${PREF}/*.txt
+    - data/${PREF}/raa/*.txt
+    - data/${PREF}/raa/*.json
     - output_${PREF}.log
     expire_in: 2 days
   rules:
diff --git a/RAAspotter.py b/RAAspotter.py
index 1829441..4a75a3a 100644
--- a/RAAspotter.py
+++ b/RAAspotter.py
@@ -1,12 +1,13 @@
 import os
 import re
 import ssl
-import sys
 import subprocess
+import shutil
 import logging
 import requests
 import time
 import datetime
+import json
 from urllib.parse import quote
 
 from selenium import webdriver
@@ -19,7 +20,10 @@ import dateparser
 
 from bs4 import BeautifulSoup
 from pyvirtualdisplay import Display
+
 from pdfminer.high_level import extract_text
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfdocument import PDFDocument
 
 from stem import Signal
 from stem.control import Controller
@@ -39,10 +43,10 @@ class RAAspotter:
         date = datetime.datetime(1970, 1, 1)
         date_str = ""
         name = ""
-        filename = ""
         sha256 = ""
+        pdf_creation_date = None
 
-        def __init__(self, url, date, name, filename):
+        def __init__(self, url, date, name):
             if not url == "":
                 self.url = url
             if not date == "":
@@ -50,17 +54,77 @@ class RAAspotter:
                 self.date_str = date.strftime("%d/%m/%Y")
             if not name == "":
                 self.name = name
-            if not filename == "":
-                self.filename = filename
 
         def get_sha256(self):
             if (self.sha256 == ""):
-                self.sha256 = hashlib.sha256(self.filename.encode('utf-8')).hexdigest()
+                self.sha256 = hashlib.sha256(self.url.encode('utf-8')).hexdigest()
             return self.sha256
 
+        def get_pdf_creation_date(self, data_dir):
+            raa_data_dir = f'{data_dir}/raa/'
+
+            try:
+                p_pdf = open(f'{raa_data_dir}{self.get_sha256()}.pdf', 'rb')
+                pdf_parser = PDFParser(p_pdf)
+                pdf_creation_date_raw = PDFDocument(pdf_parser).info[0]['CreationDate'].decode('utf-8').replace('D:', '').replace('\'', '')
+                if pdf_creation_date_raw:
+                    try:
+                        self.pdf_creation_date = datetime.datetime.strptime(pdf_creation_date_raw, '%Y%m%d%H%M%S%z')
+                    except ValueError as exc:
+                        self.pdf_creation_date = datetime.datetime.strptime(pdf_creation_date_raw, '%Y%m%d%H%M%S')
+            except Exception as exc:
+                logger.warning(f'Impossible d\'extraire la date du PDF {self.get_sha256()}.pdf')
+
+        def extract_content(self, data_dir):
+            raa_data_dir = f'{data_dir}/raa/'
+
+            text = ""
+            try:
+                text = extract_text(f'{raa_data_dir}{self.get_sha256()}.ocr.pdf')
+            except Exception as exc:
+                logger.warning(f'ATTENTION: Impossible d\'extraire le texte du fichier {self.get_sha256()}.pdf : {exc}')
+
+            # Écrit le texte du PDF dans un fichier texte pour une analyse future
+            f = open(f'{raa_data_dir}{self.get_sha256()}.txt', 'w')
+            f.write(text)
+            f.close()
+
+            # Supprime le PDF d'origine et la version OCRisée
+            os.remove(f'{raa_data_dir}{self.get_sha256()}.pdf')
+            os.remove(f'{raa_data_dir}{self.get_sha256()}.ocr.pdf')
+
+        def write_properties(self, data_dir):
+            raa_data_dir = f'{data_dir}/raa/'
+
+            pdf_creation_date_json = None
+            if self.pdf_creation_date:
+                pdf_creation_date_json = self.pdf_creation_date.strftime("%d/%m/%Y %H:%M:%S")
+
+            properties = {
+                'name': self.name,
+                'date': self.date_str,
+                'url': quote(self.url, safe='/:'),
+                'first_saw_on': datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S"),
+                'pdf_creation_date': pdf_creation_date_json
+            }
+            f = open(f'{raa_data_dir}{self.get_sha256()}.json', 'w')
+            f.write(json.dumps(properties))
+            f.close()
+
+        def parse(self, data_dir, not_before, keywords):
+            self.get_pdf_creation_date(data_dir)
+            self.write_properties(data_dir)
+            self.extract_content(data_dir)
+
     def __init__(self, data_dir, user_agent=''):
         logger.debug('Initialisation de RAAspotter')
 
+        # On crée le dossier de téléchargement
+        os.makedirs(data_dir, exist_ok=True)
+
+        # pdfminer.six est un peu trop verbeux en mode debug, donc on relève son niveau de log
+        logging.getLogger("pdfminer").setLevel(logging.WARNING)
+
         self.session = requests.Session()
         self.data_dir = data_dir
         self.found = False
@@ -340,11 +404,11 @@ class RAAspotter:
     def download_file(self, raa):
         try:
             os.makedirs(
-                os.path.dirname(f'{self.data_dir}{raa.get_sha256()}.pdf'),
+                os.path.dirname(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf'),
                 exist_ok=True
             )
             file = self.get_page(raa.url, 'get')
-            f = open(f'{self.data_dir}{raa.get_sha256()}.pdf', 'wb')
+            f = open(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', 'wb')
             f.write(file.content)
             f.close()
         except (requests.exceptions.ConnectionError,
@@ -354,48 +418,6 @@ class RAAspotter:
         except Exception as exc:
             logger.warning(f'ATTENTION: Impossible de télécharger le fichier {raa.url}: {exc}')
 
-    def parse_pdf(self, raa, keywords):
-        if not os.path.isfile(f'{self.data_dir}{raa.get_sha256()}.pdf'):
-            logger.warning(f'ATTENTION: le fichier {raa.get_sha256()}.pdf n\'existe pas')
-        else:
-            text = ""
-            try:
-                # pdfminer.six est un peu trop verbeux en mode debug, donc on relève son niveau de log
-                logging.getLogger("pdfminer").setLevel(logging.WARNING)
-                text = extract_text(f'{self.data_dir}{raa.get_sha256()}.pdf')
-            except Exception as exc:
-                logger.warning(f'ATTENTION: Impossible d\'extraire le texte du fichier {raa.get_sha256()}.pdf : {exc}')
-
-            found = False
-            found_keywords = []
-            for keyword in keywords:
-                if re.search(keyword, text, re.IGNORECASE | re.MULTILINE):
-                    if not found:
-                        url = quote(raa.url, safe='/:')
-                        self.print_output(f'\033[92m{raa.name}\033[0m ({raa.date_str})')
-                        self.print_output(f'URL : {url}')
-                        found = True
-                        self.found = True
-                    self.print_output(f'    Le terme \033[1m{keyword}\033[0m a été trouvé.')
-                    found_keywords.append(keyword)
-
-            # Écrit le texte du PDF dans un fichier texte pour une analyse
-            # future, puis supprime le PDF
-            f = open(f'{self.data_dir}{raa.get_sha256()}.txt', 'w')
-            f.write(text)
-            f.close()
-            os.remove(f'{self.data_dir}{raa.get_sha256()}.pdf')
-            if found:
-                self.print_output('')
-                url = quote(raa.url, safe='/:')
-                found_keywords_str = ', '.join(
-                    [str(x) for x in found_keywords]
-                )
-                self.mastodon_toot(
-                    f'{raa.name} ({raa.date_str})\n\nLes termes suivants ont '
-                    f'été trouvés : {found_keywords_str}.\n\nURL : {url}'
-                )
-
     def ocr(self, raa, retry_on_failure=True):
         cmd = [
             'ocrmypdf',
@@ -404,8 +426,8 @@ class RAAspotter:
             '--redo-ocr',
             '--skip-big', '500',
             '--invalidate-digital-signatures',
-            f'{self.data_dir}{raa.get_sha256()}.pdf',
-            f'{self.data_dir}{raa.get_sha256()}.pdf'
+            f'{self.data_dir}/raa/{raa.get_sha256()}.pdf',
+            f'{self.data_dir}/raa/{raa.get_sha256()}.ocr.pdf'
         ]
         logger.debug(f'Lancement de ocrmypdf: {cmd}')
         try:
@@ -419,18 +441,46 @@ class RAAspotter:
                 self.ocr(raa, False)
             elif (not exc.returncode == 6) and (not exc.returncode == 10) and (not exc.returncode == 4):
                 logger.warning('ATTENTION : Impossible d\'OCRiser le document', exc.returncode, exc.output)
+                shutil.copy(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', f'{self.data_dir}/raa/{raa.get_sha256()}.ocr.pdf')
+
+    def search_keywords(self, raa, keywords):
+        text = open(f'{self.data_dir}/raa/{raa.get_sha256()}.txt').read()
+
+        found = False
+        found_keywords = []
+        for keyword in keywords:
+            if re.search(keyword, text, re.IGNORECASE | re.MULTILINE):
+                if not found:
+                    url = quote(raa.url, safe='/:')
+                    self.print_output(f'\033[92m{raa.name}\033[0m ({raa.date_str})')
+                    self.print_output(f'URL : {url}')
+                    found = True
+                    self.found = True
+                self.print_output(f'    Le terme \033[1m{keyword}\033[0m a été trouvé.')
+                found_keywords.append(keyword)
+
+        if found:
+            self.print_output('')
+            url = quote(raa.url, safe='/:')
+            found_keywords_str = ', '.join(
+                [str(x) for x in found_keywords]
+            )
+            self.mastodon_toot(
+                f'{raa.name} ({raa.date_str})\n\nLes termes suivants ont '
+                f'été trouvés : {found_keywords_str}.\n\nURL : {url}'
+            )
 
     def parse_raa(self, elements, keywords):
         for raa in elements:
             # Si le fichier n'a pas déjà été parsé et qu'il est postérieur à la
             # date maximale d'analyse, on le télécharge et on le parse
-            if raa.date >= self.not_before and \
-               not os.path.isfile(f'{self.data_dir}{raa.get_sha256()}.txt'):
+            if raa.date >= self.not_before and not os.path.isfile(f'{self.data_dir}/raa/{raa.get_sha256()}.txt'):
                 url = quote(raa.url, safe='/:')
                 logger.info(f'Nouveau fichier : {raa.name} ({raa.date_str}). URL : {url}')
                 self.download_file(raa)
                 self.ocr(raa, True)
-                self.parse_pdf(raa, keywords)
+                raa.parse(self.data_dir, self.not_before, keywords)
+                self.search_keywords(raa, keywords)
 
     def get_raa(self, page_content):
         logger.error('Cette fonction doit être surchargée')
diff --git a/RAAspotter_ppparis.py b/RAAspotter_ppparis.py
index 4a867a4..c7981d4 100644
--- a/RAAspotter_ppparis.py
+++ b/RAAspotter_ppparis.py
@@ -46,8 +46,7 @@ class RAAspotter_ppparis(RAAspotter):
                 url = unquote(url)
                 name = a.find('span').get_text()
                 date = datetime.datetime.strptime(a.find('div', class_="field--type-datetime").get_text().strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref04.py b/RAAspotter_pref04.py
index f79fa5e..2a16978 100644
--- a/RAAspotter_pref04.py
+++ b/RAAspotter_pref04.py
@@ -57,8 +57,7 @@ class RAAspotter_pref04(RAAspotter):
                 url = unquote(url)
                 name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                 date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref05.py b/RAAspotter_pref05.py
index c29a2a7..1bfa6f8 100644
--- a/RAAspotter_pref05.py
+++ b/RAAspotter_pref05.py
@@ -97,8 +97,7 @@ class RAAspotter_pref05(RAAspotter):
                 url = unquote(url)
                 name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                 date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref06.py b/RAAspotter_pref06.py
index 75f2d4e..0210da9 100644
--- a/RAAspotter_pref06.py
+++ b/RAAspotter_pref06.py
@@ -103,8 +103,7 @@ class RAAspotter_pref06(RAAspotter):
                 url = unquote(url)
                 name = a.get_text().strip()
                 date = datetime.datetime.strptime(card.find('p', class_='fr-card__detail').get_text().replace('Publié le ', '').strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref09.py b/RAAspotter_pref09.py
index 51c9331..20b395c 100644
--- a/RAAspotter_pref09.py
+++ b/RAAspotter_pref09.py
@@ -67,8 +67,7 @@ class RAAspotter_pref09(RAAspotter):
                 url = unquote(url)
                 name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                 date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref13.py b/RAAspotter_pref13.py
index e66d4a6..17ca16e 100644
--- a/RAAspotter_pref13.py
+++ b/RAAspotter_pref13.py
@@ -55,8 +55,7 @@ class RAAspotter_pref13(RAAspotter):
                 url = unquote(url)
                 name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                 date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref31.py b/RAAspotter_pref31.py
index cff6ade..7e3e72e 100644
--- a/RAAspotter_pref31.py
+++ b/RAAspotter_pref31.py
@@ -69,8 +69,7 @@ class RAAspotter_pref31(RAAspotter):
                 url = unquote(url)
                 name = a.get_text().strip().capitalize()
                 date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref34.py b/RAAspotter_pref34.py
index 06c1e1d..d905e4b 100644
--- a/RAAspotter_pref34.py
+++ b/RAAspotter_pref34.py
@@ -69,8 +69,7 @@ class RAAspotter_pref34(RAAspotter):
                 url = unquote(url)
                 name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                 date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref35.py b/RAAspotter_pref35.py
index b9542b6..469215c 100644
--- a/RAAspotter_pref35.py
+++ b/RAAspotter_pref35.py
@@ -56,8 +56,7 @@ class RAAspotter_pref35(RAAspotter):
                 url = unquote(url)
                 name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                 date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref38.py b/RAAspotter_pref38.py
index 58727dd..22e0f9f 100644
--- a/RAAspotter_pref38.py
+++ b/RAAspotter_pref38.py
@@ -96,8 +96,7 @@ class RAAspotter_pref38(RAAspotter):
                         url = unquote(url)
                         name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                         date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
-                        filename = url.split('/')[-1]
 
-                        raa = RAAspotter.RAA(url, date, name, filename)
+                        raa = RAAspotter.RAA(url, date, name)
                         elements.append(raa)
         return elements
diff --git a/RAAspotter_pref42.py b/RAAspotter_pref42.py
index 254f560..7e24629 100644
--- a/RAAspotter_pref42.py
+++ b/RAAspotter_pref42.py
@@ -76,8 +76,7 @@ class RAAspotter_pref42(RAAspotter):
                 url = unquote(url)
                 name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                 date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref59.py b/RAAspotter_pref59.py
index ebbe460..e911a7e 100644
--- a/RAAspotter_pref59.py
+++ b/RAAspotter_pref59.py
@@ -80,8 +80,7 @@ class RAAspotter_pref59(RAAspotter):
                 url = unquote(url)
                 name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                 date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref62.py b/RAAspotter_pref62.py
index 75b909c..3f64ccc 100644
--- a/RAAspotter_pref62.py
+++ b/RAAspotter_pref62.py
@@ -93,8 +93,7 @@ class RAAspotter_pref62(RAAspotter):
                 url = unquote(url)
                 name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                 date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements[::-1]
diff --git a/RAAspotter_pref64.py b/RAAspotter_pref64.py
index 803b043..adacb30 100644
--- a/RAAspotter_pref64.py
+++ b/RAAspotter_pref64.py
@@ -99,8 +99,7 @@ class RAAspotter_pref64(RAAspotter):
                 url = unquote(url)
                 name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                 date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref65.py b/RAAspotter_pref65.py
index 880c8a9..68278d2 100644
--- a/RAAspotter_pref65.py
+++ b/RAAspotter_pref65.py
@@ -69,8 +69,7 @@ class RAAspotter_pref65(RAAspotter):
                 url = unquote(url)
                 name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                 date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref66.py b/RAAspotter_pref66.py
index a2f4277..6ffde9b 100644
--- a/RAAspotter_pref66.py
+++ b/RAAspotter_pref66.py
@@ -103,9 +103,8 @@ class RAAspotter_pref66(RAAspotter):
                         name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                     else:
                         name = a.get_text().replace('Télécharger ', '').strip()
-                    filename = url.split('/')[-1]
 
-                    elements.append(RAAspotter.RAA(url, date, name, filename))
+                    elements.append(RAAspotter.RAA(url, date, name))
         return elements
 
     # On parse les RAA depuis 2024
@@ -133,7 +132,6 @@ class RAAspotter_pref66(RAAspotter):
                 url = unquote(url)
                 name = page['name'].replace('Télécharger ', '').strip()
                 date = datetime.datetime.strptime(page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                elements.append(RAAspotter.RAA(url, date, name, filename))
+                elements.append(RAAspotter.RAA(url, date, name))
         return elements
diff --git a/RAAspotter_pref69.py b/RAAspotter_pref69.py
index 3c7c9bf..e28a831 100644
--- a/RAAspotter_pref69.py
+++ b/RAAspotter_pref69.py
@@ -83,8 +83,7 @@ class RAAspotter_pref69(RAAspotter):
                 url = unquote(url)
                 name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                 date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref80.py b/RAAspotter_pref80.py
index 8118d0c..b616fd1 100644
--- a/RAAspotter_pref80.py
+++ b/RAAspotter_pref80.py
@@ -77,7 +77,6 @@ class RAAspotter_pref80(RAAspotter):
                 # On enlève les espaces insécables, les double-espaces, et le texte « Télécharger » de certains liens
                 name = a.get_text().replace('Télécharger ', '').strip().replace(u"\u00A0", ' ').replace('  ', ' ')
                 if name and not name == '':
-                    filename = url.split('/')[-1]
                     # Certains RAA de la Somme ont une ligne avec les détails du fichier. Si cette ligne
                     # est disponible, on la parse, sinon on devine la date à partir du nom
                     date = None
@@ -98,6 +97,6 @@ class RAAspotter_pref80(RAAspotter):
                     if date.year == 9999:
                         logger.warning(f'On ignore {name} (URL : {url})')
                     else:
-                        raa = RAAspotter.RAA(url, date, name, filename)
+                        raa = RAAspotter.RAA(url, date, name)
                         elements.append(raa)
         return elements[::-1]
diff --git a/RAAspotter_pref81.py b/RAAspotter_pref81.py
index ca4052b..2f4bf7f 100644
--- a/RAAspotter_pref81.py
+++ b/RAAspotter_pref81.py
@@ -113,8 +113,7 @@ class RAAspotter_pref81(RAAspotter):
                 url = unquote(url)
                 name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                 date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref83.py b/RAAspotter_pref83.py
index 8e52369..fe73b4f 100644
--- a/RAAspotter_pref83.py
+++ b/RAAspotter_pref83.py
@@ -88,8 +88,7 @@ class RAAspotter_pref83(RAAspotter):
                 url = unquote(url)
                 name = a.get_text().strip()
                 date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref87.py b/RAAspotter_pref87.py
index e9814d0..6436659 100644
--- a/RAAspotter_pref87.py
+++ b/RAAspotter_pref87.py
@@ -104,8 +104,7 @@ class RAAspotter_pref87(RAAspotter):
                 url = unquote(url)
                 name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                 date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/RAAspotter_pref976.py b/RAAspotter_pref976.py
index ec78c39..5d17bb0 100644
--- a/RAAspotter_pref976.py
+++ b/RAAspotter_pref976.py
@@ -115,8 +115,7 @@ class RAAspotter_pref976(RAAspotter):
                 url = unquote(url)
                 name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
                 date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
-                filename = url.split('/')[-1]
 
-                raa = RAAspotter.RAA(url, date, name, filename)
+                raa = RAAspotter.RAA(url, date, name)
                 elements.append(raa)
         return elements
diff --git a/cli.py b/cli.py
index 52af7b8..31e7f47 100755
--- a/cli.py
+++ b/cli.py
@@ -218,9 +218,6 @@ else:
 if __PREF_EMAIL_TO and not __PREF_EMAIL_TO == '':
     __EMAIL_TO = f'{__EMAIL_TO},{__PREF_EMAIL_TO}'
 
-# On crée le dossier de téléchargement
-os.makedirs(__DATA_DIR, exist_ok=True)
-
 module = importlib.import_module(f'RAAspotter_{args.pref}')
 raa_spotter = getattr(module, f'RAAspotter_{args.pref}')(__DATA_DIR)
 
-- 
GitLab