From 0440409f4edce1702775fc3fc3ba1818f7d99eb7 Mon Sep 17 00:00:00 2001
From: Bastien Le Querrec <blq@laquadrature.net>
Date: Sat, 13 Apr 2024 17:05:24 +0200
Subject: [PATCH] RAAspotter: sauvegarde la date de modification des RAA

---
 RAAspotter.py | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/RAAspotter.py b/RAAspotter.py
index 98d3092..add05ee 100644
--- a/RAAspotter.py
+++ b/RAAspotter.py
@@ -45,6 +45,7 @@ class RAAspotter:
         name = ""
         sha256 = ""
         pdf_creation_date = None
+        pdf_modification_date = None
 
         def __init__(self, url, date, name):
             if not url == "":
@@ -60,20 +61,39 @@ class RAAspotter:
                 self.sha256 = hashlib.sha256(self.url.encode('utf-8')).hexdigest()
             return self.sha256
 
-        def get_pdf_creation_date(self, data_dir):
+        def get_pdf_dates(self, data_dir):
             raa_data_dir = f'{data_dir}/raa/'
 
+            p_pdf = None
+            pdf_parser = None
+            pdf_creation_date_raw = None
+            pdf_modification_date_raw = None
+
             try:
                 p_pdf = open(f'{raa_data_dir}{self.get_sha256()}.pdf', 'rb')
                 pdf_parser = PDFParser(p_pdf)
+            except Exception as exc:
+                logger.warning(f'Impossible d\'ouvrir le PDF {self.get_sha256()}.pdf pour extraction de ses métadonnées : {exc}')
+
+            try:
                 pdf_creation_date_raw = PDFDocument(pdf_parser).info[0]['CreationDate'].decode('utf-8').replace('D:', '').replace('\'', '')
                 if pdf_creation_date_raw:
                     try:
                         self.pdf_creation_date = datetime.datetime.strptime(pdf_creation_date_raw, '%Y%m%d%H%M%S%z')
-                    except ValueError as exc:
+                    except ValueError:
                         self.pdf_creation_date = datetime.datetime.strptime(pdf_creation_date_raw, '%Y%m%d%H%M%S')
             except Exception as exc:
-                logger.warning(f'Impossible d\'extraire la date du PDF {self.get_sha256()}.pdf')
+                pass
+
+            try:
+                pdf_modification_date_raw = PDFDocument(pdf_parser).info[0]['ModDate'].decode('utf-8').replace('D:', '').replace('\'', '')
+                if pdf_modification_date_raw:
+                    try:
+                        self.pdf_modification_date = datetime.datetime.strptime(pdf_modification_date_raw, '%Y%m%d%H%M%S%z')
+                    except ValueError:
+                        self.pdf_modification_date = datetime.datetime.strptime(pdf_modification_date_raw, '%Y%m%d%H%M%S')
+            except Exception as exc:
+                pass
 
         def extract_content(self, data_dir):
             raa_data_dir = f'{data_dir}/raa/'
@@ -97,22 +117,27 @@ class RAAspotter:
             raa_data_dir = f'{data_dir}/raa/'
 
             pdf_creation_date_json = None
+            pdf_modification_date_json = None
+
             if self.pdf_creation_date:
                 pdf_creation_date_json = self.pdf_creation_date.strftime("%d/%m/%Y %H:%M:%S")
+            if self.pdf_modification_date:
+                pdf_modification_date_json = self.pdf_modification_date.strftime("%d/%m/%Y %H:%M:%S")
 
             properties = {
                 'name': self.name,
                 'date': self.date_str,
                 'url': quote(self.url, safe='/:'),
                 'first_saw_on': datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S"),
-                'pdf_creation_date': pdf_creation_date_json
+                'pdf_creation_date': pdf_creation_date_json,
+                'pdf_modification_date': pdf_modification_date_json
             }
             f = open(f'{raa_data_dir}{self.get_sha256()}.json', 'w')
             f.write(json.dumps(properties))
             f.close()
 
         def parse(self, data_dir, not_before, keywords):
-            self.get_pdf_creation_date(data_dir)
+            self.get_pdf_dates(data_dir)
             self.write_properties(data_dir)
             self.extract_content(data_dir)
 
-- 
GitLab