Skip to content
Extraits de code Groupes Projets
Valider 0440409f rédigé par Bastien Le Querrec's avatar Bastien Le Querrec
Parcourir les fichiers

RAAspotter: sauvegarde la date de modification des RAA

parent 9b23a392
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
......@@ -45,6 +45,7 @@ class RAAspotter:
name = ""
sha256 = ""
pdf_creation_date = None
pdf_modification_date = None
def __init__(self, url, date, name):
if not url == "":
......@@ -60,20 +61,39 @@ class RAAspotter:
self.sha256 = hashlib.sha256(self.url.encode('utf-8')).hexdigest()
return self.sha256
def get_pdf_creation_date(self, data_dir):
def get_pdf_dates(self, data_dir):
raa_data_dir = f'{data_dir}/raa/'
p_pdf = None
pdf_parser = None
pdf_creation_date_raw = None
pdf_modification_date_raw = None
try:
p_pdf = open(f'{raa_data_dir}{self.get_sha256()}.pdf', 'rb')
pdf_parser = PDFParser(p_pdf)
except Exception as exc:
logger.warning(f'Impossible d\'ouvrir le PDF {self.get_sha256()}.pdf pour extraction de ses métadonnées : {exc}')
try:
pdf_creation_date_raw = PDFDocument(pdf_parser).info[0]['CreationDate'].decode('utf-8').replace('D:', '').replace('\'', '')
if pdf_creation_date_raw:
try:
self.pdf_creation_date = datetime.datetime.strptime(pdf_creation_date_raw, '%Y%m%d%H%M%S%z')
except ValueError as exc:
except ValueError:
self.pdf_creation_date = datetime.datetime.strptime(pdf_creation_date_raw, '%Y%m%d%H%M%S')
except Exception as exc:
logger.warning(f'Impossible d\'extraire la date du PDF {self.get_sha256()}.pdf')
pass
try:
pdf_modification_date_raw = PDFDocument(pdf_parser).info[0]['ModDate'].decode('utf-8').replace('D:', '').replace('\'', '')
if pdf_modification_date_raw:
try:
self.pdf_modification_date = datetime.datetime.strptime(pdf_modification_date_raw, '%Y%m%d%H%M%S%z')
except ValueError:
self.pdf_modification_date = datetime.datetime.strptime(pdf_modification_date_raw, '%Y%m%d%H%M%S')
except Exception as exc:
pass
def extract_content(self, data_dir):
raa_data_dir = f'{data_dir}/raa/'
......@@ -97,22 +117,27 @@ class RAAspotter:
raa_data_dir = f'{data_dir}/raa/'
pdf_creation_date_json = None
pdf_modification_date_json = None
if self.pdf_creation_date:
pdf_creation_date_json = self.pdf_creation_date.strftime("%d/%m/%Y %H:%M:%S")
if self.pdf_modification_date:
pdf_modification_date_json = self.pdf_modification_date.strftime("%d/%m/%Y %H:%M:%S")
properties = {
'name': self.name,
'date': self.date_str,
'url': quote(self.url, safe='/:'),
'first_saw_on': datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S"),
'pdf_creation_date': pdf_creation_date_json
'pdf_creation_date': pdf_creation_date_json,
'pdf_modification_date': pdf_modification_date_json
}
f = open(f'{raa_data_dir}{self.get_sha256()}.json', 'w')
f.write(json.dumps(properties))
f.close()
def parse(self, data_dir, not_before, keywords):
self.get_pdf_creation_date(data_dir)
self.get_pdf_dates(data_dir)
self.write_properties(data_dir)
self.extract_content(data_dir)
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter