diff --git a/RAAspotter.py b/RAAspotter.py index 98d30928e45b78179397e8925a58c79878737180..add05eeaa90f54c0ec339f51759fb46d5107782c 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -45,6 +45,7 @@ class RAAspotter: name = "" sha256 = "" pdf_creation_date = None + pdf_modification_date = None def __init__(self, url, date, name): if not url == "": @@ -60,20 +61,39 @@ class RAAspotter: self.sha256 = hashlib.sha256(self.url.encode('utf-8')).hexdigest() return self.sha256 - def get_pdf_creation_date(self, data_dir): + def get_pdf_dates(self, data_dir): raa_data_dir = f'{data_dir}/raa/' + p_pdf = None + pdf_parser = None + pdf_creation_date_raw = None + pdf_modification_date_raw = None + try: p_pdf = open(f'{raa_data_dir}{self.get_sha256()}.pdf', 'rb') pdf_parser = PDFParser(p_pdf) + except Exception as exc: + logger.warning(f'Impossible d\'ouvrir le PDF {self.get_sha256()}.pdf pour extraction de ses métadonnées : {exc}') + + try: pdf_creation_date_raw = PDFDocument(pdf_parser).info[0]['CreationDate'].decode('utf-8').replace('D:', '').replace('\'', '') if pdf_creation_date_raw: try: self.pdf_creation_date = datetime.datetime.strptime(pdf_creation_date_raw, '%Y%m%d%H%M%S%z') - except ValueError as exc: + except ValueError: self.pdf_creation_date = datetime.datetime.strptime(pdf_creation_date_raw, '%Y%m%d%H%M%S') except Exception as exc: - logger.warning(f'Impossible d\'extraire la date du PDF {self.get_sha256()}.pdf') + pass + + try: + pdf_modification_date_raw = PDFDocument(pdf_parser).info[0]['ModDate'].decode('utf-8').replace('D:', '').replace('\'', '') + if pdf_modification_date_raw: + try: + self.pdf_modification_date = datetime.datetime.strptime(pdf_modification_date_raw, '%Y%m%d%H%M%S%z') + except ValueError: + self.pdf_modification_date = datetime.datetime.strptime(pdf_modification_date_raw, '%Y%m%d%H%M%S') + except Exception as exc: + pass def extract_content(self, data_dir): raa_data_dir = f'{data_dir}/raa/' @@ -97,22 +117,27 @@ class RAAspotter: raa_data_dir = f'{data_dir}/raa/' pdf_creation_date_json = None + pdf_modification_date_json = None + if self.pdf_creation_date: pdf_creation_date_json = self.pdf_creation_date.strftime("%d/%m/%Y %H:%M:%S") + if self.pdf_modification_date: + pdf_modification_date_json = self.pdf_modification_date.strftime("%d/%m/%Y %H:%M:%S") properties = { 'name': self.name, 'date': self.date_str, 'url': quote(self.url, safe='/:'), 'first_saw_on': datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S"), - 'pdf_creation_date': pdf_creation_date_json + 'pdf_creation_date': pdf_creation_date_json, + 'pdf_modification_date': pdf_modification_date_json } f = open(f'{raa_data_dir}{self.get_sha256()}.json', 'w') f.write(json.dumps(properties)) f.close() def parse(self, data_dir, not_before, keywords): - self.get_pdf_creation_date(data_dir) + self.get_pdf_dates(data_dir) self.write_properties(data_dir) self.extract_content(data_dir)