From 066854976cc7c6aa95b9d680c6394db4089ee6eb Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Sat, 13 Apr 2024 20:04:57 +0200 Subject: [PATCH] RAAspotter: remplace pdfminer.six par pypdf --- RAAspotter.py | 51 ++++++++++++------------------------------------ requirements.txt | 2 +- 2 files changed, 13 insertions(+), 40 deletions(-) diff --git a/RAAspotter.py b/RAAspotter.py index add05ee..d5e632e 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -21,9 +21,7 @@ import dateparser from bs4 import BeautifulSoup from pyvirtualdisplay import Display -from pdfminer.high_level import extract_text -from pdfminer.pdfparser import PDFParser -from pdfminer.pdfdocument import PDFDocument +from pypdf import PdfReader from stem import Signal from stem.control import Controller @@ -64,45 +62,23 @@ class RAAspotter: def get_pdf_dates(self, data_dir): raa_data_dir = f'{data_dir}/raa/' - p_pdf = None - pdf_parser = None - pdf_creation_date_raw = None - pdf_modification_date_raw = None + reader = PdfReader(f'{raa_data_dir}{self.get_sha256()}.pdf') + pdf_metadata = reader.metadata + + if pdf_metadata.creation_date: + self.pdf_creation_date = pdf_metadata.creation_date - try: - p_pdf = open(f'{raa_data_dir}{self.get_sha256()}.pdf', 'rb') - pdf_parser = PDFParser(p_pdf) - except Exception as exc: - logger.warning(f'Impossible d\'ouvrir le PDF {self.get_sha256()}.pdf pour extraction de ses métadonnées : {exc}') - - try: - pdf_creation_date_raw = PDFDocument(pdf_parser).info[0]['CreationDate'].decode('utf-8').replace('D:', '').replace('\'', '') - if pdf_creation_date_raw: - try: - self.pdf_creation_date = datetime.datetime.strptime(pdf_creation_date_raw, '%Y%m%d%H%M%S%z') - except ValueError: - self.pdf_creation_date = datetime.datetime.strptime(pdf_creation_date_raw, '%Y%m%d%H%M%S') - except Exception as exc: - pass - - try: - pdf_modification_date_raw = PDFDocument(pdf_parser).info[0]['ModDate'].decode('utf-8').replace('D:', '').replace('\'', '') - if pdf_modification_date_raw: - try: - self.pdf_modification_date = datetime.datetime.strptime(pdf_modification_date_raw, '%Y%m%d%H%M%S%z') - except ValueError: - self.pdf_modification_date = datetime.datetime.strptime(pdf_modification_date_raw, '%Y%m%d%H%M%S') - except Exception as exc: - pass + if pdf_metadata.modification_date: + self.pdf_modification_date = pdf_metadata.modification_date def extract_content(self, data_dir): raa_data_dir = f'{data_dir}/raa/' text = "" - try: - text = extract_text(f'{raa_data_dir}{self.get_sha256()}.ocr.pdf') - except Exception as exc: - logger.warning(f'ATTENTION: Impossible d\'extraire le texte du fichier {self.get_sha256()}.pdf : {exc}') + + reader = PdfReader(f'{raa_data_dir}{self.get_sha256()}.ocr.pdf') + for page in reader.pages: + text = text + "\n" + page.extract_text() # Écrit le texte du PDF dans un fichier texte pour une analyse future f = open(f'{raa_data_dir}{self.get_sha256()}.txt', 'w') @@ -147,9 +123,6 @@ class RAAspotter: # On crée le dossier de téléchargement os.makedirs(data_dir, exist_ok=True) - # pdfminer.six est un peu trop verbeux en mode debug, donc on relève son niveau de log - logging.getLogger("pdfminer").setLevel(logging.WARNING) - self.session = requests.Session() self.data_dir = data_dir self.found = False diff --git a/requirements.txt b/requirements.txt index 5761721..6d62f56 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ beautifulsoup4>=4.12.3 dateparser>=1.2.0 Mastodon.py>=1.8.1 -pdfminer.six>=20231228 pycodestyle>=2.11.1 +pypdf>=4.2.0 PyVirtualDisplay>=3.0 requests>=2.31.0 selenium>=4.19.0 -- GitLab