diff --git a/RAAspotter.py b/RAAspotter.py index add05eeaa90f54c0ec339f51759fb46d5107782c..d5e632e8948b25b0871ccc1ded85d502758c1de6 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -21,9 +21,7 @@ import dateparser from bs4 import BeautifulSoup from pyvirtualdisplay import Display -from pdfminer.high_level import extract_text -from pdfminer.pdfparser import PDFParser -from pdfminer.pdfdocument import PDFDocument +from pypdf import PdfReader from stem import Signal from stem.control import Controller @@ -64,45 +62,23 @@ class RAAspotter: def get_pdf_dates(self, data_dir): raa_data_dir = f'{data_dir}/raa/' - p_pdf = None - pdf_parser = None - pdf_creation_date_raw = None - pdf_modification_date_raw = None + reader = PdfReader(f'{raa_data_dir}{self.get_sha256()}.pdf') + pdf_metadata = reader.metadata + + if pdf_metadata.creation_date: + self.pdf_creation_date = pdf_metadata.creation_date - try: - p_pdf = open(f'{raa_data_dir}{self.get_sha256()}.pdf', 'rb') - pdf_parser = PDFParser(p_pdf) - except Exception as exc: - logger.warning(f'Impossible d\'ouvrir le PDF {self.get_sha256()}.pdf pour extraction de ses métadonnées : {exc}') - - try: - pdf_creation_date_raw = PDFDocument(pdf_parser).info[0]['CreationDate'].decode('utf-8').replace('D:', '').replace('\'', '') - if pdf_creation_date_raw: - try: - self.pdf_creation_date = datetime.datetime.strptime(pdf_creation_date_raw, '%Y%m%d%H%M%S%z') - except ValueError: - self.pdf_creation_date = datetime.datetime.strptime(pdf_creation_date_raw, '%Y%m%d%H%M%S') - except Exception as exc: - pass - - try: - pdf_modification_date_raw = PDFDocument(pdf_parser).info[0]['ModDate'].decode('utf-8').replace('D:', '').replace('\'', '') - if pdf_modification_date_raw: - try: - self.pdf_modification_date = datetime.datetime.strptime(pdf_modification_date_raw, '%Y%m%d%H%M%S%z') - except ValueError: - self.pdf_modification_date = datetime.datetime.strptime(pdf_modification_date_raw, '%Y%m%d%H%M%S') - except Exception as exc: - pass + if pdf_metadata.modification_date: + self.pdf_modification_date = pdf_metadata.modification_date def extract_content(self, data_dir): raa_data_dir = f'{data_dir}/raa/' text = "" - try: - text = extract_text(f'{raa_data_dir}{self.get_sha256()}.ocr.pdf') - except Exception as exc: - logger.warning(f'ATTENTION: Impossible d\'extraire le texte du fichier {self.get_sha256()}.pdf : {exc}') + + reader = PdfReader(f'{raa_data_dir}{self.get_sha256()}.ocr.pdf') + for page in reader.pages: + text = text + "\n" + page.extract_text() # Écrit le texte du PDF dans un fichier texte pour une analyse future f = open(f'{raa_data_dir}{self.get_sha256()}.txt', 'w') @@ -147,9 +123,6 @@ class RAAspotter: # On crée le dossier de téléchargement os.makedirs(data_dir, exist_ok=True) - # pdfminer.six est un peu trop verbeux en mode debug, donc on relève son niveau de log - logging.getLogger("pdfminer").setLevel(logging.WARNING) - self.session = requests.Session() self.data_dir = data_dir self.found = False diff --git a/requirements.txt b/requirements.txt index 57617213ae7068a24d011255a0031a16e94c47cb..6d62f561749cbb48be57693b5b2afea5079824c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ beautifulsoup4>=4.12.3 dateparser>=1.2.0 Mastodon.py>=1.8.1 -pdfminer.six>=20231228 pycodestyle>=2.11.1 +pypdf>=4.2.0 PyVirtualDisplay>=3.0 requests>=2.31.0 selenium>=4.19.0