From 4356e39446270e0d03e8f4263463cbf43ca54c77 Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Sun, 12 May 2024 00:43:40 +0200 Subject: [PATCH] Attrap: nettoie les textes extraits des PDF avant sauvegarde --- Attrap.py | 5 ++++- requirements.txt | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Attrap.py b/Attrap.py index 273de90..27bc2b7 100644 --- a/Attrap.py +++ b/Attrap.py @@ -34,6 +34,8 @@ import email from mastodon import Mastodon +import ftfy + logger = logging.getLogger(__name__) @@ -79,9 +81,10 @@ class Attrap: text = "" reader = PdfReader(f'{raa_data_dir}{self.get_sha256()}.ocr.pdf') + ftfy_config = ftfy.TextFixerConfig(unescape_html=False, explain=False) for page in reader.pages: try: - text = text + "\n" + page.extract_text() + text = text + "\n" + ftfy.fix_text(page.extract_text(), config=ftfy_config) except Exception as exc: logger.warning(f'ATTENTION: Impossible d\'extraire le texte du fichier {self.get_sha256()}.pdf : {exc}') diff --git a/requirements.txt b/requirements.txt index 6d62f56..441591e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ beautifulsoup4>=4.12.3 dateparser>=1.2.0 +ftfy>=6.2.0 Mastodon.py>=1.8.1 pycodestyle>=2.11.1 pypdf>=4.2.0 -- GitLab