From 4356e39446270e0d03e8f4263463cbf43ca54c77 Mon Sep 17 00:00:00 2001
From: Bastien Le Querrec <blq@laquadrature.net>
Date: Sun, 12 May 2024 00:43:40 +0200
Subject: [PATCH] Attrap: nettoie les textes extraits des PDF avant sauvegarde

---
 Attrap.py        | 5 ++++-
 requirements.txt | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/Attrap.py b/Attrap.py
index 273de90..27bc2b7 100644
--- a/Attrap.py
+++ b/Attrap.py
@@ -34,6 +34,8 @@ import email
 
 from mastodon import Mastodon
 
+import ftfy
+
 logger = logging.getLogger(__name__)
 
 
@@ -79,9 +81,10 @@ class Attrap:
             text = ""
 
             reader = PdfReader(f'{raa_data_dir}{self.get_sha256()}.ocr.pdf')
+            ftfy_config = ftfy.TextFixerConfig(unescape_html=False, explain=False)
             for page in reader.pages:
                 try:
-                    text = text + "\n" + page.extract_text()
+                    text = text + "\n" + ftfy.fix_text(page.extract_text(), config=ftfy_config)
                 except Exception as exc:
                     logger.warning(f'ATTENTION: Impossible d\'extraire le texte du fichier {self.get_sha256()}.pdf : {exc}')
 
diff --git a/requirements.txt b/requirements.txt
index 6d62f56..441591e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 beautifulsoup4>=4.12.3
 dateparser>=1.2.0
+ftfy>=6.2.0
 Mastodon.py>=1.8.1
 pycodestyle>=2.11.1
 pypdf>=4.2.0
-- 
GitLab