diff --git a/Attrap.py b/Attrap.py index 273de9065f2f5b5744626a031d23bfddca42f300..27bc2b74f99fff68df8a9c1ca9b7ad3bdf75cdc3 100644 --- a/Attrap.py +++ b/Attrap.py @@ -34,6 +34,8 @@ import email from mastodon import Mastodon +import ftfy + logger = logging.getLogger(__name__) @@ -79,9 +81,10 @@ class Attrap: text = "" reader = PdfReader(f'{raa_data_dir}{self.get_sha256()}.ocr.pdf') + ftfy_config = ftfy.TextFixerConfig(unescape_html=False, explain=False) for page in reader.pages: try: - text = text + "\n" + page.extract_text() + text = text + "\n" + ftfy.fix_text(page.extract_text(), config=ftfy_config) except Exception as exc: logger.warning(f'ATTENTION: Impossible d\'extraire le texte du fichier {self.get_sha256()}.pdf : {exc}') diff --git a/requirements.txt b/requirements.txt index 6d62f561749cbb48be57693b5b2afea5079824c0..441591e7b422a421e349f34589a6c2c5c7af9eef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ beautifulsoup4>=4.12.3 dateparser>=1.2.0 +ftfy>=6.2.0 Mastodon.py>=1.8.1 pycodestyle>=2.11.1 pypdf>=4.2.0