diff --git a/RAAspotter.py b/RAAspotter.py index d5e632e8948b25b0871ccc1ded85d502758c1de6..376230f59609e11b6d029568f215f0e6029c8981 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -22,6 +22,8 @@ from bs4 import BeautifulSoup from pyvirtualdisplay import Display from pypdf import PdfReader +from pypdf import PdfWriter +from pypdf.generic import NameObject, NumberObject from stem import Signal from stem.control import Controller @@ -88,6 +90,7 @@ class RAAspotter: # Supprime le PDF d'origine et la version OCRisée os.remove(f'{raa_data_dir}{self.get_sha256()}.pdf') os.remove(f'{raa_data_dir}{self.get_sha256()}.ocr.pdf') + os.remove(f'{raa_data_dir}{self.get_sha256()}.flat.pdf') def write_properties(self, data_dir): raa_data_dir = f'{data_dir}/raa/' @@ -112,10 +115,9 @@ class RAAspotter: f.write(json.dumps(properties)) f.close() - def parse(self, data_dir, not_before, keywords): + def parse_metadata(self, data_dir): self.get_pdf_dates(data_dir) self.write_properties(data_dir) - self.extract_content(data_dir) def __init__(self, data_dir, user_agent=''): logger.debug('Initialisation de RAAspotter') @@ -425,7 +427,7 @@ class RAAspotter: '--skip-big', '500', '--invalidate-digital-signatures', '--optimize', '0', - f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', + f'{self.data_dir}/raa/{raa.get_sha256()}.flat.pdf', f'{self.data_dir}/raa/{raa.get_sha256()}.ocr.pdf' ] logger.debug(f'Lancement de ocrmypdf: {cmd}') @@ -442,6 +444,21 @@ class RAAspotter: logger.warning('ATTENTION : Impossible d\'OCRiser le document', exc.returncode, exc.output) shutil.copy(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', f'{self.data_dir}/raa/{raa.get_sha256()}.ocr.pdf') + def flatten_pdf(self, raa): + # OCRmyPDF ne sait pas gérer les formulaires, donc on les enlève avant OCRisation + reader = PdfReader(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf') + writer = PdfWriter() + + for page in reader.pages: + if page.get('/Annots'): + for annot in page.get('/Annots'): + writer_annot = annot.get_object() + writer_annot.update({ + NameObject("/Ff"): NumberObject(1) + }) + writer.add_page(page) + writer.write(f'{self.data_dir}/raa/{raa.get_sha256()}.flat.pdf') + def search_keywords(self, raa, keywords): text = open(f'{self.data_dir}/raa/{raa.get_sha256()}.txt').read() @@ -477,8 +494,10 @@ class RAAspotter: url = quote(raa.url, safe='/:') logger.info(f'Nouveau fichier : {raa.name} ({raa.date_str}). URL : {url}') self.download_file(raa) + raa.parse_metadata(self.data_dir) + self.flatten_pdf(raa) self.ocr(raa, True) - raa.parse(self.data_dir, self.not_before, keywords) + raa.extract_content(self.data_dir) self.search_keywords(raa, keywords) def get_raa(self, page_content):