From 9a499b8094cf4a0867e3d36fe364151d686807ce Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Thu, 28 Mar 2024 01:12:24 +0100 Subject: [PATCH] =?UTF-8?q?mise=20en=20conformit=C3=A9=20PEP=208=20du=20co?= =?UTF-8?q?de?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RAAspotter.py | 925 +++++++++++++++++++++++------------------- RAAspotter_ppparis.py | 100 +++-- RAAspotter_pref04.py | 116 +++--- RAAspotter_pref06.py | 198 +++++---- RAAspotter_pref13.py | 115 +++--- RAAspotter_pref34.py | 126 +++--- RAAspotter_pref35.py | 122 +++--- RAAspotter_pref38.py | 193 +++++---- RAAspotter_pref59.py | 140 ++++--- RAAspotter_pref62.py | 180 ++++---- RAAspotter_pref65.py | 127 +++--- RAAspotter_pref69.py | 173 ++++---- RAAspotter_pref83.py | 179 ++++---- RAAspotter_pref976.py | 237 ++++++----- cli.py | 217 +++++++--- 15 files changed, 1820 insertions(+), 1328 deletions(-) diff --git a/RAAspotter.py b/RAAspotter.py index 5eddeaf..3454dcd 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -1,4 +1,7 @@ -import os, re, ssl, sys +import os +import re +import ssl +import sys import subprocess import logging import requests @@ -28,421 +31,513 @@ from mastodon import Mastodon logger = logging.getLogger(__name__) + class RAAspotter: - class RAA: - url = "" - date = datetime.datetime(1970, 1, 1) - date_str = "" - name = "" - filename = "" - sha256 = "" - - def __init__(self, url, date, name, filename): - if not url == "": - self.url = url - if not date == "": - self.date = date - self.date_str = date.strftime("%d/%m/%Y") - if not name == "": - self.name = name - if not filename == "": - self.filename = filename - - def get_sha256(self): - if (self.sha256 == ""): - self.sha256 = hashlib.sha256(self.filename.encode('utf-8')).hexdigest() - return self.sha256 - - def __init__(self, data_dir, user_agent=''): - logger.debug('Initialisation de RAAspotter') - - self.session = requests.Session() - self.data_dir = data_dir - self.found = False - self.output_file_path = os.path.dirname(os.path.abspath(__file__))+f'/output_{self.short_code}.log' - self.sleep_time = 0 - self.tor_enabled = False - self.tor_max_requests = 0 - self.tor_requests = 0 - self.not_before = datetime.datetime(2024, 1, 1) - self.smtp_configured = False - self.mastodon = None - self.mastodon_prefix = '' - self.mastodon_suffix = '' - - self.update_user_agent(user_agent) - - f = open(self.output_file_path,'w') - f.write('') - f.close() - - def configure_mastodon(self, access_token, instance, mastodon_prefix, mastodon_suffix): - if access_token and access_token != "" and instance and instance != "": - self.mastodon = Mastodon( - access_token=access_token, - api_base_url=instance - ) - self.mastodon_prefix = mastodon_prefix - self.mastodon_suffix = mastodon_suffix - - def mastodon_toot(self, content): - if self.mastodon: - toot = content - if not self.mastodon_prefix == '': - toot = f"{self.mastodon_prefix}\n\n{toot}" - if not self.mastodon_suffix == '': - toot = f"{toot}\n\n{self.mastodon_suffix}" - self.mastodon.toot(toot) - - def enable_tor(self, max_requests=0): - proxies = { - "http": f"socks5h://127.0.0.1:9050", - "https": f"socks5h://127.0.0.1:9050", - } - self.tor_enabled = True - self.tor_max_requests = max_requests - self.tor_requests = 0 - self.session.proxies.update(proxies) - self.tor_get_new_id() - - def disable_tor(self): - proxies = {} - self.tor_enabled = False - self.tor_max_requests = 0 - self.tor_requests = 0 - self.session.proxies.update(proxies) - - def tor_get_new_id(self): - logger.info('Changement d\'identité Tor') - try: - controller = Controller.from_port(port = 9051) - controller.authenticate() - controller.signal(Signal.NEWNYM) - time.sleep(5) - self.tor_requests = 0 - except: - logger.debug('Impossible de changer d\'identité Tor') - - def get_sub_pages(self, page_content, element, host, recursive_until_pdf): - soup = BeautifulSoup(page_content, 'html.parser') - sub_pages = [] - for a in soup.select(element): - if a.get('href'): - url = f"{host}{a['href']}" - if recursive_until_pdf: - sub_page_content = self.get_page(url, 'get').content - if not self.has_pdf(sub_page_content): - logger.info(f'{url} ne contient pas de PDF, on récupère ses sous-pages') - for sub_sub_page in self.get_sub_pages(sub_page_content, element, host, recursive_until_pdf): - sub_pages.append(sub_sub_page) - else: - sub_page = { - 'url': url, - 'name': a.get_text().strip() - } - sub_pages.append(sub_page) - else: - sub_page = { - 'url': url, - 'name': a.get_text().strip() - } - sub_pages.append(sub_page) - return sub_pages - - def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, host): - pages = [] - page_content = self.get_page(page, 'get').content - - # On initialise le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On recherche les sous-pages - sub_pages = soup.select(sub_page_element) - for sub_page in sub_pages: - if sub_page.get('href'): - page = { - 'url': f"{host}{sub_page['href']}", - 'name': sub_page.get_text().strip() + class RAA: + url = "" + date = datetime.datetime(1970, 1, 1) + date_str = "" + name = "" + filename = "" + sha256 = "" + + def __init__(self, url, date, name, filename): + if not url == "": + self.url = url + if not date == "": + self.date = date + self.date_str = date.strftime("%d/%m/%Y") + if not name == "": + self.name = name + if not filename == "": + self.filename = filename + + def get_sha256(self): + if (self.sha256 == ""): + self.sha256 = hashlib.sha256( + self.filename.encode('utf-8') + ).hexdigest() + return self.sha256 + + def __init__(self, data_dir, user_agent=''): + logger.debug('Initialisation de RAAspotter') + + self.session = requests.Session() + self.data_dir = data_dir + self.found = False + self.output_file_path = os.path.dirname( + os.path.abspath(__file__) + )+f'/output_{self.short_code}.log' + self.sleep_time = 0 + self.tor_enabled = False + self.tor_max_requests = 0 + self.tor_requests = 0 + self.not_before = datetime.datetime(2024, 1, 1) + self.smtp_configured = False + self.mastodon = None + self.mastodon_prefix = '' + self.mastodon_suffix = '' + + self.update_user_agent(user_agent) + + f = open(self.output_file_path, 'w') + f.write('') + f.close() + + def configure_mastodon(self, access_token, instance, mastodon_prefix, + mastodon_suffix): + if access_token and access_token != "" and instance and instance != "": + self.mastodon = Mastodon( + access_token=access_token, + api_base_url=instance + ) + self.mastodon_prefix = mastodon_prefix + self.mastodon_suffix = mastodon_suffix + + def mastodon_toot(self, content): + if self.mastodon: + toot = content + if not self.mastodon_prefix == '': + toot = f"{self.mastodon_prefix}\n\n{toot}" + if not self.mastodon_suffix == '': + toot = f"{toot}\n\n{self.mastodon_suffix}" + self.mastodon.toot(toot) + + def enable_tor(self, max_requests=0): + proxies = { + "http": f"socks5h://127.0.0.1:9050", + "https": f"socks5h://127.0.0.1:9050", } - pages.append(page) - - # On recherche un pager, et si on le trouve on le suit - pager = soup.select(pager_element) - if pager and pager[0] and pager[0].get('href'): - for sub_page in self.get_sub_pages_with_pager(f"{host}{pager[0]['href']}", sub_page_element, pager_element, host): - pages.append(sub_page) - - return pages - - def get_raa_with_pager(self, pages_list, pager_element, host): - elements = [] - # On parse chaque page passée en paramètre - for page in pages_list: - page_content = self.get_page(page, 'get').content - - # Pour chaque page, on récupère les PDF - for raa in self.get_raa_elements(page_content): - elements.append(raa) - - # On regarde également s'il n'y aurait pas un pager - sub_pages = [] - for sub_page in self.get_sub_pages(page_content, pager_element, host, True): - sub_pages.append(sub_page['url']) - for sub_raa in self.get_raa_with_pager(sub_pages, pager_element, host): - elements.append(sub_raa) - return elements - - def set_sleep_time(self, sleep_time): - self.sleep_time = sleep_time - - def has_pdf(self, page_content): - elements = [] - soup = BeautifulSoup(page_content, 'html.parser') - for a in soup.find_all('a', href=True): - if a['href'].endswith('.pdf'): - return True - return False - - # On démarre le navigateur - def get_session(self, url, wait_element=""): - webdriver_options = webdriver.ChromeOptions() - webdriver_options.add_argument("--no-sandbox") - webdriver_options.add_argument("--disable-extensions") - webdriver_options.add_argument("--disable-gpu") - webdriver_options.add_argument("--disable-dev-shm-usage") - webdriver_options.add_argument("--use_subprocess") - webdriver_options.add_argument("--disable-blink-features=AutomationControlled") - - if not self.user_agent == "": - webdriver_options.add_argument(f"--user-agent={self.user_agent}") - - webdriver_options.add_argument("--headless") - webdriver_options.add_argument("--window-size=1024,768") - display = Display(visible=False, size=(1024, 768)) - display.start() - - browser = webdriver.Chrome(options=webdriver_options) - - # Téléchargement de l'URL - browser.get(url) - - if not wait_element == "": - # On attend que le navigateur ait passé les tests anti-robots et que le contenu s'affiche - WebDriverWait(browser, 120).until(expected_conditions.presence_of_element_located((By.ID, wait_element))) - page_content = browser.page_source - - # On récupère les cookies du navigateur pour les réutiliser plus tard - for cookie in browser.get_cookies(): - self.session.cookies.set(cookie['name'], cookie['value']) - - # On arrête le navigateur - browser.quit() - display.stop() - - return page_content - - def print_output(self, data): - print(data) - data = data.replace('\033[92m', '') - data = data.replace('\033[0m', '') - data = data.replace('\033[1m', '') - f = open(self.output_file_path,'a') - f.write(data+"\n") - f.close() - - def get_page(self, url, method, data={}): - try: - logger.debug(f'Chargement de la page {url}') - if self.sleep_time > 0: - time.sleep(self.sleep_time) - - page = None - if method == 'get': - page = self.session.get(url) - if method == 'post': - page = self.session.post(url, data=data) - - if page.status_code == 429: - logger.info(f'Erreur 429 Too Many Requests reçue, temporisation...') + self.tor_enabled = True + self.tor_max_requests = max_requests + self.tor_requests = 0 + self.session.proxies.update(proxies) self.tor_get_new_id() - time.sleep(55) - return self.get_page(url, method, data) - - if self.tor_enabled: - self.tor_requests+=1 - if self.tor_max_requests>0 and self.tor_requests>self.tor_max_requests: - self.tor_get_new_id() - - return page - except requests.exceptions.ConnectionError as exc: - logger.info(f'Erreur de connexion, temporisation...') - self.tor_get_new_id() - time.sleep(55) - return self.get_page(url, method, data) - - def update_user_agent(self, user_agent): - self.user_agent = user_agent - self.session.headers.update({'User-Agent': self.user_agent}) - - def download_file(self, raa): - try: - os.makedirs(os.path.dirname(f'{self.data_dir}{raa.get_sha256()}.pdf'), exist_ok=True) - file = self.get_page(raa.url, 'get') - f = open(f'{self.data_dir}{raa.get_sha256()}.pdf','wb') - f.write(file.content) - f.close() - except (requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError): - logger.warning(f'ATTENTION: la connexion a été interrompue pendant le téléchargement de {raa.url}, nouvelle tentative...') - self.download_file(raa) - except Exception as exc: - logger.warning(f'ATTENTION: Impossible de télécharger le fichier {raa.url}: {exc}') - - def parse_pdf(self, raa, keywords): - if not os.path.isfile(f'{self.data_dir}{raa.get_sha256()}.pdf'): - logger.warning(f'ATTENTION: le fichier {raa.get_sha256()}.pdf n\'existe pas') - else: - text = "" - try: - text = extract_text(f'{self.data_dir}{raa.get_sha256()}.pdf') - except Exception as exc: - logger.warning(f'ATTENTION: Impossible d\'extraire le texte du fichier {raa.get_sha256()}.pdf : {exc}') - - found = False - found_keywords = [] - for keyword in keywords: - if re.search(keyword, text, re.IGNORECASE|re.MULTILINE): - if not found: - url = quote(raa.url, safe='/:') - self.print_output(f'\033[92m{raa.name}\033[0m ({raa.date_str})') - self.print_output(f'URL : {url}') - found = True - self.found = True - self.print_output(f' Le terme \033[1m{keyword}\033[0m a été trouvé.') - found_keywords.append(keyword) - - # Écrit le texte du PDF dans un fichier texte pour une analyse future, puis supprime le PDF - f = open(f'{self.data_dir}{raa.get_sha256()}.txt','w') - f.write(text) - f.close() - os.remove(f'{self.data_dir}{raa.get_sha256()}.pdf') - if found: - self.print_output('') - url = quote(raa.url, safe='/:') - found_keywords_str = ', '.join([str(x) for x in found_keywords]) - self.mastodon_toot(f"{raa.name} ({raa.date_str})\n\nLes termes suivants ont été trouvés : {found_keywords_str}.\n\nURL : {url}") - - - def ocr(self, raa, retry_on_failure=True): - cmd = [ - 'ocrmypdf', - '-l', 'eng+fra', - '--output-type', 'pdfa', - '--redo-ocr', - '--skip-big', '500', - '--invalidate-digital-signatures', - f'{self.data_dir}{raa.get_sha256()}.pdf', - f'{self.data_dir}{raa.get_sha256()}.pdf' - ] - logger.debug(f'Lancement de ocrmypdf: {cmd}') - try: - output = subprocess.check_output(cmd, stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as exc: - if exc.returncode == 2 and retry_on_failure: - logger.warning('ATTENTION : Le fichier n\'est pas un PDF correct, nouvelle tentative de le télécharger') - if self.tor_enabled: - self.tor_get_new_id() - self.download_file(raa) - self.ocr(raa,False) - elif (not exc.returncode == 6) and (not exc.returncode == 10): - logger.warning('ATTENTION : Impossible d\'OCRiser le document', exc.returncode, exc.output) - - def parse_raa(self, elements, keywords): - for raa in elements: - # Si le fichier n'a pas déjà été parsé et qu'il est postérieur à la date maximale d'analyse, - # on le télécharge et on le parse - if (raa.date >= self.not_before) and (not os.path.isfile(f'{self.data_dir}{raa.get_sha256()}.txt')): - url = quote(raa.url, safe='/:') - logger.info(f'Nouveau fichier : {raa.name} ({raa.date_str}). URL : {url}') - self.download_file(raa) - self.ocr(raa, True) - self.parse_pdf(raa, keywords) - - def get_raa(self, page_content): - logger.error('Cette fonction doit être surchargée') - - def configure_mailer(self, smtp_host, smtp_username, smtp_password, - smtp_port, smtp_starttls, smtp_ssl, email_from, - email_to, email_object): - self.smtp_host = smtp_host - self.smtp_username = smtp_username - self.smtp_password = smtp_password - if smtp_port <= 0: - self.smtp_port = 587 - else: - self.smtp_port = int(smtp_port) - self.smtp_starttls = smtp_starttls - self.smtp_ssl = smtp_ssl - self.email_from = email_from - self.email_to = email_to - self.email_object = email_object - - if smtp_host and smtp_username and smtp_password and email_from and email_to and email_object: - self.smtp_configured = True - - def mailer(self): - if self.smtp_configured and self.found: - try: - message = email.message.EmailMessage() - message.set_content(open(self.output_file_path).read()) - - message['Subject'] = self.email_object - message['From'] = self.email_from - message['Message-ID'] = email.utils.make_msgid(domain=self.email_from.split('@')[-1]) - message['Date'] = email.utils.formatdate() - - context = ssl.create_default_context() - - if self.smtp_ssl == True: - for address in self.email_to.split(','): - del message['To'] - message['To'] = address - smtp = smtplib.SMTP_SSL(self.smtp_host, port, context=context) - if self.smtp_username: - smtp.login(self.smtp_username, self.smtp_password) - smtp.send_message(message) - smtp.quit() - elif self.smtp_starttls == True: - for address in self.email_to.split(','): - del message['To'] - message['To'] = address - smtp = smtplib.SMTP(self.smtp_host) - smtp.starttls(context=context) - if self.smtp_username: - smtp.login(self.smtp_username, self.smtp_password) - smtp.send_message(message) - smtp.quit() + + def disable_tor(self): + proxies = {} + self.tor_enabled = False + self.tor_max_requests = 0 + self.tor_requests = 0 + self.session.proxies.update(proxies) + + def tor_get_new_id(self): + logger.info('Changement d\'identité Tor') + try: + controller = Controller.from_port(port=9051) + controller.authenticate() + controller.signal(Signal.NEWNYM) + time.sleep(5) + self.tor_requests = 0 + except Exception as exc: + logger.debug(f'Impossible de changer d\'identité Tor: {exc}') + + def get_sub_pages(self, page_content, element, host, recursive_until_pdf): + soup = BeautifulSoup(page_content, 'html.parser') + sub_pages = [] + for a in soup.select(element): + if a.get('href'): + url = f"{host}{a['href']}" + if recursive_until_pdf: + sub_page_content = self.get_page(url, 'get').content + if not self.has_pdf(sub_page_content): + logger.info( + f'{url} ne contient pas de PDF, on ' + 'récupère ses sous-pages' + ) + for sub_sub_page in self.get_sub_pages( + sub_page_content, + element, + host, + recursive_until_pdf + ): + sub_pages.append(sub_sub_page) + else: + sub_page = { + 'url': url, + 'name': a.get_text().strip() + } + sub_pages.append(sub_page) + else: + sub_page = { + 'url': url, + 'name': a.get_text().strip() + } + sub_pages.append(sub_page) + return sub_pages + + def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, + host): + pages = [] + page_content = self.get_page(page, 'get').content + + # On initialise le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On recherche les sous-pages + sub_pages = soup.select(sub_page_element) + for sub_page in sub_pages: + if sub_page.get('href'): + page = { + 'url': f"{host}{sub_page['href']}", + 'name': sub_page.get_text().strip() + } + pages.append(page) + + # On recherche un pager, et si on le trouve on le suit + pager = soup.select(pager_element) + if pager and pager[0] and pager[0].get('href'): + for sub_page in self.get_sub_pages_with_pager( + f"{host}{pager[0]['href']}", + sub_page_element, + pager_element, + host + ): + pages.append(sub_page) + + return pages + + def get_raa_with_pager(self, pages_list, pager_element, host): + elements = [] + # On parse chaque page passée en paramètre + for page in pages_list: + page_content = self.get_page(page, 'get').content + + # Pour chaque page, on récupère les PDF + for raa in self.get_raa_elements(page_content): + elements.append(raa) + + # On regarde également s'il n'y aurait pas un pager + sub_pages = [] + for sub_page in self.get_sub_pages( + page_content, + pager_element, + host, + True + ): + sub_pages.append(sub_page['url']) + for sub_raa in self.get_raa_with_pager( + sub_pages, + pager_element, + host + ): + elements.append(sub_raa) + return elements + + def set_sleep_time(self, sleep_time): + self.sleep_time = sleep_time + + def has_pdf(self, page_content): + elements = [] + soup = BeautifulSoup(page_content, 'html.parser') + for a in soup.find_all('a', href=True): + if a['href'].endswith('.pdf'): + return True + return False + + # On démarre le navigateur + def get_session(self, url, wait_element=""): + webdriver_options = webdriver.ChromeOptions() + webdriver_options.add_argument("--no-sandbox") + webdriver_options.add_argument("--disable-extensions") + webdriver_options.add_argument("--disable-gpu") + webdriver_options.add_argument("--disable-dev-shm-usage") + webdriver_options.add_argument("--use_subprocess") + webdriver_options.add_argument( + "--disable-blink-features=AutomationControlled" + ) + + if not self.user_agent == "": + webdriver_options.add_argument(f"--user-agent={self.user_agent}") + + webdriver_options.add_argument("--headless") + webdriver_options.add_argument("--window-size=1024,768") + display = Display(visible=False, size=(1024, 768)) + display.start() + + browser = webdriver.Chrome(options=webdriver_options) + + # Téléchargement de l'URL + browser.get(url) + + if not wait_element == "": + # On attend que le navigateur ait passé les tests anti-robots et + # que le contenu s'affiche + WebDriverWait(browser, 120).until( + expected_conditions.presence_of_element_located( + ( + By.ID, + wait_element + ) + ) + ) + page_content = browser.page_source + + # On récupère les cookies du navigateur pour les réutiliser plus tard + for cookie in browser.get_cookies(): + self.session.cookies.set(cookie['name'], cookie['value']) + + # On arrête le navigateur + browser.quit() + display.stop() + + return page_content + + def print_output(self, data): + print(data) + data = data.replace('\033[92m', '') + data = data.replace('\033[0m', '') + data = data.replace('\033[1m', '') + f = open(self.output_file_path, 'a') + f.write(data+"\n") + f.close() + + def get_page(self, url, method, data={}): + try: + logger.debug(f'Chargement de la page {url}') + if self.sleep_time > 0: + time.sleep(self.sleep_time) + + page = None + if method == 'get': + page = self.session.get(url) + if method == 'post': + page = self.session.post(url, data=data) + + if page.status_code == 429: + logger.info( + 'Erreur 429 Too Many Requests reçue, temporisation...' + ) + self.tor_get_new_id() + time.sleep(55) + return self.get_page(url, method, data) + + if self.tor_enabled: + self.tor_requests += 1 + if self.tor_max_requests > 0 and \ + self.tor_requests > self.tor_max_requests: + self.tor_get_new_id() + + return page + except requests.exceptions.ConnectionError as exc: + logger.info(f'Erreur de connexion, temporisation...') + self.tor_get_new_id() + time.sleep(55) + return self.get_page(url, method, data) + + def update_user_agent(self, user_agent): + self.user_agent = user_agent + self.session.headers.update({'User-Agent': self.user_agent}) + + def download_file(self, raa): + try: + os.makedirs( + os.path.dirname( + f'{self.data_dir}{raa.get_sha256()}.pdf' + ), + exist_ok=True + ) + file = self.get_page(raa.url, 'get') + f = open(f'{self.data_dir}{raa.get_sha256()}.pdf', 'wb') + f.write(file.content) + f.close() + except (requests.exceptions.ConnectionError, + requests.exceptions.ChunkedEncodingError): + logger.warning( + 'ATTENTION: la connexion a été interrompue pendant le ' + f'téléchargement de {raa.url}, nouvelle tentative...' + ) + self.download_file(raa) + except Exception as exc: + logger.warning( + f'ATTENTION: Impossible de télécharger le fichier {raa.url}: ' + f'{exc}' + ) + + def parse_pdf(self, raa, keywords): + if not os.path.isfile(f'{self.data_dir}{raa.get_sha256()}.pdf'): + logger.warning( + f'ATTENTION: le fichier {raa.get_sha256()}.pdf n\'existe pas' + ) + else: + text = "" + try: + text = extract_text(f'{self.data_dir}{raa.get_sha256()}.pdf') + except Exception as exc: + logger.warning( + 'ATTENTION: Impossible d\'extraire le texte du fichier ' + f'{raa.get_sha256()}.pdf : {exc}' + ) + + found = False + found_keywords = [] + for keyword in keywords: + if re.search(keyword, text, re.IGNORECASE | re.MULTILINE): + if not found: + url = quote(raa.url, safe='/:') + self.print_output( + f'\033[92m{raa.name}\033[0m ({raa.date_str})' + ) + self.print_output(f'URL : {url}') + found = True + self.found = True + self.print_output( + f' Le terme \033[1m{keyword}\033[0m a été trouvé.' + ) + found_keywords.append(keyword) + + # Écrit le texte du PDF dans un fichier texte pour une analyse + # future, puis supprime le PDF + f = open(f'{self.data_dir}{raa.get_sha256()}.txt', 'w') + f.write(text) + f.close() + os.remove(f'{self.data_dir}{raa.get_sha256()}.pdf') + if found: + self.print_output('') + url = quote(raa.url, safe='/:') + found_keywords_str = ', '.join( + [str(x) for x in found_keywords] + ) + self.mastodon_toot( + f'{raa.name} ({raa.date_str})\n\nLes termes suivants ont ' + f'été trouvés : {found_keywords_str}.\n\nURL : {url}' + ) + + def ocr(self, raa, retry_on_failure=True): + cmd = [ + 'ocrmypdf', + '-l', 'eng+fra', + '--output-type', 'pdfa', + '--redo-ocr', + '--skip-big', '500', + '--invalidate-digital-signatures', + f'{self.data_dir}{raa.get_sha256()}.pdf', + f'{self.data_dir}{raa.get_sha256()}.pdf' + ] + logger.debug(f'Lancement de ocrmypdf: {cmd}') + try: + output = subprocess.check_output(cmd, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as exc: + if exc.returncode == 2 and retry_on_failure: + logger.warning( + 'ATTENTION : Le fichier n\'est pas un PDF correct, ' + 'nouvelle tentative de le télécharger' + ) + if self.tor_enabled: + self.tor_get_new_id() + self.download_file(raa) + self.ocr(raa, False) + elif (not exc.returncode == 6) and (not exc.returncode == 10): + logger.warning( + 'ATTENTION : Impossible d\'OCRiser le document', + exc.returncode, + exc.output + ) + + def parse_raa(self, elements, keywords): + for raa in elements: + # Si le fichier n'a pas déjà été parsé et qu'il est postérieur à la + # date maximale d'analyse, on le télécharge et on le parse + if raa.date >= self.not_before and \ + not os.path.isfile( + f'{self.data_dir}{raa.get_sha256()}.txt' + ): + url = quote(raa.url, safe='/:') + logger.info( + f'Nouveau fichier : {raa.name} ({raa.date_str}). URL : ' + f'{url}' + ) + self.download_file(raa) + self.ocr(raa, True) + self.parse_pdf(raa, keywords) + + def get_raa(self, page_content): + logger.error('Cette fonction doit être surchargée') + + def configure_mailer(self, smtp_host, smtp_username, smtp_password, + smtp_port, smtp_starttls, smtp_ssl, email_from, + email_to, email_object): + self.smtp_host = smtp_host + self.smtp_username = smtp_username + self.smtp_password = smtp_password + if smtp_port <= 0: + self.smtp_port = 587 else: - for address in self.email_to.split(','): - del message['To'] - message['To'] = address - smtp = smtplib.SMTP(self.smtp_host) - if self.smtp_username: - smtp.login(self.smtp_username, self.smtp_password) - smtp.send_message(message) - smtp.quit() - except Exception as exc: - logger.warning(f'Impossible d\'envoyer le courrier électronique : {exc}') - - # Fonction qui essaie de deviner la date d'un RAA à partir de son nom. - # Utile pour limiter les requêtes lors de l'obtention des RAA à scanner. - def guess_date(string, regex): - try: - search = re.search(regex, string, re.IGNORECASE) - guessed_date = dateparser.parse(search.group(1)) - if guessed_date == None: - raise Exception('La date est un objet None') - else: - return guessed_date - except Exception as exc: - logger.warning(f"Impossible de deviner la date du terme {string} : {exc}") - return datetime.datetime(9999, 1, 1) + self.smtp_port = int(smtp_port) + self.smtp_starttls = smtp_starttls + self.smtp_ssl = smtp_ssl + self.email_from = email_from + self.email_to = email_to + self.email_object = email_object + + if smtp_host and smtp_username and smtp_password and email_from and \ + email_to and email_object: + self.smtp_configured = True + + def mailer(self): + if self.smtp_configured and self.found: + try: + message = email.message.EmailMessage() + message.set_content(open(self.output_file_path).read()) + + message['Subject'] = self.email_object + message['From'] = self.email_from + message['Message-ID'] = email.utils.make_msgid( + domain=self.email_from.split('@')[-1] + ) + message['Date'] = email.utils.formatdate() + + context = ssl.create_default_context() + + if self.smtp_ssl is True: + for address in self.email_to.split(','): + del message['To'] + message['To'] = address + smtp = smtplib.SMTP_SSL( + self.smtp_host, + port, + context=context + ) + if self.smtp_username: + smtp.login(self.smtp_username, self.smtp_password) + smtp.send_message(message) + smtp.quit() + elif self.smtp_starttls is True: + for address in self.email_to.split(','): + del message['To'] + message['To'] = address + smtp = smtplib.SMTP(self.smtp_host) + smtp.starttls(context=context) + if self.smtp_username: + smtp.login(self.smtp_username, self.smtp_password) + smtp.send_message(message) + smtp.quit() + else: + for address in self.email_to.split(','): + del message['To'] + message['To'] = address + smtp = smtplib.SMTP(self.smtp_host) + if self.smtp_username: + smtp.login(self.smtp_username, self.smtp_password) + smtp.send_message(message) + smtp.quit() + except Exception as exc: + logger.warning( + f'Impossible d\'envoyer le courrier électronique : {exc}' + ) + + # Fonction qui essaie de deviner la date d'un RAA à partir de son nom. + # Utile pour limiter les requêtes lors de l'obtention des RAA à scanner. + def guess_date(string, regex): + try: + search = re.search(regex, string, re.IGNORECASE) + guessed_date = dateparser.parse(search.group(1)) + if guessed_date is None: + raise Exception('La date est un objet None') + else: + return guessed_date + except Exception as exc: + logger.warning( + f'Impossible de deviner la date du terme {string} : {exc}' + ) + return datetime.datetime(9999, 1, 1) diff --git a/RAAspotter_ppparis.py b/RAAspotter_ppparis.py index f2ebb8b..d605eb3 100644 --- a/RAAspotter_ppparis.py +++ b/RAAspotter_ppparis.py @@ -5,50 +5,60 @@ from urllib.parse import unquote from RAAspotter import RAAspotter + class RAAspotter_ppparis(RAAspotter): - # Config - __HOST = 'https://www.prefecturedepolice.interieur.gouv.fr' - __RAA_PAGE = f'{__HOST}/actualites-et-presse/arretes/accueil-arretes' - __WAIT_ELEMENT = 'block-decree-list-block' - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' - full_name = 'Préfecture de police de Paris' - short_code = 'ppparis' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - - def get_raa(self, keywords): - self.print_output('RAAspotter_ppparis') - self.print_output(f'Termes recherchés: {keywords}') - self.print_output('') - - page_content = self.get_session() - raa_elements = self.get_raa_elements(page_content) - self.parse_raa(raa_elements, keywords.split(',')) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse - for a in soup.find_all('a', href=True): - if a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = 'https://www.prefecturedepolice.interieur.gouv.fr'+a['href'] - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').get_text() - date = datetime.datetime.strptime(a.find('div', class_="field--type-datetime").get_text().strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - - raa = RAAspotter.RAA(url, date, name, filename) - elements.append(raa) - return elements - - def get_session(self): - return super().get_session(self.__RAA_PAGE, self.__WAIT_ELEMENT) + # Config + __HOST = 'https://www.prefecturedepolice.interieur.gouv.fr' + __RAA_PAGE = f'{__HOST}/actualites-et-presse/arretes/accueil-arretes' + __WAIT_ELEMENT = 'block-decree-list-block' + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \ + '(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' + full_name = 'Préfecture de police de Paris' + short_code = 'ppparis' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + + def get_raa(self, keywords): + self.print_output('RAAspotter_ppparis') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') + + page_content = self.get_session() + raa_elements = self.get_raa_elements(page_content) + self.parse_raa(raa_elements, keywords.split(',')) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le + # parse + for a in soup.find_all('a', href=True): + if a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = 'https://www.prefecturedepolice.interieur.gouv.fr' \ + + a['href'] + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').get_text() + date = datetime.datetime.strptime( + a.find( + 'div', + class_="field--type-datetime" + ).get_text().strip(), + '%d/%m/%Y' + ) + filename = url.split('/')[-1] + + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements + + def get_session(self): + return super().get_session(self.__RAA_PAGE, self.__WAIT_ELEMENT) diff --git a/RAAspotter_pref04.py b/RAAspotter_pref04.py index 118afb5..f4953fe 100644 --- a/RAAspotter_pref04.py +++ b/RAAspotter_pref04.py @@ -1,4 +1,4 @@ -import os, sys +import os import datetime from bs4 import BeautifulSoup @@ -6,52 +6,72 @@ from urllib.parse import unquote from RAAspotter import RAAspotter + class RAAspotter_pref04(RAAspotter): - # Config - __HOST = 'https://www.alpes-de-haute-provence.gouv.fr' - __RAA_PAGE = f'{__HOST}/Publications/Publications-administratives-et-legales/Recueil-des-Actes-Administratifs' - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture des Alpes-de-Haute-Provence' - short_code = 'pref04' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - self.print_output('RAAspotter_pref04') - self.print_output(f'Termes recherchés: {keywords}') - self.print_output('') - - pages = [] - page_content = self.get_page(self.__RAA_PAGE, 'get').content - for sub_page in self.get_sub_pages(page_content, 'div.fr-card__body div.fr-card__content h2.fr-card__title a', self.__HOST, False): - if RAAspotter.guess_date(sub_page['name'], '([0-9]{4}).*').year >= self.not_before.year: - sub_page_content = self.get_page(sub_page['url'], 'get').content - raa_elements = self.get_raa_elements(sub_page_content) - self.parse_raa(raa_elements, keywords.split(',')) - - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse - for a in soup.select('a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - - raa = RAAspotter.RAA(url, date, name, filename) - elements.append(raa) - return elements + # Config + __HOST = 'https://www.alpes-de-haute-provence.gouv.fr' + __RAA_PAGE = f'{__HOST}/Publications/Publications-administratives-et-'\ + 'legales/Recueil-des-Actes-Administratifs' + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 '\ + 'Firefox/115.0' + full_name = 'Préfecture des Alpes-de-Haute-Provence' + short_code = 'pref04' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + self.print_output('RAAspotter_pref04') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') + + pages = [] + page_content = self.get_page(self.__RAA_PAGE, 'get').content + for sub_page in self.get_sub_pages( + page_content, + 'div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + ): + if RAAspotter.guess_date( + sub_page['name'], + '([0-9]{4}).*' + ).year >= self.not_before.year: + sub_page_content = self.get_page( + sub_page['url'], + 'get' + ).content + raa_elements = self.get_raa_elements(sub_page_content) + self.parse_raa(raa_elements, keywords.split(',')) + + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le + # parse + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace( + 'Télécharger ', + '' + ).strip() + date = datetime.datetime.strptime( + a.find('span').get_text().split(' - ')[-1].strip(), + '%d/%m/%Y') + filename = url.split('/')[-1] + + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements diff --git a/RAAspotter_pref06.py b/RAAspotter_pref06.py index e58f018..8f8e272 100644 --- a/RAAspotter_pref06.py +++ b/RAAspotter_pref06.py @@ -1,4 +1,4 @@ -import os, sys +import os import datetime from bs4 import BeautifulSoup @@ -6,91 +6,131 @@ from urllib.parse import unquote from RAAspotter import RAAspotter + class RAAspotter_pref06(RAAspotter): - # Config - __HOST = 'https://www.alpes-maritimes.gouv.fr' - __RAA_PAGE = {'2024': - [f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2024/Recueils-mensuels', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2024/Recueils-speciaux', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2024/Recueils-specifiques'], - '2023': - [f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2023/Recueils-mensuels', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2023/Recueils-speciaux', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2023/Recueils-specifiques'], - '2022': - [f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2022/Recueils-mensuels', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2022/Recueils-speciaux', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2022/Recueils-specifiques'], - '2021': - [f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2021/Recueils-mensuels', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2021/Recueils-speciaux', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2021/Recueils-specifiques'], - '2020': - [f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2020/Recueils-mensuels', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2020/Recueils-speciaux', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2020/Recueils-specifiques'], - '2019': - [f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2019/Recueils-mensuels', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2019/Recueils-speciaux', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2019/Recueils-specifiques']} - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture des Alpes-Maritimes' - short_code = 'pref06' + # Config + __HOST = 'https://www.alpes-maritimes.gouv.fr' + __RAA_PAGE = { + '2024': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2024/Recueils-mensuels', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2024/Recueils-speciaux', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2024/Recueils-specifiques' + ], + '2023': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2023/Recueils-mensuels', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2023/Recueils-speciaux', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2023/Recueils-specifiques' + ], + '2022': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2022/Recueils-mensuels', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2022/Recueils-speciaux', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2022/Recueils-specifiques' + ], + '2021': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2021/Recueils-mensuels', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2021/Recueils-speciaux', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2021/Recueils-specifiques' + ], + '2020': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2020/Recueils-mensuels', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2020/Recueils-speciaux', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2020/Recueils-specifiques' + ], + '2019': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2019/Recueils-mensuels', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2019/Recueils-speciaux', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA' + '/Annee-2019/Recueils-specifiques' + ] + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) ' \ + 'Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture des Alpes-Maritimes' + short_code = 'pref06' - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(20) + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(20) - def get_raa(self, keywords): - self.print_output('RAAspotter_pref06') - self.print_output(f'Termes recherchés: {keywords}') - self.print_output('') + def get_raa(self, keywords): + self.print_output('RAAspotter_pref06') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') - pages_to_parse = [] - if self.not_before.year <= 2024: - for page in self.__RAA_PAGE['2024']: - pages_to_parse.append(page) - if self.not_before.year <= 2023: - for page in self.__RAA_PAGE['2023']: - pages_to_parse.append(page) - if self.not_before.year <= 2022: - for page in self.__RAA_PAGE['2022']: - pages_to_parse.append(page) - if self.not_before.year <= 2021: - for page in self.__RAA_PAGE['2021']: - pages_to_parse.append(page) - if self.not_before.year <= 2020: - for page in self.__RAA_PAGE['2020']: - pages_to_parse.append(page) - if self.not_before.year <= 2019: - for page in self.__RAA_PAGE['2019']: - pages_to_parse.append(page) + pages_to_parse = [] + if self.not_before.year <= 2024: + for page in self.__RAA_PAGE['2024']: + pages_to_parse.append(page) + if self.not_before.year <= 2023: + for page in self.__RAA_PAGE['2023']: + pages_to_parse.append(page) + if self.not_before.year <= 2022: + for page in self.__RAA_PAGE['2022']: + pages_to_parse.append(page) + if self.not_before.year <= 2021: + for page in self.__RAA_PAGE['2021']: + pages_to_parse.append(page) + if self.not_before.year <= 2020: + for page in self.__RAA_PAGE['2020']: + pages_to_parse.append(page) + if self.not_before.year <= 2019: + for page in self.__RAA_PAGE['2019']: + pages_to_parse.append(page) - elements = self.get_raa_with_pager(pages_to_parse, ".fr-pagination__link.fr-pagination__link--next", self.__HOST) - self.parse_raa(elements, keywords.split(',')) - self.mailer() + elements = self.get_raa_with_pager( + pages_to_parse, + ".fr-pagination__link.fr-pagination__link--next", + self.__HOST + ) + self.parse_raa(elements, keywords.split(',')) + self.mailer() - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') - # Pour chaque élément fr-card__content, on cherche sa balise a, et si c'est un PDF on le parse - cards = soup.find_all('div', class_='fr-card__content') - for card in cards: - a = card.find('a') - if a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] + # Pour chaque élément fr-card__content, on cherche sa balise a, et si + # c'est un PDF on le parse + cards = soup.find_all('div', class_='fr-card__content') + for card in cards: + a = card.find('a') + if a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] - url = unquote(url) - name = a.get_text().strip() - date = datetime.datetime.strptime(card.find('p', class_='fr-card__detail').get_text().replace('Publié le ', '').strip(), '%d/%m/%Y') - filename = url.split('/')[-1] + url = unquote(url) + name = a.get_text().strip() + date = datetime.datetime.strptime( + card.find( + 'p', + class_='fr-card__detail' + ).get_text().replace( + 'Publié le ', + '' + ).strip(), '%d/%m/%Y') + filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) - elements.append(raa) - return elements + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements diff --git a/RAAspotter_pref13.py b/RAAspotter_pref13.py index 0285e12..48fab5c 100644 --- a/RAAspotter_pref13.py +++ b/RAAspotter_pref13.py @@ -1,4 +1,4 @@ -import os, sys +import os import datetime from bs4 import BeautifulSoup @@ -6,54 +6,69 @@ from urllib.parse import unquote from RAAspotter import RAAspotter + class RAAspotter_pref13(RAAspotter): - # Config - __HOST = 'https://www.bouches-du-rhone.gouv.fr' - __RAA_PAGE = [f'{__HOST}/Publications/RAA-et-Archives/RAA-2024', - f'{__HOST}/Publications/RAA-et-Archives/RAA-2023', - f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2022', - f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2021', - f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2020', - f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2019'] - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' - full_name = 'Préfecture des Bouches-du-Rhône' - short_code = 'pref13' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - self.print_output('RAAspotter_pref13') - self.print_output(f'Termes recherchés: {keywords}') - self.print_output('') - - for raa_page in self.__RAA_PAGE: - page_content = self.get_page(raa_page, 'get').content - raa_elements = self.get_raa_elements(page_content) - self.parse_raa(raa_elements, keywords.split(',')) - - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse - for a in soup.select('a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - - raa = RAAspotter.RAA(url, date, name, filename) - elements.append(raa) - return elements + # Config + __HOST = 'https://www.bouches-du-rhone.gouv.fr' + __RAA_PAGE = [ + f'{__HOST}/Publications/RAA-et-Archives/RAA-2024', + f'{__HOST}/Publications/RAA-et-Archives/RAA-2023', + f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-' + 'Rhone/RAA-2022', + f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-' + 'Rhone/RAA-2021', + f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-' + 'Rhone/RAA-2020', + f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-' + 'Rhone/RAA-2019' + ] + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \ + '(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' + full_name = 'Préfecture des Bouches-du-Rhône' + short_code = 'pref13' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + self.print_output('RAAspotter_pref13') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') + + for raa_page in self.__RAA_PAGE: + page_content = self.get_page(raa_page, 'get').content + raa_elements = self.get_raa_elements(page_content) + self.parse_raa(raa_elements, keywords.split(',')) + + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le + # parse + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace( + 'Télécharger ', + '' + ).strip() + date = datetime.datetime.strptime( + a.find('span').get_text().split(' - ')[-1].strip(), + '%d/%m/%Y' + ) + filename = url.split('/')[-1] + + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements diff --git a/RAAspotter_pref34.py b/RAAspotter_pref34.py index 7681435..f52c531 100644 --- a/RAAspotter_pref34.py +++ b/RAAspotter_pref34.py @@ -1,4 +1,4 @@ -import os, sys +import os import datetime from bs4 import BeautifulSoup @@ -6,68 +6,84 @@ from urllib.parse import unquote from RAAspotter import RAAspotter + class RAAspotter_pref34(RAAspotter): - # Config - __HOST = 'https://www.herault.gouv.fr' - __RAA_PAGE = {'2024': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2024', - '2023': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2023', - '2022': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2022', - '2021': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2021', - '2020': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2020', - '2019': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Archives/Recueil-des-actes-administratifs-2019'} - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' - full_name = 'Préfecture de l\'Hérault' - short_code = 'pref34' + # Config + __HOST = 'https://www.herault.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/Publications/Recueils-des-actes-administratifs' + '/Recueil-des-actes-administratifs-2024', + '2023': f'{__HOST}/Publications/Recueils-des-actes-administratifs' + '/Recueil-des-actes-administratifs-2023', + '2022': f'{__HOST}/Publications/Recueils-des-actes-administratifs' + '/Recueil-des-actes-administratifs-2022', + '2021': f'{__HOST}/Publications/Recueils-des-actes-administratifs' + '/Recueil-des-actes-administratifs-2021', + '2020': f'{__HOST}/Publications/Recueils-des-actes-administratifs' + '/Recueil-des-actes-administratifs-2020', + '2019': f'{__HOST}/Publications/Recueils-des-actes-administratifs' + '/Archives/Recueil-des-actes-administratifs-2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \ + '(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' + full_name = 'Préfecture de l\'Hérault' + short_code = 'pref34' - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) - def get_raa(self, keywords): - self.print_output('RAAspotter_pref34') - self.print_output(f'Termes recherchés: {keywords}') - self.print_output('') + def get_raa(self, keywords): + self.print_output('RAAspotter_pref34') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) - for raa_page in pages_to_parse: - page_content = self.get_page(raa_page, 'get').content - raa_elements = self.get_raa_elements(page_content) - self.parse_raa(raa_elements, keywords.split(',')) + for raa_page in pages_to_parse: + page_content = self.get_page(raa_page, 'get').content + raa_elements = self.get_raa_elements(page_content) + self.parse_raa(raa_elements, keywords.split(',')) - self.mailer() + self.mailer() - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') - # On récupère chaque balise a - for a in soup.select('a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] + # On récupère chaque balise a + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] + url = unquote(url) + name = a.find('span').previous_sibling.replace( + 'Télécharger ', + '' + ).strip() + date = datetime.datetime.strptime( + a.find('span').get_text().split(' - ')[-1].strip(), + '%d/%m/%Y' + ) + filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) - elements.append(raa) - return elements + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements diff --git a/RAAspotter_pref35.py b/RAAspotter_pref35.py index 6d18b4f..8e16411 100644 --- a/RAAspotter_pref35.py +++ b/RAAspotter_pref35.py @@ -1,4 +1,4 @@ -import os, sys +import os import datetime from bs4 import BeautifulSoup @@ -6,54 +6,76 @@ from urllib.parse import unquote from RAAspotter import RAAspotter + class RAAspotter_pref35(RAAspotter): - # Config - __HOST = 'https://www.ille-et-vilaine.gouv.fr' - __RAA_PAGE = [f'{__HOST}/Publications/Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2024', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2023', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2022', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2021', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2020', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2019'] - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' - full_name = 'Préfecture d\'Ille-et-Vilaine' - short_code = 'pref35' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - self.print_output('RAAspotter_pref35') - self.print_output(f'Termes recherchés: {keywords}') - self.print_output('') - - for raa_page in self.__RAA_PAGE: - page_content = self.get_page(raa_page, 'get').content - raa_elements = self.get_raa_elements(page_content) - self.parse_raa(raa_elements, keywords.split(',')) - - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse - for a in soup.find_all('a', href=True, class_='fr-link--download'): - if a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - - raa = RAAspotter.RAA(url, date, name, filename) - elements.append(raa) - return elements + # Config + __HOST = 'https://www.ille-et-vilaine.gouv.fr' + __RAA_PAGE = [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs/Recueil-des-' + 'actes-administratifs-2024', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-' + 'des-recueils-des-actes-administratifs/Recueil-des-actes-' + 'administratifs-2023', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-' + 'des-recueils-des-actes-administratifs/Recueil-des-actes-' + 'administratifs-2022', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-' + 'des-recueils-des-actes-administratifs/Recueil-des-actes-' + 'administratifs-2021', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-' + 'des-recueils-des-actes-administratifs/Recueil-des-actes-' + 'administratifs-2020', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-' + 'des-recueils-des-actes-administratifs/Recueil-des-actes-' + 'administratifs-2019' + ] + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \ + '(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' + full_name = 'Préfecture d\'Ille-et-Vilaine' + short_code = 'pref35' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + self.print_output('RAAspotter_pref35') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') + + for raa_page in self.__RAA_PAGE: + page_content = self.get_page(raa_page, 'get').content + raa_elements = self.get_raa_elements(page_content) + self.parse_raa(raa_elements, keywords.split(',')) + + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le + # parse + for a in soup.find_all('a', href=True, class_='fr-link--download'): + if a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace( + 'Télécharger ', + '' + ).strip() + date = datetime.datetime.strptime( + a.find('span').get_text().split(' - ')[-1].strip(), + '%d/%m/%Y' + ) + filename = url.split('/')[-1] + + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements diff --git a/RAAspotter_pref38.py b/RAAspotter_pref38.py index 8b5123a..42a521a 100644 --- a/RAAspotter_pref38.py +++ b/RAAspotter_pref38.py @@ -1,4 +1,4 @@ -import os, sys, re +import os import datetime import logging @@ -9,81 +9,120 @@ from RAAspotter import RAAspotter logger = logging.getLogger(__name__) + class RAAspotter_pref38(RAAspotter): - # Config - __HOST = 'https://www.isere.gouv.fr' - __RAA_PAGE = {'2024': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2024', - '2023': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2023', - '2022': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2022', - '2021': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-recueils-des-actes-administratifs-de-la-prefecture-de-l-Isere-2021/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2021', - '2020': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-recueils-des-actes-administratifs-de-la-prefecture-de-l-Isere-2020/Recueils-des-Actes-Administratifs-de-la-Prefecture-de-l-Isere-2020', - '2019': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2019/Archives-des-Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2019'} - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture de l\'Isère' - short_code = 'pref38' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(20) - - def get_raa(self, keywords): - self.print_output('RAAspotter_pref38') - self.print_output(f'Termes recherchés: {keywords}') - self.print_output('') - - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) - - for raa_page in pages_to_parse: - page_content = self.get_page(raa_page, 'get').content - raa_elements = self.get_raa_elements(page_content, raa_page) - self.parse_raa(raa_elements, keywords.split(',')) - self.mailer() - - def get_raa_elements(self, page_content, raa_page): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère le select qui contient la liste des RAA - select_list = soup.select('select#-liste-docs')[0] - # On analyse chaque résultat - for option in select_list.find_all('option'): - if not option['value'] == "": - # On estime la date à partir du nom de fichier - guessed_date = RAAspotter.guess_date(option['title'], '.* n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)') - - # Si la date estimée correspond à la plage d'analyse, on demande au serveur les détails du RAA - if guessed_date >= self.not_before: - page_content = self.get_page(raa_page, 'post', {'-liste-docs':option['value']}).content - # On parse la page de détails pour obtenir les propriétés du RAA - soup = BeautifulSoup(page_content, 'html.parser') - a = soup.select('div.liste_deroulante a.fr-link.fr-link--download')[0] - - # Si la page contient une balise a qui renvoie vers un pdf, c'est qu'on a obtenu les détails du RAA demandé, donc on le parse - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - - raa = RAAspotter.RAA(url, date, name, filename) - elements.append(raa) - return elements + # Config + __HOST = 'https://www.isere.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs' + '/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2024', + '2023': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs' + '/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2023', + '2022': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs' + '/Archives/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-' + 'Isere-2022', + '2021': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs' + '/Archives/Archives-des-recueils-des-actes-administratifs-de-la-' + 'prefecture-de-l-Isere-2021/Recueils-des-Actes-Administratifs-de-la-' + 'prefecture-de-l-Isere-2021', + '2020': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs' + '/Archives/Archives-des-recueils-des-actes-administratifs-de-la-' + 'prefecture-de-l-Isere-2020/Recueils-des-Actes-Administratifs-de-la-' + 'Prefecture-de-l-Isere-2020', + '2019': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs' + '/Archives/Archives-des-Recueils-des-Actes-Administratifs-de-la-' + 'prefecture-de-l-Isere-2019/Archives-des-Recueils-des-Actes-' + 'Administratifs-de-la-prefecture-de-l-Isere-2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) ' \ + 'Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture de l\'Isère' + short_code = 'pref38' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(20) + + def get_raa(self, keywords): + self.print_output('RAAspotter_pref38') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') + + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + for raa_page in pages_to_parse: + page_content = self.get_page(raa_page, 'get').content + raa_elements = self.get_raa_elements(page_content, raa_page) + self.parse_raa(raa_elements, keywords.split(',')) + self.mailer() + + def get_raa_elements(self, page_content, raa_page): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère le select qui contient la liste des RAA + select_list = soup.select('select#-liste-docs')[0] + # On analyse chaque résultat + for option in select_list.find_all('option'): + if not option['value'] == "": + # On estime la date à partir du nom de fichier + guessed_date = RAAspotter.guess_date( + option['title'], + '.* n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)' + ) + + # Si la date estimée correspond à la plage d'analyse, on + # demande au serveur les détails du RAA + if guessed_date >= self.not_before: + page_content = self.get_page( + raa_page, + 'post', + { + '-liste-docs': option['value'] + } + ).content + + # On parse la page de détails pour obtenir les propriétés + # du RAA + soup = BeautifulSoup(page_content, 'html.parser') + a = soup.select( + 'div.liste_deroulante a.fr-link.fr-link--download' + )[0] + + # Si la page contient une balise a qui renvoie vers un pdf, + # c'est qu'on a obtenu les détails du RAA demandé, donc + # on le parse + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace( + 'Télécharger ', + '' + ).strip() + date = datetime.datetime.strptime( + a.find('span').get_text().split(' - ')[-1].strip(), + '%d/%m/%Y' + ) + filename = url.split('/')[-1] + + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements diff --git a/RAAspotter_pref59.py b/RAAspotter_pref59.py index e5078d8..7911488 100644 --- a/RAAspotter_pref59.py +++ b/RAAspotter_pref59.py @@ -1,4 +1,4 @@ -import os, sys, re +import os import datetime import dateparser import logging @@ -10,70 +10,94 @@ from RAAspotter import RAAspotter logger = logging.getLogger(__name__) + class RAAspotter_pref59(RAAspotter): - # Config - __HOST = 'https://www.nord.gouv.fr' - __RAA_PAGE = {'2024': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2024', - '2023': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2023', - '2022': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2022', - '2021': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2021', - '2020': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2020', - '2019': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2019'} - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture du Nord' - short_code = 'pref59' + # Config + __HOST = 'https://www.nord.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/Publications/Recueils-des-actes-administratifs' + '/RAA-du-departement-du-Nord/2024', + '2023': f'{__HOST}/Publications/Recueils-des-actes-administratifs' + '/RAA-du-departement-du-Nord/2023', + '2022': f'{__HOST}/Publications/Recueils-des-actes-administratifs' + '/RAA-du-departement-du-Nord/2022', + '2021': f'{__HOST}/Publications/Recueils-des-actes-administratifs' + '/RAA-du-departement-du-Nord/2021', + '2020': f'{__HOST}/Publications/Recueils-des-actes-administratifs' + '/RAA-du-departement-du-Nord/2020', + '2019': f'{__HOST}/Publications/Recueils-des-actes-administratifs' + '/RAA-du-departement-du-Nord/2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) ' \ + 'Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture du Nord' + short_code = 'pref59' - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(20) + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(20) - def get_raa(self, keywords): - self.print_output('RAAspotter_pref59') - self.print_output(f'Termes recherchés: {keywords}') - self.print_output('') + def get_raa(self, keywords): + self.print_output('RAAspotter_pref59') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) - for raa_page in pages_to_parse: - page_content = self.get_page(raa_page, 'get').content - sub_pages = self.get_sub_pages(page_content, "div.fr-card__body div.fr-card__content h2.fr-card__title a", self.__HOST, True) - for sub_page in sub_pages[::-1]: - sub_page_content = self.get_page(sub_page['url'], 'get').content - sub_raa_elements = self.get_raa_elements(sub_page_content) - self.parse_raa(sub_raa_elements, keywords.split(',')) - self.mailer() + for raa_page in pages_to_parse: + page_content = self.get_page(raa_page, 'get').content + sub_pages = self.get_sub_pages( + page_content, + "div.fr-card__body div.fr-card__content h2.fr-card__title a", + self.__HOST, + True + ) + for sub_page in sub_pages[::-1]: + sub_page_content = self.get_page( + sub_page['url'], + 'get' + ).content + sub_raa_elements = self.get_raa_elements(sub_page_content) + self.parse_raa(sub_raa_elements, keywords.split(',')) + self.mailer() - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') - # On récupère chaque balise a - for a in soup.select('a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] + # On récupère chaque balise a + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] + url = unquote(url) + name = a.find('span').previous_sibling.replace( + 'Télécharger ', + '' + ).strip() + date = datetime.datetime.strptime( + a.find('span').get_text().split(' - ')[-1].strip(), + '%d/%m/%Y' + ) + filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) - elements.append(raa) - return elements + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements diff --git a/RAAspotter_pref62.py b/RAAspotter_pref62.py index 7b1767d..c792ff9 100644 --- a/RAAspotter_pref62.py +++ b/RAAspotter_pref62.py @@ -1,4 +1,4 @@ -import os, sys +import os import datetime from bs4 import BeautifulSoup @@ -6,86 +6,118 @@ from urllib.parse import unquote from RAAspotter import RAAspotter + class RAAspotter_pref62(RAAspotter): - # Config - __HOST = 'https://www.pas-de-calais.gouv.fr' - __RAA_PAGE = {'2024': - [f'{__HOST}/Publications/Recueil-des-actes-administratifs/2024-Recueils-des-actes-administratifs'], - '2023': - [f'{__HOST}/Publications/Recueil-des-actes-administratifs/2023-Recueils-des-actes-administratifs', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/2023-Recueils-speciaux-des-actes-administratifs'], - '2022': - [f'{__HOST}/Publications/Recueil-des-actes-administratifs/2022-Recueils-des-Actes-Administratifs', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/2022-Recueils-Speciaux-des-Actes-Administratifs'], - '2021': - [f'{__HOST}/Publications/Recueil-des-actes-administratifs/2021-Recueils-des-actes-administratifs', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/2021-Recueils-speciaux-des-actes-administratifs'], - '2020': - [f'{__HOST}/Publications/Recueil-des-actes-administratifs/2020-Recueils-des-actes-administratifs', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/2020-Recueils-speciaux-des-actes-administratifs'], - '2019': - [f'{__HOST}/Publications/Recueil-des-actes-administratifs/2019-Recueil-des-actes-administratifs', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/2019-Recueils-speciaux-des-actes-administratifs']} - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture du Pas-de-Calais' - short_code = 'pref62' + # Config + __HOST = 'https://www.pas-de-calais.gouv.fr' + __RAA_PAGE = { + '2024': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs' + '/2024-Recueils-des-actes-administratifs' + ], + '2023': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs' + '/2023-Recueils-des-actes-administratifs', + f'{__HOST}/Publications/Recueil-des-actes-administratifs' + '/2023-Recueils-speciaux-des-actes-administratifs' + ], + '2022': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs' + '/2022-Recueils-des-Actes-Administratifs', + f'{__HOST}/Publications/Recueil-des-actes-administratifs' + '/2022-Recueils-Speciaux-des-Actes-Administratifs' + ], + '2021': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs' + '/2021-Recueils-des-actes-administratifs', + f'{__HOST}/Publications/Recueil-des-actes-administratifs' + '/2021-Recueils-speciaux-des-actes-administratifs' + ], + '2020': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs' + '/2020-Recueils-des-actes-administratifs', + f'{__HOST}/Publications/Recueil-des-actes-administratifs' + '/2020-Recueils-speciaux-des-actes-administratifs' + ], + '2019': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs' + '/2019-Recueil-des-actes-administratifs', + f'{__HOST}/Publications/Recueil-des-actes-administratifs' + '/2019-Recueils-speciaux-des-actes-administratifs' + ] + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) ' \ + 'Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture du Pas-de-Calais' + short_code = 'pref62' - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(20) + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(20) - def get_raa(self, keywords): - self.print_output('RAAspotter_pref62') - self.print_output(f'Termes recherchés: {keywords}') - self.print_output('') + def get_raa(self, keywords): + self.print_output('RAAspotter_pref62') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') - pages_to_parse = [] - if self.not_before.year <= 2024: - for page in self.__RAA_PAGE['2024']: - pages_to_parse.append(page) - if self.not_before.year <= 2023: - for page in self.__RAA_PAGE['2023']: - pages_to_parse.append(page) - if self.not_before.year <= 2022: - for page in self.__RAA_PAGE['2022']: - pages_to_parse.append(page) - if self.not_before.year <= 2021: - for page in self.__RAA_PAGE['2021']: - pages_to_parse.append(page) - if self.not_before.year <= 2020: - for page in self.__RAA_PAGE['2020']: - pages_to_parse.append(page) - if self.not_before.year <= 2019: - for page in self.__RAA_PAGE['2019']: - pages_to_parse.append(page) + pages_to_parse = [] + if self.not_before.year <= 2024: + for page in self.__RAA_PAGE['2024']: + pages_to_parse.append(page) + if self.not_before.year <= 2023: + for page in self.__RAA_PAGE['2023']: + pages_to_parse.append(page) + if self.not_before.year <= 2022: + for page in self.__RAA_PAGE['2022']: + pages_to_parse.append(page) + if self.not_before.year <= 2021: + for page in self.__RAA_PAGE['2021']: + pages_to_parse.append(page) + if self.not_before.year <= 2020: + for page in self.__RAA_PAGE['2020']: + pages_to_parse.append(page) + if self.not_before.year <= 2019: + for page in self.__RAA_PAGE['2019']: + pages_to_parse.append(page) - for raa_page in pages_to_parse: - page_content = self.get_page(raa_page, 'get').content - raa_elements = self.get_raa_elements(page_content) - self.parse_raa(raa_elements, keywords.split(',')) - self.mailer() + for raa_page in pages_to_parse: + page_content = self.get_page( + raa_page, + 'get' + ).content + raa_elements = self.get_raa_elements(page_content) + self.parse_raa(raa_elements, keywords.split(',')) + self.mailer() - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') - # On récupère le div qui contient la liste des RAA - cards = soup.select('div.fr-downloads-group.fr-downloads-group--bordered')[0] - # On analyse chaque balise a dans ce div - for a in cards.find_all('a', href=True): - if a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] + # On récupère le div qui contient la liste des RAA + cards = soup.select( + 'div.fr-downloads-group.fr-downloads-group--bordered' + )[0] + # On analyse chaque balise a dans ce div + for a in cards.find_all('a', href=True): + if a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] + url = unquote(url) + name = a.find('span').previous_sibling.replace( + 'Télécharger ', + '' + ).strip() + date = datetime.datetime.strptime( + a.find('span').get_text().split(' - ')[-1].strip(), + '%d/%m/%Y' + ) + filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) - elements.append(raa) - return elements[::-1] + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements[::-1] diff --git a/RAAspotter_pref65.py b/RAAspotter_pref65.py index 77de89a..2e5aaa7 100644 --- a/RAAspotter_pref65.py +++ b/RAAspotter_pref65.py @@ -1,4 +1,4 @@ -import os, sys +import os import datetime from bs4 import BeautifulSoup @@ -6,68 +6,85 @@ from urllib.parse import unquote from RAAspotter import RAAspotter + class RAAspotter_pref65(RAAspotter): - # Config - __HOST = 'https://www.hautes-pyrenees.gouv.fr' - __RAA_PAGE = {'2024': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2024', - '2023': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2023', - '2022': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2022', - '2021': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2021', - '2020': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2020', - '2019': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2019'} - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture des Hautes-Pyrénées' - short_code = 'pref65' + # Config + __HOST = 'https://www.hautes-pyrenees.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/Publications/Recueil-d-actes-administratifs' + '/RAA-2024', + '2023': f'{__HOST}/Publications/Recueil-d-actes-administratifs' + '/RAA-2023', + '2022': f'{__HOST}/Publications/Recueil-d-actes-administratifs' + '/RAA-2022', + '2021': f'{__HOST}/Publications/Recueil-d-actes-administratifs' + '/RAA-2021', + '2020': f'{__HOST}/Publications/Recueil-d-actes-administratifs' + '/RAA-2020', + '2019': f'{__HOST}/Publications/Recueil-d-actes-administratifs' + '/RAA-2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) ' \ + 'Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture des Hautes-Pyrénées' + short_code = 'pref65' - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) - def get_raa(self, keywords): - self.print_output('RAAspotter_pref65') - self.print_output(f'Termes recherchés: {keywords}') - self.print_output('') + def get_raa(self, keywords): + self.print_output('RAAspotter_pref65') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) - for raa_page in pages_to_parse: - page_content = self.get_page(raa_page, 'get').content - raa_elements = self.get_raa_elements(page_content) - self.parse_raa(raa_elements, keywords.split(',')) + for raa_page in pages_to_parse: + page_content = self.get_page(raa_page, 'get').content + raa_elements = self.get_raa_elements(page_content) + self.parse_raa(raa_elements, keywords.split(',')) - self.mailer() + self.mailer() - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') - # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse - for a in soup.select('a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] + # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le + # parse + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] + url = unquote(url) + name = a.find('span').previous_sibling.replace( + 'Télécharger ', + '' + ).strip() + date = datetime.datetime.strptime( + a.find('span').get_text().split(' - ')[-1].strip(), + '%d/%m/%Y' + ) + filename = url.split('/')[-1] - raa = RAAspotter.RAA(url, date, name, filename) - elements.append(raa) - return elements + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements diff --git a/RAAspotter_pref69.py b/RAAspotter_pref69.py index be16a7d..a3c0d8e 100644 --- a/RAAspotter_pref69.py +++ b/RAAspotter_pref69.py @@ -1,4 +1,4 @@ -import os, sys +import os import datetime from bs4 import BeautifulSoup @@ -6,80 +6,101 @@ from urllib.parse import unquote from RAAspotter import RAAspotter + class RAAspotter_pref69(RAAspotter): - # Config - __HOST = 'https://www.rhone.gouv.fr' - __RAA_PAGE = {'2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2024', - '2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2023', - '2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2022', - '2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2021', - '2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2020', - '2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2019'} - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture du Rhône' - short_code = 'pref69' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(20) - - def get_raa(self, keywords): - self.print_output('RAAspotter_pref69') - self.print_output(f'Termes recherchés: {keywords}') - self.print_output('') - - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) - - sub_pages_to_parse = [] - - for raa_page in pages_to_parse: - sub_pages = self.get_sub_pages_with_pager(raa_page, - "div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link", - "ul.fr-pagination__list li a.fr-pagination__link--next", - self.__HOST)[::-1] - for sub_page in sub_pages: - sub_pages_to_parse.append(sub_page['url']) - - elements = [] - for sub_page_to_parse in sub_pages_to_parse: - page_content = self.get_page(sub_page_to_parse, 'get').content - for element in self.get_raa_elements(page_content)[::-1]: - elements.append(element) - - self.parse_raa(elements, keywords.split(',')) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère chaque balise a - for a in soup.select('a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - - raa = RAAspotter.RAA(url, date, name, filename) - elements.append(raa) - return elements + # Config + __HOST = 'https://www.rhone.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-' + 'Rhone-RAA/Recueils-de-2024', + '2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-' + 'Rhone-RAA/Recueils-de-2023', + '2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-' + 'Rhone-RAA/Recueils-de-2022', + '2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-' + 'Rhone-RAA/Recueils-de-2021', + '2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-' + 'Rhone-RAA/Recueils-de-2020', + '2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-' + 'Rhone-RAA/Recueils-de-2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) ' \ + 'Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture du Rhône' + short_code = 'pref69' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(20) + + def get_raa(self, keywords): + self.print_output('RAAspotter_pref69') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') + + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + sub_pages_to_parse = [] + + for raa_page in pages_to_parse: + sub_pages = self.get_sub_pages_with_pager( + raa_page, + 'div.fr-card__body div.fr-card__content ' + 'h2.fr-card__title a.fr-card__link', + "ul.fr-pagination__list li a.fr-pagination__link--next", + self.__HOST)[::-1] + for sub_page in sub_pages: + sub_pages_to_parse.append(sub_page['url']) + + elements = [] + for sub_page_to_parse in sub_pages_to_parse: + page_content = self.get_page( + sub_page_to_parse, + 'get' + ).content + for element in self.get_raa_elements(page_content)[::-1]: + elements.append(element) + + self.parse_raa(elements, keywords.split(',')) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace( + 'Télécharger ', + '' + ).strip() + date = datetime.datetime.strptime( + a.find('span').get_text().split(' - ')[-1].strip(), + '%d/%m/%Y' + ) + filename = url.split('/')[-1] + + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements diff --git a/RAAspotter_pref83.py b/RAAspotter_pref83.py index 791d72c..7e6d9cf 100644 --- a/RAAspotter_pref83.py +++ b/RAAspotter_pref83.py @@ -1,4 +1,4 @@ -import os, sys +import os import datetime from bs4 import BeautifulSoup @@ -6,82 +6,105 @@ from urllib.parse import unquote from RAAspotter import RAAspotter + class RAAspotter_pref83(RAAspotter): - # Config - __HOST = 'https://www.var.gouv.fr' - __RAA_PAGE = {'2024': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2024', - '2023': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2023', - '2022': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2022', - '2021': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2021', - '2020': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2020', - '2019': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2019'} - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture du Var' - short_code = 'pref83' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - self.print_output('RAAspotter_pref83') - self.print_output(f'Termes recherchés: {keywords}') - self.print_output('') - - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) - - sub_pages_to_parse = [] - - # Pour chaque année, on cherche les sous-pages de mois - for raa_page in pages_to_parse: - sub_pages_to_parse.append(raa_page) - page_content = self.get_page(raa_page, 'get').content - month_pages = self.get_sub_pages( - page_content, - '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', - self.__HOST, - False - )[::-1] - for month_page in month_pages: - sub_pages_to_parse.append(month_page['url']) - - # On parse les pages contenant des RAA - elements = self.get_raa_with_pager(sub_pages_to_parse[::-1], ".fr-pagination__link.fr-pagination__link--next", self.__HOST) - self.parse_raa(elements, keywords.split(',')) - - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère chaque section contenant un RAA - for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link.menu-item-link'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.get_text().strip() - date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - - raa = RAAspotter.RAA(url, date, name, filename) - elements.append(raa) - return elements + # Config + __HOST = 'https://www.var.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs' + '/Recueil-des-actes-administratifs-2024', + '2023': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs' + '/Recueil-des-actes-administratifs-2023', + '2022': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs' + '/Recueil-des-actes-administratifs-2022', + '2021': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs' + '/Recueil-des-actes-administratifs-2021', + '2020': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs' + '/Recueil-des-actes-administratifs-2020', + '2019': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs' + '/Recueil-des-actes-administratifs-2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) ' \ + 'Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture du Var' + short_code = 'pref83' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + self.print_output('RAAspotter_pref83') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') + + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + sub_pages_to_parse = [] + + # Pour chaque année, on cherche les sous-pages de mois + for raa_page in pages_to_parse: + sub_pages_to_parse.append(raa_page) + page_content = self.get_page(raa_page, 'get').content + month_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link ' + 'div.fr-card__body ' + 'div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + )[::-1] + for month_page in month_pages: + sub_pages_to_parse.append(month_page['url']) + + # On parse les pages contenant des RAA + elements = self.get_raa_with_pager( + sub_pages_to_parse[::-1], + ".fr-pagination__link.fr-pagination__link--next", + self.__HOST + ) + self.parse_raa(elements, keywords.split(',')) + + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque section contenant un RAA + cards = soup.select( + 'div.fr-card__body div.fr-card__content ' + 'h2.fr-card__title a.fr-card__link.menu-item-link' + ) + for a in cards: + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.get_text().strip() + date = datetime.datetime.strptime( + a['title'].split(' - ')[-1].strip(), + '%d/%m/%Y' + ) + filename = url.split('/')[-1] + + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements diff --git a/RAAspotter_pref976.py b/RAAspotter_pref976.py index 39b28b6..f06eea5 100644 --- a/RAAspotter_pref976.py +++ b/RAAspotter_pref976.py @@ -1,4 +1,4 @@ -import os, sys +import os import datetime from bs4 import BeautifulSoup @@ -6,108 +6,137 @@ from urllib.parse import unquote from RAAspotter import RAAspotter + class RAAspotter_pref976(RAAspotter): - # Config - __HOST = 'https://www.mayotte.gouv.fr' - __RAA_PAGE = {'default': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A', - '2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2024', - '2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2023', - '2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2022', - '2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2021', - '2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2020', - '2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2019'} - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' - full_name = 'Préfecture de Mayotte' - short_code = 'pref976' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - self.print_output('RAAspotter_pref976') - self.print_output(f'Termes recherchés: {keywords}') - self.print_output('') - - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) - - sub_pages_to_parse = [self.__RAA_PAGE['default']] - - # Pour chaque année, on cherche les sous-pages de mois - for raa_page in pages_to_parse: - page_content = self.get_page(raa_page, 'get').content - month_pages = self.get_sub_pages( - page_content, - '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', - self.__HOST, - False - )[::-1] - - # On regarde aussi si sur la page de l'année il n'y aurait pas un RAA mal catégorisé - for page_to_parse in self.find_raa_card(raa_page): - sub_pages_to_parse.append(page_to_parse) - - # Pour chaque mois, on cherche les pages des RAA - for month_page in month_pages: - year = RAAspotter.guess_date(month_page['name'], '(.*)').year - for page_to_parse in self.find_raa_card(month_page['url'], year): - sub_pages_to_parse.append(page_to_parse) - - # On parse les pages contenant des RAA - for page in sub_pages_to_parse: - page_content = self.get_page(page, 'get').content - raa_elements = self.get_raa_elements(page_content) - self.parse_raa(raa_elements, keywords.split(',')) - self.mailer() - - def find_raa_card(self, page, year=None): - pages = [] - card_pages = self.get_sub_pages_with_pager( - page, - 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', - 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', - self.__HOST - )[::-1] - for card_page in card_pages: - # On filtre les pages de RAA ne correspondant pas à la période analysée - guessed_date = RAAspotter.guess_date(card_page['name'], 'n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)') - if year: - guessed_date = guessed_date.replace(year = year) - if guessed_date >= self.not_before: - pages.append(card_page['url']) - return pages - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère chaque balise a - for a in soup.select('a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - filename = url.split('/')[-1] - - raa = RAAspotter.RAA(url, date, name, filename) - elements.append(raa) - return elements + # Config + __HOST = 'https://www.mayotte.gouv.fr' + __RAA_PAGE = { + 'default': f'{__HOST}/Publications/Recueil-des-actes-administratifs-' + 'R.A.A', + '2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A' + '/RAA-2024', + '2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A' + '/RAA-2023', + '2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A' + '/RAA-2022', + '2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A' + '/RAA-2021', + '2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A' + '/RAA-2020', + '2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A' + '/RAA-2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \ + '(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' + full_name = 'Préfecture de Mayotte' + short_code = 'pref976' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + self.print_output('RAAspotter_pref976') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') + + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + sub_pages_to_parse = [self.__RAA_PAGE['default']] + + # Pour chaque année, on cherche les sous-pages de mois + for raa_page in pages_to_parse: + page_content = self.get_page(raa_page, 'get').content + month_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link ' + 'div.fr-card__body div.fr-card__content ' + 'h2.fr-card__title a', + self.__HOST, + False + )[::-1] + + # On regarde aussi si sur la page de l'année il n'y aurait pas un + # RAA mal catégorisé + for page_to_parse in self.find_raa_card(raa_page): + sub_pages_to_parse.append(page_to_parse) + + # Pour chaque mois, on cherche les pages des RAA + for month_page in month_pages: + year = RAAspotter.guess_date(month_page['name'], '(.*)').year + for page_to_parse in self.find_raa_card( + month_page['url'], + year + ): + sub_pages_to_parse.append(page_to_parse) + + # On parse les pages contenant des RAA + for page in sub_pages_to_parse: + page_content = self.get_page(page, 'get').content + raa_elements = self.get_raa_elements(page_content) + self.parse_raa(raa_elements, keywords.split(',')) + self.mailer() + + def find_raa_card(self, page, year=None): + pages = [] + card_pages = self.get_sub_pages_with_pager( + page, + 'div.fr-card__body div.fr-card__content h2.fr-card__title ' + 'a.fr-card__link', + 'ul.fr-pagination__list li ' + 'a.fr-pagination__link.fr-pagination__link--next', + self.__HOST + )[::-1] + for card_page in card_pages: + # On filtre les pages de RAA ne correspondant pas à la période + # analysée + guessed_date = RAAspotter.guess_date( + card_page['name'], + 'n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)' + ) + if year: + guessed_date = guessed_date.replace(year=year) + if guessed_date >= self.not_before: + pages.append(card_page['url']) + return pages + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace( + 'Télécharger ', + '' + ).strip() + date = datetime.datetime.strptime( + a.find('span').get_text().split(' - ')[-1].strip(), + '%d/%m/%Y' + ) + filename = url.split('/')[-1] + + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements diff --git a/cli.py b/cli.py index 150fbb5..995c87a 100755 --- a/cli.py +++ b/cli.py @@ -7,7 +7,8 @@ import importlib from RAAspotter import RAAspotter # Config -__KEYWORDS = os.getenv('KEYWORDS') or 'vidéoprotection,caméras,captation,aéronef' +__KEYWORDS = os.getenv('KEYWORDS') or \ + 'vidéoprotection,caméras,captation,aéronef' __DATA_DIR_ROOT = os.path.dirname(os.path.abspath(__file__))+'/data/' __SMTP_HOSTNAME = os.getenv('SMTP_HOSTNAME') or 'localhost' __SMTP_USERNAME = os.getenv('SMTP_USERNAME') or '' @@ -15,120 +16,204 @@ __SMTP_PASSWORD = os.getenv('SMTP_PASSWORD') or '' __EMAIL_FROM = os.getenv('EMAIL_FROM') __EMAIL_TO = os.getenv('EMAIL_TO') if os.getenv('SMTP_PORT'): - __SMTP_PORT = int(os.getenv('SMTP_PORT')) + __SMTP_PORT = int(os.getenv('SMTP_PORT')) else: - __SMTP_PORT = 587 + __SMTP_PORT = 587 if os.getenv('SMTP_STARTTLS'): - __SMTP_STARTTLS = True + __SMTP_STARTTLS = True else: - __SMTP_STARTTLS = False + __SMTP_STARTTLS = False if os.getenv('SMTP_SSL'): - __SMTP_SSL = True + __SMTP_SSL = True else: - __SMTP_SSL = False + __SMTP_SSL = False if os.getenv('NOT_BEFORE'): - __NOT_BEFORE = datetime.datetime.strptime(os.getenv('NOT_BEFORE'), '%Y-%m-%d') + __NOT_BEFORE = datetime.datetime.strptime( + os.getenv('NOT_BEFORE'), '%Y-%m-%d' + ) else: - __NOT_BEFORE = datetime.datetime(2024, 1, 1) + __NOT_BEFORE = datetime.datetime(2024, 1, 1) __MASTODON_ACCESS_TOKEN = os.getenv('MASTODON_ACCESS_TOKEN') __MASTODON_INSTANCE = os.getenv('MASTODON_INSTANCE') # Liste des préfectures supportées available_prefs = [ - 'ppparis', - 'pref04', - 'pref06', - 'pref13', - 'pref34', - 'pref35', - 'pref38', - 'pref59', - 'pref62', - 'pref65', - 'pref69', - 'pref83', - 'pref976' + 'ppparis', + 'pref04', + 'pref06', + 'pref13', + 'pref34', + 'pref35', + 'pref38', + 'pref59', + 'pref62', + 'pref65', + 'pref69', + 'pref83', + 'pref976' ] # Début du script -parser = argparse.ArgumentParser(prog='cli.py', description='Télécharge les RAA d\'une préfecture donnée et recherche des mots-clés') -parser.add_argument('-p', '--pref', action='store', help='identifiant de la préfecture', required=True, choices=available_prefs) -parser.add_argument('-k', '--keywords', action='store', help='liste des termes recherchés, séparés par une virgule (par défaut : vidéoprotection,caméras,captation,aéronef)') -parser.add_argument('--not-before', action='store', help='n\'analyse pas les RAA datant d\'avant la date indiquée, au format YYYY-MM-DD (par défaut : 2024-01-01)') -parser.add_argument('--smtp-hostname', action='store', help='nom d\'hôte SMTP (par défaut : localhost)') -parser.add_argument('--smtp-username', action='store', help='nom d\'utilisateur SMTP (par défaut : vide)') -parser.add_argument('--smtp-password', action='store', help='mot de passe SMTP (par défaut : vide)') -parser.add_argument('--smtp-port', action='store', help='port SMTP (par défaut : 587)') -parser.add_argument('--smtp-starttls', action='store_true', help='connexion SMTP avec STARTTLS') -parser.add_argument('--smtp-ssl', action='store_true', help='connexion SMTP avec SSL') -parser.add_argument('-f', '--email-from', action='store', help='adresse de courrier électronique expéditrice des notifications') -parser.add_argument('-t', '--email-to', action='store', help='adresses de courrier électronique destinataires des notifications (séparées par une virgule)') +parser = argparse.ArgumentParser( + prog='cli.py', + description='Télécharge les RAA d\'une préfecture donnée et recherche des ' + 'mots-clés' +) +parser.add_argument( + '-p', + '--pref', + action='store', + help='identifiant de la préfecture', + required=True, + choices=available_prefs +) +parser.add_argument( + '-k', + '--keywords', + action='store', + help='liste des termes recherchés, séparés par une virgule (par défaut : ' + 'vidéoprotection,caméras,captation,aéronef)' +) +parser.add_argument( + '--not-before', + action='store', + help='n\'analyse pas les RAA datant d\'avant la date indiquée, au format ' + 'YYYY-MM-DD (par défaut : 2024-01-01)' +) +parser.add_argument( + '--smtp-hostname', + action='store', + help='nom d\'hôte SMTP (par défaut : localhost)' +) +parser.add_argument( + '--smtp-username', + action='store', + help='nom d\'utilisateur SMTP (par défaut : vide)' +) +parser.add_argument( + '--smtp-password', + action='store', + help='mot de passe SMTP (par défaut : vide)' +) +parser.add_argument( + '--smtp-port', + action='store', + help='port SMTP (par défaut : 587)' +) +parser.add_argument( + '--smtp-starttls', + action='store_true', + help='connexion SMTP avec STARTTLS' +) +parser.add_argument( + '--smtp-ssl', + action='store_true', + help='connexion SMTP avec SSL' +) +parser.add_argument( + '-f', + '--email-from', + action='store', + help='adresse de courrier électronique expéditrice des notifications' +) +parser.add_argument( + '-t', + '--email-to', + action='store', + help='adresses de courrier électronique destinataires des notifications ' + '(séparées par une virgule)' +) for pref in available_prefs: - parser.add_argument(f'--{pref}-email-to', action='store', help=f'adresses de courrier électronique destinataires des notifications (séparées par une virgule) uniquement si l\'analyse concerne {pref} (s\'ajoute à celles précisées dans --email-to)') - -parser.add_argument('--mastodon-access-token', action='store', help='jeton d\'accès pour publier sur Mastodon (par défaut : vide)') -parser.add_argument('--mastodon-instance', action='store', help='URL de l\'instance (doit inclure "http://" ou "https://" ; par défaut : vide)') -parser.add_argument('-v', action='store_true', help='relève le niveau de verbosité à INFO') -parser.add_argument('-vv', action='store_true', help='relève le niveau de verbosité à DEBUG') + parser.add_argument( + f'--{pref}-email-to', + action='store', + help=f'adresses de courrier électronique destinataires des ' + 'notifications (séparées par une virgule) uniquement si ' + 'l\'analyse concerne {pref} (s\'ajoute à celles précisées dans ' + '--email-to)' + ) + +parser.add_argument( + '--mastodon-access-token', + action='store', + help='jeton d\'accès pour publier sur Mastodon (par défaut : vide)' +) +parser.add_argument( + '--mastodon-instance', + action='store', + help='URL de l\'instance (doit inclure "http://" ou "https://" ; par ' + 'défaut : vide)' +) +parser.add_argument( + '-v', + action='store_true', + help='relève le niveau de verbosité à INFO' +) +parser.add_argument( + '-vv', + action='store_true', + help='relève le niveau de verbosité à DEBUG' +) args = parser.parse_args() if args.v or os.getenv('VERBOSE'): - logging.basicConfig(level=logging.INFO) + logging.basicConfig(level=logging.INFO) if args.vv or os.getenv('VVERBOSE'): - logging.basicConfig(level=logging.DEBUG) + logging.basicConfig(level=logging.DEBUG) if args.keywords: - __KEYWORDS = args.keywords + __KEYWORDS = args.keywords if args.not_before: - __NOT_BEFORE = datetime.datetime.strptime(args.not_before, '%Y-%m-%d') + __NOT_BEFORE = datetime.datetime.strptime(args.not_before, '%Y-%m-%d') if args.smtp_hostname: - __SMTP_HOSTNAME = args.smtp_hostname + __SMTP_HOSTNAME = args.smtp_hostname if args.smtp_username: - __SMTP_USERNAME = args.smtp_username + __SMTP_USERNAME = args.smtp_username if args.smtp_password: - __SMTP_PASSWORD = args.smtp_password + __SMTP_PASSWORD = args.smtp_password if args.smtp_port: - __SMTP_PORT = int(args.smtp_port) + __SMTP_PORT = int(args.smtp_port) if args.smtp_starttls: - __SMTP_STARTTLS = True + __SMTP_STARTTLS = True if args.smtp_ssl: - __SMTP_SSL = True + __SMTP_SSL = True if args.email_from: - __EMAIL_FROM = args.email_from + __EMAIL_FROM = args.email_from if args.email_to: - __EMAIL_TO = args.email_to + __EMAIL_TO = args.email_to if args.mastodon_access_token: - __MASTODON_ACCESS_TOKEN = args.mastodon_access_token + __MASTODON_ACCESS_TOKEN = args.mastodon_access_token if args.mastodon_instance: - __MASTODON_INSTANCE = args.mastodon_instance + __MASTODON_INSTANCE = args.mastodon_instance __DATA_DIR = f'{__DATA_DIR_ROOT}{args.pref}/' -# On calcule la liste des mails à notifier (liste générale EMAIL_TO + liste prefecture EMAIL_TO_PREF**) +# On calcule la liste des mails à notifier (liste générale EMAIL_TO + liste +# prefecture EMAIL_TO_PREF**) __PREF_EMAIL_TO = '' pref_var_name = f'{args.pref}_EMAIL_TO'.upper() if os.getenv(pref_var_name): - __PREF_EMAIL_TO = os.getenv(pref_var_name) + __PREF_EMAIL_TO = os.getenv(pref_var_name) else: - for arg in vars(args).items(): - if arg[0] == f'{args.pref}_email_to': - __PREF_EMAIL_TO = arg[1] + for arg in vars(args).items(): + if arg[0] == f'{args.pref}_email_to': + __PREF_EMAIL_TO = arg[1] if __PREF_EMAIL_TO and not __PREF_EMAIL_TO == '': - __EMAIL_TO = f'{__EMAIL_TO},{__PREF_EMAIL_TO}' + __EMAIL_TO = f'{__EMAIL_TO},{__PREF_EMAIL_TO}' # On crée le dossier de téléchargement os.makedirs(__DATA_DIR, exist_ok=True) @@ -137,8 +222,12 @@ module = importlib.import_module(f'RAAspotter_{args.pref}') raa_spotter = getattr(module, f'RAAspotter_{args.pref}')(__DATA_DIR) raa_spotter.not_before = __NOT_BEFORE -raa_spotter.configure_mailer(__SMTP_HOSTNAME, __SMTP_USERNAME, __SMTP_PASSWORD, __SMTP_PORT, - __SMTP_STARTTLS, __SMTP_SSL, __EMAIL_FROM, __EMAIL_TO, - f'[RAAspotter] [{raa_spotter.full_name}] Nouveaux éléments trouvés') -raa_spotter.configure_mastodon(__MASTODON_ACCESS_TOKEN, __MASTODON_INSTANCE, f'[{raa_spotter.full_name}]', f'#{raa_spotter.short_code}') +raa_spotter.configure_mailer(__SMTP_HOSTNAME, __SMTP_USERNAME, __SMTP_PASSWORD, + __SMTP_PORT, __SMTP_STARTTLS, __SMTP_SSL, + __EMAIL_FROM, __EMAIL_TO, + f'[RAAspotter] [{raa_spotter.full_name}] ' + 'Nouveaux éléments trouvés') +raa_spotter.configure_mastodon(__MASTODON_ACCESS_TOKEN, __MASTODON_INSTANCE, + f'[{raa_spotter.full_name}]', + f'#{raa_spotter.short_code}') raa_spotter.get_raa(__KEYWORDS) -- GitLab