From c64b88407a60000885ccabdb2504dd9968b6186a Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Fri, 19 Apr 2024 17:45:36 +0200 Subject: [PATCH] RAAspotter --> Attrap --- Attrap.py | 595 ++++++++++++++++++++++++++++++++++++++++++++++ Attrap_ppparis.py | 48 ++++ Attrap_pref04.py | 60 +++++ Attrap_pref05.py | 99 ++++++++ Attrap_pref06.py | 105 ++++++++ Attrap_pref09.py | 72 ++++++ Attrap_pref13.py | 59 +++++ Attrap_pref31.py | 71 ++++++ Attrap_pref33.py | 108 +++++++++ Attrap_pref34.py | 73 ++++++ Attrap_pref35.py | 60 +++++ Attrap_pref38.py | 101 ++++++++ Attrap_pref42.py | 78 ++++++ Attrap_pref44.py | 108 +++++++++ Attrap_pref59.py | 85 +++++++ Attrap_pref62.py | 98 ++++++++ Attrap_pref64.py | 101 ++++++++ Attrap_pref65.py | 73 ++++++ Attrap_pref66.py | 133 +++++++++++ Attrap_pref69.py | 85 +++++++ Attrap_pref80.py | 98 ++++++++ Attrap_pref81.py | 117 +++++++++ Attrap_pref83.py | 90 +++++++ Attrap_pref87.py | 106 +++++++++ Attrap_pref976.py | 120 ++++++++++ 25 files changed, 2743 insertions(+) create mode 100644 Attrap.py create mode 100644 Attrap_ppparis.py create mode 100644 Attrap_pref04.py create mode 100644 Attrap_pref05.py create mode 100644 Attrap_pref06.py create mode 100644 Attrap_pref09.py create mode 100644 Attrap_pref13.py create mode 100644 Attrap_pref31.py create mode 100644 Attrap_pref33.py create mode 100644 Attrap_pref34.py create mode 100644 Attrap_pref35.py create mode 100644 Attrap_pref38.py create mode 100644 Attrap_pref42.py create mode 100644 Attrap_pref44.py create mode 100644 Attrap_pref59.py create mode 100644 Attrap_pref62.py create mode 100644 Attrap_pref64.py create mode 100644 Attrap_pref65.py create mode 100644 Attrap_pref66.py create mode 100644 Attrap_pref69.py create mode 100644 Attrap_pref80.py create mode 100644 Attrap_pref81.py create mode 100644 Attrap_pref83.py create mode 100644 Attrap_pref87.py create mode 100644 Attrap_pref976.py diff --git a/Attrap.py b/Attrap.py new file mode 100644 index 0000000..273de90 --- /dev/null +++ b/Attrap.py @@ -0,0 +1,595 @@ +import os +import re +import ssl +import subprocess +import shutil +import logging +import requests +import time +import datetime +import json +from urllib.parse import quote + +from selenium import webdriver +from selenium.common.exceptions import TimeoutException +from selenium.webdriver.common.by import By +from selenium.webdriver.support.wait import WebDriverWait +from selenium.webdriver.support import expected_conditions + +import dateparser + +from bs4 import BeautifulSoup +from pyvirtualdisplay import Display + +from pypdf import PdfReader +from pypdf import PdfWriter +from pypdf.generic import NameObject, NumberObject + +from stem import Signal +from stem.control import Controller + +import hashlib +import smtplib +import email + +from mastodon import Mastodon + +logger = logging.getLogger(__name__) + + +class Attrap: + class RAA: + url = "" + date = datetime.datetime(1970, 1, 1) + date_str = "" + name = "" + sha256 = "" + pdf_creation_date = None + pdf_modification_date = None + + def __init__(self, url, date, name): + if not url == "": + self.url = url + if not date == "": + self.date = date + self.date_str = date.strftime("%d/%m/%Y") + if not name == "": + self.name = name + + def get_sha256(self): + if (self.sha256 == ""): + self.sha256 = hashlib.sha256(self.url.encode('utf-8')).hexdigest() + return self.sha256 + + def get_pdf_dates(self, data_dir): + raa_data_dir = f'{data_dir}/raa/' + + reader = PdfReader(f'{raa_data_dir}{self.get_sha256()}.pdf') + pdf_metadata = reader.metadata + + if pdf_metadata.creation_date: + self.pdf_creation_date = pdf_metadata.creation_date + + if pdf_metadata.modification_date: + self.pdf_modification_date = pdf_metadata.modification_date + + def extract_content(self, data_dir): + raa_data_dir = f'{data_dir}/raa/' + + text = "" + + reader = PdfReader(f'{raa_data_dir}{self.get_sha256()}.ocr.pdf') + for page in reader.pages: + try: + text = text + "\n" + page.extract_text() + except Exception as exc: + logger.warning(f'ATTENTION: Impossible d\'extraire le texte du fichier {self.get_sha256()}.pdf : {exc}') + + # Écrit le texte du PDF dans un fichier texte pour une analyse future + f = open(f'{raa_data_dir}{self.get_sha256()}.txt', 'w') + f.write(text) + f.close() + + # Supprime le PDF d'origine et la version OCRisée + os.remove(f'{raa_data_dir}{self.get_sha256()}.pdf') + os.remove(f'{raa_data_dir}{self.get_sha256()}.ocr.pdf') + os.remove(f'{raa_data_dir}{self.get_sha256()}.flat.pdf') + + def write_properties(self, data_dir): + raa_data_dir = f'{data_dir}/raa/' + + pdf_creation_date_json = None + pdf_modification_date_json = None + + if self.pdf_creation_date: + pdf_creation_date_json = self.pdf_creation_date.strftime("%d/%m/%Y %H:%M:%S") + if self.pdf_modification_date: + pdf_modification_date_json = self.pdf_modification_date.strftime("%d/%m/%Y %H:%M:%S") + + properties = { + 'name': self.name, + 'date': self.date_str, + 'url': quote(self.url, safe='/:'), + 'first_saw_on': datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S"), + 'pdf_creation_date': pdf_creation_date_json, + 'pdf_modification_date': pdf_modification_date_json + } + f = open(f'{raa_data_dir}{self.get_sha256()}.json', 'w') + f.write(json.dumps(properties)) + f.close() + + def parse_metadata(self, data_dir): + self.get_pdf_dates(data_dir) + self.write_properties(data_dir) + + def __init__(self, data_dir, user_agent=''): + logger.debug('Initialisation de Attrap') + + # On crée le dossier de téléchargement + os.makedirs(data_dir, exist_ok=True) + + self.session = requests.Session() + self.data_dir = data_dir + self.found = False + self.output_file_path = os.path.dirname(os.path.abspath(__file__)) + f'/output_{self.short_code}.log' + self.sleep_time = 0 + self.tor_enabled = False + self.tor_max_requests = 0 + self.tor_requests = 0 + self.not_before = datetime.datetime(2024, 1, 1) + self.smtp_configured = False + self.mastodon = None + self.mastodon_prefix = '' + self.mastodon_suffix = '' + + self.update_user_agent(user_agent) + + f = open(self.output_file_path, 'w') + f.write('') + f.close() + + self.print_output(str(self.__class__.__name__)) + + def configure_mastodon(self, access_token, instance, mastodon_prefix, mastodon_suffix): + if access_token and access_token != "" and instance and instance != "": + self.mastodon = Mastodon( + access_token=access_token, + api_base_url=instance + ) + self.mastodon_prefix = mastodon_prefix + self.mastodon_suffix = mastodon_suffix + + def mastodon_toot(self, content): + if self.mastodon: + toot = content + if not self.mastodon_prefix == '': + toot = f"{self.mastodon_prefix}\n\n{toot}" + if not self.mastodon_suffix == '': + toot = f"{toot}\n\n{self.mastodon_suffix}" + self.mastodon.toot(toot) + + def enable_tor(self, max_requests=0): + proxies = { + "http": f"socks5h://127.0.0.1:9050", + "https": f"socks5h://127.0.0.1:9050", + } + self.tor_enabled = True + self.tor_max_requests = max_requests + self.tor_requests = 0 + self.session.proxies.update(proxies) + self.tor_get_new_id() + + def disable_tor(self): + proxies = {} + self.tor_enabled = False + self.tor_max_requests = 0 + self.tor_requests = 0 + self.session.proxies.update(proxies) + + def tor_get_new_id(self): + if self.tor_enabled: + logger.info('Changement d\'identité Tor') + try: + self.session.close() + controller = Controller.from_port(port=9051) + controller.authenticate() + controller.signal(Signal.NEWNYM) + time.sleep(5) + self.tor_requests = 0 + except Exception as exc: + logger.debug(f'Impossible de changer d\'identité Tor: {exc}') + + def get_sub_pages(self, page_content, element, host, recursive_until_pdf): + soup = BeautifulSoup(page_content, 'html.parser') + sub_pages = [] + for a in soup.select(element): + if a.get('href'): + url = f"{host}{a['href']}" + if recursive_until_pdf: + sub_page_content = self.get_page(url, 'get').content + if not self.has_pdf(sub_page_content): + logger.info( + f'{url} ne contient pas de PDF, on récupère ses sous-pages' + ) + for sub_sub_page in self.get_sub_pages( + sub_page_content, + element, + host, + recursive_until_pdf + ): + sub_pages.append(sub_sub_page) + else: + sub_page = { + 'url': url, + 'name': a.get_text().strip() + } + sub_pages.append(sub_page) + else: + sub_page = { + 'url': url, + 'name': a.get_text().strip() + } + sub_pages.append(sub_page) + return sub_pages + + def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, details_element, host): + pages = [] + page_content = self.get_page(page, 'get').content + + # On initialise le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On recherche les sous-pages + sub_pages = soup.select(sub_page_element) + sub_pages_details = None + if details_element is not None: + sub_pages_details = soup.select(details_element) + i = 0 + for sub_page in sub_pages: + if sub_page.get('href'): + page = { + 'url': f"{host}{sub_page['href']}", + 'name': sub_page.get_text().strip(), + 'details': '' + } + if details_element is not None: + page['details'] = sub_pages_details[i].get_text().strip() + pages.append(page) + i = i + 1 + + # On recherche un pager, et si on le trouve on le suit + pager = soup.select(pager_element) + if pager and pager[0] and pager[0].get('href'): + for sub_page in self.get_sub_pages_with_pager( + f"{host}{pager[0]['href']}", + sub_page_element, + pager_element, + details_element, + host + ): + pages.append(sub_page) + + return pages + + def get_raa_with_pager(self, pages_list, pager_element, host): + elements = [] + # On parse chaque page passée en paramètre + for page in pages_list: + page_content = self.get_page(page, 'get').content + + # Pour chaque page, on récupère les PDF + for raa in self.get_raa_elements(page_content): + elements.append(raa) + + # On regarde également s'il n'y aurait pas un pager + sub_pages = [] + for sub_page in self.get_sub_pages( + page_content, + pager_element, + host, + True + ): + sub_pages.append(sub_page['url']) + for sub_raa in self.get_raa_with_pager( + sub_pages, + pager_element, + host + ): + elements.append(sub_raa) + return elements + + def set_sleep_time(self, sleep_time): + self.sleep_time = sleep_time + + def has_pdf(self, page_content): + elements = [] + soup = BeautifulSoup(page_content, 'html.parser') + for a in soup.find_all('a', href=True): + if a['href'].endswith('.pdf'): + return True + return False + + # On démarre le navigateur + def get_session(self, url, wait_element, remaining_retries=0): + webdriver_options = webdriver.ChromeOptions() + webdriver_options.add_argument("--no-sandbox") + webdriver_options.add_argument("--disable-extensions") + webdriver_options.add_argument("--disable-gpu") + webdriver_options.add_argument("--disable-dev-shm-usage") + webdriver_options.add_argument("--use_subprocess") + webdriver_options.add_argument("--disable-blink-features=AutomationControlled") + + if not self.user_agent == "": + webdriver_options.add_argument(f"--user-agent={self.user_agent}") + + webdriver_options.add_argument("--headless") + webdriver_options.add_argument("--window-size=1024,768") + display = Display(visible=False, size=(1024, 768)) + display.start() + + browser = webdriver.Chrome(options=webdriver_options) + + # Téléchargement de l'URL + browser.get(url) + + if wait_element is not None: + # On attend que le navigateur ait passé les tests anti-robots et + # que le contenu s'affiche + try: + WebDriverWait(browser, 60).until( + expected_conditions.presence_of_element_located( + ( + By.ID, + wait_element + ) + ) + ) + except TimeoutException as exc: + logger.warning(f'TimeoutException: {exc}') + if remaining_retries > 0: + time.sleep(5) + return self.get_session(url, wait_element, (remaining_retries - 1)) + else: + raise TimeoutException(exc) + + page_content = browser.page_source + + # On récupère les cookies du navigateur pour les réutiliser plus tard + for cookie in browser.get_cookies(): + self.session.cookies.set(cookie['name'], cookie['value']) + + # On arrête le navigateur + browser.quit() + display.stop() + + return page_content + + def print_output(self, data): + print(data) + data = data.replace('\033[92m', '') + data = data.replace('\033[0m', '') + data = data.replace('\033[1m', '') + f = open(self.output_file_path, 'a') + f.write(data + "\n") + f.close() + + def get_page(self, url, method, data={}): + try: + logger.debug(f'Chargement de la page {url}') + if self.sleep_time > 0: + time.sleep(self.sleep_time) + + page = None + if method == 'get': + page = self.session.get(url, timeout=(10, 120)) + if method == 'post': + page = self.session.post(url, data=data, timeout=(10, 120)) + + if page.status_code == 429: + logger.warning('Erreur 429 Too Many Requests reçue, temporisation...') + self.tor_get_new_id() + time.sleep(55) + return self.get_page(url, method, data) + + if self.tor_enabled: + self.tor_requests += 1 + if self.tor_max_requests > 0 and \ + self.tor_requests > self.tor_max_requests: + self.tor_get_new_id() + + return page + except requests.exceptions.ConnectionError: + logger.warning(f'Erreur de connexion, temporisation...') + self.tor_get_new_id() + time.sleep(55) + return self.get_page(url, method, data) + except requests.exceptions.Timeout: + logger.warning(f'Timeout, on relance la requête...') + return self.get_page(url, method, data) + + def update_user_agent(self, user_agent): + self.user_agent = user_agent + self.session.headers.update({'User-Agent': self.user_agent}) + + def download_file(self, raa): + try: + os.makedirs( + os.path.dirname(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf'), + exist_ok=True + ) + file = self.get_page(raa.url, 'get') + f = open(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', 'wb') + f.write(file.content) + f.close() + except (requests.exceptions.ConnectionError, + requests.exceptions.ChunkedEncodingError): + logger.warning(f'ATTENTION: la connexion a été interrompue pendant le téléchargement de {raa.url}, nouvelle tentative...') + self.download_file(raa) + except Exception as exc: + logger.warning(f'ATTENTION: Impossible de télécharger le fichier {raa.url}: {exc}') + + def ocr(self, raa, retry_on_failure=True): + cmd = [ + 'ocrmypdf', + '-l', 'eng+fra', + '--output-type', 'pdf', + '--redo-ocr', + '--skip-big', '500', + '--invalidate-digital-signatures', + '--optimize', '0', + f'{self.data_dir}/raa/{raa.get_sha256()}.flat.pdf', + f'{self.data_dir}/raa/{raa.get_sha256()}.ocr.pdf' + ] + logger.debug(f'Lancement de ocrmypdf: {cmd}') + try: + output = subprocess.check_output(cmd, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as exc: + if exc.returncode == 2 and retry_on_failure: + logger.warning('ATTENTION : Le fichier n\'est pas un PDF correct, nouvelle tentative de le télécharger') + if self.tor_enabled: + self.tor_get_new_id() + self.download_file(raa) + self.ocr(raa, False) + elif (not exc.returncode == 6) and (not exc.returncode == 10) and (not exc.returncode == 4): + logger.warning('ATTENTION : Impossible d\'OCRiser le document', exc.returncode, exc.output) + shutil.copy(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', f'{self.data_dir}/raa/{raa.get_sha256()}.ocr.pdf') + + def flatten_pdf(self, raa): + # OCRmyPDF ne sait pas gérer les formulaires, donc on les enlève avant OCRisation + reader = PdfReader(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf') + writer = PdfWriter() + + for page in reader.pages: + if page.get('/Annots'): + for annot in page.get('/Annots'): + writer_annot = annot.get_object() + writer_annot.update({ + NameObject("/Ff"): NumberObject(1) + }) + writer.add_page(page) + writer.write(f'{self.data_dir}/raa/{raa.get_sha256()}.flat.pdf') + + def search_keywords(self, raa, keywords): + if keywords and not keywords == '': + text = open(f'{self.data_dir}/raa/{raa.get_sha256()}.txt').read() + + found = False + found_keywords = [] + for keyword in keywords.split(','): + if re.search(keyword, text, re.IGNORECASE | re.MULTILINE): + if not found: + url = quote(raa.url, safe='/:') + self.print_output(f'\033[92m{raa.name}\033[0m ({raa.date_str})') + self.print_output(f'URL : {url}') + found = True + self.found = True + self.print_output(f' Le terme \033[1m{keyword}\033[0m a été trouvé.') + found_keywords.append(keyword) + + if found: + self.print_output('') + url = quote(raa.url, safe='/:') + found_keywords_str = ', '.join( + [str(x) for x in found_keywords] + ) + self.mastodon_toot( + f'{raa.name} ({raa.date_str})\n\nLes termes suivants ont ' + f'été trouvés : {found_keywords_str}.\n\nURL : {url}' + ) + + def parse_raa(self, elements, keywords): + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') + + for raa in elements: + # Si le fichier n'a pas déjà été parsé et qu'il est postérieur à la + # date maximale d'analyse, on le télécharge et on le parse + if raa.date >= self.not_before and not os.path.isfile(f'{self.data_dir}/raa/{raa.get_sha256()}.txt'): + url = quote(raa.url, safe='/:') + logger.info(f'Nouveau fichier : {raa.name} ({raa.date_str}). URL : {url}') + self.download_file(raa) + raa.parse_metadata(self.data_dir) + self.flatten_pdf(raa) + self.ocr(raa, True) + raa.extract_content(self.data_dir) + self.search_keywords(raa, keywords) + + def get_raa(self, page_content): + logger.error('Cette fonction doit être surchargée') + + def configure_mailer(self, smtp_host, smtp_username, smtp_password, + smtp_port, smtp_starttls, smtp_ssl, email_from, + email_to, email_object): + self.smtp_host = smtp_host + self.smtp_username = smtp_username + self.smtp_password = smtp_password + if smtp_port <= 0: + self.smtp_port = 587 + else: + self.smtp_port = int(smtp_port) + self.smtp_starttls = smtp_starttls + self.smtp_ssl = smtp_ssl + self.email_from = email_from + self.email_to = email_to + self.email_object = email_object + + if smtp_host and smtp_username and smtp_password and email_from and email_to and email_object: + self.smtp_configured = True + + def mailer(self): + if self.smtp_configured and self.found: + try: + message = email.message.EmailMessage() + message.set_content(open(self.output_file_path).read()) + + message['Subject'] = self.email_object + message['From'] = self.email_from + message['Message-ID'] = email.utils.make_msgid(domain=self.email_from.split('@')[-1]) + message['Date'] = email.utils.formatdate() + + context = ssl.create_default_context() + + if self.smtp_ssl is True: + for address in self.email_to.split(','): + del message['To'] + message['To'] = address + smtp = smtplib.SMTP_SSL(self.smtp_host, port, context=context) + if self.smtp_username: + smtp.login(self.smtp_username, self.smtp_password) + smtp.send_message(message) + smtp.quit() + elif self.smtp_starttls is True: + for address in self.email_to.split(','): + del message['To'] + message['To'] = address + smtp = smtplib.SMTP(self.smtp_host) + smtp.starttls(context=context) + if self.smtp_username: + smtp.login(self.smtp_username, self.smtp_password) + smtp.send_message(message) + smtp.quit() + else: + for address in self.email_to.split(','): + del message['To'] + message['To'] = address + smtp = smtplib.SMTP(self.smtp_host) + if self.smtp_username: + smtp.login(self.smtp_username, self.smtp_password) + smtp.send_message(message) + smtp.quit() + except Exception as exc: + logger.warning(f'Impossible d\'envoyer le courrier électronique : {exc}') + + # Fonction qui essaie de deviner la date d'un RAA à partir de son nom. + # Utile pour limiter les requêtes lors de l'obtention des RAA à scanner. + def guess_date(string, regex): + try: + search = re.search(regex, string, re.IGNORECASE) + guessed_date = dateparser.parse(search.group(1)) + if guessed_date is None: + raise Exception('La date est un objet None') + else: + return guessed_date + except Exception as exc: + logger.warning(f'Impossible de deviner la date du terme {string} : {exc}') + return datetime.datetime(9999, 1, 1) diff --git a/Attrap_ppparis.py b/Attrap_ppparis.py new file mode 100644 index 0000000..938100a --- /dev/null +++ b/Attrap_ppparis.py @@ -0,0 +1,48 @@ +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_ppparis(Attrap): + + # Config + __HOST = 'https://www.prefecturedepolice.interieur.gouv.fr' + __RAA_PAGE = f'{__HOST}/actualites-et-presse/arretes/accueil-arretes' + __WAIT_ELEMENT = 'block-decree-list-block' + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' + full_name = 'Préfecture de police de Paris' + short_code = 'ppparis' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + + def get_raa(self, keywords): + page_content = self.get_session(self.__RAA_PAGE, self.__WAIT_ELEMENT, 6) + raa_elements = self.get_raa_elements(page_content) + self.parse_raa(raa_elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le + # parse + for a in soup.find_all('a', href=True): + if a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = 'https://www.prefecturedepolice.interieur.gouv.fr' + a['href'] + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').get_text() + date = datetime.datetime.strptime(a.find('div', class_="field--type-datetime").get_text().strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref04.py b/Attrap_pref04.py new file mode 100644 index 0000000..7ab8422 --- /dev/null +++ b/Attrap_pref04.py @@ -0,0 +1,60 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref04(Attrap): + + # Config + __HOST = 'https://www.alpes-de-haute-provence.gouv.fr' + __RAA_PAGE = f'{__HOST}/Publications/Publications-administratives-et-legales/Recueil-des-Actes-Administratifs' + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture des Alpes-de-Haute-Provence' + short_code = 'pref04' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + elements = [] + page_content = self.get_page(self.__RAA_PAGE, 'get').content + for sub_page in self.get_sub_pages( + page_content, + 'div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + ): + if Attrap.guess_date(sub_page['name'], '([0-9]{4}).*').year >= self.not_before.year: + sub_page_content = self.get_page(sub_page['url'], 'get').content + for element in self.get_raa_elements(sub_page_content): + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le + # parse + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref05.py b/Attrap_pref05.py new file mode 100644 index 0000000..7c5da60 --- /dev/null +++ b/Attrap_pref05.py @@ -0,0 +1,99 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref05(Attrap): + + # Config + __HOST = 'https://www.hautes-alpes.gouv.fr' + __RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs' + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture des Hautes-Alpes' + short_code = 'pref05' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + year_pages_to_parse = [] + + # On détermine quelles pages d'année parser + page_content = self.get_page(self.__RAA_PAGE, 'get').content + year_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + ) + for year_page in year_pages: + if int(year_page['name'].replace('Année ', '').strip()) >= self.not_before.year: + year_pages_to_parse.append(year_page['url']) + + month_pages_to_parse = [] + # Pour chaque année, on cherche les sous-pages de mois + for year_page in year_pages_to_parse: + page_content = self.get_page(year_page, 'get').content + month_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + )[::-1] + for month_page in month_pages: + # On filtre les mois ne correspondant pas à la période analysée + guessed_date = Attrap.guess_date(month_page['name'], '.*([a-zéû]* [0-9]{4})') + if guessed_date.replace(day=1) >= self.not_before.replace(day=1): + month_pages_to_parse.append(month_page['url']) + + pages_to_parse = [] + # Pour chaque page de mois, on cherche les pages de RAA + for month_page in month_pages_to_parse: + pages = self.get_sub_pages_with_pager( + month_page, + 'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', + 'nav.fr-pagination ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label', + 'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail', + self.__HOST + )[::-1] + for page in pages: + guessed_date = datetime.datetime.strptime(page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y') + if guessed_date.replace(day=1) >= self.not_before.replace(day=1): + pages_to_parse.append(page['url']) + + elements = [] + # On parse les pages contenant des RAA + for page in pages_to_parse: + page_content = self.get_page(page, 'get').content + for element in self.get_raa_elements(page_content): + elements.append(element) + + # On parse les RAA + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('div.fr-grid-row div.fr-downloads-group.fr-downloads-group--bordered ul li a'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref06.py b/Attrap_pref06.py new file mode 100644 index 0000000..e4ae0c3 --- /dev/null +++ b/Attrap_pref06.py @@ -0,0 +1,105 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref06(Attrap): + + # Config + __HOST = 'https://www.alpes-maritimes.gouv.fr' + __RAA_PAGE = { + '2024': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2024/Recueils-mensuels', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2024/Recueils-speciaux', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2024/Recueils-specifiques' + ], + '2023': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2023/Recueils-mensuels', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2023/Recueils-speciaux', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2023/Recueils-specifiques' + ], + '2022': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2022/Recueils-mensuels', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2022/Recueils-speciaux', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2022/Recueils-specifiques' + ], + '2021': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2021/Recueils-mensuels', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2021/Recueils-speciaux', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2021/Recueils-specifiques' + ], + '2020': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2020/Recueils-mensuels', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2020/Recueils-speciaux', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2020/Recueils-specifiques' + ], + '2019': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2019/Recueils-mensuels', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2019/Recueils-speciaux', + f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2019/Recueils-specifiques' + ] + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture des Alpes-Maritimes' + short_code = 'pref06' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(20) + + def get_raa(self, keywords): + pages_to_parse = [] + if self.not_before.year <= 2024: + for page in self.__RAA_PAGE['2024']: + pages_to_parse.append(page) + if self.not_before.year <= 2023: + for page in self.__RAA_PAGE['2023']: + pages_to_parse.append(page) + if self.not_before.year <= 2022: + for page in self.__RAA_PAGE['2022']: + pages_to_parse.append(page) + if self.not_before.year <= 2021: + for page in self.__RAA_PAGE['2021']: + pages_to_parse.append(page) + if self.not_before.year <= 2020: + for page in self.__RAA_PAGE['2020']: + pages_to_parse.append(page) + if self.not_before.year <= 2019: + for page in self.__RAA_PAGE['2019']: + pages_to_parse.append(page) + + elements = self.get_raa_with_pager( + pages_to_parse, + ".fr-pagination__link.fr-pagination__link--next", + self.__HOST + ) + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # Pour chaque élément fr-card__content, on cherche sa balise a, et si + # c'est un PDF on le parse + cards = soup.find_all('div', class_='fr-card__content') + for card in cards: + a = card.find('a') + if a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.get_text().strip() + date = datetime.datetime.strptime(card.find('p', class_='fr-card__detail').get_text().replace('Publié le ', '').strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref09.py b/Attrap_pref09.py new file mode 100644 index 0000000..d5388ca --- /dev/null +++ b/Attrap_pref09.py @@ -0,0 +1,72 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref09(Attrap): + + # Config + __HOST = 'https://www.ariege.gouv.fr' + __RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-l-Ariege-a-partir-du-28-avril-2015' + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture de l\'Ariège' + short_code = 'pref09' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + pages_to_parse = [] + + # Les RAA de l'Ariège sont éparpillés sur des sous-pages par mois. + # Donc on parse la page principale à la recherche des sous-pages. + sub_pages = self.get_sub_pages_with_pager( + self.__RAA_PAGE, + 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', + 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', + 'div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail', + self.__HOST + )[::-1] + + # On filtre par date les sous-pages pour limiter les requêtes + for sub_page in sub_pages: + guessed_date = datetime.datetime.strptime(sub_page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y') + guessed_date.replace(day=1) + if guessed_date >= self.not_before: + pages_to_parse.append(sub_page['url']) + + # On parse les pages contenant des RAA + elements = [] + for page in pages_to_parse: + page_content = self.get_page(page, 'get').content + for element in self.get_raa_elements(page_content): + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('div.fr-downloads-group.fr-downloads-group--bordered ul li a'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref13.py b/Attrap_pref13.py new file mode 100644 index 0000000..c0f6922 --- /dev/null +++ b/Attrap_pref13.py @@ -0,0 +1,59 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref13(Attrap): + + # Config + __HOST = 'https://www.bouches-du-rhone.gouv.fr' + __RAA_PAGE = [ + f'{__HOST}/Publications/RAA-et-Archives/RAA-2024', + f'{__HOST}/Publications/RAA-et-Archives/RAA-2023', + f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2022', + f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2021', + f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2020', + f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2019' + ] + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' + full_name = 'Préfecture des Bouches-du-Rhône' + short_code = 'pref13' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + elements = [] + for raa_page in self.__RAA_PAGE: + page_content = self.get_page(raa_page, 'get').content + for element in self.get_raa_elements(page_content): + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref31.py b/Attrap_pref31.py new file mode 100644 index 0000000..9080b4e --- /dev/null +++ b/Attrap_pref31.py @@ -0,0 +1,71 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref31(Attrap): + + # Config + __HOST = 'https://www.haute-garonne.gouv.fr' + __RAA_PAGE = f'{__HOST}/Publications/Recueil-des-Actes-Administratifs/Recueil-des-Actes-Administratifs-Haute-Garonne' + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' + full_name = 'Préfecture de la Haute-Garonne' + short_code = 'pref31' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + + def get_raa(self, keywords): + # On cherche les pages de chaque mois + page_content = self.get_page(self.__RAA_PAGE, 'get').content + month_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + )[::-1] + + pages_to_parse = [] + + # On filtre les pages de mois pour limiter le nombre de requêtes + for month_page in month_pages: + guessed_date = Attrap.guess_date(month_page['name'], '([a-zéû]* [0-9]{4})') + if guessed_date >= self.not_before.replace(day=1): + pages_to_parse.append(month_page['url']) + + elements = [] + # On parse les pages des mois qu'on veut analyser + for element in self.get_raa_with_pager( + pages_to_parse, + ".fr-pagination__link.fr-pagination__link--next", + self.__HOST + ): + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link.menu-item-link'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.get_text().strip().capitalize() + date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref33.py b/Attrap_pref33.py new file mode 100644 index 0000000..5a99f21 --- /dev/null +++ b/Attrap_pref33.py @@ -0,0 +1,108 @@ +import os +import re +import datetime +import logging + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + +logger = logging.getLogger(__name__) + + +class Attrap_pref33(Attrap): + + # Config + __HOST = 'https://www.gironde.gouv.fr' + __RAA_PAGE = f'{__HOST}/Publications/Recueil-des-Actes-Administratifs' + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture de la Gironde' + short_code = 'pref33' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + pages_to_parse = [] + + # Parfois un RAA est mal catégorisé et se retrouve sur la page racine, donc on la parse + pages_to_parse.append(self.__RAA_PAGE) + + # On détermine quelles pages d'année parser + year_pages_to_parse = [] + page_content = self.get_page(self.__RAA_PAGE, 'get').content + year_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + ) + for year_page in year_pages: + year = 9999 + try: + year = int(re.search('.*([0-9]{4})', year_page['name'].strip(), re.IGNORECASE).group(1)) + if year is None: + year = 9999 + except Exception as exc: + logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}") + year = 9999 + + if year >= self.not_before.year: + year_pages_to_parse.append(year_page['url']) + + # Pour chaque année, on cherche les sous-pages de mois + month_pages_to_parse = [] + for year_page in year_pages_to_parse: + page_content = self.get_page(year_page, 'get').content + month_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + )[::-1] + + for month_page in month_pages: + guessed_date = Attrap.guess_date(month_page['name'], '([a-zéû]* [0-9]{4})') + if guessed_date >= self.not_before.replace(day=1): + pages_to_parse.append(month_page['url']) + + # On parse les pages sélectionnées + elements = self.get_raa_with_pager( + pages_to_parse, + "ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label", + self.__HOST + )[::-1] + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + + # On récupère chaque carte avec un RAA + for card in BeautifulSoup(page_content, 'html.parser').select('div.fr-card.fr-card--horizontal div.fr-card__body div.fr-card__content'): + # On récupère le lien + links = card.select('h2.fr-card__title a.fr-card__link.menu-item-link') + # On récupère la date + dates_raw = card.select('div.fr-card__end p.fr-card__detail') + + # Si on a toutes les infos, on continue + if links and links[0] and dates_raw and dates_raw[0]: + a = links[0] + date_raw = dates_raw[0] + + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.get_text().strip() + date = datetime.datetime.strptime(date_raw.get_text().replace('Publié le', '').strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref34.py b/Attrap_pref34.py new file mode 100644 index 0000000..162d27c --- /dev/null +++ b/Attrap_pref34.py @@ -0,0 +1,73 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref34(Attrap): + + # Config + __HOST = 'https://www.herault.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2024', + '2023': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2023', + '2022': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2022', + '2021': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2021', + '2020': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2020', + '2019': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Archives/Recueil-des-actes-administratifs-2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' + full_name = 'Préfecture de l\'Hérault' + short_code = 'pref34' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + elements = [] + for raa_page in pages_to_parse: + page_content = self.get_page(raa_page, 'get').content + for element in self.get_raa_elements(page_content): + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref35.py b/Attrap_pref35.py new file mode 100644 index 0000000..8a6195e --- /dev/null +++ b/Attrap_pref35.py @@ -0,0 +1,60 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref35(Attrap): + + # Config + __HOST = 'https://www.ille-et-vilaine.gouv.fr' + __RAA_PAGE = [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2024', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2023', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2022', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2021', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2020', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2019' + ] + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' + full_name = 'Préfecture d\'Ille-et-Vilaine' + short_code = 'pref35' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + elements = [] + for raa_page in self.__RAA_PAGE: + page_content = self.get_page(raa_page, 'get').content + for element in self.get_raa_elements(page_content): + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le + # parse + for a in soup.find_all('a', href=True, class_='fr-link--download'): + if a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref38.py b/Attrap_pref38.py new file mode 100644 index 0000000..da82791 --- /dev/null +++ b/Attrap_pref38.py @@ -0,0 +1,101 @@ +import os +import datetime +import logging + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + +logger = logging.getLogger(__name__) + + +class Attrap_pref38(Attrap): + + # Config + __HOST = 'https://www.isere.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2024', + '2023': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2023', + '2022': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2022', + '2021': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-recueils-des-actes-administratifs-de-la-prefecture-de-l-Isere-2021/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2021', + '2020': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-recueils-des-actes-administratifs-de-la-prefecture-de-l-Isere-2020/Recueils-des-Actes-Administratifs-de-la-Prefecture-de-l-Isere-2020', + '2019': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2019/Archives-des-Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture de l\'Isère' + short_code = 'pref38' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(20) + + def get_raa(self, keywords): + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + elements = [] + for raa_page in pages_to_parse: + page_content = self.get_page(raa_page, 'get').content + for element in self.get_raa_elements(page_content, raa_page): + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content, raa_page): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère le select qui contient la liste des RAA + select_list = soup.select('select#-liste-docs')[0] + # On analyse chaque résultat + for option in select_list.find_all('option'): + if not option['value'] == "": + # On estime la date à partir du nom de fichier + guessed_date = Attrap.guess_date(option['title'], '.* n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)') + + # Si la date estimée correspond à la plage d'analyse, on + # demande au serveur les détails du RAA + if guessed_date >= self.not_before: + page_content = self.get_page( + raa_page, + 'post', + { + '-liste-docs': option['value'] + } + ).content + + # On parse la page de détails pour obtenir les propriétés + # du RAA + soup = BeautifulSoup(page_content, 'html.parser') + a = soup.select('div.liste_deroulante a.fr-link.fr-link--download')[0] + + # Si la page contient une balise a qui renvoie vers un pdf, + # c'est qu'on a obtenu les détails du RAA demandé, donc + # on le parse + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref42.py b/Attrap_pref42.py new file mode 100644 index 0000000..e7ba5ac --- /dev/null +++ b/Attrap_pref42.py @@ -0,0 +1,78 @@ +import os +import datetime +import re + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref42(Attrap): + + # Config + __HOST = 'https://www.loire.gouv.fr' + __RAA_PAGE = f'{__HOST}/Publications/Publications-legales/Recueil-des-Actes-Administratifs' + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture de de la Loire' + short_code = 'pref42' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + year_pages_to_parse = [] + + # On détermine quelles pages d'année parser + year_pages = self.get_sub_pages_with_pager( + self.__RAA_PAGE, + 'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', + 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label', + 'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail', + self.__HOST + ) + for year_page in year_pages: + year = 9999 + try: + year = int(re.search('([0-9]{4})', year_page['name'], re.IGNORECASE).group(1)) + if year is None: + year = 9999 + except Exception as exc: + logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}") + year = 9999 + + if year >= self.not_before.year: + year_pages_to_parse.append(year_page['url']) + + elements = [] + # Pour chaque année, on parse les RAA + for year_page in year_pages_to_parse: + page_content = self.get_page(year_page, 'get').content + for element in self.get_raa_elements(page_content)[::-1]: + elements.append(element) + + # On parse les RAA + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('div.fr-downloads-group.fr-downloads-group--bordered ul li a'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref44.py b/Attrap_pref44.py new file mode 100644 index 0000000..22a33b3 --- /dev/null +++ b/Attrap_pref44.py @@ -0,0 +1,108 @@ +import os +import datetime +import logging + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + +logger = logging.getLogger(__name__) + + +class Attrap_pref44(Attrap): + + # Config + __HOST = 'https://www.loire-atlantique.gouv.fr' + __RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA-en-Loire-Atlantique' + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture de la Loire-Atlantique' + short_code = 'pref44' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + pages_to_parse = [] + + # Parfois un RAA est mal catégorisé et se retrouve sur la page racine, donc on la parse + pages_to_parse.append(self.__RAA_PAGE) + + # On détermine quelles pages d'année parser + year_pages_to_parse = [] + page_content = self.get_page(self.__RAA_PAGE, 'get').content + year_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + ) + for year_page in year_pages: + year = 9999 + try: + year = int(year_page['name'].strip()) + if year is None: + year = 9999 + except Exception as exc: + logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}") + year = 9999 + + if year >= self.not_before.year: + year_pages_to_parse.append(year_page['url']) + + # Parfois un RAA est mal catégorisé et se retrouve sur la page de l'année, donc on la parse + pages_to_parse.append(year_page['url']) + + # Pour chaque année, on cherche les sous-pages de mois + month_pages_to_parse = [] + for year_page in year_pages_to_parse: + page_content = self.get_page(year_page, 'get').content + month_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + )[::-1] + + for month_page in month_pages: + pages_to_parse.append(month_page['url']) + + # On parse les pages sélectionnées + elements = self.get_raa_with_pager( + pages_to_parse, + "ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label", + self.__HOST + )[::-1] + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + + # On récupère chaque carte avec un RAA + for card in BeautifulSoup(page_content, 'html.parser').select('div.fr-card.fr-card--horizontal div.fr-card__body div.fr-card__content'): + # On récupère le lien + links = card.select('h2.fr-card__title a.fr-card__link.menu-item-link') + # On récupère la date + dates_raw = card.select('div.fr-card__end p.fr-card__detail') + + # Si on a toutes les infos, on continue + if links and links[0] and dates_raw and dates_raw[0]: + a = links[0] + date_raw = dates_raw[0] + + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.get_text().strip() + date = datetime.datetime.strptime(date_raw.get_text().replace('Publié le', '').strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref59.py b/Attrap_pref59.py new file mode 100644 index 0000000..d800c5b --- /dev/null +++ b/Attrap_pref59.py @@ -0,0 +1,85 @@ +import os +import datetime +import dateparser +import logging + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + +logger = logging.getLogger(__name__) + + +class Attrap_pref59(Attrap): + + # Config + __HOST = 'https://www.nord.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2024', + '2023': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2023', + '2022': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2022', + '2021': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2021', + '2020': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2020', + '2019': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture du Nord' + short_code = 'pref59' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(20) + + def get_raa(self, keywords): + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + elements = [] + for raa_page in pages_to_parse: + page_content = self.get_page(raa_page, 'get').content + sub_pages = self.get_sub_pages( + page_content, + 'div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + True + ) + for sub_page in sub_pages[::-1]: + sub_page_content = self.get_page(sub_page['url'], 'get').content + for element in self.get_raa_elements(sub_page_content): + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref62.py b/Attrap_pref62.py new file mode 100644 index 0000000..4e4f64c --- /dev/null +++ b/Attrap_pref62.py @@ -0,0 +1,98 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref62(Attrap): + + # Config + __HOST = 'https://www.pas-de-calais.gouv.fr' + __RAA_PAGE = { + '2024': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs/2024-Recueils-des-actes-administratifs' + ], + '2023': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs/2023-Recueils-des-actes-administratifs', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/2023-Recueils-speciaux-des-actes-administratifs' + ], + '2022': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs/2022-Recueils-des-Actes-Administratifs', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/2022-Recueils-Speciaux-des-Actes-Administratifs' + ], + '2021': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs/2021-Recueils-des-actes-administratifs', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/2021-Recueils-speciaux-des-actes-administratifs' + ], + '2020': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs/2020-Recueils-des-actes-administratifs', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/2020-Recueils-speciaux-des-actes-administratifs' + ], + '2019': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs/2019-Recueil-des-actes-administratifs', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/2019-Recueils-speciaux-des-actes-administratifs' + ] + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture du Pas-de-Calais' + short_code = 'pref62' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(20) + + def get_raa(self, keywords): + pages_to_parse = [] + if self.not_before.year <= 2024: + for page in self.__RAA_PAGE['2024']: + pages_to_parse.append(page) + if self.not_before.year <= 2023: + for page in self.__RAA_PAGE['2023']: + pages_to_parse.append(page) + if self.not_before.year <= 2022: + for page in self.__RAA_PAGE['2022']: + pages_to_parse.append(page) + if self.not_before.year <= 2021: + for page in self.__RAA_PAGE['2021']: + pages_to_parse.append(page) + if self.not_before.year <= 2020: + for page in self.__RAA_PAGE['2020']: + pages_to_parse.append(page) + if self.not_before.year <= 2019: + for page in self.__RAA_PAGE['2019']: + pages_to_parse.append(page) + + elements = [] + for raa_page in pages_to_parse: + page_content = self.get_page(raa_page, 'get').content + for element in self.get_raa_elements(page_content): + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère le div qui contient la liste des RAA + cards = soup.select('div.fr-downloads-group.fr-downloads-group--bordered')[0] + # On analyse chaque balise a dans ce div + for a in cards.find_all('a', href=True): + if a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements[::-1] diff --git a/Attrap_pref64.py b/Attrap_pref64.py new file mode 100644 index 0000000..e25a097 --- /dev/null +++ b/Attrap_pref64.py @@ -0,0 +1,101 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref64(Attrap): + + # Config + __HOST = 'https://www.pyrenees-atlantiques.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs/Annee-2024', + '2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs/Annee-2023', + '2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs/Annee-2022', + '2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs/Annee-2021', + '2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs/Annee-2020', + '2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs/Annee-2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture des Pyrénées-Atlantiques' + short_code = 'pref64' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + year_pages_to_parse = [] + if self.not_before.year <= 2024: + year_pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + year_pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + year_pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + year_pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + year_pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + year_pages_to_parse.append(self.__RAA_PAGE['2019']) + + pages_to_parse = [] + # Pour chaque année, on cherche les sous-pages de mois + for year_page in year_pages_to_parse: + page_content = self.get_page(year_page, 'get').content + month_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + )[::-1] + + # Pour chaque page de mois, on récupère les liens vers des pages de RAA + for month_page in month_pages: + raa_links = self.get_sub_pages_with_pager( + month_page['url'], + 'div.content-view-line div.class-file h2 a', + 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', + None, + self.__HOST + )[::-1] + + # Pour chaque lien vers un RAA, on filtre ceux ne correspondant pas à la période analysée + for raa_link in raa_links: + guessed_date = Attrap.guess_date(raa_link['name'], 'n°[ 0-9-]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)') + if guessed_date >= self.not_before: + pages_to_parse.append(raa_link['url']) + + # On parse les pages contenant des RAA + elements = [] + for page in pages_to_parse: + page_content = self.get_page(page, 'get').content + for raa in self.get_raa_elements(page_content): + elements.append(raa) + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref65.py b/Attrap_pref65.py new file mode 100644 index 0000000..cdce4e5 --- /dev/null +++ b/Attrap_pref65.py @@ -0,0 +1,73 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref65(Attrap): + + # Config + __HOST = 'https://www.hautes-pyrenees.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2024', + '2023': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2023', + '2022': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2022', + '2021': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2021', + '2020': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2020', + '2019': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture des Hautes-Pyrénées' + short_code = 'pref65' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + elements = [] + for raa_page in pages_to_parse: + page_content = self.get_page(raa_page, 'get').content + for element in self.get_raa_elements(page_content): + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref66.py b/Attrap_pref66.py new file mode 100644 index 0000000..bc30ab3 --- /dev/null +++ b/Attrap_pref66.py @@ -0,0 +1,133 @@ +import os +import sys +import datetime +import logging + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + +logger = logging.getLogger(__name__) + + +class Attrap_pref66(Attrap): + + # Config + __HOST = 'https://www.pyrenees-orientales.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2024', + '2023': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2023', + '2022': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2022', + '2021': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2021', + '2020': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2020', + '2019': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture des Pyrénées-Orientales' + short_code = 'pref66' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + elements = [] + + # La préfecture des Pyrénées-Orientales est une originale : avant 2024, + # chaque page annuelle contient l'ensemble des RAA, mais pas tout le + # temps avec leur date, qu'il faut deviner à partir du nom du RAA. + # Mais en 2024, ça change ! La page de 2024 contient un tableau + # récapitulatif avec toutes les dates de publication des RAA, mais + # aussi un pager. Sauf qu'il s'avère que le tableau récapitulatif + # n'est pas exhaustif. On doit donc parser toutes les sous-pages de + # 2024 puisqu'on ne peut se fier au tableau récapitulatif. + # Grrr. + if self.not_before.year <= 2024: + for element in self.get_raa_elements_since_2024(self.__RAA_PAGE['2024']): + elements.append(element) + if self.not_before.year <= 2023: + for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2023']): + elements.append(element) + if self.not_before.year <= 2022: + for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2022']): + elements.append(element) + if self.not_before.year <= 2021: + for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2021']): + elements.append(element) + if self.not_before.year <= 2020: + for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2020']): + elements.append(element) + if self.not_before.year <= 2019: + for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2019']): + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + # On parse un lien d'avant 2024 + def get_raa_elements_before_2024(self, page): + elements = [] + page_content = self.get_page(page, 'get').content + soup = BeautifulSoup(page_content, 'html.parser') + for a in soup.select('div.fr-table.fr-table--bordered.list a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + date = None + try: + # Lorsque la date n'est pas affichée à l'écran, elle est en + # fait cachée dans la propriété "title" du lien + details = '' + if a.find('span'): + details = a.find('span').get_text().split(' - ')[-1].strip() + else: + details = a['title'].split(' - ')[-1].strip() + date = datetime.datetime.strptime(details, '%d/%m/%Y') + except Exception as exc: + logger.error(f'Impossible de trouver de date pour le texte : {text_raw}: {exc}') + sys.exit(1) + + if date >= self.not_before: + url = '' + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = '' + if a.find('span') and a.find('span').previous_sibling: + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + else: + name = a.get_text().replace('Télécharger ', '').strip() + + elements.append(Attrap.RAA(url, date, name)) + return elements + + # On parse les RAA depuis 2024 + def get_raa_elements_since_2024(self, root_page): + pages = self.get_sub_pages_with_pager( + root_page, + 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', + 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', + 'div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail', + self.__HOST + )[::-1] + + pages_to_parse = [] + elements = [] + + for page in pages: + if not page['url'].endswith('.pdf'): + logger.warning(f"Attention, le lien vers {page['url']} n'est pas bon !") + else: + if page['url'].startswith('/'): + url = f"{self.__HOST}{page['url']}" + else: + url = page['url'] + + url = unquote(url) + name = page['name'].replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y') + + elements.append(Attrap.RAA(url, date, name)) + return elements diff --git a/Attrap_pref69.py b/Attrap_pref69.py new file mode 100644 index 0000000..1d7844b --- /dev/null +++ b/Attrap_pref69.py @@ -0,0 +1,85 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref69(Attrap): + + # Config + __HOST = 'https://www.rhone.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2024', + '2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2023', + '2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2022', + '2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2021', + '2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2020', + '2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture du Rhône' + short_code = 'pref69' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(20) + + def get_raa(self, keywords): + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + sub_pages_to_parse = [] + + for raa_page in pages_to_parse: + sub_pages = self.get_sub_pages_with_pager( + raa_page, + 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', + 'ul.fr-pagination__list li a.fr-pagination__link--next', + None, + self.__HOST)[::-1] + for sub_page in sub_pages: + sub_pages_to_parse.append(sub_page['url']) + + elements = [] + for sub_page_to_parse in sub_pages_to_parse: + page_content = self.get_page(sub_page_to_parse, 'get').content + for element in self.get_raa_elements(page_content)[::-1]: + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref80.py b/Attrap_pref80.py new file mode 100644 index 0000000..391eb89 --- /dev/null +++ b/Attrap_pref80.py @@ -0,0 +1,98 @@ +import os +import datetime +import logging + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + +logger = logging.getLogger(__name__) + + +class Attrap_pref80(Attrap): + + # Config + __HOST = 'https://www.somme.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2024', + '2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2023', + '2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2022', + '2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2021', + '2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2020', + '2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' + full_name = 'Préfecture de la Somme' + short_code = 'pref80' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + year_pages_to_parse = [] + if self.not_before.year <= 2024: + year_pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + year_pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + year_pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + year_pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + year_pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + year_pages_to_parse.append(self.__RAA_PAGE['2019']) + + # Pour chaque page Année, on récupère la liste des RAA + elements = [] + for year_page in year_pages_to_parse: + page_content = self.get_page(year_page, 'get').content + for element in self.get_raa_elements(page_content): + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le + # parse + for a in soup.select('div.fr-text--lead.fr-my-3w p a.fr-link'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + # On enlève les espaces insécables, les double-espaces, et le texte « Télécharger » de certains liens + name = a.get_text().replace('Télécharger ', '').strip().replace(u"\u00A0", ' ').replace(' ', ' ') + if name and not name == '': + # Certains RAA de la Somme ont une ligne avec les détails du fichier. Si cette ligne + # est disponible, on la parse, sinon on devine la date à partir du nom + date = None + if a.find('span'): + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + else: + regex = '.* n°.*(?:du)? ([0-9]*(?:er)? [a-zéû]* (?:[0-9]{4}|[0-9]{2}))' + date = Attrap.guess_date(name, regex) + # Parfois, il manque l'année dans le nom du RAA, alors on essaie avec l'année de la page + if date.year == 9999: + page_year = soup.select('nav.fr-breadcrumb div.fr-collapse ol.fr-breadcrumb__list li a.fr-breadcrumb__link.breadcrumb-item-link')[-1].get_text().replace('Année ', '').strip() + date = Attrap.guess_date(f'{name} {page_year}', regex) + + # Parfois, c'est que le fichier n'est pas un RAA mais un arrêté seul + if date.year == 9999: + date = Attrap.guess_date(name, '([0-9]*(?:er)? [a-zéû]* [0-9]{4})') + + if date.year == 9999: + logger.warning(f'On ignore {name} (URL : {url})') + else: + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements[::-1] diff --git a/Attrap_pref81.py b/Attrap_pref81.py new file mode 100644 index 0000000..6d6943b --- /dev/null +++ b/Attrap_pref81.py @@ -0,0 +1,117 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref81(Attrap): + + # Config + __HOST = 'https://www.tarn.gouv.fr' + __RAA_PAGE = { + 'default': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA', + '2024': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2024', + '2023': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2023', + '2022': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2022', + '2021': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2021', + '2020': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2020', + '2019': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2019', + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture du Tarn' + short_code = 'pref81' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + sub_pages_to_parse = [self.__RAA_PAGE['default']] + + # Pour chaque année, on cherche les sous-pages de mois + for raa_page in pages_to_parse: + page_content = self.get_page(raa_page, 'get').content + month_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + )[::-1] + + # On regarde aussi si sur la page de l'année il n'y aurait pas un + # RAA mal catégorisé + for page_to_parse in self.find_raa_card(raa_page): + sub_pages_to_parse.append(page_to_parse) + + # Pour chaque mois, on cherche les pages des RAA + for month_page in month_pages: + year = Attrap.guess_date(month_page['name'], '(.*)').year + for page_to_parse in self.find_raa_card(month_page['url'], year): + sub_pages_to_parse.append(page_to_parse) + # On ajoute aussi la page des mois à parser au cas où il y ait + # eu une redirection vers un RAA + sub_pages_to_parse.append(month_page['url']) + + # On parse les pages contenant des RAA + elements = [] + for page in sub_pages_to_parse: + page_content = self.get_page(page, 'get').content + for element in self.get_raa_elements(page_content): + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + def find_raa_card(self, page, year=None): + pages = [] + card_pages = self.get_sub_pages_with_pager( + page, + 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', + 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', + 'div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail', + self.__HOST + )[::-1] + for card_page in card_pages: + # On filtre les pages de RAA ne correspondant pas à la période analysée + guessed_date = datetime.datetime.strptime(card_page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y') + if guessed_date >= self.not_before: + pages.append(card_page['url']) + return pages + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('div.fr-downloads-group.fr-downloads-group--bordered ul li a'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref83.py b/Attrap_pref83.py new file mode 100644 index 0000000..112b420 --- /dev/null +++ b/Attrap_pref83.py @@ -0,0 +1,90 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref83(Attrap): + + # Config + __HOST = 'https://www.var.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2024', + '2023': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2023', + '2022': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2022', + '2021': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2021', + '2020': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2020', + '2019': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture du Var' + short_code = 'pref83' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + sub_pages_to_parse = [] + + # Pour chaque année, on cherche les sous-pages de mois + for raa_page in pages_to_parse: + sub_pages_to_parse.append(raa_page) + page_content = self.get_page(raa_page, 'get').content + month_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + )[::-1] + for month_page in month_pages: + sub_pages_to_parse.append(month_page['url']) + + # On parse les pages contenant des RAA + elements = self.get_raa_with_pager( + sub_pages_to_parse[::-1], + '.fr-pagination__link.fr-pagination__link--next', + self.__HOST + ) + self.parse_raa(elements, keywords) + + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque section contenant un RAA + cards = soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link.menu-item-link') + for a in cards: + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.get_text().strip() + date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref87.py b/Attrap_pref87.py new file mode 100644 index 0000000..354d853 --- /dev/null +++ b/Attrap_pref87.py @@ -0,0 +1,106 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref87(Attrap): + + # Config + __HOST = 'https://www.haute-vienne.gouv.fr' + __RAA_PAGE = { + '2024': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs/JANVIER-JUIN-2024/JANVIER-JUIN-2024', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/JUILLET-DECEMBRE-2024' + ], + '2023': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs/JANVIER-JUIN-2023', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/JUILLET-DECEMBRE-2023/JUILLET-DECEMBRE-2023' + ], + '2022': [ + f'{__HOST}/Publications/Recueil-des-actes-administratifs/JANVIER-JUIN-2022', + f'{__HOST}/Publications/Recueil-des-actes-administratifs/JUILLET-DECEMBRE-2022/Recueil-des-actes-administratifs-2022', + ], + '2021': [f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/2021'], + '2020': [f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/2020'], + '2019': [f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/2019'] + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture de la Haute-Vienne' + short_code = 'pref87' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + year_pages_to_parse = [] + if self.not_before.year <= 2024: + for year_page in self.__RAA_PAGE['2024']: + year_pages_to_parse.append(year_page) + if self.not_before.year <= 2023: + for year_page in self.__RAA_PAGE['2023']: + year_pages_to_parse.append(year_page) + if self.not_before.year <= 2022: + for year_page in self.__RAA_PAGE['2022']: + year_pages_to_parse.append(year_page) + if self.not_before.year <= 2021: + for year_page in self.__RAA_PAGE['2021']: + year_pages_to_parse.append(year_page) + if self.not_before.year <= 2020: + for year_page in self.__RAA_PAGE['2020']: + year_pages_to_parse.append(year_page) + if self.not_before.year <= 2019: + for year_page in self.__RAA_PAGE['2019']: + year_pages_to_parse.append(year_page) + + pages_to_parse = year_pages_to_parse + # Pour chaque année, on cherche les éventuelles sous-pages de mois + for year_page in year_pages_to_parse: + page_content = self.get_page(year_page, 'get').content + month_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + )[::-1] + + # On filtre les pages de mois ne correspondant pas à la période analysée + for month_page in month_pages: + guessed_date = Attrap.guess_date(month_page['name'], '([a-zéû]* [0-9]{4})').replace(day=1) + if guessed_date >= self.not_before.replace(day=1): + pages_to_parse.append(month_page['url']) + + # On parse les pages contenant des RAA + elements = [] + for page in pages_to_parse: + page_content = self.get_page(page, 'get').content + for raa in self.get_raa_elements(page_content): + elements.append(raa) + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Attrap_pref976.py b/Attrap_pref976.py new file mode 100644 index 0000000..32a4990 --- /dev/null +++ b/Attrap_pref976.py @@ -0,0 +1,120 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref976(Attrap): + + # Config + __HOST = 'https://www.mayotte.gouv.fr' + __RAA_PAGE = { + 'default': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A', + '2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2024', + '2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2023', + '2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2022', + '2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2021', + '2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2020', + '2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' + full_name = 'Préfecture de Mayotte' + short_code = 'pref976' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + sub_pages_to_parse = [self.__RAA_PAGE['default']] + + # Pour chaque année, on cherche les sous-pages de mois + for raa_page in pages_to_parse: + page_content = self.get_page(raa_page, 'get').content + month_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + )[::-1] + + # On regarde aussi si sur la page de l'année il n'y aurait pas un + # RAA mal catégorisé + for page_to_parse in self.find_raa_card(raa_page): + sub_pages_to_parse.append(page_to_parse) + + # Pour chaque mois, on cherche les pages des RAA + for month_page in month_pages: + year = Attrap.guess_date(month_page['name'], '(.*)').year + for page_to_parse in self.find_raa_card( + month_page['url'], + year + ): + sub_pages_to_parse.append(page_to_parse) + + # On parse les pages contenant des RAA + elements = [] + for page in sub_pages_to_parse: + page_content = self.get_page(page, 'get').content + for element in self.get_raa_elements(page_content): + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + def find_raa_card(self, page, year=None): + pages = [] + card_pages = self.get_sub_pages_with_pager( + page, + 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', + 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', + None, + self.__HOST + )[::-1] + for card_page in card_pages: + # On filtre les pages de RAA ne correspondant pas à la période + # analysée + guessed_date = Attrap.guess_date(card_page['name'], 'n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)') + if year: + guessed_date = guessed_date.replace(year=year) + if guessed_date >= self.not_before: + pages.append(card_page['url']) + return pages + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements -- GitLab