diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3063ad7f97f9b580badb181039445ff2f41d77f8..a7f28dcde363e9150bc17dbd2873824a9e3b5b27 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -33,7 +33,7 @@ docker: install: stage: install - image: registry.git.laquadrature.net/la-quadrature-du-net/raaspotter/base:latest + image: registry.git.laquadrature.net/la-quadrature-du-net/attrap/base:latest tags: - unprivileged script: @@ -51,7 +51,7 @@ install: pep8: stage: lint - image: registry.git.laquadrature.net/la-quadrature-du-net/raaspotter/base:latest + image: registry.git.laquadrature.net/la-quadrature-du-net/attrap/base:latest needs: [install] tags: - unprivileged @@ -70,7 +70,7 @@ pep8: .default_pref: stage: test - image: registry.git.laquadrature.net/la-quadrature-du-net/raaspotter/base:latest + image: registry.git.laquadrature.net/la-quadrature-du-net/attrap/base:latest tags: - unprivileged needs: [install] diff --git a/Dockerfile-base b/Dockerfile-base index 3ef580d143f91e4b1dab43871d39c18e56609e2c..433cc25108de0d39ecb821c78fe1ae7cee401c4b 100644 --- a/Dockerfile-base +++ b/Dockerfile-base @@ -1,4 +1,4 @@ -FROM debian:trixie +FROM debian:sid ENV DEBIAN_FRONTEND="noninteractive" diff --git a/RAAspotter.py b/RAAspotter.py deleted file mode 100644 index e42095d9a36bdfdf7e3cacba7819f2ff875031cb..0000000000000000000000000000000000000000 --- a/RAAspotter.py +++ /dev/null @@ -1,595 +0,0 @@ -import os -import re -import ssl -import subprocess -import shutil -import logging -import requests -import time -import datetime -import json -from urllib.parse import quote - -from selenium import webdriver -from selenium.common.exceptions import TimeoutException -from selenium.webdriver.common.by import By -from selenium.webdriver.support.wait import WebDriverWait -from selenium.webdriver.support import expected_conditions - -import dateparser - -from bs4 import BeautifulSoup -from pyvirtualdisplay import Display - -from pypdf import PdfReader -from pypdf import PdfWriter -from pypdf.generic import NameObject, NumberObject - -from stem import Signal -from stem.control import Controller - -import hashlib -import smtplib -import email - -from mastodon import Mastodon - -logger = logging.getLogger(__name__) - - -class RAAspotter: - class RAA: - url = "" - date = datetime.datetime(1970, 1, 1) - date_str = "" - name = "" - sha256 = "" - pdf_creation_date = None - pdf_modification_date = None - - def __init__(self, url, date, name): - if not url == "": - self.url = url - if not date == "": - self.date = date - self.date_str = date.strftime("%d/%m/%Y") - if not name == "": - self.name = name - - def get_sha256(self): - if (self.sha256 == ""): - self.sha256 = hashlib.sha256(self.url.encode('utf-8')).hexdigest() - return self.sha256 - - def get_pdf_dates(self, data_dir): - raa_data_dir = f'{data_dir}/raa/' - - reader = PdfReader(f'{raa_data_dir}{self.get_sha256()}.pdf') - pdf_metadata = reader.metadata - - if pdf_metadata.creation_date: - self.pdf_creation_date = pdf_metadata.creation_date - - if pdf_metadata.modification_date: - self.pdf_modification_date = pdf_metadata.modification_date - - def extract_content(self, data_dir): - raa_data_dir = f'{data_dir}/raa/' - - text = "" - - reader = PdfReader(f'{raa_data_dir}{self.get_sha256()}.ocr.pdf') - for page in reader.pages: - try: - text = text + "\n" + page.extract_text() - except Exception as exc: - logger.warning(f'ATTENTION: Impossible d\'extraire le texte du fichier {self.get_sha256()}.pdf : {exc}') - - # Écrit le texte du PDF dans un fichier texte pour une analyse future - f = open(f'{raa_data_dir}{self.get_sha256()}.txt', 'w') - f.write(text) - f.close() - - # Supprime le PDF d'origine et la version OCRisée - os.remove(f'{raa_data_dir}{self.get_sha256()}.pdf') - os.remove(f'{raa_data_dir}{self.get_sha256()}.ocr.pdf') - os.remove(f'{raa_data_dir}{self.get_sha256()}.flat.pdf') - - def write_properties(self, data_dir): - raa_data_dir = f'{data_dir}/raa/' - - pdf_creation_date_json = None - pdf_modification_date_json = None - - if self.pdf_creation_date: - pdf_creation_date_json = self.pdf_creation_date.strftime("%d/%m/%Y %H:%M:%S") - if self.pdf_modification_date: - pdf_modification_date_json = self.pdf_modification_date.strftime("%d/%m/%Y %H:%M:%S") - - properties = { - 'name': self.name, - 'date': self.date_str, - 'url': quote(self.url, safe='/:'), - 'first_saw_on': datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S"), - 'pdf_creation_date': pdf_creation_date_json, - 'pdf_modification_date': pdf_modification_date_json - } - f = open(f'{raa_data_dir}{self.get_sha256()}.json', 'w') - f.write(json.dumps(properties)) - f.close() - - def parse_metadata(self, data_dir): - self.get_pdf_dates(data_dir) - self.write_properties(data_dir) - - def __init__(self, data_dir, user_agent=''): - logger.debug('Initialisation de RAAspotter') - - # On crée le dossier de téléchargement - os.makedirs(data_dir, exist_ok=True) - - self.session = requests.Session() - self.data_dir = data_dir - self.found = False - self.output_file_path = os.path.dirname(os.path.abspath(__file__)) + f'/output_{self.short_code}.log' - self.sleep_time = 0 - self.tor_enabled = False - self.tor_max_requests = 0 - self.tor_requests = 0 - self.not_before = datetime.datetime(2024, 1, 1) - self.smtp_configured = False - self.mastodon = None - self.mastodon_prefix = '' - self.mastodon_suffix = '' - - self.update_user_agent(user_agent) - - f = open(self.output_file_path, 'w') - f.write('') - f.close() - - self.print_output(str(self.__class__.__name__)) - - def configure_mastodon(self, access_token, instance, mastodon_prefix, mastodon_suffix): - if access_token and access_token != "" and instance and instance != "": - self.mastodon = Mastodon( - access_token=access_token, - api_base_url=instance - ) - self.mastodon_prefix = mastodon_prefix - self.mastodon_suffix = mastodon_suffix - - def mastodon_toot(self, content): - if self.mastodon: - toot = content - if not self.mastodon_prefix == '': - toot = f"{self.mastodon_prefix}\n\n{toot}" - if not self.mastodon_suffix == '': - toot = f"{toot}\n\n{self.mastodon_suffix}" - self.mastodon.toot(toot) - - def enable_tor(self, max_requests=0): - proxies = { - "http": f"socks5h://127.0.0.1:9050", - "https": f"socks5h://127.0.0.1:9050", - } - self.tor_enabled = True - self.tor_max_requests = max_requests - self.tor_requests = 0 - self.session.proxies.update(proxies) - self.tor_get_new_id() - - def disable_tor(self): - proxies = {} - self.tor_enabled = False - self.tor_max_requests = 0 - self.tor_requests = 0 - self.session.proxies.update(proxies) - - def tor_get_new_id(self): - if self.tor_enabled: - logger.info('Changement d\'identité Tor') - try: - self.session.close() - controller = Controller.from_port(port=9051) - controller.authenticate() - controller.signal(Signal.NEWNYM) - time.sleep(5) - self.tor_requests = 0 - except Exception as exc: - logger.debug(f'Impossible de changer d\'identité Tor: {exc}') - - def get_sub_pages(self, page_content, element, host, recursive_until_pdf): - soup = BeautifulSoup(page_content, 'html.parser') - sub_pages = [] - for a in soup.select(element): - if a.get('href'): - url = f"{host}{a['href']}" - if recursive_until_pdf: - sub_page_content = self.get_page(url, 'get').content - if not self.has_pdf(sub_page_content): - logger.info( - f'{url} ne contient pas de PDF, on récupère ses sous-pages' - ) - for sub_sub_page in self.get_sub_pages( - sub_page_content, - element, - host, - recursive_until_pdf - ): - sub_pages.append(sub_sub_page) - else: - sub_page = { - 'url': url, - 'name': a.get_text().strip() - } - sub_pages.append(sub_page) - else: - sub_page = { - 'url': url, - 'name': a.get_text().strip() - } - sub_pages.append(sub_page) - return sub_pages - - def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, details_element, host): - pages = [] - page_content = self.get_page(page, 'get').content - - # On initialise le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On recherche les sous-pages - sub_pages = soup.select(sub_page_element) - sub_pages_details = None - if details_element is not None: - sub_pages_details = soup.select(details_element) - i = 0 - for sub_page in sub_pages: - if sub_page.get('href'): - page = { - 'url': f"{host}{sub_page['href']}", - 'name': sub_page.get_text().strip(), - 'details': '' - } - if details_element is not None: - page['details'] = sub_pages_details[i].get_text().strip() - pages.append(page) - i = i + 1 - - # On recherche un pager, et si on le trouve on le suit - pager = soup.select(pager_element) - if pager and pager[0] and pager[0].get('href'): - for sub_page in self.get_sub_pages_with_pager( - f"{host}{pager[0]['href']}", - sub_page_element, - pager_element, - details_element, - host - ): - pages.append(sub_page) - - return pages - - def get_raa_with_pager(self, pages_list, pager_element, host): - elements = [] - # On parse chaque page passée en paramètre - for page in pages_list: - page_content = self.get_page(page, 'get').content - - # Pour chaque page, on récupère les PDF - for raa in self.get_raa_elements(page_content): - elements.append(raa) - - # On regarde également s'il n'y aurait pas un pager - sub_pages = [] - for sub_page in self.get_sub_pages( - page_content, - pager_element, - host, - True - ): - sub_pages.append(sub_page['url']) - for sub_raa in self.get_raa_with_pager( - sub_pages, - pager_element, - host - ): - elements.append(sub_raa) - return elements - - def set_sleep_time(self, sleep_time): - self.sleep_time = sleep_time - - def has_pdf(self, page_content): - elements = [] - soup = BeautifulSoup(page_content, 'html.parser') - for a in soup.find_all('a', href=True): - if a['href'].endswith('.pdf'): - return True - return False - - # On démarre le navigateur - def get_session(self, url, wait_element, remaining_retries=0): - webdriver_options = webdriver.ChromeOptions() - webdriver_options.add_argument("--no-sandbox") - webdriver_options.add_argument("--disable-extensions") - webdriver_options.add_argument("--disable-gpu") - webdriver_options.add_argument("--disable-dev-shm-usage") - webdriver_options.add_argument("--use_subprocess") - webdriver_options.add_argument("--disable-blink-features=AutomationControlled") - - if not self.user_agent == "": - webdriver_options.add_argument(f"--user-agent={self.user_agent}") - - webdriver_options.add_argument("--headless") - webdriver_options.add_argument("--window-size=1024,768") - display = Display(visible=False, size=(1024, 768)) - display.start() - - browser = webdriver.Chrome(options=webdriver_options) - - # Téléchargement de l'URL - browser.get(url) - - if wait_element is not None: - # On attend que le navigateur ait passé les tests anti-robots et - # que le contenu s'affiche - try: - WebDriverWait(browser, 60).until( - expected_conditions.presence_of_element_located( - ( - By.ID, - wait_element - ) - ) - ) - except TimeoutException as exc: - logger.warning(f'TimeoutException: {exc}') - if remaining_retries > 0: - time.sleep(5) - return self.get_session(url, wait_element, (remaining_retries - 1)) - else: - raise TimeoutException(exc) - - page_content = browser.page_source - - # On récupère les cookies du navigateur pour les réutiliser plus tard - for cookie in browser.get_cookies(): - self.session.cookies.set(cookie['name'], cookie['value']) - - # On arrête le navigateur - browser.quit() - display.stop() - - return page_content - - def print_output(self, data): - print(data) - data = data.replace('\033[92m', '') - data = data.replace('\033[0m', '') - data = data.replace('\033[1m', '') - f = open(self.output_file_path, 'a') - f.write(data + "\n") - f.close() - - def get_page(self, url, method, data={}): - try: - logger.debug(f'Chargement de la page {url}') - if self.sleep_time > 0: - time.sleep(self.sleep_time) - - page = None - if method == 'get': - page = self.session.get(url, timeout=(10, 120)) - if method == 'post': - page = self.session.post(url, data=data, timeout=(10, 120)) - - if page.status_code == 429: - logger.warning('Erreur 429 Too Many Requests reçue, temporisation...') - self.tor_get_new_id() - time.sleep(55) - return self.get_page(url, method, data) - - if self.tor_enabled: - self.tor_requests += 1 - if self.tor_max_requests > 0 and \ - self.tor_requests > self.tor_max_requests: - self.tor_get_new_id() - - return page - except requests.exceptions.ConnectionError: - logger.warning(f'Erreur de connexion, temporisation...') - self.tor_get_new_id() - time.sleep(55) - return self.get_page(url, method, data) - except requests.exceptions.Timeout: - logger.warning(f'Timeout, on relance la requête...') - return self.get_page(url, method, data) - - def update_user_agent(self, user_agent): - self.user_agent = user_agent - self.session.headers.update({'User-Agent': self.user_agent}) - - def download_file(self, raa): - try: - os.makedirs( - os.path.dirname(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf'), - exist_ok=True - ) - file = self.get_page(raa.url, 'get') - f = open(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', 'wb') - f.write(file.content) - f.close() - except (requests.exceptions.ConnectionError, - requests.exceptions.ChunkedEncodingError): - logger.warning(f'ATTENTION: la connexion a été interrompue pendant le téléchargement de {raa.url}, nouvelle tentative...') - self.download_file(raa) - except Exception as exc: - logger.warning(f'ATTENTION: Impossible de télécharger le fichier {raa.url}: {exc}') - - def ocr(self, raa, retry_on_failure=True): - cmd = [ - 'ocrmypdf', - '-l', 'eng+fra', - '--output-type', 'pdf', - '--redo-ocr', - '--skip-big', '500', - '--invalidate-digital-signatures', - '--optimize', '0', - f'{self.data_dir}/raa/{raa.get_sha256()}.flat.pdf', - f'{self.data_dir}/raa/{raa.get_sha256()}.ocr.pdf' - ] - logger.debug(f'Lancement de ocrmypdf: {cmd}') - try: - output = subprocess.check_output(cmd, stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as exc: - if exc.returncode == 2 and retry_on_failure: - logger.warning('ATTENTION : Le fichier n\'est pas un PDF correct, nouvelle tentative de le télécharger') - if self.tor_enabled: - self.tor_get_new_id() - self.download_file(raa) - self.ocr(raa, False) - elif (not exc.returncode == 6) and (not exc.returncode == 10) and (not exc.returncode == 4): - logger.warning('ATTENTION : Impossible d\'OCRiser le document', exc.returncode, exc.output) - shutil.copy(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', f'{self.data_dir}/raa/{raa.get_sha256()}.ocr.pdf') - - def flatten_pdf(self, raa): - # OCRmyPDF ne sait pas gérer les formulaires, donc on les enlève avant OCRisation - reader = PdfReader(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf') - writer = PdfWriter() - - for page in reader.pages: - if page.get('/Annots'): - for annot in page.get('/Annots'): - writer_annot = annot.get_object() - writer_annot.update({ - NameObject("/Ff"): NumberObject(1) - }) - writer.add_page(page) - writer.write(f'{self.data_dir}/raa/{raa.get_sha256()}.flat.pdf') - - def search_keywords(self, raa, keywords): - if keywords and not keywords == '': - text = open(f'{self.data_dir}/raa/{raa.get_sha256()}.txt').read() - - found = False - found_keywords = [] - for keyword in keywords.split(','): - if re.search(keyword, text, re.IGNORECASE | re.MULTILINE): - if not found: - url = quote(raa.url, safe='/:') - self.print_output(f'\033[92m{raa.name}\033[0m ({raa.date_str})') - self.print_output(f'URL : {url}') - found = True - self.found = True - self.print_output(f' Le terme \033[1m{keyword}\033[0m a été trouvé.') - found_keywords.append(keyword) - - if found: - self.print_output('') - url = quote(raa.url, safe='/:') - found_keywords_str = ', '.join( - [str(x) for x in found_keywords] - ) - self.mastodon_toot( - f'{raa.name} ({raa.date_str})\n\nLes termes suivants ont ' - f'été trouvés : {found_keywords_str}.\n\nURL : {url}' - ) - - def parse_raa(self, elements, keywords): - self.print_output(f'Termes recherchés: {keywords}') - self.print_output('') - - for raa in elements: - # Si le fichier n'a pas déjà été parsé et qu'il est postérieur à la - # date maximale d'analyse, on le télécharge et on le parse - if raa.date >= self.not_before and not os.path.isfile(f'{self.data_dir}/raa/{raa.get_sha256()}.txt'): - url = quote(raa.url, safe='/:') - logger.info(f'Nouveau fichier : {raa.name} ({raa.date_str}). URL : {url}') - self.download_file(raa) - raa.parse_metadata(self.data_dir) - self.flatten_pdf(raa) - self.ocr(raa, True) - raa.extract_content(self.data_dir) - self.search_keywords(raa, keywords) - - def get_raa(self, page_content): - logger.error('Cette fonction doit être surchargée') - - def configure_mailer(self, smtp_host, smtp_username, smtp_password, - smtp_port, smtp_starttls, smtp_ssl, email_from, - email_to, email_object): - self.smtp_host = smtp_host - self.smtp_username = smtp_username - self.smtp_password = smtp_password - if smtp_port <= 0: - self.smtp_port = 587 - else: - self.smtp_port = int(smtp_port) - self.smtp_starttls = smtp_starttls - self.smtp_ssl = smtp_ssl - self.email_from = email_from - self.email_to = email_to - self.email_object = email_object - - if smtp_host and smtp_username and smtp_password and email_from and email_to and email_object: - self.smtp_configured = True - - def mailer(self): - if self.smtp_configured and self.found: - try: - message = email.message.EmailMessage() - message.set_content(open(self.output_file_path).read()) - - message['Subject'] = self.email_object - message['From'] = self.email_from - message['Message-ID'] = email.utils.make_msgid(domain=self.email_from.split('@')[-1]) - message['Date'] = email.utils.formatdate() - - context = ssl.create_default_context() - - if self.smtp_ssl is True: - for address in self.email_to.split(','): - del message['To'] - message['To'] = address - smtp = smtplib.SMTP_SSL(self.smtp_host, port, context=context) - if self.smtp_username: - smtp.login(self.smtp_username, self.smtp_password) - smtp.send_message(message) - smtp.quit() - elif self.smtp_starttls is True: - for address in self.email_to.split(','): - del message['To'] - message['To'] = address - smtp = smtplib.SMTP(self.smtp_host) - smtp.starttls(context=context) - if self.smtp_username: - smtp.login(self.smtp_username, self.smtp_password) - smtp.send_message(message) - smtp.quit() - else: - for address in self.email_to.split(','): - del message['To'] - message['To'] = address - smtp = smtplib.SMTP(self.smtp_host) - if self.smtp_username: - smtp.login(self.smtp_username, self.smtp_password) - smtp.send_message(message) - smtp.quit() - except Exception as exc: - logger.warning(f'Impossible d\'envoyer le courrier électronique : {exc}') - - # Fonction qui essaie de deviner la date d'un RAA à partir de son nom. - # Utile pour limiter les requêtes lors de l'obtention des RAA à scanner. - def guess_date(string, regex): - try: - search = re.search(regex, string, re.IGNORECASE) - guessed_date = dateparser.parse(search.group(1)) - if guessed_date is None: - raise Exception('La date est un objet None') - else: - return guessed_date - except Exception as exc: - logger.warning(f'Impossible de deviner la date du terme {string} : {exc}') - return datetime.datetime(9999, 1, 1) diff --git a/RAAspotter_ppparis.py b/RAAspotter_ppparis.py deleted file mode 100644 index f3afe3044b3ad57c6121ead0916138426ba43ce4..0000000000000000000000000000000000000000 --- a/RAAspotter_ppparis.py +++ /dev/null @@ -1,48 +0,0 @@ -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_ppparis(RAAspotter): - - # Config - __HOST = 'https://www.prefecturedepolice.interieur.gouv.fr' - __RAA_PAGE = f'{__HOST}/actualites-et-presse/arretes/accueil-arretes' - __WAIT_ELEMENT = 'block-decree-list-block' - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' - full_name = 'Préfecture de police de Paris' - short_code = 'ppparis' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - - def get_raa(self, keywords): - page_content = self.get_session(self.__RAA_PAGE, self.__WAIT_ELEMENT, 6) - raa_elements = self.get_raa_elements(page_content) - self.parse_raa(raa_elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le - # parse - for a in soup.find_all('a', href=True): - if a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = 'https://www.prefecturedepolice.interieur.gouv.fr' + a['href'] - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').get_text() - date = datetime.datetime.strptime(a.find('div', class_="field--type-datetime").get_text().strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref04.py b/RAAspotter_pref04.py deleted file mode 100644 index eccaea17945a5c89526db2bdebc66e5dca9dec79..0000000000000000000000000000000000000000 --- a/RAAspotter_pref04.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref04(RAAspotter): - - # Config - __HOST = 'https://www.alpes-de-haute-provence.gouv.fr' - __RAA_PAGE = f'{__HOST}/Publications/Publications-administratives-et-legales/Recueil-des-Actes-Administratifs' - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture des Alpes-de-Haute-Provence' - short_code = 'pref04' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - elements = [] - page_content = self.get_page(self.__RAA_PAGE, 'get').content - for sub_page in self.get_sub_pages( - page_content, - 'div.fr-card__body div.fr-card__content h2.fr-card__title a', - self.__HOST, - False - ): - if RAAspotter.guess_date(sub_page['name'], '([0-9]{4}).*').year >= self.not_before.year: - sub_page_content = self.get_page(sub_page['url'], 'get').content - for element in self.get_raa_elements(sub_page_content): - elements.append(element) - - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le - # parse - for a in soup.select('a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref05.py b/RAAspotter_pref05.py deleted file mode 100644 index c9f7c60ae945cf21ed05974b6274c1dd842ef76a..0000000000000000000000000000000000000000 --- a/RAAspotter_pref05.py +++ /dev/null @@ -1,99 +0,0 @@ -import os -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref05(RAAspotter): - - # Config - __HOST = 'https://www.hautes-alpes.gouv.fr' - __RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs' - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture des Hautes-Alpes' - short_code = 'pref05' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - year_pages_to_parse = [] - - # On détermine quelles pages d'année parser - page_content = self.get_page(self.__RAA_PAGE, 'get').content - year_pages = self.get_sub_pages( - page_content, - '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', - self.__HOST, - False - ) - for year_page in year_pages: - if int(year_page['name'].replace('Année ', '').strip()) >= self.not_before.year: - year_pages_to_parse.append(year_page['url']) - - month_pages_to_parse = [] - # Pour chaque année, on cherche les sous-pages de mois - for year_page in year_pages_to_parse: - page_content = self.get_page(year_page, 'get').content - month_pages = self.get_sub_pages( - page_content, - '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', - self.__HOST, - False - )[::-1] - for month_page in month_pages: - # On filtre les mois ne correspondant pas à la période analysée - guessed_date = RAAspotter.guess_date(month_page['name'], '.*([a-zéû]* [0-9]{4})') - if guessed_date.replace(day=1) >= self.not_before.replace(day=1): - month_pages_to_parse.append(month_page['url']) - - pages_to_parse = [] - # Pour chaque page de mois, on cherche les pages de RAA - for month_page in month_pages_to_parse: - pages = self.get_sub_pages_with_pager( - month_page, - 'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', - 'nav.fr-pagination ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label', - 'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail', - self.__HOST - )[::-1] - for page in pages: - guessed_date = datetime.datetime.strptime(page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y') - if guessed_date.replace(day=1) >= self.not_before.replace(day=1): - pages_to_parse.append(page['url']) - - elements = [] - # On parse les pages contenant des RAA - for page in pages_to_parse: - page_content = self.get_page(page, 'get').content - for element in self.get_raa_elements(page_content): - elements.append(element) - - # On parse les RAA - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère chaque balise a - for a in soup.select('div.fr-grid-row div.fr-downloads-group.fr-downloads-group--bordered ul li a'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref06.py b/RAAspotter_pref06.py deleted file mode 100644 index 9c3fc13d7f77610827acf9d674c6adb30b8a5288..0000000000000000000000000000000000000000 --- a/RAAspotter_pref06.py +++ /dev/null @@ -1,105 +0,0 @@ -import os -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref06(RAAspotter): - - # Config - __HOST = 'https://www.alpes-maritimes.gouv.fr' - __RAA_PAGE = { - '2024': [ - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2024/Recueils-mensuels', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2024/Recueils-speciaux', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2024/Recueils-specifiques' - ], - '2023': [ - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2023/Recueils-mensuels', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2023/Recueils-speciaux', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2023/Recueils-specifiques' - ], - '2022': [ - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2022/Recueils-mensuels', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2022/Recueils-speciaux', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2022/Recueils-specifiques' - ], - '2021': [ - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2021/Recueils-mensuels', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2021/Recueils-speciaux', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2021/Recueils-specifiques' - ], - '2020': [ - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2020/Recueils-mensuels', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2020/Recueils-speciaux', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2020/Recueils-specifiques' - ], - '2019': [ - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2019/Recueils-mensuels', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2019/Recueils-speciaux', - f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2019/Recueils-specifiques' - ] - } - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture des Alpes-Maritimes' - short_code = 'pref06' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(20) - - def get_raa(self, keywords): - pages_to_parse = [] - if self.not_before.year <= 2024: - for page in self.__RAA_PAGE['2024']: - pages_to_parse.append(page) - if self.not_before.year <= 2023: - for page in self.__RAA_PAGE['2023']: - pages_to_parse.append(page) - if self.not_before.year <= 2022: - for page in self.__RAA_PAGE['2022']: - pages_to_parse.append(page) - if self.not_before.year <= 2021: - for page in self.__RAA_PAGE['2021']: - pages_to_parse.append(page) - if self.not_before.year <= 2020: - for page in self.__RAA_PAGE['2020']: - pages_to_parse.append(page) - if self.not_before.year <= 2019: - for page in self.__RAA_PAGE['2019']: - pages_to_parse.append(page) - - elements = self.get_raa_with_pager( - pages_to_parse, - ".fr-pagination__link.fr-pagination__link--next", - self.__HOST - ) - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # Pour chaque élément fr-card__content, on cherche sa balise a, et si - # c'est un PDF on le parse - cards = soup.find_all('div', class_='fr-card__content') - for card in cards: - a = card.find('a') - if a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.get_text().strip() - date = datetime.datetime.strptime(card.find('p', class_='fr-card__detail').get_text().replace('Publié le ', '').strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref09.py b/RAAspotter_pref09.py deleted file mode 100644 index 8ce0cb0e6f44eacc0b8d1935833c86e77f97701f..0000000000000000000000000000000000000000 --- a/RAAspotter_pref09.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref09(RAAspotter): - - # Config - __HOST = 'https://www.ariege.gouv.fr' - __RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-l-Ariege-a-partir-du-28-avril-2015' - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture de l\'Ariège' - short_code = 'pref09' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - pages_to_parse = [] - - # Les RAA de l'Ariège sont éparpillés sur des sous-pages par mois. - # Donc on parse la page principale à la recherche des sous-pages. - sub_pages = self.get_sub_pages_with_pager( - self.__RAA_PAGE, - 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', - 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', - 'div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail', - self.__HOST - )[::-1] - - # On filtre par date les sous-pages pour limiter les requêtes - for sub_page in sub_pages: - guessed_date = datetime.datetime.strptime(sub_page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y') - guessed_date.replace(day=1) - if guessed_date >= self.not_before: - pages_to_parse.append(sub_page['url']) - - # On parse les pages contenant des RAA - elements = [] - for page in pages_to_parse: - page_content = self.get_page(page, 'get').content - for element in self.get_raa_elements(page_content): - elements.append(element) - - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère chaque balise a - for a in soup.select('div.fr-downloads-group.fr-downloads-group--bordered ul li a'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref13.py b/RAAspotter_pref13.py deleted file mode 100644 index 2e0e884f04fd99fa2cf8abe941a2cc3746387919..0000000000000000000000000000000000000000 --- a/RAAspotter_pref13.py +++ /dev/null @@ -1,59 +0,0 @@ -import os -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref13(RAAspotter): - - # Config - __HOST = 'https://www.bouches-du-rhone.gouv.fr' - __RAA_PAGE = [ - f'{__HOST}/Publications/RAA-et-Archives/RAA-2024', - f'{__HOST}/Publications/RAA-et-Archives/RAA-2023', - f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2022', - f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2021', - f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2020', - f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2019' - ] - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' - full_name = 'Préfecture des Bouches-du-Rhône' - short_code = 'pref13' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - elements = [] - for raa_page in self.__RAA_PAGE: - page_content = self.get_page(raa_page, 'get').content - for element in self.get_raa_elements(page_content): - elements.append(element) - - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse - for a in soup.select('a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref31.py b/RAAspotter_pref31.py deleted file mode 100644 index 3a2d1bb337648d2f40847844c56b8a708a2bb1dc..0000000000000000000000000000000000000000 --- a/RAAspotter_pref31.py +++ /dev/null @@ -1,71 +0,0 @@ -import os -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref31(RAAspotter): - - # Config - __HOST = 'https://www.haute-garonne.gouv.fr' - __RAA_PAGE = f'{__HOST}/Publications/Recueil-des-Actes-Administratifs/Recueil-des-Actes-Administratifs-Haute-Garonne' - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' - full_name = 'Préfecture de la Haute-Garonne' - short_code = 'pref31' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - - def get_raa(self, keywords): - # On cherche les pages de chaque mois - page_content = self.get_page(self.__RAA_PAGE, 'get').content - month_pages = self.get_sub_pages( - page_content, - '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', - self.__HOST, - False - )[::-1] - - pages_to_parse = [] - - # On filtre les pages de mois pour limiter le nombre de requêtes - for month_page in month_pages: - guessed_date = RAAspotter.guess_date(month_page['name'], '([a-zéû]* [0-9]{4})') - if guessed_date >= self.not_before.replace(day=1): - pages_to_parse.append(month_page['url']) - - elements = [] - # On parse les pages des mois qu'on veut analyser - for element in self.get_raa_with_pager( - pages_to_parse, - ".fr-pagination__link.fr-pagination__link--next", - self.__HOST - ): - elements.append(element) - - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère chaque balise a - for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link.menu-item-link'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.get_text().strip().capitalize() - date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref33.py b/RAAspotter_pref33.py deleted file mode 100644 index b6280288aeb1b9be89fdbcc6811948150aadb72f..0000000000000000000000000000000000000000 --- a/RAAspotter_pref33.py +++ /dev/null @@ -1,108 +0,0 @@ -import os -import re -import datetime -import logging - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - -logger = logging.getLogger(__name__) - - -class RAAspotter_pref33(RAAspotter): - - # Config - __HOST = 'https://www.gironde.gouv.fr' - __RAA_PAGE = f'{__HOST}/Publications/Recueil-des-Actes-Administratifs' - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture de la Gironde' - short_code = 'pref33' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - pages_to_parse = [] - - # Parfois un RAA est mal catégorisé et se retrouve sur la page racine, donc on la parse - pages_to_parse.append(self.__RAA_PAGE) - - # On détermine quelles pages d'année parser - year_pages_to_parse = [] - page_content = self.get_page(self.__RAA_PAGE, 'get').content - year_pages = self.get_sub_pages( - page_content, - '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', - self.__HOST, - False - ) - for year_page in year_pages: - year = 9999 - try: - year = int(re.search('.*([0-9]{4})', year_page['name'].strip(), re.IGNORECASE).group(1)) - if year is None: - year = 9999 - except Exception as exc: - logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}") - year = 9999 - - if year >= self.not_before.year: - year_pages_to_parse.append(year_page['url']) - - # Pour chaque année, on cherche les sous-pages de mois - month_pages_to_parse = [] - for year_page in year_pages_to_parse: - page_content = self.get_page(year_page, 'get').content - month_pages = self.get_sub_pages( - page_content, - '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', - self.__HOST, - False - )[::-1] - - for month_page in month_pages: - guessed_date = RAAspotter.guess_date(month_page['name'], '([a-zéû]* [0-9]{4})') - if guessed_date >= self.not_before.replace(day=1): - pages_to_parse.append(month_page['url']) - - # On parse les pages sélectionnées - elements = self.get_raa_with_pager( - pages_to_parse, - "ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label", - self.__HOST - )[::-1] - - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - - # On récupère chaque carte avec un RAA - for card in BeautifulSoup(page_content, 'html.parser').select('div.fr-card.fr-card--horizontal div.fr-card__body div.fr-card__content'): - # On récupère le lien - links = card.select('h2.fr-card__title a.fr-card__link.menu-item-link') - # On récupère la date - dates_raw = card.select('div.fr-card__end p.fr-card__detail') - - # Si on a toutes les infos, on continue - if links and links[0] and dates_raw and dates_raw[0]: - a = links[0] - date_raw = dates_raw[0] - - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.get_text().strip() - date = datetime.datetime.strptime(date_raw.get_text().replace('Publié le', '').strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref34.py b/RAAspotter_pref34.py deleted file mode 100644 index ff163ed1ae549fcef0a2342b6e08101e10571343..0000000000000000000000000000000000000000 --- a/RAAspotter_pref34.py +++ /dev/null @@ -1,73 +0,0 @@ -import os -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref34(RAAspotter): - - # Config - __HOST = 'https://www.herault.gouv.fr' - __RAA_PAGE = { - '2024': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2024', - '2023': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2023', - '2022': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2022', - '2021': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2021', - '2020': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2020', - '2019': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Archives/Recueil-des-actes-administratifs-2019' - } - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' - full_name = 'Préfecture de l\'Hérault' - short_code = 'pref34' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) - - elements = [] - for raa_page in pages_to_parse: - page_content = self.get_page(raa_page, 'get').content - for element in self.get_raa_elements(page_content): - elements.append(element) - - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère chaque balise a - for a in soup.select('a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref35.py b/RAAspotter_pref35.py deleted file mode 100644 index 3192c64e7e2639ff08f5b4cd1ec79ab692b0fc72..0000000000000000000000000000000000000000 --- a/RAAspotter_pref35.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref35(RAAspotter): - - # Config - __HOST = 'https://www.ille-et-vilaine.gouv.fr' - __RAA_PAGE = [ - f'{__HOST}/Publications/Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2024', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2023', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2022', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2021', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2020', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2019' - ] - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' - full_name = 'Préfecture d\'Ille-et-Vilaine' - short_code = 'pref35' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - elements = [] - for raa_page in self.__RAA_PAGE: - page_content = self.get_page(raa_page, 'get').content - for element in self.get_raa_elements(page_content): - elements.append(element) - - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le - # parse - for a in soup.find_all('a', href=True, class_='fr-link--download'): - if a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref38.py b/RAAspotter_pref38.py deleted file mode 100644 index 576e1a5efc82a38c199233cbaa1efca6237cdbb1..0000000000000000000000000000000000000000 --- a/RAAspotter_pref38.py +++ /dev/null @@ -1,101 +0,0 @@ -import os -import datetime -import logging - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - -logger = logging.getLogger(__name__) - - -class RAAspotter_pref38(RAAspotter): - - # Config - __HOST = 'https://www.isere.gouv.fr' - __RAA_PAGE = { - '2024': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2024', - '2023': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2023', - '2022': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2022', - '2021': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-recueils-des-actes-administratifs-de-la-prefecture-de-l-Isere-2021/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2021', - '2020': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-recueils-des-actes-administratifs-de-la-prefecture-de-l-Isere-2020/Recueils-des-Actes-Administratifs-de-la-Prefecture-de-l-Isere-2020', - '2019': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2019/Archives-des-Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2019' - } - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture de l\'Isère' - short_code = 'pref38' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(20) - - def get_raa(self, keywords): - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) - - elements = [] - for raa_page in pages_to_parse: - page_content = self.get_page(raa_page, 'get').content - for element in self.get_raa_elements(page_content, raa_page): - elements.append(element) - - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content, raa_page): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère le select qui contient la liste des RAA - select_list = soup.select('select#-liste-docs')[0] - # On analyse chaque résultat - for option in select_list.find_all('option'): - if not option['value'] == "": - # On estime la date à partir du nom de fichier - guessed_date = RAAspotter.guess_date(option['title'], '.* n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)') - - # Si la date estimée correspond à la plage d'analyse, on - # demande au serveur les détails du RAA - if guessed_date >= self.not_before: - page_content = self.get_page( - raa_page, - 'post', - { - '-liste-docs': option['value'] - } - ).content - - # On parse la page de détails pour obtenir les propriétés - # du RAA - soup = BeautifulSoup(page_content, 'html.parser') - a = soup.select('div.liste_deroulante a.fr-link.fr-link--download')[0] - - # Si la page contient une balise a qui renvoie vers un pdf, - # c'est qu'on a obtenu les détails du RAA demandé, donc - # on le parse - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref42.py b/RAAspotter_pref42.py deleted file mode 100644 index ef8b2def3a3708e70ea070ca294bd2a878b89863..0000000000000000000000000000000000000000 --- a/RAAspotter_pref42.py +++ /dev/null @@ -1,78 +0,0 @@ -import os -import datetime -import re - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref42(RAAspotter): - - # Config - __HOST = 'https://www.loire.gouv.fr' - __RAA_PAGE = f'{__HOST}/Publications/Publications-legales/Recueil-des-Actes-Administratifs' - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture de de la Loire' - short_code = 'pref42' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - year_pages_to_parse = [] - - # On détermine quelles pages d'année parser - year_pages = self.get_sub_pages_with_pager( - self.__RAA_PAGE, - 'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', - 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label', - 'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail', - self.__HOST - ) - for year_page in year_pages: - year = 9999 - try: - year = int(re.search('([0-9]{4})', year_page['name'], re.IGNORECASE).group(1)) - if year is None: - year = 9999 - except Exception as exc: - logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}") - year = 9999 - - if year >= self.not_before.year: - year_pages_to_parse.append(year_page['url']) - - elements = [] - # Pour chaque année, on parse les RAA - for year_page in year_pages_to_parse: - page_content = self.get_page(year_page, 'get').content - for element in self.get_raa_elements(page_content)[::-1]: - elements.append(element) - - # On parse les RAA - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère chaque balise a - for a in soup.select('div.fr-downloads-group.fr-downloads-group--bordered ul li a'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref44.py b/RAAspotter_pref44.py deleted file mode 100644 index 783977559594629e7def7aba1012955423896046..0000000000000000000000000000000000000000 --- a/RAAspotter_pref44.py +++ /dev/null @@ -1,108 +0,0 @@ -import os -import datetime -import logging - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - -logger = logging.getLogger(__name__) - - -class RAAspotter_pref44(RAAspotter): - - # Config - __HOST = 'https://www.loire-atlantique.gouv.fr' - __RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA-en-Loire-Atlantique' - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture de la Loire-Atlantique' - short_code = 'pref44' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - pages_to_parse = [] - - # Parfois un RAA est mal catégorisé et se retrouve sur la page racine, donc on la parse - pages_to_parse.append(self.__RAA_PAGE) - - # On détermine quelles pages d'année parser - year_pages_to_parse = [] - page_content = self.get_page(self.__RAA_PAGE, 'get').content - year_pages = self.get_sub_pages( - page_content, - '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', - self.__HOST, - False - ) - for year_page in year_pages: - year = 9999 - try: - year = int(year_page['name'].strip()) - if year is None: - year = 9999 - except Exception as exc: - logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}") - year = 9999 - - if year >= self.not_before.year: - year_pages_to_parse.append(year_page['url']) - - # Parfois un RAA est mal catégorisé et se retrouve sur la page de l'année, donc on la parse - pages_to_parse.append(year_page['url']) - - # Pour chaque année, on cherche les sous-pages de mois - month_pages_to_parse = [] - for year_page in year_pages_to_parse: - page_content = self.get_page(year_page, 'get').content - month_pages = self.get_sub_pages( - page_content, - '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', - self.__HOST, - False - )[::-1] - - for month_page in month_pages: - pages_to_parse.append(month_page['url']) - - # On parse les pages sélectionnées - elements = self.get_raa_with_pager( - pages_to_parse, - "ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label", - self.__HOST - )[::-1] - - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - - # On récupère chaque carte avec un RAA - for card in BeautifulSoup(page_content, 'html.parser').select('div.fr-card.fr-card--horizontal div.fr-card__body div.fr-card__content'): - # On récupère le lien - links = card.select('h2.fr-card__title a.fr-card__link.menu-item-link') - # On récupère la date - dates_raw = card.select('div.fr-card__end p.fr-card__detail') - - # Si on a toutes les infos, on continue - if links and links[0] and dates_raw and dates_raw[0]: - a = links[0] - date_raw = dates_raw[0] - - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.get_text().strip() - date = datetime.datetime.strptime(date_raw.get_text().replace('Publié le', '').strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref59.py b/RAAspotter_pref59.py deleted file mode 100644 index cf2e9fb8f6403690e8ff312bc11a97b8cc963689..0000000000000000000000000000000000000000 --- a/RAAspotter_pref59.py +++ /dev/null @@ -1,85 +0,0 @@ -import os -import datetime -import dateparser -import logging - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - -logger = logging.getLogger(__name__) - - -class RAAspotter_pref59(RAAspotter): - - # Config - __HOST = 'https://www.nord.gouv.fr' - __RAA_PAGE = { - '2024': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2024', - '2023': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2023', - '2022': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2022', - '2021': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2021', - '2020': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2020', - '2019': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2019' - } - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture du Nord' - short_code = 'pref59' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(20) - - def get_raa(self, keywords): - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) - - elements = [] - for raa_page in pages_to_parse: - page_content = self.get_page(raa_page, 'get').content - sub_pages = self.get_sub_pages( - page_content, - 'div.fr-card__body div.fr-card__content h2.fr-card__title a', - self.__HOST, - True - ) - for sub_page in sub_pages[::-1]: - sub_page_content = self.get_page(sub_page['url'], 'get').content - for element in self.get_raa_elements(sub_page_content): - elements.append(element) - - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère chaque balise a - for a in soup.select('a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref62.py b/RAAspotter_pref62.py deleted file mode 100644 index bc1d2f18600a92c7dbacb887de1f8e29fda5c7fe..0000000000000000000000000000000000000000 --- a/RAAspotter_pref62.py +++ /dev/null @@ -1,98 +0,0 @@ -import os -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref62(RAAspotter): - - # Config - __HOST = 'https://www.pas-de-calais.gouv.fr' - __RAA_PAGE = { - '2024': [ - f'{__HOST}/Publications/Recueil-des-actes-administratifs/2024-Recueils-des-actes-administratifs' - ], - '2023': [ - f'{__HOST}/Publications/Recueil-des-actes-administratifs/2023-Recueils-des-actes-administratifs', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/2023-Recueils-speciaux-des-actes-administratifs' - ], - '2022': [ - f'{__HOST}/Publications/Recueil-des-actes-administratifs/2022-Recueils-des-Actes-Administratifs', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/2022-Recueils-Speciaux-des-Actes-Administratifs' - ], - '2021': [ - f'{__HOST}/Publications/Recueil-des-actes-administratifs/2021-Recueils-des-actes-administratifs', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/2021-Recueils-speciaux-des-actes-administratifs' - ], - '2020': [ - f'{__HOST}/Publications/Recueil-des-actes-administratifs/2020-Recueils-des-actes-administratifs', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/2020-Recueils-speciaux-des-actes-administratifs' - ], - '2019': [ - f'{__HOST}/Publications/Recueil-des-actes-administratifs/2019-Recueil-des-actes-administratifs', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/2019-Recueils-speciaux-des-actes-administratifs' - ] - } - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture du Pas-de-Calais' - short_code = 'pref62' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(20) - - def get_raa(self, keywords): - pages_to_parse = [] - if self.not_before.year <= 2024: - for page in self.__RAA_PAGE['2024']: - pages_to_parse.append(page) - if self.not_before.year <= 2023: - for page in self.__RAA_PAGE['2023']: - pages_to_parse.append(page) - if self.not_before.year <= 2022: - for page in self.__RAA_PAGE['2022']: - pages_to_parse.append(page) - if self.not_before.year <= 2021: - for page in self.__RAA_PAGE['2021']: - pages_to_parse.append(page) - if self.not_before.year <= 2020: - for page in self.__RAA_PAGE['2020']: - pages_to_parse.append(page) - if self.not_before.year <= 2019: - for page in self.__RAA_PAGE['2019']: - pages_to_parse.append(page) - - elements = [] - for raa_page in pages_to_parse: - page_content = self.get_page(raa_page, 'get').content - for element in self.get_raa_elements(page_content): - elements.append(element) - - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère le div qui contient la liste des RAA - cards = soup.select('div.fr-downloads-group.fr-downloads-group--bordered')[0] - # On analyse chaque balise a dans ce div - for a in cards.find_all('a', href=True): - if a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements[::-1] diff --git a/RAAspotter_pref64.py b/RAAspotter_pref64.py deleted file mode 100644 index 044d9145ce8b91d22973184a40bc5a0f281e8cd9..0000000000000000000000000000000000000000 --- a/RAAspotter_pref64.py +++ /dev/null @@ -1,101 +0,0 @@ -import os -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref64(RAAspotter): - - # Config - __HOST = 'https://www.pyrenees-atlantiques.gouv.fr' - __RAA_PAGE = { - '2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs/Annee-2024', - '2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs/Annee-2023', - '2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs/Annee-2022', - '2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs/Annee-2021', - '2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs/Annee-2020', - '2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs/Annee-2019' - } - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture des Pyrénées-Atlantiques' - short_code = 'pref64' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - year_pages_to_parse = [] - if self.not_before.year <= 2024: - year_pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - year_pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - year_pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - year_pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - year_pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - year_pages_to_parse.append(self.__RAA_PAGE['2019']) - - pages_to_parse = [] - # Pour chaque année, on cherche les sous-pages de mois - for year_page in year_pages_to_parse: - page_content = self.get_page(year_page, 'get').content - month_pages = self.get_sub_pages( - page_content, - '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', - self.__HOST, - False - )[::-1] - - # Pour chaque page de mois, on récupère les liens vers des pages de RAA - for month_page in month_pages: - raa_links = self.get_sub_pages_with_pager( - month_page['url'], - 'div.content-view-line div.class-file h2 a', - 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', - None, - self.__HOST - )[::-1] - - # Pour chaque lien vers un RAA, on filtre ceux ne correspondant pas à la période analysée - for raa_link in raa_links: - guessed_date = RAAspotter.guess_date(raa_link['name'], 'n°[ 0-9-]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)') - if guessed_date >= self.not_before: - pages_to_parse.append(raa_link['url']) - - # On parse les pages contenant des RAA - elements = [] - for page in pages_to_parse: - page_content = self.get_page(page, 'get').content - for raa in self.get_raa_elements(page_content): - elements.append(raa) - - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère chaque balise a - for a in soup.select('a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref65.py b/RAAspotter_pref65.py deleted file mode 100644 index 242997e7c866884cc093f6979b695a0146839293..0000000000000000000000000000000000000000 --- a/RAAspotter_pref65.py +++ /dev/null @@ -1,73 +0,0 @@ -import os -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref65(RAAspotter): - - # Config - __HOST = 'https://www.hautes-pyrenees.gouv.fr' - __RAA_PAGE = { - '2024': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2024', - '2023': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2023', - '2022': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2022', - '2021': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2021', - '2020': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2020', - '2019': f'{__HOST}/Publications/Recueil-d-actes-administratifs/RAA-2019' - } - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture des Hautes-Pyrénées' - short_code = 'pref65' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) - - elements = [] - for raa_page in pages_to_parse: - page_content = self.get_page(raa_page, 'get').content - for element in self.get_raa_elements(page_content): - elements.append(element) - - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse - for a in soup.select('a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref66.py b/RAAspotter_pref66.py deleted file mode 100644 index 0ed6a8d553f0cae89fb288295a384a9ae00f008f..0000000000000000000000000000000000000000 --- a/RAAspotter_pref66.py +++ /dev/null @@ -1,133 +0,0 @@ -import os -import sys -import datetime -import logging - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - -logger = logging.getLogger(__name__) - - -class RAAspotter_pref66(RAAspotter): - - # Config - __HOST = 'https://www.pyrenees-orientales.gouv.fr' - __RAA_PAGE = { - '2024': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2024', - '2023': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2023', - '2022': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2022', - '2021': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2021', - '2020': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2020', - '2019': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2019' - } - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture des Pyrénées-Orientales' - short_code = 'pref66' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - elements = [] - - # La préfecture des Pyrénées-Orientales est une originale : avant 2024, - # chaque page annuelle contient l'ensemble des RAA, mais pas tout le - # temps avec leur date, qu'il faut deviner à partir du nom du RAA. - # Mais en 2024, ça change ! La page de 2024 contient un tableau - # récapitulatif avec toutes les dates de publication des RAA, mais - # aussi un pager. Sauf qu'il s'avère que le tableau récapitulatif - # n'est pas exhaustif. On doit donc parser toutes les sous-pages de - # 2024 puisqu'on ne peut se fier au tableau récapitulatif. - # Grrr. - if self.not_before.year <= 2024: - for element in self.get_raa_elements_since_2024(self.__RAA_PAGE['2024']): - elements.append(element) - if self.not_before.year <= 2023: - for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2023']): - elements.append(element) - if self.not_before.year <= 2022: - for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2022']): - elements.append(element) - if self.not_before.year <= 2021: - for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2021']): - elements.append(element) - if self.not_before.year <= 2020: - for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2020']): - elements.append(element) - if self.not_before.year <= 2019: - for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2019']): - elements.append(element) - - self.parse_raa(elements, keywords) - self.mailer() - - # On parse un lien d'avant 2024 - def get_raa_elements_before_2024(self, page): - elements = [] - page_content = self.get_page(page, 'get').content - soup = BeautifulSoup(page_content, 'html.parser') - for a in soup.select('div.fr-table.fr-table--bordered.list a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - date = None - try: - # Lorsque la date n'est pas affichée à l'écran, elle est en - # fait cachée dans la propriété "title" du lien - details = '' - if a.find('span'): - details = a.find('span').get_text().split(' - ')[-1].strip() - else: - details = a['title'].split(' - ')[-1].strip() - date = datetime.datetime.strptime(details, '%d/%m/%Y') - except Exception as exc: - logger.error(f'Impossible de trouver de date pour le texte : {text_raw}: {exc}') - sys.exit(1) - - if date >= self.not_before: - url = '' - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = '' - if a.find('span') and a.find('span').previous_sibling: - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - else: - name = a.get_text().replace('Télécharger ', '').strip() - - elements.append(RAAspotter.RAA(url, date, name)) - return elements - - # On parse les RAA depuis 2024 - def get_raa_elements_since_2024(self, root_page): - pages = self.get_sub_pages_with_pager( - root_page, - 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', - 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', - 'div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail', - self.__HOST - )[::-1] - - pages_to_parse = [] - elements = [] - - for page in pages: - if not page['url'].endswith('.pdf'): - logger.warning(f"Attention, le lien vers {page['url']} n'est pas bon !") - else: - if page['url'].startswith('/'): - url = f"{self.__HOST}{page['url']}" - else: - url = page['url'] - - url = unquote(url) - name = page['name'].replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y') - - elements.append(RAAspotter.RAA(url, date, name)) - return elements diff --git a/RAAspotter_pref69.py b/RAAspotter_pref69.py deleted file mode 100644 index ae7e3cf312191e7fc57b0d34c3599de527752f74..0000000000000000000000000000000000000000 --- a/RAAspotter_pref69.py +++ /dev/null @@ -1,85 +0,0 @@ -import os -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref69(RAAspotter): - - # Config - __HOST = 'https://www.rhone.gouv.fr' - __RAA_PAGE = { - '2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2024', - '2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2023', - '2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2022', - '2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2021', - '2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2020', - '2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2019' - } - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture du Rhône' - short_code = 'pref69' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(20) - - def get_raa(self, keywords): - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) - - sub_pages_to_parse = [] - - for raa_page in pages_to_parse: - sub_pages = self.get_sub_pages_with_pager( - raa_page, - 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', - 'ul.fr-pagination__list li a.fr-pagination__link--next', - None, - self.__HOST)[::-1] - for sub_page in sub_pages: - sub_pages_to_parse.append(sub_page['url']) - - elements = [] - for sub_page_to_parse in sub_pages_to_parse: - page_content = self.get_page(sub_page_to_parse, 'get').content - for element in self.get_raa_elements(page_content)[::-1]: - elements.append(element) - - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère chaque balise a - for a in soup.select('a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref80.py b/RAAspotter_pref80.py deleted file mode 100644 index 818c25c13f283eab312728e5100e4ea1831008fd..0000000000000000000000000000000000000000 --- a/RAAspotter_pref80.py +++ /dev/null @@ -1,98 +0,0 @@ -import os -import datetime -import logging - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - -logger = logging.getLogger(__name__) - - -class RAAspotter_pref80(RAAspotter): - - # Config - __HOST = 'https://www.somme.gouv.fr' - __RAA_PAGE = { - '2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2024', - '2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2023', - '2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2022', - '2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2021', - '2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2020', - '2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-departement-de-la-Somme/Annee-2019' - } - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' - full_name = 'Préfecture de la Somme' - short_code = 'pref80' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - year_pages_to_parse = [] - if self.not_before.year <= 2024: - year_pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - year_pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - year_pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - year_pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - year_pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - year_pages_to_parse.append(self.__RAA_PAGE['2019']) - - # Pour chaque page Année, on récupère la liste des RAA - elements = [] - for year_page in year_pages_to_parse: - page_content = self.get_page(year_page, 'get').content - for element in self.get_raa_elements(page_content): - elements.append(element) - - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le - # parse - for a in soup.select('div.fr-text--lead.fr-my-3w p a.fr-link'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - # On enlève les espaces insécables, les double-espaces, et le texte « Télécharger » de certains liens - name = a.get_text().replace('Télécharger ', '').strip().replace(u"\u00A0", ' ').replace(' ', ' ') - if name and not name == '': - # Certains RAA de la Somme ont une ligne avec les détails du fichier. Si cette ligne - # est disponible, on la parse, sinon on devine la date à partir du nom - date = None - if a.find('span'): - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - else: - regex = '.* n°.*(?:du)? ([0-9]*(?:er)? [a-zéû]* (?:[0-9]{4}|[0-9]{2}))' - date = RAAspotter.guess_date(name, regex) - # Parfois, il manque l'année dans le nom du RAA, alors on essaie avec l'année de la page - if date.year == 9999: - page_year = soup.select('nav.fr-breadcrumb div.fr-collapse ol.fr-breadcrumb__list li a.fr-breadcrumb__link.breadcrumb-item-link')[-1].get_text().replace('Année ', '').strip() - date = RAAspotter.guess_date(f'{name} {page_year}', regex) - - # Parfois, c'est que le fichier n'est pas un RAA mais un arrêté seul - if date.year == 9999: - date = RAAspotter.guess_date(name, '([0-9]*(?:er)? [a-zéû]* [0-9]{4})') - - if date.year == 9999: - logger.warning(f'On ignore {name} (URL : {url})') - else: - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements[::-1] diff --git a/RAAspotter_pref81.py b/RAAspotter_pref81.py deleted file mode 100644 index 5a3d1e2e3285e72dbe2bbc3843a177680adb9c81..0000000000000000000000000000000000000000 --- a/RAAspotter_pref81.py +++ /dev/null @@ -1,117 +0,0 @@ -import os -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref81(RAAspotter): - - # Config - __HOST = 'https://www.tarn.gouv.fr' - __RAA_PAGE = { - 'default': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA', - '2024': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2024', - '2023': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2023', - '2022': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2022', - '2021': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2021', - '2020': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2020', - '2019': f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA/2019', - } - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture du Tarn' - short_code = 'pref81' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) - - sub_pages_to_parse = [self.__RAA_PAGE['default']] - - # Pour chaque année, on cherche les sous-pages de mois - for raa_page in pages_to_parse: - page_content = self.get_page(raa_page, 'get').content - month_pages = self.get_sub_pages( - page_content, - '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', - self.__HOST, - False - )[::-1] - - # On regarde aussi si sur la page de l'année il n'y aurait pas un - # RAA mal catégorisé - for page_to_parse in self.find_raa_card(raa_page): - sub_pages_to_parse.append(page_to_parse) - - # Pour chaque mois, on cherche les pages des RAA - for month_page in month_pages: - year = RAAspotter.guess_date(month_page['name'], '(.*)').year - for page_to_parse in self.find_raa_card(month_page['url'], year): - sub_pages_to_parse.append(page_to_parse) - # On ajoute aussi la page des mois à parser au cas où il y ait - # eu une redirection vers un RAA - sub_pages_to_parse.append(month_page['url']) - - # On parse les pages contenant des RAA - elements = [] - for page in sub_pages_to_parse: - page_content = self.get_page(page, 'get').content - for element in self.get_raa_elements(page_content): - elements.append(element) - - self.parse_raa(elements, keywords) - self.mailer() - - def find_raa_card(self, page, year=None): - pages = [] - card_pages = self.get_sub_pages_with_pager( - page, - 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', - 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', - 'div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail', - self.__HOST - )[::-1] - for card_page in card_pages: - # On filtre les pages de RAA ne correspondant pas à la période analysée - guessed_date = datetime.datetime.strptime(card_page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y') - if guessed_date >= self.not_before: - pages.append(card_page['url']) - return pages - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère chaque balise a - for a in soup.select('div.fr-downloads-group.fr-downloads-group--bordered ul li a'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref83.py b/RAAspotter_pref83.py deleted file mode 100644 index ccc2af0358246cee75b4b340f67178e804f8f19a..0000000000000000000000000000000000000000 --- a/RAAspotter_pref83.py +++ /dev/null @@ -1,90 +0,0 @@ -import os -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref83(RAAspotter): - - # Config - __HOST = 'https://www.var.gouv.fr' - __RAA_PAGE = { - '2024': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2024', - '2023': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2023', - '2022': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2022', - '2021': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2021', - '2020': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2020', - '2019': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2019' - } - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture du Var' - short_code = 'pref83' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) - - sub_pages_to_parse = [] - - # Pour chaque année, on cherche les sous-pages de mois - for raa_page in pages_to_parse: - sub_pages_to_parse.append(raa_page) - page_content = self.get_page(raa_page, 'get').content - month_pages = self.get_sub_pages( - page_content, - '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', - self.__HOST, - False - )[::-1] - for month_page in month_pages: - sub_pages_to_parse.append(month_page['url']) - - # On parse les pages contenant des RAA - elements = self.get_raa_with_pager( - sub_pages_to_parse[::-1], - '.fr-pagination__link.fr-pagination__link--next', - self.__HOST - ) - self.parse_raa(elements, keywords) - - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère chaque section contenant un RAA - cards = soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link.menu-item-link') - for a in cards: - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.get_text().strip() - date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref87.py b/RAAspotter_pref87.py deleted file mode 100644 index 3a008f4a036b9a620df6a29ef2b69e9f80577aa8..0000000000000000000000000000000000000000 --- a/RAAspotter_pref87.py +++ /dev/null @@ -1,106 +0,0 @@ -import os -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref87(RAAspotter): - - # Config - __HOST = 'https://www.haute-vienne.gouv.fr' - __RAA_PAGE = { - '2024': [ - f'{__HOST}/Publications/Recueil-des-actes-administratifs/JANVIER-JUIN-2024/JANVIER-JUIN-2024', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/JUILLET-DECEMBRE-2024' - ], - '2023': [ - f'{__HOST}/Publications/Recueil-des-actes-administratifs/JANVIER-JUIN-2023', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/JUILLET-DECEMBRE-2023/JUILLET-DECEMBRE-2023' - ], - '2022': [ - f'{__HOST}/Publications/Recueil-des-actes-administratifs/JANVIER-JUIN-2022', - f'{__HOST}/Publications/Recueil-des-actes-administratifs/JUILLET-DECEMBRE-2022/Recueil-des-actes-administratifs-2022', - ], - '2021': [f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/2021'], - '2020': [f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/2020'], - '2019': [f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/2019'] - } - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' - full_name = 'Préfecture de la Haute-Vienne' - short_code = 'pref87' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - year_pages_to_parse = [] - if self.not_before.year <= 2024: - for year_page in self.__RAA_PAGE['2024']: - year_pages_to_parse.append(year_page) - if self.not_before.year <= 2023: - for year_page in self.__RAA_PAGE['2023']: - year_pages_to_parse.append(year_page) - if self.not_before.year <= 2022: - for year_page in self.__RAA_PAGE['2022']: - year_pages_to_parse.append(year_page) - if self.not_before.year <= 2021: - for year_page in self.__RAA_PAGE['2021']: - year_pages_to_parse.append(year_page) - if self.not_before.year <= 2020: - for year_page in self.__RAA_PAGE['2020']: - year_pages_to_parse.append(year_page) - if self.not_before.year <= 2019: - for year_page in self.__RAA_PAGE['2019']: - year_pages_to_parse.append(year_page) - - pages_to_parse = year_pages_to_parse - # Pour chaque année, on cherche les éventuelles sous-pages de mois - for year_page in year_pages_to_parse: - page_content = self.get_page(year_page, 'get').content - month_pages = self.get_sub_pages( - page_content, - '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', - self.__HOST, - False - )[::-1] - - # On filtre les pages de mois ne correspondant pas à la période analysée - for month_page in month_pages: - guessed_date = RAAspotter.guess_date(month_page['name'], '([a-zéû]* [0-9]{4})').replace(day=1) - if guessed_date >= self.not_before.replace(day=1): - pages_to_parse.append(month_page['url']) - - # On parse les pages contenant des RAA - elements = [] - for page in pages_to_parse: - page_content = self.get_page(page, 'get').content - for raa in self.get_raa_elements(page_content): - elements.append(raa) - - self.parse_raa(elements, keywords) - self.mailer() - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère chaque balise a - for a in soup.select('a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/RAAspotter_pref976.py b/RAAspotter_pref976.py deleted file mode 100644 index 4e42957c98ed1a877beea7c21060e6d68e64d655..0000000000000000000000000000000000000000 --- a/RAAspotter_pref976.py +++ /dev/null @@ -1,120 +0,0 @@ -import os -import datetime - -from bs4 import BeautifulSoup -from urllib.parse import unquote - -from RAAspotter import RAAspotter - - -class RAAspotter_pref976(RAAspotter): - - # Config - __HOST = 'https://www.mayotte.gouv.fr' - __RAA_PAGE = { - 'default': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A', - '2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2024', - '2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2023', - '2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2022', - '2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2021', - '2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2020', - '2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2019' - } - __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' - full_name = 'Préfecture de Mayotte' - short_code = 'pref976' - - def __init__(self, data_dir): - super().__init__(data_dir, self.__USER_AGENT) - self.enable_tor(10) - - def get_raa(self, keywords): - pages_to_parse = [] - if self.not_before.year <= 2024: - pages_to_parse.append(self.__RAA_PAGE['2024']) - if self.not_before.year <= 2023: - pages_to_parse.append(self.__RAA_PAGE['2023']) - if self.not_before.year <= 2022: - pages_to_parse.append(self.__RAA_PAGE['2022']) - if self.not_before.year <= 2021: - pages_to_parse.append(self.__RAA_PAGE['2021']) - if self.not_before.year <= 2020: - pages_to_parse.append(self.__RAA_PAGE['2020']) - if self.not_before.year <= 2019: - pages_to_parse.append(self.__RAA_PAGE['2019']) - - sub_pages_to_parse = [self.__RAA_PAGE['default']] - - # Pour chaque année, on cherche les sous-pages de mois - for raa_page in pages_to_parse: - page_content = self.get_page(raa_page, 'get').content - month_pages = self.get_sub_pages( - page_content, - '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', - self.__HOST, - False - )[::-1] - - # On regarde aussi si sur la page de l'année il n'y aurait pas un - # RAA mal catégorisé - for page_to_parse in self.find_raa_card(raa_page): - sub_pages_to_parse.append(page_to_parse) - - # Pour chaque mois, on cherche les pages des RAA - for month_page in month_pages: - year = RAAspotter.guess_date(month_page['name'], '(.*)').year - for page_to_parse in self.find_raa_card( - month_page['url'], - year - ): - sub_pages_to_parse.append(page_to_parse) - - # On parse les pages contenant des RAA - elements = [] - for page in sub_pages_to_parse: - page_content = self.get_page(page, 'get').content - for element in self.get_raa_elements(page_content): - elements.append(element) - - self.parse_raa(elements, keywords) - self.mailer() - - def find_raa_card(self, page, year=None): - pages = [] - card_pages = self.get_sub_pages_with_pager( - page, - 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', - 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', - None, - self.__HOST - )[::-1] - for card_page in card_pages: - # On filtre les pages de RAA ne correspondant pas à la période - # analysée - guessed_date = RAAspotter.guess_date(card_page['name'], 'n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)') - if year: - guessed_date = guessed_date.replace(year=year) - if guessed_date >= self.not_before: - pages.append(card_page['url']) - return pages - - def get_raa_elements(self, page_content): - elements = [] - # On charge le parser - soup = BeautifulSoup(page_content, 'html.parser') - - # On récupère chaque balise a - for a in soup.select('a.fr-link.fr-link--download'): - if a.get('href') and a['href'].endswith('.pdf'): - if a['href'].startswith('/'): - url = f"{self.__HOST}{a['href']}" - else: - url = a['href'] - - url = unquote(url) - name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() - date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') - - raa = RAAspotter.RAA(url, date, name) - elements.append(raa) - return elements diff --git a/README.md b/README.md index 04fd6738f0925523a27f5a5107a8357a9966951f..2740dd793862ab76bff8b0b215a808e3f053e46b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# RAAspotter +# Attrap -Une série de scripts en Python qui récupère les derniers recueils des actes administratifs (RAA) pour y rechercher certains mots-clés prédéfinis. +Un logiciel qui récupère les derniers recueils des actes administratifs (RAA) pour y rechercher certains mots-clés prédéfinis. Conçu pour être utilisé dans une CI. diff --git a/cli.py b/cli.py index ffc424049cd2595e56355efbce60ee80adf2b57f..6bf99c768a38d62de9abac612cb5e727a55a82d9 100755 --- a/cli.py +++ b/cli.py @@ -4,7 +4,7 @@ import logging import datetime import importlib -from RAAspotter import RAAspotter +from Attrap import Attrap # Config __KEYWORDS = os.getenv('KEYWORDS') or '' @@ -220,12 +220,11 @@ if __ADMINISTRATION_EMAIL_TO and not __ADMINISTRATION_EMAIL_TO == '': else: __EMAIL_TO = __ADMINISTRATION_EMAIL_TO -module = importlib.import_module(f'RAAspotter_{args.administration}') -raa_spotter = getattr(module, f'RAAspotter_{args.administration}')(__DATA_DIR) +module = importlib.import_module(f'Attrap_{args.administration}') +attrap = getattr(module, f'Attrap_{args.administration}')(__DATA_DIR) -raa_spotter.not_before = __NOT_BEFORE -raa_spotter.configure_mailer(__SMTP_HOSTNAME, __SMTP_USERNAME, __SMTP_PASSWORD, __SMTP_PORT, __SMTP_STARTTLS, __SMTP_SSL, - __EMAIL_FROM, __EMAIL_TO, f'[RAAspotter] [{raa_spotter.full_name}] Nouveaux éléments trouvés') -raa_spotter.configure_mastodon(__MASTODON_ACCESS_TOKEN, __MASTODON_INSTANCE, - f'[{raa_spotter.full_name}]', f'#{raa_spotter.short_code}') -raa_spotter.get_raa(__KEYWORDS) +attrap.not_before = __NOT_BEFORE +attrap.configure_mailer(__SMTP_HOSTNAME, __SMTP_USERNAME, __SMTP_PASSWORD, __SMTP_PORT, __SMTP_STARTTLS, __SMTP_SSL, + __EMAIL_FROM, __EMAIL_TO, f'[Attrap] [{attrap.full_name}] Nouveaux éléments trouvés') +attrap.configure_mastodon(__MASTODON_ACCESS_TOKEN, __MASTODON_INSTANCE, f'[{attrap.full_name}]', f'#{attrap.short_code}') +attrap.get_raa(__KEYWORDS)