From 476d866a634f784cdb4bd26a613cc351d203bc41 Mon Sep 17 00:00:00 2001 From: Felix Lena <felix.lena@epita.fr> Date: Sun, 9 Jun 2024 20:28:49 +0200 Subject: [PATCH] pref2b: Ajout de la prefecture de la haute corse --- Attrap_pref2B.py | 92 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 Attrap_pref2B.py diff --git a/Attrap_pref2B.py b/Attrap_pref2B.py new file mode 100644 index 0000000..79726fb --- /dev/null +++ b/Attrap_pref2B.py @@ -0,0 +1,92 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref83(Attrap): + + # Config + __HOST = 'https://www.haute-corse.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2024', + '2023': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2023', + '2022': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2016-a-2022/Recueils-des-actes-administratifs-2022', + '2021': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2016-a-2022/Recueils-des-actes-administratifs-2021', + '2020': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2016-a-2022/Recueils-des-actes-administratifs-2020', + '2019': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2016-a-2022/Recueils-des-actes-administratifs-2019' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture de Haute-Corse' + short_code = 'pref2B' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + sub_pages_to_parse = [] + + # Pour chaque année, on cherche les sous-pages de mois + for raa_page in pages_to_parse: + sub_pages_to_parse.append(raa_page) + page_content = self.get_page(raa_page, 'get').content + month_pages = self.get_sub_pages( + page_content, + '.fr-card fr-card--horizontal fr-card--sm fr-enlarge-link fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + )[::-1] + for month_page in month_pages: + sub_pages_to_parse.append(month_page['url']) + + # On parse les pages contenant des RAA + elements = self.get_raa_with_pager( + sub_pages_to_parse[::-1], + '.fr-pagination__link.fr-pagination__link--next', + self.__HOST + ) + self.parse_raa(elements, keywords) + + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque section contenant un RAA + cards = soup.select( + 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link.menu-item-link') + for a in cards: + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.get_text().strip() + date = datetime.datetime.strptime( + a['title'].split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements -- GitLab