import os import datetime from bs4 import BeautifulSoup from urllib.parse import unquote from Attrap import Attrap import locale locale.setlocale(locale.LC_TIME, "fr_FR.UTF-8") # https://www.guadeloupe.gouv.fr/Publications/Le-Recueil-des-actes-administratifs/2021/Decembre class Attrap_pref971(Attrap): months_fr = [ "Janvier", "Fevrier", "Mars", "Avril", "Mai", "Juin", "Juillet", "Aout", "Septembre", "Octobre", "Novembre", "Decembre", ] # Config __HOST = "https://www.guadeloupe.gouv.fr" __RAA_PAGE = { "2015": [], "2016": [], "2017": [], "2018": [], "2019": [], "2020": [], "2021": [], "2022": [], "2023": [], "2024": [], } for year in range(2015, 2024): for month in range(1, 12): __RAA_PAGE[str(year)].append( f"{__HOST}/Publications/Le-Recueil-des-actes-administratifs/{year}/{months_fr[month]}" ) __USER_AGENT = ( "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0" ) full_name = "Préfecture de la Guadeloupe" short_code = "pref971" def __init__(self, data_dir): super().__init__(data_dir, self.__USER_AGENT) self.enable_tor(10) def get_raa(self, keywords): year_pages_to_parse = [] if self.not_before.year <= 2024: for year_page in self.__RAA_PAGE["2024"]: year_pages_to_parse.append(year_page) if self.not_before.year <= 2023: for year_page in self.__RAA_PAGE["2023"]: year_pages_to_parse.append(year_page) if self.not_before.year <= 2022: for year_page in self.__RAA_PAGE["2022"]: year_pages_to_parse.append(year_page) if self.not_before.year <= 2021: for year_page in self.__RAA_PAGE["2021"]: year_pages_to_parse.append(year_page) if self.not_before.year <= 2020: for year_page in self.__RAA_PAGE["2020"]: year_pages_to_parse.append(year_page) if self.not_before.year <= 2019: for year_page in self.__RAA_PAGE["2019"]: year_pages_to_parse.append(year_page) if self.not_before.year <= 2018: for year_page in self.__RAA_PAGE["2018"]: year_pages_to_parse.append(year_page) if self.not_before.year <= 2017: for year_page in self.__RAA_PAGE["2017"]: year_pages_to_parse.append(year_page) if self.not_before.year <= 2016: for year_page in self.__RAA_PAGE["2016"]: year_pages_to_parse.append(year_page) if self.not_before.year <= 2015: for year_page in self.__RAA_PAGE["2015"]: year_pages_to_parse.append(year_page) # On parse les pages contenant des RAA elements = [] for year in self.__RAA_PAGE: for page in self.__RAA_PAGE[year]: page_content = self.get_page(page, "get").content for raa in self.get_raa_elements(page_content): elements.append(raa) self.parse_raa(elements, keywords) self.mailer() def get_raa_elements(self, page_content): elements = [] # On charge le parser soup = BeautifulSoup(page_content, "html.parser") # On récupère chaque balise a for a in soup.select("a.fr-link.fr-link--download"): if a.get("href") and a["href"].endswith(".pdf"): if a["href"].startswith("/"): url = f"{self.__HOST}{a['href']}" else: url = a["href"] url = unquote(url) name = a.get_text().replace("Télécharger ", "").strip() date_str = a.find_all("span")[-1].get_text().split(" - ")[-1].strip() date = datetime.datetime.strptime(date_str, "%d/%m/%Y") raa = Attrap.RAA(url, date, name) elements.append(raa) return elements