Skip to content
Extraits de code Groupes Projets

pref971

Ouvert ketsapiwiq a demandé de fusionner ketsapiwiq/Attrap:pref971 vers main
1 fichier
+ 5
5
Comparer les modifications
  • Côte à côte
  • En ligne
Attrap_pref971.py 0 → 100644
+ 127
0
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
import locale
locale.setlocale(locale.LC_TIME, "fr_FR.UTF-8")
# https://www.guadeloupe.gouv.fr/Publications/Le-Recueil-des-actes-administratifs/2021/Decembre
class Attrap_pref971(Attrap):
months_fr = [
"Janvier",
"Fevrier",
"Mars",
"Avril",
"Mai",
"Juin",
"Juillet",
"Aout",
"Septembre",
"Octobre",
"Novembre",
"Decembre",
]
# Config
__HOST = "https://www.guadeloupe.gouv.fr"
__RAA_PAGE = {
"2015": [],
"2016": [],
"2017": [],
"2018": [],
"2019": [],
"2020": [],
"2021": [],
"2022": [],
"2023": [],
"2024": [],
}
for year in range(2015, 2024):
for month in range(1, 12):
__RAA_PAGE[str(year)].append(
f"{__HOST}/Publications/Le-Recueil-des-actes-administratifs/{year}/{months_fr[month]}"
)
__USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0"
)
full_name = "Préfecture de la Guadeloupe"
short_code = "pref971"
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
def get_raa(self, keywords):
year_pages_to_parse = []
if self.not_before.year <= 2024:
for year_page in self.__RAA_PAGE["2024"]:
year_pages_to_parse.append(year_page)
if self.not_before.year <= 2023:
for year_page in self.__RAA_PAGE["2023"]:
year_pages_to_parse.append(year_page)
if self.not_before.year <= 2022:
for year_page in self.__RAA_PAGE["2022"]:
year_pages_to_parse.append(year_page)
if self.not_before.year <= 2021:
for year_page in self.__RAA_PAGE["2021"]:
year_pages_to_parse.append(year_page)
if self.not_before.year <= 2020:
for year_page in self.__RAA_PAGE["2020"]:
year_pages_to_parse.append(year_page)
if self.not_before.year <= 2019:
for year_page in self.__RAA_PAGE["2019"]:
year_pages_to_parse.append(year_page)
if self.not_before.year <= 2018:
for year_page in self.__RAA_PAGE["2018"]:
year_pages_to_parse.append(year_page)
if self.not_before.year <= 2017:
for year_page in self.__RAA_PAGE["2017"]:
year_pages_to_parse.append(year_page)
if self.not_before.year <= 2016:
for year_page in self.__RAA_PAGE["2016"]:
year_pages_to_parse.append(year_page)
if self.not_before.year <= 2015:
for year_page in self.__RAA_PAGE["2015"]:
year_pages_to_parse.append(year_page)
# On parse les pages contenant des RAA
elements = []
for year in self.__RAA_PAGE:
for page in self.__RAA_PAGE[year]:
page_content = self.get_page(page, "get").content
for raa in self.get_raa_elements(page_content):
elements.append(raa)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, "html.parser")
# On récupère chaque balise a
for a in soup.select("a.fr-link.fr-link--download"):
if a.get("href") and a["href"].endswith(".pdf"):
if a["href"].startswith("/"):
url = f"{self.__HOST}{a['href']}"
else:
url = a["href"]
url = unquote(url)
name = a.get_text().replace("Télécharger ", "").strip()
date_str = a.find_all("span")[-1].get_text().split(" - ")[-1].strip()
date = datetime.datetime.strptime(date_str, "%d/%m/%Y")
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
Chargement en cours