Skip to content
Extraits de code Groupes Projets
Valider 476d866a rédigé par Felix Lena's avatar Felix Lena Validation de Elunias
Parcourir les fichiers

pref2b: Ajout de la prefecture de la haute corse

parent 5ed1d5c1
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref83(Attrap):
# Config
__HOST = 'https://www.haute-corse.gouv.fr'
__RAA_PAGE = {
'2024': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2024',
'2023': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2023',
'2022': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2016-a-2022/Recueils-des-actes-administratifs-2022',
'2021': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2016-a-2022/Recueils-des-actes-administratifs-2021',
'2020': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2016-a-2022/Recueils-des-actes-administratifs-2020',
'2019': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2016-a-2022/Recueils-des-actes-administratifs-2019'
}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture de Haute-Corse'
short_code = 'pref2B'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
def get_raa(self, keywords):
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
sub_pages_to_parse = []
# Pour chaque année, on cherche les sous-pages de mois
for raa_page in pages_to_parse:
sub_pages_to_parse.append(raa_page)
page_content = self.get_page(raa_page, 'get').content
month_pages = self.get_sub_pages(
page_content,
'.fr-card fr-card--horizontal fr-card--sm fr-enlarge-link fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)[::-1]
for month_page in month_pages:
sub_pages_to_parse.append(month_page['url'])
# On parse les pages contenant des RAA
elements = self.get_raa_with_pager(
sub_pages_to_parse[::-1],
'.fr-pagination__link.fr-pagination__link--next',
self.__HOST
)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque section contenant un RAA
cards = soup.select(
'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link.menu-item-link')
for a in cards:
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.get_text().strip()
date = datetime.datetime.strptime(
a['title'].split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter