Skip to content
Extraits de code Groupes Projets
Valider 9945ed17 rédigé par Joseki's avatar Joseki
Parcourir les fichiers

Ajout de la prefecture 73 Savoie

parent ce75d11b
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
Pipeline #5884 en échec
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref73(Attrap):
# Config
__HOST = 'https://www.savoie.gouv.fr'
__RAA_PAGE = {
'2024': f'{__HOST}/Publications/Recueils-hebdomadaires-et-speciaux-des-actes-administratifs/2024',
'2023': f'{__HOST}/Publications/Recueils-hebdomadaires-et-speciaux-des-actes-administratifs/2023',
'2022': f'{__HOST}/Publications/Recueils-hebdomadaires-et-speciaux-des-actes-administratifs/2022',
'2021': f'{__HOST}/Publications/Recueils-hebdomadaires-et-speciaux-des-actes-administratifs/2021',
'2020': f'{__HOST}/Publications/Recueils-hebdomadaires-et-speciaux-des-actes-administratifs/2020',
'2019': f'{__HOST}/Publications/Recueils-hebdomadaires-et-speciaux-des-actes-administratifs/2019',
}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture de la Savoie'
short_code = 'pref73'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
def get_raa(self, keywords):
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
# On parse les pages contenant des RAA
elements = []
print(pages_to_parse)
for page in pages_to_parse:
page_content = self.get_page(page, 'get').content
for element in self.get_raa_elements(page_content):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace(
'Télécharger ', '').strip()
date = datetime.datetime.strptime(
a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
......@@ -59,6 +59,7 @@ available_administrations = [
'pref65',
'pref66',
'pref69',
'pref73',
'pref80',
'pref81',
'pref83',
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter