Skip to content
Extraits de code Groupes Projets
Valider 820ae2a1 rédigé par Bastien Le Querrec's avatar Bastien Le Querrec
Parcourir les fichiers

pref73: détecte l'URL de l'année voulue automatiquement

parent d9a274eb
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
import os
import re
import datetime
from bs4 import BeautifulSoup
......@@ -11,16 +12,9 @@ class Attrap_pref73(Attrap):
# Config
hostname = 'https://www.savoie.gouv.fr'
raa_page = {
'2024': f'{hostname}/Publications/Recueils-hebdomadaires-et-speciaux-des-actes-administratifs/2024',
'2023': f'{hostname}/Publications/Recueils-hebdomadaires-et-speciaux-des-actes-administratifs/2023',
'2022': f'{hostname}/Publications/Recueils-hebdomadaires-et-speciaux-des-actes-administratifs/2022',
'2021': f'{hostname}/Publications/Recueils-hebdomadaires-et-speciaux-des-actes-administratifs/2021',
'2020': f'{hostname}/Publications/Recueils-hebdomadaires-et-speciaux-des-actes-administratifs/2020',
'2019': f'{hostname}/Publications/Recueils-hebdomadaires-et-speciaux-des-actes-administratifs/2019',
}
raa_page = f'{hostname}/Publications/Recueils-hebdomadaires-et-speciaux-des-actes-administratifs'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture de la Savoie'
full_name = 'Savoie'
short_code = 'pref73'
timezone = 'Europe/Paris'
......@@ -28,39 +22,53 @@ class Attrap_pref73(Attrap):
super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30)
def get_raa(self, keywords):
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.raa_page['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.raa_page['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.raa_page['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.raa_page['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.raa_page['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.raa_page['2019'])
self.page_urls_to_parse = [self.raa_page]
self.elements = []
# On parse les pages contenant des RAA
elements = []
print(pages_to_parse)
for page in pages_to_parse:
page_content = self.get_page(page, 'get').content
for element in self.get_raa_elements(page_content):
elements.append(element)
def get_raa(self, keywords):
while not self.page_urls_to_parse == []:
page_url = self.page_urls_to_parse[-1]
page_content = self.get_page(page_url, 'get').content # On récupère le HTML de la page
self.get_year_pages(page_content) # On cherche les cartes d'années
self.get_elements_pages(page_url) # On cherche les cartes d'éléments
for element in self.get_raa_elements(page_content): # On cherche les éléments
self.elements.append(element)
self.page_urls_to_parse.remove(page_url) # On supprime la page de la liste de celles à parser
self.parse_raa(elements, keywords)
self.parse_raa(self.elements[::-1], keywords)
self.mailer()
def get_year_pages(self, page_content):
for card in self.get_sub_pages(
page_content,
'div.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', # Pages d'années
self.hostname,
False
):
date = Attrap.guess_date(card['name'].strip(), '([0-9]{4})').replace(day=1, month=1)
if date.year >= self.not_before.year:
self.page_urls_to_parse.append(card['url'])
def get_elements_pages(self, page_url):
print(page_url)
for card in self.get_sub_pages_with_pager(
page_url,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', # Carte avec un élément (mois ou arrêté seul)
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label', # Pager
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail', # Détails (avec la date de publication)
self.hostname
):
date = datetime.datetime.strptime(card['details'].replace('Publié le ', '').strip(), '%d/%m/%Y')
if date >= self.not_before:
self.page_urls_to_parse.append(card['url'])
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-link.fr-link--download'):
for a in soup.select('div.fr-downloads-group.fr-downloads-group--bordered ul li a,div a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.hostname}{a['href']}"
......@@ -68,10 +76,8 @@ class Attrap_pref73(Attrap):
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace(
'Télécharger ', '').strip()
date = datetime.datetime.strptime(
a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name, timezone=self.timezone)
elements.append(raa)
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter