Newer
Older
import os
import datetime
import re
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref42(Attrap):
# Config
hostname = 'https://www.loire.gouv.fr'
raa_page = f'{hostname}/Publications/Publications-legales/Recueil-des-Actes-Administratifs'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
timezone = 'Europe/Paris'
super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30)
def get_raa(self, keywords):
year_pages_to_parse = []
# On détermine quelles pages d'année parser
year_pages = self.get_sub_pages_with_pager(
self.raa_page,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.hostname
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
)
for year_page in year_pages:
year = 9999
try:
year = int(re.search('([0-9]{4})', year_page['name'], re.IGNORECASE).group(1))
if year is None:
year = 9999
except Exception as exc:
logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}")
year = 9999
if year >= self.not_before.year:
year_pages_to_parse.append(year_page['url'])
elements = []
# Pour chaque année, on parse les RAA
for year_page in year_pages_to_parse:
page_content = self.get_page(year_page, 'get').content
for element in self.get_raa_elements(page_content)[::-1]:
elements.append(element)
# On parse les RAA
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('div.fr-downloads-group.fr-downloads-group--bordered ul li a'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.hostname}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name, timezone=self.timezone)