Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import re
import datetime
import logging
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
logger = logging.getLogger(__name__)
class RAAspotter_pref33(RAAspotter):
# Config
__HOST = 'https://www.gironde.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-Actes-Administratifs'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture de la Gironde'
short_code = 'pref33'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref33')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
# Parfois un RAA est mal catégorisé et se retrouve sur la page racine, donc on la parse
pages_to_parse.append(self.__RAA_PAGE)
# On détermine quelles pages d'année parser
year_pages_to_parse = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
year_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)
for year_page in year_pages:
year = 9999
try:
year = int(re.search('.*([0-9]{4})', year_page['name'].strip(), re.IGNORECASE).group(1))
if year is None:
year = 9999
except Exception as exc:
logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}")
year = 9999
if year >= self.not_before.year:
year_pages_to_parse.append(year_page['url'])
# Pour chaque année, on cherche les sous-pages de mois
month_pages_to_parse = []
for year_page in year_pages_to_parse:
page_content = self.get_page(year_page, 'get').content
month_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)[::-1]
for month_page in month_pages:
guessed_date = RAAspotter.guess_date(month_page['name'], '([a-zéû]* [0-9]{4})')
if guessed_date >= self.not_before.replace(day=1):
pages_to_parse.append(month_page['url'])
# On parse les pages sélectionnées
elements = self.get_raa_with_pager(
pages_to_parse,
"ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label",
self.__HOST
)[::-1]
self.parse_raa(elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On récupère chaque carte avec un RAA
for card in BeautifulSoup(page_content, 'html.parser').select('div.fr-card.fr-card--horizontal div.fr-card__body div.fr-card__content'):
# On récupère le lien
links = card.select('h2.fr-card__title a.fr-card__link.menu-item-link')
# On récupère la date
dates_raw = card.select('div.fr-card__end p.fr-card__detail')
# Si on a toutes les infos, on continue
if links and links[0] and dates_raw and dates_raw[0]:
a = links[0]
date_raw = dates_raw[0]
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.get_text().strip()
date = datetime.datetime.strptime(date_raw.get_text().replace('Publié le', '').strip(), '%d/%m/%Y')
raa = RAAspotter.RAA(url, date, name)
elements.append(raa)
return elements