Newer
Older
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_pref69(RAAspotter):
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Config
__HOST = 'https://www.rhone.gouv.fr'
__RAA_PAGE = {
'2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-'
'Rhone-RAA/Recueils-de-2024',
'2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-'
'Rhone-RAA/Recueils-de-2023',
'2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-'
'Rhone-RAA/Recueils-de-2022',
'2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-'
'Rhone-RAA/Recueils-de-2021',
'2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-'
'Rhone-RAA/Recueils-de-2020',
'2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-'
'Rhone-RAA/Recueils-de-2019'
}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) ' \
'Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Rhône'
short_code = 'pref69'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(20)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref69')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
sub_pages_to_parse = []
for raa_page in pages_to_parse:
sub_pages = self.get_sub_pages_with_pager(
raa_page,
'div.fr-card__body div.fr-card__content '
'h2.fr-card__title a.fr-card__link',
"ul.fr-pagination__list li a.fr-pagination__link--next",

Bastien Le Querrec
a validé
None,
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
self.__HOST)[::-1]
for sub_page in sub_pages:
sub_pages_to_parse.append(sub_page['url'])
elements = []
for sub_page_to_parse in sub_pages_to_parse:
page_content = self.get_page(
sub_page_to_parse,
'get'
).content
for element in self.get_raa_elements(page_content)[::-1]:
elements.append(element)
self.parse_raa(elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace(
'Télécharger ',
''
).strip()
date = datetime.datetime.strptime(
a.find('span').get_text().split(' - ')[-1].strip(),
'%d/%m/%Y'
)
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements