From 47808327ba45ee222f83ba7b7be8e4984b431ea1 Mon Sep 17 00:00:00 2001 From: Luc Pellissier <luc.pellissier@inria.fr> Date: Tue, 18 Jun 2024 18:29:44 +0200 Subject: [PATCH] =?UTF-8?q?prefIdf:=20ajout=20de=20la=20pr=C3=A9fecture=20?= =?UTF-8?q?d'=C3=8Ele-de-France?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes !12 --- Attrap_prefIdf.py | 96 +++++++++++++++++++++++++++++++++++++++++++++++ Makefile | 4 +- README.md | 1 + cli.py | 3 +- 4 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 Attrap_prefIdf.py diff --git a/Attrap_prefIdf.py b/Attrap_prefIdf.py new file mode 100644 index 0000000..cd857e9 --- /dev/null +++ b/Attrap_prefIdf.py @@ -0,0 +1,96 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_prefIdf(Attrap): + + # Config + __HOST = 'https://www.prefectures-regions.gouv.fr' + __RAA_PAGE = { + '2024': f'{__HOST}/ile-de-france/ile-de-france/ile-de-france/Documents-publications/Recueil-des-actes-administratifs/RAA-de-la-region-Ile-de-France-2024', + '2023': f'{__HOST}/ile-de-france/ile-de-france/ile-de-france/Documents-publications/Recueil-des-actes-administratifs/RAA-de-la-region-Ile-de-France-2023', + '2022': f'{__HOST}/ile-de-france/ile-de-france/ile-de-france/Documents-publications/Recueil-des-actes-administratifs/RAA-de-la-region-Ile-de-France-2022', + '2021': f'{__HOST}/ile-de-france/ile-de-france/ile-de-france/Documents-publications/Recueil-des-actes-administratifs/RAA-de-la-region-Ile-de-France-2021', + '2020': f'{__HOST}/ile-de-france/ile-de-france/ile-de-france/Documents-publications/Recueil-des-actes-administratifs/RAA-de-la-region-Ile-de-France-2020', + '2019': f'{__HOST}/ile-de-france/ile-de-france/ile-de-france/Documents-publications/Recueil-des-actes-administratifs/RAA-de-la-region-Ile-de-France-2019', + '2018': f'{__HOST}/ile-de-france/ile-de-france/ile-de-france/Documents-publications/Recueil-des-actes-administratifs/RAA-de-la-region-Ile-de-France-2018' + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture d\'Île-de-France' + short_code = 'prefIdf' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + year_pages_to_parse = [] + + # Les RAA de l'Île-de-France sont éparpillés sur des sous-pages par mois. + # Donc on parse la page principale à la recherche des sous-pages. + if self.not_before.year <= 2024: + year_pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + year_pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + year_pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + year_pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + year_pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + year_pages_to_parse.append(self.__RAA_PAGE['2019']) + + pages_to_parse = [] + for year_page in year_pages_to_parse: + page_content = self.get_page(year_page, 'get').content + year = BeautifulSoup(page_content, 'html.parser').select('div.breadcrumb div.container p span.active')[0].get_text().split('-')[-1].strip() + month_pages = self.get_sub_pages( + page_content, + 'div.sommaire-bloc div.sommaire-content ol li a', + self.__HOST, + False + )[::-1] + for month_page in month_pages: + month_date = Attrap.guess_date(f"{month_page['name']} {year}", "(.*)").replace(day=1) + if month_date >= self.not_before.replace(day=1): + pages_to_parse.append(month_page['url']) + + elements = [] + for page in pages_to_parse: + page_content = self.get_page(page, 'get').content + for element in self.get_raa_elements(page_content): + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse + for a in soup.select('main div.container.main-container div.col-main article.article div.texte div a.link-download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + url = unquote(url) + name = a.find('span').get_text().strip() + # On devine la date du RAA à partir du nom de fichier + guessed = Attrap.guess_date(name, '((?:[0-9]{2}(?:-|\\.)[0-9]{2}(?:-|\\.)20[0-9]{2})|(?:20[0-9]{2}(?:-|\\.)[0-9]{2}(?:-|\\.)[0-9]{2})\\D*^)') + if (guessed == datetime.datetime(9999, 1, 1, 0, 0)): + date = None + else: + date = guessed + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Makefile b/Makefile index 68f9a25..823925f 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -make: ppparis pref04 pref05 pref06 pref09 pref13 pref25 pref31 pref33 pref34 pref35 pref38 pref42 pref44 pref59 pref62 pref63 pref64 pref65 pref66 pref69 pref73 pref75 pref80 pref81 pref83 pref87 pref93 pref94 pref976 +make: ppparis pref04 pref05 pref06 pref09 pref13 pref25 pref31 pref33 pref34 pref35 pref38 pref42 pref44 pref59 pref62 pref63 pref64 pref65 pref66 pref69 pref73 pref75 pref80 pref81 pref83 pref87 pref93 pref94 pref976 prefIdf ppparis: bin/python3 cli.py ppparis pref04: @@ -59,5 +59,7 @@ pref94: bin/python3 cli.py pref94 pref976: bin/python3 cli.py pref976 +prefIdf: + bin/python3 cli.py prefIdf lint: bin/pycodestyle --first --show-source --ignore=E501 *.py diff --git a/README.md b/README.md index adbbfb7..2011c87 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,7 @@ Les options suivantes peuvent être précisées, par un paramètre si l'utilitai - Préfecture de Seine-Saint-Denis (identifiant : `pref93`) - Préfecture du Val-de-Marne (identifiant : `pref94`) - Préfecture de Mayotte (identifiant : `pref976`) +- Préfecture d'Île-de-France (identifiant : `prefIdf`) ## Contributions diff --git a/cli.py b/cli.py index c6f7041..8d06c62 100755 --- a/cli.py +++ b/cli.py @@ -68,7 +68,8 @@ available_administrations = [ 'pref87', 'pref93', 'pref94', - 'pref976' + 'pref976', + 'prefIdf' ] # Début du script -- GitLab