From c75a25071b6ed8c43a9bc4a8e5df6d21b919d36e Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Tue, 19 Mar 2024 20:29:10 +0100 Subject: [PATCH] =?UTF-8?q?pref38:=20ajout=20de=20la=20pr=C3=A9fecture=20d?= =?UTF-8?q?e=20l'Is=C3=A8re?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 4 +- RAAspotter_pref38.py | 96 ++++++++++++++++++++++++++++++++++++++++++++ cli.py | 1 + requirements.txt | 1 + 4 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 RAAspotter_pref38.py diff --git a/Makefile b/Makefile index 095b8db..37086af 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,12 @@ -make: pref06 pref13 pref35 pref62 ppparis +make: pref06 pref13 pref35 pref38 pref62 ppparis pref06: python cli.py --pref pref06 pref13: python cli.py --pref pref13 pref35: python cli.py --pref pref35 +pref38: + python cli.py --pref pref38 pref62: python cli.py --pref pref62 ppparis: diff --git a/RAAspotter_pref38.py b/RAAspotter_pref38.py new file mode 100644 index 0000000..a7fed1a --- /dev/null +++ b/RAAspotter_pref38.py @@ -0,0 +1,96 @@ +import os, sys, re +import datetime +import dateparser +import logging + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from RAAspotter import RAAspotter + +logger = logging.getLogger(__name__) + +class RAAspotter_pref38(RAAspotter): + + # Config + __HOST = 'https://www.isere.gouv.fr' + __RAA_PAGE = {'2024': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2024', + '2023': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2023', + '2022': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2022', + '2021': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-recueils-des-actes-administratifs-de-la-prefecture-de-l-Isere-2021/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2021', + '2020': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-recueils-des-actes-administratifs-de-la-prefecture-de-l-Isere-2020/Recueils-des-Actes-Administratifs-de-la-Prefecture-de-l-Isere-2020', + '2019': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2019/Archives-des-Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2019'} + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture de l\'Isère' + short_code = 'pref38' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(20) + + def get_raa(self, keywords): + self.print_output('RAAspotter_pref38') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') + + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + for raa_page in pages_to_parse: + page_content = self.get_page(raa_page, 'get').content + raa_elements = self.get_raa_elements(page_content, raa_page) + self.parse_raa(raa_elements, keywords.split(',')) + self.mailer() + + def get_raa_elements(self, page_content, raa_page): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère le select qui contient la liste des RAA + select_list = soup.select('select#-liste-docs')[0] + # On analyse chaque résultat + for option in select_list.find_all('option'): + if not option['value'] == "": + guessed_date = datetime.datetime(9999, 1, 1) + try: + # Pour chaque RAA listé, on essaie de deviner sa date à partir de son nom, et si elle correspond à la plage + # demandée, on poursuit l'analyse + search = re.search('.* n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)', option['title'], re.IGNORECASE) + guessed_date = dateparser.parse(search.group(1)) + except Exception as exc: + logger.warning(f"Impossible de deviner la date du RAA {option['title']} : {exc}") + + # Si la date estimée correspond à la plage d'analyse, on demande au serveur les détails du RAA + if guessed_date >= self.not_before: + page_content = self.get_page(raa_page, 'post', {'-liste-docs':option['value']}).content + # On parse la page de détails pour obtenir les propriétés du RAA + soup = BeautifulSoup(page_content, 'html.parser') + a = soup.select('div.liste_deroulante a.fr-link.fr-link--download')[0] + + # Si la page contient une balise a qui renvoie vers un pdf, c'est qu'on a obtenu les détails du RAA demandé, donc on le parse + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + filename = url.split('/')[-1] + + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements diff --git a/cli.py b/cli.py index 5816df4..8776b02 100755 --- a/cli.py +++ b/cli.py @@ -38,6 +38,7 @@ available_prefs = [ 'pref06', 'pref13', 'pref35', + 'pref38', 'pref62', 'ppparis' ] diff --git a/requirements.txt b/requirements.txt index 0ae92bc..6558ead 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ pdfminer.six requests stem Mastodon.py +dateparser -- GitLab