From 4ce909ed89030a9c3115945246ba470620b40757 Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Fri, 31 May 2024 12:00:30 +0200 Subject: [PATCH] =?UTF-8?q?pref93:=20ajout=20de=20la=20pr=C3=A9fecture=20d?= =?UTF-8?q?e=20Seine-Saint-Denis?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Close !1 Co-authored-by: Bastien Le Querrec <blq@laquadrature.net> Co-authored-by: smyds <smyds@lebib.org> --- .gitlab-ci.yml | 5 +++ Attrap_pref93.py | 94 ++++++++++++++++++++++++++++++++++++++++++++++++ Makefile | 4 ++- README.md | 1 + cli.py | 1 + 5 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 Attrap_pref93.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 44b1308..df21d60 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -213,6 +213,11 @@ test_pref87: PREF: "pref87" extends: .default_pref +test_pref93: + variables: + PREF: "pref93" + extends: .default_pref + test_pref976: variables: PREF: "pref976" diff --git a/Attrap_pref93.py b/Attrap_pref93.py new file mode 100644 index 0000000..26f0458 --- /dev/null +++ b/Attrap_pref93.py @@ -0,0 +1,94 @@ +import os +import re +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref93(Attrap): + + # Config + __HOST = 'https://www.seine-saint-denis.gouv.fr' + __RAA_PAGE = f'{__HOST}/Publications/Bulletin-d-informations-administratives-Recueil-des-actes-administratifs/' + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture de Seine-Saint-Denis' + short_code = 'pref93' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + pages_to_parse = [] + + # On récupère les pages d'années + page_content = self.get_page(self.__RAA_PAGE, 'get').content + year_pages = self.get_sub_pages( + page_content, + 'div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False, + )[::-1] + + # On filtre par date pour limiter les requêtes + year_pages_to_parse = [] + for year_page in year_pages: + year = 9999 + try: + year = int(re.search('.*([0-9]{4})', year_page['name'].strip(), re.IGNORECASE).group(1)) + if year is None: + year = 9999 + except Exception as exc: + logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}") + year = 9999 + if year >= self.not_before.year: + year_pages_to_parse.append(year_page['url']) + + # Pour chaque année, on cherche les sous-pages de mois + for year_page in year_pages_to_parse: + page_content = self.get_page(year_page, 'get').content + month_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + )[::-1] + + # On filtre en fonction de la date demandée + for month_page in month_pages: + guessed_date = Attrap.guess_date(month_page['name'].strip(), '([a-zéû]*).*') + if guessed_date >= self.not_before.replace(day=1): + pages_to_parse.append(month_page['url']) + + # On parse les pages contenant des RAA + elements = [] + for page in pages_to_parse: + page_content = self.get_page(page, 'get').content + for element in self.get_raa_elements(page_content): + elements.append(element) + + self.parse_raa(elements[::-1], keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + soup = BeautifulSoup(page_content, 'html.parser') + + for card in soup.select('div.fr-card__body div.fr-card__content'): + a = card.select_one('h2.fr-card__title a.fr-card__link') + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.text.strip() + date = datetime.datetime.strptime(card.select_one('div.fr-card__end p.fr-card__detail').get_text().removeprefix('Publié le ').strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Makefile b/Makefile index 78e7ce6..dbbd7dd 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -make: ppparis pref04 pref05 pref06 pref09 pref13 pref31 pref33 pref34 pref35 pref38 pref42 pref44 pref59 pref62 pref63 pref64 pref65 pref66 pref69 pref80 pref81 pref83 pref87 pref976 +make: ppparis pref04 pref05 pref06 pref09 pref13 pref31 pref33 pref34 pref35 pref38 pref42 pref44 pref59 pref62 pref63 pref64 pref65 pref66 pref69 pref80 pref81 pref83 pref87 pref93 pref976 ppparis: bin/python3 cli.py ppparis pref04: @@ -47,6 +47,8 @@ pref83: bin/python3 cli.py pref83 pref87: bin/python3 cli.py pref87 +pref93: + bin/python3 cli.py pref93 pref976: bin/python3 cli.py pref976 lint: diff --git a/README.md b/README.md index a2d2f03..5627db0 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,7 @@ Les options suivantes peuvent être précisées, par un paramètre si l'utilitai - Préfecture du Tarn (identifiant : `pref81`) - Préfecture du Var (identifiant : `pref83`) - Préfecture de la Haute-Vienne (identifiant : `pref87`) +- Préfecture de Seine-Saint-Denis(identifiant : `pref93`) - Préfecture de Mayotte (identifiant : `pref976`) ## Contributions diff --git a/cli.py b/cli.py index 6a2feb0..72a9ec2 100755 --- a/cli.py +++ b/cli.py @@ -61,6 +61,7 @@ available_administrations = [ 'pref81', 'pref83', 'pref87', + 'pref93', 'pref976' ] -- GitLab