From 89b8eca7eb90d31e4169644b570233bcbc58fdb1 Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Sun, 25 Aug 2024 13:13:28 +0200 Subject: [PATCH] =?UTF-8?q?prefPaca:=20ajout=20de=20la=20pr=C3=A9fecture?= =?UTF-8?q?=20de=20Provence-Alpes-C=C3=B4te-d'Azur?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitlab-ci.yml | 5 ++++ Attrap_prefPaca.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++ Makefile | 4 ++- README.md | 1 + cli.py | 3 +- 5 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 Attrap_prefPaca.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e87a404..7635e95 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -274,3 +274,8 @@ test_prefIdf: variables: PREF: "prefIdf" extends: .default_pref + +test_prefPaca: + variables: + PREF: "prefPaca" + extends: .default_pref diff --git a/Attrap_prefPaca.py b/Attrap_prefPaca.py new file mode 100644 index 0000000..869d665 --- /dev/null +++ b/Attrap_prefPaca.py @@ -0,0 +1,69 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_prefPaca(Attrap): + + # Config + __HOST = 'https://www.prefectures-regions.gouv.fr' + __RAA_PAGE = f'{__HOST}/provence-alpes-cote-dazur/Documents-publications' + __USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture de Provence-Alpes-Côte-d\'Azur' + short_code = 'prefPaca' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + # On récupère les pages d'années + year_pages = [] + for year_page in self.get_sub_pages_with_pager( + self.__RAA_PAGE, + 'article.news-list-item header h2.news-list-title a', + 'article.article div.content-pagination ul.pagination li.next a', + None, + self.__HOST + ): + year = Attrap.guess_date(year_page['name'].strip(), 'RAA ([0-9]{4})').year + if year < 9999 and year >= self.not_before.year: + year_pages.append(year_page['url']) + + elements = [] + for year_page in year_pages: + page_content = self.get_page(year_page, 'get').content + for element in self.get_raa_elements(page_content): + elements.append(element) + + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse + for a in soup.select('main div.container.main-container div.col-main article.article div.texte div a.link-download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + url = unquote(url) + name = a.find('span').get_text().strip() + # On devine la date du RAA à partir du nom de fichier + guessed = Attrap.guess_date(name, '((?:[0-9]{2}|[0-9]{1})(?:er){0,1}[ _](?:[a-zéû]{3,9})[ _](?:[0-9]{4}|[0-9]{2}))') + if (guessed == datetime.datetime(9999, 1, 1, 0, 0)): + date = None + else: + date = guessed + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Makefile b/Makefile index f210897..aeafb44 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -make: ppparis pref2b pref03 pref04 pref05 pref06 pref09 pref10 pref13 pref25 pref31 pref33 pref34 pref35 pref38 pref39 pref42 pref44 pref59 pref62 pref63 pref64 pref65 pref66 pref69 pref73 pref75 pref80 pref81 pref83 pref87 pref92 pref93 pref94 pref976 prefIdf +make: ppparis pref2b pref03 pref04 pref05 pref06 pref09 pref10 pref13 pref25 pref31 pref33 pref34 pref35 pref38 pref39 pref42 pref44 pref59 pref62 pref63 pref64 pref65 pref66 pref69 pref73 pref75 pref80 pref81 pref83 pref87 pref92 pref93 pref94 pref976 prefIdf prefPaca ppparis: bin/python3 cli.py ppparis pref2b: @@ -71,5 +71,7 @@ pref976: bin/python3 cli.py pref976 prefIdf: bin/python3 cli.py prefIdf +prefPaca: + bin/python3 cli.py prefPaca lint: bin/pycodestyle --first --show-source --ignore=E501 *.py diff --git a/README.md b/README.md index 1ead382..c83bb93 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,7 @@ Vous pouvez également activer le safe mode en spécifiant la variable d'environ - Préfecture du Val-de-Marne (identifiant : `pref94`) - Préfecture de Mayotte (identifiant : `pref976`) - Préfecture d'Île-de-France (identifiant : `prefIdf`) +- Préfecture de Provence-Alpes-Côte-d'Azur (identifiant : `prefPaca`) ## Contributions diff --git a/cli.py b/cli.py index de41ff5..e84db71 100755 --- a/cli.py +++ b/cli.py @@ -79,7 +79,8 @@ available_administrations = [ 'pref93', 'pref94', 'pref976', - 'prefIdf' + 'prefIdf', + 'prefPaca' ] # Début du script -- GitLab