From e2a3d0240c7e18fc007d50cae35a31669f148c35 Mon Sep 17 00:00:00 2001 From: Bastien Le Querrec <blq@laquadrature.net> Date: Wed, 20 Mar 2024 00:51:21 +0100 Subject: [PATCH] =?UTF-8?q?pref69:=20ajout=20de=20la=20pr=C3=A9fecture=20d?= =?UTF-8?q?u=20Rh=C3=B4ne?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitlab-ci.yml | 28 +++++++++++++++ Makefile | 4 ++- RAAspotter.py | 23 +++++++++++- RAAspotter_pref69.py | 85 ++++++++++++++++++++++++++++++++++++++++++++ README.md | 1 + cli.py | 3 +- 6 files changed, 141 insertions(+), 3 deletions(-) create mode 100644 RAAspotter_pref69.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 352574e..62dabd9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -238,3 +238,31 @@ test_pref62: - data/pref62/*.txt - output.log expire_in: 1 hour + +test_pref69: + stage: test + image: registry.git.laquadrature.net/bastien/raaspotter/base:latest + tags: + - unprivileged + needs: [install] + script: + - curl --silent --location --output artifacts.zip "${CI_SERVER_PROTOCOL}://${CI_SERVER_HOST}:${CI_SERVER_PORT}/api/v4/projects/${CI_PROJECT_ID}/jobs/artifacts/${CI_COMMIT_BRANCH}/download?job=${CI_JOB_NAME}&job_token=${CI_JOB_TOKEN}" || true + - unzip -q artifacts.zip data/pref69/* || true + - rm artifacts.zip || true + - source bin/activate + - /etc/init.d/tor start + - python ./cli.py --pref pref69 + retry: 2 + only: + - main + cache: + key: $CI_COMMIT_REF_SLUG + paths: + - bin/ + - lib/ + - pyvenv.cfg + artifacts: + paths: + - data/pref69/*.txt + - output.log + expire_in: 1 hour diff --git a/Makefile b/Makefile index 5d1cd97..fa10661 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -make: ppparis pref06 pref13 pref35 pref38 pref59 pref62 +make: ppparis pref06 pref13 pref35 pref38 pref59 pref62 pref69 ppparis: python cli.py --pref ppparis pref06: @@ -13,3 +13,5 @@ pref59: python cli.py --pref pref59 pref62: python cli.py --pref pref62 +pref69: + python cli.py --pref pref69 diff --git a/RAAspotter.py b/RAAspotter.py index 05b847f..f7fc686 100644 --- a/RAAspotter.py +++ b/RAAspotter.py @@ -1,4 +1,4 @@ -import os, re, ssl +import os, re, ssl, sys import subprocess import logging import requests @@ -136,6 +136,27 @@ class RAAspotter: sub_pages.append(url) return sub_pages + def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, host=""): + pages = [] + page_content = self.get_page(page, 'get').content + + # On initialise le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On recherche les sous-pages + sub_pages = soup.select(sub_page_element) + for sub_page in sub_pages: + if sub_page.get('href'): + pages.append(f"{host}{sub_page['href']}") + + # On recherche un pager, et si on le trouve on le suit + pager = soup.select(pager_element) + if pager[0] and pager[0].get('href'): + for sub_page in self.get_sub_pages_with_pager(f"{host}{pager[0]['href']}", sub_page_element, pager_element, host): + pages.append(sub_page) + + return pages + def get_raa_with_pager(self, pages_list, pager_element, host=""): elements = [] # On parse chaque page passée en paramètre diff --git a/RAAspotter_pref69.py b/RAAspotter_pref69.py new file mode 100644 index 0000000..63ac570 --- /dev/null +++ b/RAAspotter_pref69.py @@ -0,0 +1,85 @@ +import os, sys +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from RAAspotter import RAAspotter + +class RAAspotter_pref69(RAAspotter): + + # Config + __HOST = 'https://www.rhone.gouv.fr' + __RAA_PAGE = {'2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2024', + '2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2023', + '2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2022', + '2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2021', + '2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2020', + '2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2019'} + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture du Rhône' + short_code = 'pref69' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(20) + + def get_raa(self, keywords): + self.print_output('RAAspotter_pref69') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') + + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + sub_pages_to_parse = [] + + for raa_page in pages_to_parse: + sub_pages = self.get_sub_pages_with_pager(raa_page, + "div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link", + "ul.fr-pagination__list li a.fr-pagination__link--next", + self.__HOST)[::-1] + for sub_page in sub_pages: + sub_pages_to_parse.append(sub_page) + + elements = [] + for sub_page_to_parse in sub_pages_to_parse: + page_content = self.get_page(sub_page_to_parse, 'get').content + for element in self.get_raa_elements(page_content)[::-1]: + elements.append(element) + + self.parse_raa(elements, keywords.split(',')) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + filename = url.split('/')[-1] + + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements diff --git a/README.md b/README.md index 9b79bfa..75a4818 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,7 @@ Il est possible de ne lancer l'analyse que pour une seule administration, avec l - Préfecture d'Ille-et-Vilaine (identifiant : `pref35`) - Préfecture du Nord (identifiant : `pref59`) - Préfecture du Pas-de-Calais (identifiant : `pref62`) +- Préfecture du Rhône (identifiant : `pref69`) ## Licence diff --git a/cli.py b/cli.py index 33f05d9..850cbf7 100755 --- a/cli.py +++ b/cli.py @@ -41,7 +41,8 @@ available_prefs = [ 'pref35', 'pref38', 'pref59', - 'pref62' + 'pref62', + 'pref69' ] # Début du script -- GitLab