diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9b1f2aa04f1899654372ff40ad31fc4a2010eb19..67d00bee6cb7b9010a4177b87a5f5b66b8ef42eb 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -157,6 +157,34 @@ test_pref38: - output.log expire_in: 1 hour +test_pref59: + stage: test + image: registry.git.laquadrature.net/bastien/raaspotter/base:latest + tags: + - unprivileged + needs: [install] + script: + - curl --silent --location --output artifacts.zip "${CI_SERVER_PROTOCOL}://${CI_SERVER_HOST}:${CI_SERVER_PORT}/api/v4/projects/${CI_PROJECT_ID}/jobs/artifacts/${CI_COMMIT_BRANCH}/download?job=${CI_JOB_NAME}&job_token=${CI_JOB_TOKEN}" || true + - unzip -q artifacts.zip data/pref59/* || true + - rm artifacts.zip || true + - source bin/activate + - /etc/init.d/tor start + - python ./cli.py --pref pref59 + retry: 2 + only: + - main + cache: + key: $CI_COMMIT_REF_SLUG + paths: + - bin/ + - lib/ + - pyvenv.cfg + artifacts: + paths: + - data/pref59/*.txt + - output.log + expire_in: 1 hour + test_pref62: stage: test image: registry.git.laquadrature.net/bastien/raaspotter/base:latest diff --git a/Makefile b/Makefile index 37086afd2b4907973085e66681cf0fa79d5c8de9..1e339f19fb6422d7439a0ed80eace105b913671d 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -make: pref06 pref13 pref35 pref38 pref62 ppparis +make: pref06 pref13 pref35 pref38 pref59 pref62 ppparis pref06: python cli.py --pref pref06 pref13: @@ -7,6 +7,8 @@ pref35: python cli.py --pref pref35 pref38: python cli.py --pref pref38 +pref59: + python cli.py --pref pref59 pref62: python cli.py --pref pref62 ppparis: diff --git a/RAAspotter_pref59.py b/RAAspotter_pref59.py new file mode 100644 index 0000000000000000000000000000000000000000..26004cd6e12719f5f3495747caaeac092ed06f28 --- /dev/null +++ b/RAAspotter_pref59.py @@ -0,0 +1,79 @@ +import os, sys, re +import datetime +import dateparser +import logging + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from RAAspotter import RAAspotter + +logger = logging.getLogger(__name__) + +class RAAspotter_pref59(RAAspotter): + + # Config + __HOST = 'https://www.nord.gouv.fr' + __RAA_PAGE = {'2024': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2024', + '2023': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2023', + '2022': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2022', + '2021': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2021', + '2020': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2020', + '2019': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2019'} + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture du Nord' + short_code = 'pref59' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(20) + + def get_raa(self, keywords): + self.print_output('RAAspotter_pref59') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') + + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + for raa_page in pages_to_parse: + page_content = self.get_page(raa_page, 'get').content + sub_pages = self.get_sub_pages(page_content, "div.fr-card__body div.fr-card__content h2.fr-card__title a", self.__HOST) + for sub_page in sub_pages[::-1]: + sub_page_content = self.get_page(sub_page, 'get').content + sub_raa_elements = self.get_raa_elements(sub_page_content) + self.parse_raa(sub_raa_elements, keywords.split(',')) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + filename = url.split('/')[-1] + + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements diff --git a/README.md b/README.md index 047494e0d4c90ad7337963973b469b1e19582371..93c8aa3fb0ded09f943460aff544c20a1b2655ea 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ Il est possible de ne lancer l'analyse que pour une seule administration, avec l - Préfecture des Alpes-Maritimes (identifiant : `pref06`) - Préfecture des Bouches-du-Rhône (identifiant : `pref13`) - Préfecture d'Ille-et-Vilaine (identifiant : `pref35`) +- Préfecture du Nord (identifiant : `pref59`) - Préfecture du Pas-de-Calais (identifiant : `pref62`) - Préfecture de police de Paris (identifiant : `ppparis`) diff --git a/cli.py b/cli.py index 8776b02c6dfabb0d7ff6c0b3fd0354839061b760..4efb8619b999ec65fbb24ea9f43d6dd2e120550d 100755 --- a/cli.py +++ b/cli.py @@ -39,6 +39,7 @@ available_prefs = [ 'pref13', 'pref35', 'pref38', + 'pref59', 'pref62', 'ppparis' ]