diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 982f4664acefbfca5f75fcecef9f9134e3dd862c..f3224e0e60c8b15ea5092ae5d1c843450eff2ac3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -168,6 +168,11 @@ test_pref62: PREF: "pref62" extends: .default_pref +test_pref63: + variables: + PREF: "pref63" + extends: .default_pref + test_pref64: variables: PREF: "pref64" diff --git a/Attrap_pref63.py b/Attrap_pref63.py new file mode 100644 index 0000000000000000000000000000000000000000..24adc41593928d38cea3f6d19c774784c1c6fdea --- /dev/null +++ b/Attrap_pref63.py @@ -0,0 +1,77 @@ +import os +import datetime +import re + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from Attrap import Attrap + + +class Attrap_pref63(Attrap): + + # Config + __HOST = 'https://www.puy-de-dome.gouv.fr' + __RAA_PAGE = f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-Puy-de-Dome' + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture du Puy-de-Dôme' + short_code = 'pref63' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + year_pages_to_parse = [] + + # On détermine quelles pages d'année parser + page_content = self.get_page(self.__RAA_PAGE, 'get').content + year_pages = self.get_sub_pages( + page_content, + 'div.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', + self.__HOST, + False + ) + for year_page in year_pages: + if not year_page['name'].strip() == 'Archives': + year = 9999 + try: + year = int(year_page['name'].strip()) + except Exception as exc: + logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}") + year = 9999 + + if year >= self.not_before.year: + year_pages_to_parse.append(year_page['url']) + + elements = [] + # Pour chaque année, on parse les RAA + for year_page in year_pages_to_parse: + page_content = self.get_page(year_page, 'get').content + for element in self.get_raa_elements(page_content): + elements.append(element) + + # On parse les RAA + self.parse_raa(elements, keywords) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + + raa = Attrap.RAA(url, date, name) + elements.append(raa) + return elements diff --git a/Makefile b/Makefile index cca25f62d6367c1e683225feb5aec8e4ed90744b..78e7ce678caf26ab079f6e0ab64a6562844808ff 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -make: ppparis pref04 pref05 pref06 pref09 pref13 pref31 pref33 pref34 pref35 pref38 pref42 pref44 pref59 pref62 pref64 pref65 pref66 pref69 pref80 pref81 pref83 pref87 pref976 +make: ppparis pref04 pref05 pref06 pref09 pref13 pref31 pref33 pref34 pref35 pref38 pref42 pref44 pref59 pref62 pref63 pref64 pref65 pref66 pref69 pref80 pref81 pref83 pref87 pref976 ppparis: bin/python3 cli.py ppparis pref04: @@ -29,6 +29,8 @@ pref59: bin/python3 cli.py pref59 pref62: bin/python3 cli.py pref62 +pref63: + bin/python3 cli.py pref63 pref64: bin/python3 cli.py pref64 pref65: diff --git a/cli.py b/cli.py index 6bf99c768a38d62de9abac612cb5e727a55a82d9..6a2feb085d17167fcb5c0040652fa54d1a0e4247 100755 --- a/cli.py +++ b/cli.py @@ -52,6 +52,7 @@ available_administrations = [ 'pref44', 'pref59', 'pref62', + 'pref63', 'pref64', 'pref65', 'pref66',