diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5d38ceaddd7b11cfabba139a70e1f95e93355583..d7e2cebe4b06e3931e9e86e3ecd98b40c1d3b955 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -120,3 +120,8 @@ test_pref69: variables: PREF: "pref69" extends: .default_pref + +test_pref976: + variables: + PREF: "pref976" + extends: .default_pref diff --git a/Makefile b/Makefile index 2a378914bbf418108c36f208e1df1dc8a4cf01de..7ba36ae6b329cff15d84014a9fc76298c9134942 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -make: ppparis pref06 pref13 pref34 pref35 pref38 pref59 pref62 pref69 +make: ppparis pref06 pref13 pref34 pref35 pref38 pref59 pref62 pref69 pref976 ppparis: python cli.py --pref ppparis pref06: @@ -17,3 +17,5 @@ pref62: python cli.py --pref pref62 pref69: python cli.py --pref pref69 +pref976: + python cli.py --pref pref976 diff --git a/RAAspotter_pref976.py b/RAAspotter_pref976.py new file mode 100644 index 0000000000000000000000000000000000000000..1eb92783d0718da134dac891bc4c5543f0966383 --- /dev/null +++ b/RAAspotter_pref976.py @@ -0,0 +1,79 @@ +import os, sys +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from RAAspotter import RAAspotter + +class RAAspotter_pref976(RAAspotter): + + # Config + __HOST = 'https://www.mayotte.gouv.fr' + __RAA_PAGE = {'default': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A', + '2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2024', + '2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2023', + '2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2022', + '2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2021', + '2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2020', + '2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2019'} + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' + full_name = 'Préfecture de Mayotte' + short_code = 'pref976' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + self.print_output('RAAspotter_pref976') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') + + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + sub_pages_to_parse = [self.__RAA_PAGE['default']] + for raa_page in pages_to_parse: + page_content = self.get_page(raa_page, 'get').content + sub_pages = self.get_sub_pages(page_content, ":is(.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a,div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link,ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next)", self.__HOST)[::-1] + for sub_page in sub_pages: + sub_pages_to_parse.append(sub_page) + for page in sub_pages_to_parse: + page_content = self.get_page(page, 'get').content + raa_elements = self.get_raa_elements(page_content) + self.parse_raa(raa_elements, keywords.split(',')) + self.mailer() + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select('a.fr-link.fr-link--download'): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() + date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') + filename = url.split('/')[-1] + + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements diff --git a/cli.py b/cli.py index 9eef3a87e1b8514e33efaafe9281a6c9926e98da..317c2b0242e8d622088f0a581e001cef81a4d9d0 100755 --- a/cli.py +++ b/cli.py @@ -43,7 +43,8 @@ available_prefs = [ 'pref38', 'pref59', 'pref62', - 'pref69' + 'pref69', + 'pref976' ] # Début du script