diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 98c491357dc25cf82d77249f9dcf5e89a9e25987..12e7001b8c9fb57f922c4d5a3e25e02965bb4c38 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -152,6 +152,11 @@ test_pref69: PREF: "pref69" extends: .default_pref +test_pref81: + variables: + PREF: "pref81" + extends: .default_pref + test_pref83: variables: PREF: "pref83" diff --git a/Makefile b/Makefile index f53a8ce4045e8a8d799e091c66de29997ed922c4..c15abee5118ea437c6496c2c93e4eff64c15479d 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -make: ppparis pref04 pref06 pref13 pref34 pref35 pref38 pref59 pref62 pref65 pref69 pref83 pref976 +make: ppparis pref04 pref06 pref13 pref34 pref35 pref38 pref59 pref62 pref65 pref69 pref81 pref83 pref976 ppparis: python cli.py --pref ppparis pref04: @@ -21,6 +21,8 @@ pref65: python cli.py --pref pref65 pref69: python cli.py --pref pref69 +pref81: + python cli.py --pref pref81 pref83: python cli.py --pref pref83 pref976: diff --git a/RAAspotter_pref81.py b/RAAspotter_pref81.py new file mode 100644 index 0000000000000000000000000000000000000000..a2c6182ce27034412ac68fc0b707d5e2e10ada11 --- /dev/null +++ b/RAAspotter_pref81.py @@ -0,0 +1,150 @@ +import os +import datetime + +from bs4 import BeautifulSoup +from urllib.parse import unquote + +from RAAspotter import RAAspotter + + +class RAAspotter_pref81(RAAspotter): + + # Config + __HOST = 'https://www.tarn.gouv.fr' + __RAA_PAGE = { + 'default': f'{__HOST}/Publications/RAA-Recueil-des-Actes-' + 'Administratifs/RAA', + '2024': f'{__HOST}/Publications/RAA-Recueil-des-Actes-' + 'Administratifs/RAA/2024', + '2023': f'{__HOST}/Publications/RAA-Recueil-des-Actes-' + 'Administratifs/RAA/2023', + '2022': f'{__HOST}/Publications/RAA-Recueil-des-Actes-' + 'Administratifs/RAA/2022', + '2021': f'{__HOST}/Publications/RAA-Recueil-des-Actes-' + 'Administratifs/RAA/2021', + '2020': f'{__HOST}/Publications/RAA-Recueil-des-Actes-' + 'Administratifs/RAA/2020', + '2019': f'{__HOST}/Publications/RAA-Recueil-des-Actes-' + 'Administratifs/RAA/2019', + } + __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) ' \ + 'Gecko/20100101 Firefox/115.0' + full_name = 'Préfecture du Tarn' + short_code = 'pref81' + + def __init__(self, data_dir): + super().__init__(data_dir, self.__USER_AGENT) + self.enable_tor(10) + + def get_raa(self, keywords): + self.print_output('RAAspotter_pref81') + self.print_output(f'Termes recherchés: {keywords}') + self.print_output('') + + pages_to_parse = [] + if self.not_before.year <= 2024: + pages_to_parse.append(self.__RAA_PAGE['2024']) + if self.not_before.year <= 2023: + pages_to_parse.append(self.__RAA_PAGE['2023']) + if self.not_before.year <= 2022: + pages_to_parse.append(self.__RAA_PAGE['2022']) + if self.not_before.year <= 2021: + pages_to_parse.append(self.__RAA_PAGE['2021']) + if self.not_before.year <= 2020: + pages_to_parse.append(self.__RAA_PAGE['2020']) + if self.not_before.year <= 2019: + pages_to_parse.append(self.__RAA_PAGE['2019']) + + sub_pages_to_parse = [self.__RAA_PAGE['default']] + + # Pour chaque année, on cherche les sous-pages de mois + for raa_page in pages_to_parse: + page_content = self.get_page(raa_page, 'get').content + month_pages = self.get_sub_pages( + page_content, + '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link ' + 'div.fr-card__body div.fr-card__content ' + 'h2.fr-card__title a', + self.__HOST, + False + )[::-1] + + # On regarde aussi si sur la page de l'année il n'y aurait pas un + # RAA mal catégorisé + for page_to_parse in self.find_raa_card(raa_page): + sub_pages_to_parse.append(page_to_parse) + + # Pour chaque mois, on cherche les pages des RAA + for month_page in month_pages: + year = RAAspotter.guess_date(month_page['name'], '(.*)').year + for page_to_parse in self.find_raa_card( + month_page['url'], + year + ): + sub_pages_to_parse.append(page_to_parse) + # On ajoute aussi la page des mois à parser au cas où il y ait + # eu une redirection vers un RAA + sub_pages_to_parse.append(month_page['url']) + + # On parse les pages contenant des RAA + for page in sub_pages_to_parse: + page_content = self.get_page(page, 'get').content + raa_elements = self.get_raa_elements(page_content) + self.parse_raa(raa_elements, keywords.split(',')) + + self.mailer() + + def find_raa_card(self, page, year=None): + pages = [] + card_pages = self.get_sub_pages_with_pager( + page, + 'div.fr-card__body div.fr-card__content h2.fr-card__title ' + 'a.fr-card__link', + 'ul.fr-pagination__list li ' + 'a.fr-pagination__link.fr-pagination__link--next', + 'div.fr-card__body div.fr-card__content div.fr-card__end ' + 'p.fr-card__detail', + self.__HOST + )[::-1] + for card_page in card_pages: + # On filtre les pages de RAA ne correspondant pas à la période + # analysée + guessed_date = datetime.datetime.strptime( + card_page['details'].replace('Publié le ', '').strip(), + '%d/%m/%Y' + ) + if guessed_date >= self.not_before: + print(guessed_date) + print(card_page['url']) + pages.append(card_page['url']) + return pages + + def get_raa_elements(self, page_content): + elements = [] + # On charge le parser + soup = BeautifulSoup(page_content, 'html.parser') + + # On récupère chaque balise a + for a in soup.select( + 'div.fr-downloads-group.fr-downloads-group--bordered ul li a' + ): + if a.get('href') and a['href'].endswith('.pdf'): + if a['href'].startswith('/'): + url = f"{self.__HOST}{a['href']}" + else: + url = a['href'] + + url = unquote(url) + name = a.find('span').previous_sibling.replace( + 'Télécharger ', + '' + ).strip() + date = datetime.datetime.strptime( + a.find('span').get_text().split(' - ')[-1].strip(), + '%d/%m/%Y' + ) + filename = url.split('/')[-1] + + raa = RAAspotter.RAA(url, date, name, filename) + elements.append(raa) + return elements diff --git a/README.md b/README.md index ce6a8c1932334d1b614600d5a831db0b1e8f8f05..f7d4d4351bcf6b140d26dd1b81bc6f4e4f1d2823 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ Il est possible de ne lancer l'analyse que pour une seule administration, avec l - Préfecture du Pas-de-Calais (identifiant : `pref62`) - Préfecture des Hautes-Pyrénées (identifiant : `pref65`) - Préfecture du Rhône (identifiant : `pref69`) +- Préfecture du Tarn (identifiant : `pref81`) - Préfecture du Var (identifiant : `pref83`) - Préfecture de Mayotte (identifiant : `pref976`) diff --git a/cli.py b/cli.py index 995c87adcb3989bf66d9e2d5e413b8241a990106..6210b1d33d12f4ce420ef5a501d0f491a999b6b5 100755 --- a/cli.py +++ b/cli.py @@ -49,6 +49,7 @@ available_prefs = [ 'pref62', 'pref65', 'pref69', + 'pref81', 'pref83', 'pref976' ]