From 476d866a634f784cdb4bd26a613cc351d203bc41 Mon Sep 17 00:00:00 2001
From: Felix Lena <felix.lena@epita.fr>
Date: Sun, 9 Jun 2024 20:28:49 +0200
Subject: [PATCH] pref2b: Ajout de la prefecture de la haute corse

---
 Attrap_pref2B.py | 92 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 Attrap_pref2B.py

diff --git a/Attrap_pref2B.py b/Attrap_pref2B.py
new file mode 100644
index 0000000..79726fb
--- /dev/null
+++ b/Attrap_pref2B.py
@@ -0,0 +1,92 @@
+import os
+import datetime
+
+from bs4 import BeautifulSoup
+from urllib.parse import unquote
+
+from Attrap import Attrap
+
+
+class Attrap_pref83(Attrap):
+
+    # Config
+    __HOST = 'https://www.haute-corse.gouv.fr'
+    __RAA_PAGE = {
+        '2024': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2024',
+        '2023': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2023',
+        '2022': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2016-a-2022/Recueils-des-actes-administratifs-2022',
+        '2021': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2016-a-2022/Recueils-des-actes-administratifs-2021',
+        '2020': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2016-a-2022/Recueils-des-actes-administratifs-2020',
+        '2019': f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-2016-a-2022/Recueils-des-actes-administratifs-2019'
+    }
+    __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
+    full_name = 'Préfecture de Haute-Corse'
+    short_code = 'pref2B'
+
+    def __init__(self, data_dir):
+        super().__init__(data_dir, self.__USER_AGENT)
+        self.enable_tor(10)
+
+    def get_raa(self, keywords):
+        pages_to_parse = []
+        if self.not_before.year <= 2024:
+            pages_to_parse.append(self.__RAA_PAGE['2024'])
+        if self.not_before.year <= 2023:
+            pages_to_parse.append(self.__RAA_PAGE['2023'])
+        if self.not_before.year <= 2022:
+            pages_to_parse.append(self.__RAA_PAGE['2022'])
+        if self.not_before.year <= 2021:
+            pages_to_parse.append(self.__RAA_PAGE['2021'])
+        if self.not_before.year <= 2020:
+            pages_to_parse.append(self.__RAA_PAGE['2020'])
+        if self.not_before.year <= 2019:
+            pages_to_parse.append(self.__RAA_PAGE['2019'])
+
+        sub_pages_to_parse = []
+
+        # Pour chaque année, on cherche les sous-pages de mois
+        for raa_page in pages_to_parse:
+            sub_pages_to_parse.append(raa_page)
+            page_content = self.get_page(raa_page, 'get').content
+            month_pages = self.get_sub_pages(
+                page_content,
+                '.fr-card fr-card--horizontal fr-card--sm fr-enlarge-link fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a',
+                self.__HOST,
+                False
+            )[::-1]
+            for month_page in month_pages:
+                sub_pages_to_parse.append(month_page['url'])
+
+        # On parse les pages contenant des RAA
+        elements = self.get_raa_with_pager(
+            sub_pages_to_parse[::-1],
+            '.fr-pagination__link.fr-pagination__link--next',
+            self.__HOST
+        )
+        self.parse_raa(elements, keywords)
+
+        self.mailer()
+
+    def get_raa_elements(self, page_content):
+        elements = []
+        # On charge le parser
+        soup = BeautifulSoup(page_content, 'html.parser')
+
+        # On récupère chaque section contenant un RAA
+        cards = soup.select(
+            'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link.menu-item-link')
+        for a in cards:
+            if a.get('href') and a['href'].endswith('.pdf'):
+                if a['href'].startswith('/'):
+                    url = f"{self.__HOST}{a['href']}"
+                else:
+                    url = a['href']
+
+                url = unquote(url)
+                name = a.get_text().strip()
+                date = datetime.datetime.strptime(
+                    a['title'].split(' - ')[-1].strip(), '%d/%m/%Y')
+
+                raa = Attrap.RAA(url, date, name)
+                elements.append(raa)
+        return elements
-- 
GitLab