From 4ce909ed89030a9c3115945246ba470620b40757 Mon Sep 17 00:00:00 2001
From: Bastien Le Querrec <blq@laquadrature.net>
Date: Fri, 31 May 2024 12:00:30 +0200
Subject: [PATCH] =?UTF-8?q?pref93:=20ajout=20de=20la=20pr=C3=A9fecture=20d?=
 =?UTF-8?q?e=20Seine-Saint-Denis?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Close !1

Co-authored-by: Bastien Le Querrec <blq@laquadrature.net>
Co-authored-by: smyds <smyds@lebib.org>
---
 .gitlab-ci.yml   |  5 +++
 Attrap_pref93.py | 94 ++++++++++++++++++++++++++++++++++++++++++++++++
 Makefile         |  4 ++-
 README.md        |  1 +
 cli.py           |  1 +
 5 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 Attrap_pref93.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 44b1308..df21d60 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -213,6 +213,11 @@ test_pref87:
     PREF: "pref87"
   extends: .default_pref
 
+test_pref93:
+  variables:
+    PREF: "pref93"
+  extends: .default_pref
+
 test_pref976:
   variables:
     PREF: "pref976"
diff --git a/Attrap_pref93.py b/Attrap_pref93.py
new file mode 100644
index 0000000..26f0458
--- /dev/null
+++ b/Attrap_pref93.py
@@ -0,0 +1,94 @@
+import os
+import re
+import datetime
+
+from bs4 import BeautifulSoup
+from urllib.parse import unquote
+
+from Attrap import Attrap
+
+
+class Attrap_pref93(Attrap):
+
+    # Config
+    __HOST = 'https://www.seine-saint-denis.gouv.fr'
+    __RAA_PAGE = f'{__HOST}/Publications/Bulletin-d-informations-administratives-Recueil-des-actes-administratifs/'
+    __USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
+    full_name = 'Préfecture de Seine-Saint-Denis'
+    short_code = 'pref93'
+
+    def __init__(self, data_dir):
+        super().__init__(data_dir, self.__USER_AGENT)
+        self.enable_tor(10)
+
+    def get_raa(self, keywords):
+        pages_to_parse = []
+
+        # On récupère les pages d'années
+        page_content = self.get_page(self.__RAA_PAGE, 'get').content
+        year_pages = self.get_sub_pages(
+            page_content,
+            'div.fr-card__body div.fr-card__content h2.fr-card__title a',
+            self.__HOST,
+            False,
+        )[::-1]
+
+        # On filtre par date pour limiter les requêtes
+        year_pages_to_parse = []
+        for year_page in year_pages:
+            year = 9999
+            try:
+                year = int(re.search('.*([0-9]{4})', year_page['name'].strip(), re.IGNORECASE).group(1))
+                if year is None:
+                    year = 9999
+            except Exception as exc:
+                logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}")
+                year = 9999
+            if year >= self.not_before.year:
+                year_pages_to_parse.append(year_page['url'])
+
+        # Pour chaque année, on cherche les sous-pages de mois
+        for year_page in year_pages_to_parse:
+            page_content = self.get_page(year_page, 'get').content
+            month_pages = self.get_sub_pages(
+                page_content,
+                '.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
+                self.__HOST,
+                False
+            )[::-1]
+
+            # On filtre en fonction de la date demandée
+            for month_page in month_pages:
+                guessed_date = Attrap.guess_date(month_page['name'].strip(), '([a-zéû]*).*')
+                if guessed_date >= self.not_before.replace(day=1):
+                    pages_to_parse.append(month_page['url'])
+
+        # On parse les pages contenant des RAA
+        elements = []
+        for page in pages_to_parse:
+            page_content = self.get_page(page, 'get').content
+            for element in self.get_raa_elements(page_content):
+                elements.append(element)
+
+        self.parse_raa(elements[::-1], keywords)
+        self.mailer()
+
+    def get_raa_elements(self, page_content):
+        elements = []
+        soup = BeautifulSoup(page_content, 'html.parser')
+
+        for card in soup.select('div.fr-card__body div.fr-card__content'):
+            a = card.select_one('h2.fr-card__title a.fr-card__link')
+            if a.get('href') and a['href'].endswith('.pdf'):
+                if a['href'].startswith('/'):
+                    url = f"{self.__HOST}{a['href']}"
+                else:
+                    url = a['href']
+
+                url = unquote(url)
+                name = a.text.strip()
+                date = datetime.datetime.strptime(card.select_one('div.fr-card__end p.fr-card__detail').get_text().removeprefix('Publié le ').strip(), '%d/%m/%Y')
+
+                raa = Attrap.RAA(url, date, name)
+                elements.append(raa)
+        return elements
diff --git a/Makefile b/Makefile
index 78e7ce6..dbbd7dd 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-make: ppparis pref04 pref05 pref06 pref09 pref13 pref31 pref33 pref34 pref35 pref38 pref42 pref44 pref59 pref62 pref63 pref64 pref65 pref66 pref69 pref80 pref81 pref83 pref87 pref976
+make: ppparis pref04 pref05 pref06 pref09 pref13 pref31 pref33 pref34 pref35 pref38 pref42 pref44 pref59 pref62 pref63 pref64 pref65 pref66 pref69 pref80 pref81 pref83 pref87 pref93 pref976
 ppparis:
 	bin/python3 cli.py ppparis
 pref04:
@@ -47,6 +47,8 @@ pref83:
 	bin/python3 cli.py pref83
 pref87:
 	bin/python3 cli.py pref87
+pref93:
+	bin/python3 cli.py pref93
 pref976:
 	bin/python3 cli.py pref976
 lint:
diff --git a/README.md b/README.md
index a2d2f03..5627db0 100644
--- a/README.md
+++ b/README.md
@@ -77,6 +77,7 @@ Les options suivantes peuvent être précisées, par un paramètre si l'utilitai
 - Préfecture du Tarn (identifiant : `pref81`)
 - Préfecture du Var (identifiant : `pref83`)
 - Préfecture de la Haute-Vienne (identifiant : `pref87`)
+- Préfecture de Seine-Saint-Denis(identifiant : `pref93`)
 - Préfecture de Mayotte (identifiant : `pref976`)
 
 ## Contributions
diff --git a/cli.py b/cli.py
index 6a2feb0..72a9ec2 100755
--- a/cli.py
+++ b/cli.py
@@ -61,6 +61,7 @@ available_administrations = [
     'pref81',
     'pref83',
     'pref87',
+    'pref93',
     'pref976'
 ]
 
-- 
GitLab