Skip to content
Extraits de code Groupes Projets

Comparer les révisions

Les modifications sont affichées comme si la révision source était fusionnée avec la révision cible. En savoir plus sur la comparaison des révisions.

Source

Sélectionner le projet cible
No results found

Cible

Sélectionner le projet cible
  • la-quadrature-du-net/Attrap
  • foggyfrog/Attrap
  • skhwiz/Attrap
  • precambrien/Attrap
  • ketsapiwiq/Attrap
  • Joseki/Attrap
  • kr1p/attrap-pref-12
  • kr1p/attrap-pref-46
  • kr1p/attrap-pi
  • Guinness/Attrap
  • astroidgritty/attrap-pref-84
  • davinov/Attrap
  • maettellite/attrap-pref-01
  • m242/Attrap
  • multi/Attrap
  • mverdeil/Attrap
  • olpo/Attrap
17 résultats
Afficher les modifications
Validations sur la source (125)
......@@ -6,3 +6,4 @@ data/
pyvenv.cfg
output_*.log
*.patch
CACHEDIR.TAG
......@@ -74,11 +74,11 @@ pep8:
- unprivileged
needs: [install]
script:
- s3/download-from-s3.sh "${PREF}" "${S3_KEY}" "${S3_SECRET}" "${S3_HOST}" "${S3_BUCKET}" data/ || true
- misc/download-from-s3.sh "${PREF}" "${S3_KEY}" "${S3_SECRET}" "${S3_HOST}" "${S3_BUCKET}" data/ || true
- source bin/activate
- /etc/init.d/tor start
- make "${PREF}"
- s3/upload-to-s3.sh "${PREF}" "${S3_KEY}" "${S3_SECRET}" "${S3_HOST}" "${S3_BUCKET}" data/ || true
- misc/upload-to-s3.sh "${PREF}" "${S3_KEY}" "${S3_SECRET}" "${S3_HOST}" "${S3_BUCKET}" data/ || true
cache:
key: $CI_COMMIT_REF_SLUG
paths:
......@@ -99,9 +99,14 @@ test_ppparis:
PREF: "ppparis"
extends: .default_pref
test_pref2b:
test_pref01:
variables:
PREF: "pref2b"
PREF: "pref01"
extends: .default_pref
test_pref02:
variables:
PREF: "pref02"
extends: .default_pref
test_pref03:
......@@ -134,16 +139,41 @@ test_pref10:
PREF: "pref10"
extends: .default_pref
test_pref11:
variables:
PREF: "pref11"
extends: .default_pref
test_pref13:
variables:
PREF: "pref13"
extends: .default_pref
test_pref2a:
variables:
PREF: "pref2a"
extends: .default_pref
test_pref2b:
variables:
PREF: "pref2b"
extends: .default_pref
test_pref25:
variables:
PREF: "pref25"
extends: .default_pref
test_pref29:
variables:
PREF: "pref29"
extends: .default_pref
test_pref30:
variables:
PREF: "pref30"
extends: .default_pref
test_pref31:
variables:
PREF: "pref31"
......@@ -169,6 +199,11 @@ test_pref38:
PREF: "pref38"
extends: .default_pref
test_pref39:
variables:
PREF: "pref39"
extends: .default_pref
test_pref42:
variables:
PREF: "pref42"
......@@ -179,11 +214,41 @@ test_pref44:
PREF: "pref44"
extends: .default_pref
test_pref49:
variables:
PREF: "pref49"
extends: .default_pref
test_pref50:
variables:
PREF: "pref50"
extends: .default_pref
test_pref52:
variables:
PREF: "pref52"
extends: .default_pref
test_pref54:
variables:
PREF: "pref54"
extends: .default_pref
test_pref55:
variables:
PREF: "pref55"
extends: .default_pref
test_pref59:
variables:
PREF: "pref59"
extends: .default_pref
test_pref61:
variables:
PREF: "pref61"
extends: .default_pref
test_pref62:
variables:
PREF: "pref62"
......@@ -224,6 +289,11 @@ test_pref75:
PREF: "pref75"
extends: .default_pref
test_pref77:
variables:
PREF: "pref77"
extends: .default_pref
test_pref80:
variables:
PREF: "pref80"
......@@ -244,6 +314,11 @@ test_pref87:
PREF: "pref87"
extends: .default_pref
test_pref91:
variables:
PREF: "pref91"
extends: .default_pref
test_pref92:
variables:
PREF: "pref92"
......@@ -264,7 +339,17 @@ test_pref976:
PREF: "pref976"
extends: .default_pref
test_prefIdf:
test_prefbretagne:
variables:
PREF: "prefbretagne"
extends: .default_pref
test_prefidf:
variables:
PREF: "prefidf"
extends: .default_pref
test_prefpaca:
variables:
PREF: "prefIdf"
PREF: "prefpaca"
extends: .default_pref
import os
import sys
import re
import random
import ssl
......@@ -8,6 +9,7 @@ import string
import logging
import requests
import time
from types import SimpleNamespace
import datetime
import json
from urllib.parse import quote
......@@ -18,7 +20,9 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions
import pytz
import dateparser
import urllib3
from bs4 import BeautifulSoup
from pyvirtualdisplay import Display
......@@ -27,6 +31,7 @@ from pypdf import PdfReader
from pypdf import PdfWriter
from pypdf.generic import NameObject, NumberObject
from pypdf.errors import PdfStreamError
from pypdf.errors import EmptyFileError
import hashlib
import smtplib
......@@ -45,18 +50,17 @@ class Attrap:
url = ""
date = None
date_str = ""
name = ""
sha256 = ""
pdf_creation_date = None
pdf_modification_date = None
def __init__(self, url, date, name):
def __init__(self, url, date, name, timezone='Europe/Paris'):
self.timezone = timezone
if not url == "":
self.url = url
if date is not None:
self.date = Attrap.get_aware_datetime(date)
self.date_str = date.strftime("%d/%m/%Y")
self.date = Attrap.get_aware_datetime(date, timezone=timezone)
if not name == "":
self.name = name
......@@ -75,16 +79,14 @@ class Attrap:
if pdf_metadata:
if pdf_metadata.creation_date:
self.pdf_creation_date = Attrap.get_aware_datetime(pdf_metadata.creation_date)
self.pdf_creation_date = Attrap.get_aware_datetime(pdf_metadata.creation_date, timezone=self.timezone)
if self.date is None:
self.date = Attrap.get_aware_datetime(pdf_metadata.creation_date)
self.date_str = self.date.strftime("%d/%m/%Y")
self.date = Attrap.get_aware_datetime(pdf_metadata.creation_date, timezone=self.timezone)
if pdf_metadata.modification_date:
self.pdf_modification_date = Attrap.get_aware_datetime(pdf_metadata.modification_date)
self.pdf_modification_date = Attrap.get_aware_datetime(pdf_metadata.modification_date, timezone=self.timezone)
if self.date is None:
self.date = Attrap.get_aware_datetime(pdf_metadata.modification_date)
self.date_str = self.date.strftime("%d/%m/%Y")
self.date = Attrap.get_aware_datetime(pdf_metadata.modification_date, timezone=self.timezone)
def extract_content(self, data_dir):
"""Extrait le contenu du PDF OCRisé pour l'écrire dans le fichier qui servira à faire la recherche de mots-clés. Supprime tous les PDF à la fin."""
......@@ -118,17 +120,19 @@ class Attrap:
pdf_modification_date_json = None
if self.pdf_creation_date:
pdf_creation_date_json = self.pdf_creation_date.strftime("%d/%m/%Y %H:%M:%S%z")
pdf_creation_date_json = self.pdf_creation_date.astimezone(pytz.utc).isoformat(timespec="seconds")
if self.pdf_modification_date:
pdf_modification_date_json = self.pdf_modification_date.strftime("%d/%m/%Y %H:%M:%S%z")
pdf_modification_date_json = self.pdf_modification_date.astimezone(pytz.utc).isoformat(timespec="seconds")
properties = {
'version': 2,
'name': self.name,
'date': self.date_str,
'date': self.date.strftime("%Y-%m-%d"),
'url': quote(self.url, safe='/:'),
'first_saw_on': datetime.datetime.now(datetime.timezone.utc).strftime("%d/%m/%Y %H:%M:%S%z"),
'first_seen_on': datetime.datetime.now(pytz.utc).isoformat(timespec="seconds"),
'pdf_creation_date': pdf_creation_date_json,
'pdf_modification_date': pdf_modification_date_json
'pdf_modification_date': pdf_modification_date_json,
'timezone': self.timezone
}
f = open(f'{raa_data_dir}{self.get_sha256()}.json', 'w')
f.write(json.dumps(properties))
......@@ -140,6 +144,11 @@ class Attrap:
self.write_properties(data_dir)
def __init__(self, data_dir, user_agent=''):
"""
Initialise Attrap et le dossier de données.
data_dir -- le dossier où sont situées les données
user_agent -- le user_agent utilisé pour les requêtes
"""
logger.debug('Initialisation de Attrap')
# On crée le dossier de téléchargement
......@@ -150,6 +159,7 @@ class Attrap:
self.found = False
self.output_file_path = os.path.dirname(os.path.abspath(__file__)) + f'/output_{self.short_code}.log'
self.sleep_time = 0
self.last_http_request = 0
self.tor_enabled = False
self.tor_max_requests = 0
self.tor_requests = 0
......@@ -160,6 +170,7 @@ class Attrap:
self.mastodon_prefix = ''
self.mastodon_suffix = ''
self.safe_mode = False
self.timezone = datetime.datetime.now(datetime.timezone.utc).astimezone().tzname()
self.update_user_agent(user_agent)
......@@ -223,7 +234,7 @@ class Attrap:
self.session.proxies.update(proxies)
self.tor_requests = 0
def get_sub_pages(self, page_content, element, host, recursive_until_pdf):
def get_sub_pages(self, page_content, element, host, recursive_until_pdf, selenium=False):
"""
Récupère, à partir d'un chemin CSS, les sous-pages d'une page.
......@@ -231,6 +242,7 @@ class Attrap:
element -- Le chemin CSS vers l'objet renvoyant vers la sous-page recherchée
host -- Le nom d'hôte du site
recursive_until_pdf -- Un booléen pour savoir s'il faut rechercher un fichier PDF dans le chemin CSS. Le cas échéant, relance la recherche sur la sous-page si le lien n'est pas un PDF.
selenium -- lance un navigateur avec Selenium pour contourner les protections anti-robots
"""
soup = BeautifulSoup(page_content, 'html.parser')
sub_pages = []
......@@ -247,7 +259,8 @@ class Attrap:
sub_page_content,
element,
host,
recursive_until_pdf
recursive_until_pdf,
selenium=selenium
):
sub_pages.append(sub_sub_page)
else:
......@@ -264,18 +277,24 @@ class Attrap:
sub_pages.append(sub_page)
return sub_pages
def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, details_element, host):
def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, details_element, host, selenium=False):
"""
Récupère, à partir d'un chemin CSS, les sous-pages d'une page contenant un pager.
page -- L'URL de la page à analyser
page -- L'URL ou le contenu HTML de la page à analyser
sub_page_element -- Le chemin CSS vers l'objet renvoyant vers la sous-page recherchée
pager_element -- Le chemin CSS vers le lien de page suivante du pager
details_element -- Le chemin CSS vers l'objet contenant les détails de la sous-page recherchée
host -- Le nom d'hôte du site
selenium -- lance un navigateur avec Selenium pour contourner les protections anti-robots
"""
pages = []
page_content = self.get_page(page, 'get').content
if isinstance(page, bytes):
page = page.decode('utf-8')
if page.startswith('https://') or page.startswith('http://'):
page_content = self.get_page(page, 'get', selenium=selenium).content
else:
page_content = page
# On initialise le parser
soup = BeautifulSoup(page_content, 'html.parser')
......@@ -306,13 +325,14 @@ class Attrap:
sub_page_element,
pager_element,
details_element,
host
host,
selenium=selenium
):
pages.append(sub_page)
return pages
def get_raa_with_pager(self, pages_list, pager_element, host, filter_from_last_element_date=False):
def get_raa_with_pager(self, pages_list, pager_element, host, filter_from_last_element_date=False, selenium=False):
"""
Récupère et analyse les RAA d'une page contenant un pager.
......@@ -322,11 +342,12 @@ class Attrap:
filter_from_last_element_date -- (Optionnel) Si la date du dernier élément de la dernière page parsée
n'est pas dans la plage temporelle voulue, ne charge pas les pages suivantes. Par défaut à False. Ne doit
être activé que si l'ordre des éléments est chronologique.
selenium -- lance un navigateur avec Selenium pour contourner les protections anti-robots
"""
elements = []
# On parse chaque page passée en paramètre
for page in pages_list:
page_content = self.get_page(page, 'get').content
page_content = self.get_page(page, 'get', selenium=selenium).content
# Pour chaque page, on récupère les PDF
for raa in self.get_raa_elements(page_content):
......@@ -334,7 +355,7 @@ class Attrap:
# Si la date du dernier RAA est dans la plage temporelle voulue,
# on regarde également s'il n'y aurait pas un pager
if not filter_from_last_element_date or (filter_from_last_element_date and (elements[-1].date >= Attrap.get_aware_datetime(self.not_before))):
if not filter_from_last_element_date or (filter_from_last_element_date and (elements[-1].date >= Attrap.get_aware_datetime(self.not_before, timezone=self.timezone))):
sub_pages = []
for sub_page in self.get_sub_pages(
page_content,
......@@ -346,7 +367,8 @@ class Attrap:
for sub_raa in self.get_raa_with_pager(
sub_pages,
pager_element,
host
host,
filter_from_last_element_date=filter_from_last_element_date
):
elements.append(sub_raa)
return elements
......@@ -384,12 +406,17 @@ class Attrap:
webdriver_options.add_argument("--disable-dev-shm-usage")
webdriver_options.add_argument("--use_subprocess")
webdriver_options.add_argument("--disable-blink-features=AutomationControlled")
webdriver_options.add_experimental_option("excludeSwitches", ["enable-automation"])
webdriver_options.add_experimental_option('useAutomationExtension', False)
if not self.user_agent == "":
webdriver_options.add_argument(f"--user-agent={self.user_agent}")
webdriver_options.add_argument("--headless")
webdriver_options.add_argument("--window-size=1024,768")
if self.tor_enabled:
webdriver_options.add_argument(f'--proxy-server=socks5://127.0.0.1:9050')
webdriver_options.add_argument("--headless=new")
webdriver_options.add_argument("--start-maximized")
display = Display(visible=False, size=(1024, 768))
display.start()
......@@ -414,6 +441,8 @@ class Attrap:
logger.warning(f'TimeoutException: {exc}')
if remaining_retries > 0:
time.sleep(5)
if self.tor_enabled:
self.tor_get_new_id()
return self.get_session(url, wait_element, (remaining_retries - 1))
else:
raise TimeoutException(exc)
......@@ -440,24 +469,37 @@ class Attrap:
f.write(data + "\n")
f.close()
def get_page(self, url, method, data={}):
def get_page(self, url, method, data={}, selenium=False):
"""
Récupère le contenu HTML d'une page web
url -- L'URL de la page demandée
method -- 'post' ou 'get', selon le type de requête
data -- Un dictionnaire contenant les données à envoyer au site
selenium -- lance un navigateur avec Selenium pour contourner les protections anti-robots
"""
try:
logger.debug(f'Chargement de la page {url}')
# Si un délai a été configuré, on vérifie qu'il n'est pas trop tôt pour lancer la requête
if self.sleep_time > 0:
time.sleep(self.sleep_time)
current_time = int(time.mktime(datetime.datetime.today().timetuple()))
remaining_sleep_time = self.last_http_request + self.sleep_time - current_time
if remaining_sleep_time > 0:
time.sleep(remaining_sleep_time)
self.last_http_request = int(time.mktime(datetime.datetime.today().timetuple()))
page = None
if method == 'get':
page = self.session.get(url, timeout=(10, 120))
if method == 'post':
page = self.session.post(url, data=data, timeout=(10, 120))
if selenium and method == 'get':
page_content = self.get_session(url, None, 6)
page = {'content': page_content, 'status_code': 200}
page = SimpleNamespace(**page)
else:
if method == 'get':
page = self.session.get(url, timeout=(10, 120))
if method == 'post':
page = self.session.post(url, data=data, timeout=(10, 120))
if page.status_code == 429:
logger.warning('Erreur 429 Too Many Requests reçue, temporisation...')
......@@ -480,6 +522,9 @@ class Attrap:
except requests.exceptions.Timeout:
logger.warning(f'Timeout, on relance la requête...')
return self.get_page(url, method, data)
except urllib3.exceptions.ProtocolError:
logger.warning(f'Erreur de connexion, on relance la requête...')
return self.get_page(url, method, data)
def update_user_agent(self, user_agent):
"""Change la valeur du user-agent"""
......@@ -552,6 +597,7 @@ class Attrap:
"""Recherche des mots-clés dans le texte extrait du PDF"""
if keywords and not keywords == '':
text = open(f'{self.data_dir}/raa/{raa.get_sha256()}.txt').read()
date_str = raa.date.strftime("%d/%m/%Y")
found = False
found_keywords = []
......@@ -559,7 +605,7 @@ class Attrap:
if re.search(keyword, text, re.IGNORECASE | re.MULTILINE):
if not found:
url = quote(raa.url, safe='/:')
self.print_output(f'\033[92m{raa.name}\033[0m ({raa.date_str})')
self.print_output(f'\033[92m{raa.name}\033[0m ({date_str})')
self.print_output(f'URL : {url}')
found = True
self.found = True
......@@ -573,7 +619,7 @@ class Attrap:
[str(x) for x in found_keywords]
)
self.mastodon_toot(
f'{raa.name} ({raa.date_str})\n\nLes termes suivants ont '
f'{raa.name} ({date_str})\n\nLes termes suivants ont '
f'été trouvés : {found_keywords_str}.\n\nURL : {url}'
)
......@@ -590,25 +636,33 @@ class Attrap:
for raa in elements:
# Si le fichier n'a pas déjà été parsé et qu'il est postérieur à la
# date maximale d'analyse, on le télécharge et on le parse
if not os.path.isfile(f'{self.data_dir}/raa/{raa.get_sha256()}.txt') and (not raa.date or (raa.date >= Attrap.get_aware_datetime(self.not_before))):
if not os.path.isfile(f'{self.data_dir}/raa/{raa.get_sha256()}.txt') and (not raa.date or (raa.date >= Attrap.get_aware_datetime(self.not_before, timezone=self.timezone))):
url = quote(raa.url, safe='/:')
self.download_file(raa)
try:
raa.parse_metadata(self.data_dir)
# Lorsque la date du RAA n'est pas connue, on a dû télécharger le PDF pour récupérer la date de ses métadonnées.
# Donc on vérifie à nouveau ici si la date correspond à ce qu'on veut analyser
if (raa.date and raa.date >= Attrap.get_aware_datetime(self.not_before)):
logger.info(f'Nouveau fichier : {raa.name} ({raa.date_str}). URL : {url}')
if not raa.date:
os.remove(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf')
os.remove(f'{self.data_dir}/raa/{raa.get_sha256()}.json')
logger.error(f'ERREUR: le RAA {raa.name} n\'a pas de date !')
sys.exit(1)
if raa.date >= Attrap.get_aware_datetime(self.not_before, timezone=self.timezone):
date_str = raa.date.strftime("%d/%m/%Y")
logger.info(f'Nouveau fichier : {raa.name} ({date_str}). URL : {url}')
self.flatten_pdf(raa)
self.ocr(raa, True)
raa.extract_content(self.data_dir)
self.search_keywords(raa, keywords)
else:
# On supprime le fichier de metadonnées puisqu'on ne le parsera pas
os.remove(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf')
logger.error(f'ERREUR: le RAA {raa.name} n\'a pas de date !')
sys.exit(1)
os.remove(f'{self.data_dir}/raa/{raa.get_sha256()}.json')
except PdfStreamError as exc:
logger.warning(f'ATTENTION: le RAA à l\'adresse {raa.url} n\'est pas valide ! On l\'ignore...')
except EmptyFileError as exc:
logger.warning(f'ATTENTION: le RAA à l\'adresse {raa.url} est vide ! On l\'ignore...')
def get_raa(self, page_content):
logger.error('Cette fonction doit être surchargée')
......@@ -696,7 +750,7 @@ class Attrap:
"""
try:
search = re.search(regex, string, re.IGNORECASE)
guessed_date = dateparser.parse(search.group(1), languages=['fr'])
guessed_date = dateparser.parse(search.group(1), languages=['fr'], settings={'PREFER_DAY_OF_MONTH': 'last', 'PREFER_MONTH_OF_YEAR': 'last'})
if guessed_date is None:
raise Exception('La date est un objet None')
else:
......@@ -705,14 +759,14 @@ class Attrap:
logger.warning(f'Impossible de deviner la date du terme {string} : {exc}')
return datetime.datetime(9999, 1, 1)
def get_aware_datetime(unknown_datetime):
def get_aware_datetime(unknown_datetime, timezone='Europe/Paris'):
"""
Retourne un objet datetime avisé.
datetime - L'objet datetime à aviser. Utilise le fuseau horaire du système si datetime est naïf.
datetime - L'objet datetime à aviser. Utilise le fuseau indiqué si datetime est naïf.
"""
if unknown_datetime.tzinfo is not None and unknown_datetime.tzinfo.utcoffset(unknown_datetime) is not None:
return unknown_datetime
else:
return unknown_datetime.replace(tzinfo=datetime.datetime.now(datetime.timezone.utc).astimezone().tzinfo)
return pytz.timezone(timezone).localize(unknown_datetime)
......@@ -9,18 +9,19 @@ from Attrap import Attrap
class Attrap_ppparis(Attrap):
# Config
__HOST = 'https://www.prefecturedepolice.interieur.gouv.fr'
__RAA_PAGE = f'{__HOST}/actualites-et-presse/arretes/accueil-arretes'
hostname = 'https://www.prefecturedepolice.interieur.gouv.fr'
raa_page = f'{hostname}/actualites-et-presse/arretes/accueil-arretes'
__WAIT_ELEMENT = 'block-decree-list-block'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
full_name = 'Préfecture de police de Paris'
short_code = 'ppparis'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
super().__init__(data_dir, self.user_agent)
def get_raa(self, keywords):
page_content = self.get_session(self.__RAA_PAGE, self.__WAIT_ELEMENT, 6)
page_content = self.get_session(self.raa_page, self.__WAIT_ELEMENT, 6)
raa_elements = self.get_raa_elements(page_content)
self.parse_raa(raa_elements, keywords)
self.mailer()
......@@ -43,6 +44,6 @@ class Attrap_ppparis(Attrap):
name = a.find('span').get_text()
date = datetime.datetime.strptime(a.find('div', class_="field--type-datetime").get_text().strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
raa = Attrap.RAA(url, date, name, timezone=self.timezone)
elements.append(raa)
return elements
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref01(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.ain.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs-RAA'
full_name = 'Préfecture de l\'Ain'
short_code = 'pref01'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '(?:Recueil|Recueils) (?:des actes administratifs)(?:[ -])*([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref02(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.aisne.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs'
full_name = 'Préfecture de l\'Aisne'
short_code = 'pref02'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'RAA [Aa]nnée ([0-9]{4})'
import os
import datetime
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref03(Attrap_prefdpt):
class Attrap_pref03(Attrap):
# Config
__HOST = 'https://www.allier.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs-arretes'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:129.0) Gecko/20100101 Firefox/129.0'
# Configuration de la préfecture
hostname = 'https://www.allier.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs-arretes'
full_name = 'Préfecture de l\'Allier'
short_code = 'pref03'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
elements = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
for sub_page in self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
):
if Attrap.guess_date(sub_page['name'], '.* ([0-9]{4})').year >= self.not_before.year:
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# parse
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
import os
import datetime
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref04(Attrap_prefdpt):
class Attrap_pref04(Attrap):
# Config
__HOST = 'https://www.alpes-de-haute-provence.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Publications-administratives-et-legales/Recueil-des-Actes-Administratifs'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
# Configuration de la préfecture
hostname = 'https://www.alpes-de-haute-provence.gouv.fr'
raa_page = f'{hostname}/Publications/Publications-administratives-et-legales/Recueil-des-Actes-Administratifs'
full_name = 'Préfecture des Alpes-de-Haute-Provence'
short_code = 'pref04'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
elements = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
for sub_page in self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
):
if Attrap.guess_date(sub_page['name'], '([0-9]{4}).*').year >= self.not_before.year:
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# parse
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
import os
import datetime
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref05(Attrap_prefdpt):
class Attrap_pref05(Attrap):
# Config
__HOST = 'https://www.hautes-alpes.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
# Configuration de la préfecture
hostname = 'https://www.hautes-alpes.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
full_name = 'Préfecture des Hautes-Alpes'
short_code = 'pref05'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
year_pages_to_parse = []
# On détermine quelles pages d'année parser
page_content = self.get_page(self.__RAA_PAGE, 'get').content
year_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)
for year_page in year_pages:
if int(year_page['name'].replace('Année ', '').strip()) >= self.not_before.year:
year_pages_to_parse.append(year_page['url'])
month_pages_to_parse = []
# Pour chaque année, on cherche les sous-pages de mois
for year_page in year_pages_to_parse:
page_content = self.get_page(year_page, 'get').content
month_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)[::-1]
for month_page in month_pages:
# On filtre les mois ne correspondant pas à la période analysée
guessed_date = Attrap.guess_date(month_page['name'], '(.*)')
if guessed_date.replace(day=1) >= self.not_before.replace(day=1):
month_pages_to_parse.append(month_page['url'])
pages_to_parse = []
# Pour chaque page de mois, on cherche les pages de RAA
for month_page in month_pages_to_parse:
pages = self.get_sub_pages_with_pager(
month_page,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'nav.fr-pagination ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.__HOST
)[::-1]
for page in pages:
guessed_date = datetime.datetime.strptime(page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y')
if guessed_date.replace(day=1) >= self.not_before.replace(day=1):
pages_to_parse.append(page['url'])
elements = []
# On parse les pages contenant des RAA
for page in pages_to_parse:
page_content = self.get_page(page, 'get').content
for element in self.get_raa_elements(page_content):
elements.append(element)
# On parse les RAA
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('div.fr-grid-row div.fr-downloads-group.fr-downloads-group--bordered ul li a'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'Année *([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* *[0-9]{4})'
import os
import datetime
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref06(Attrap_prefdpt):
class Attrap_pref06(Attrap):
# Config
__HOST = 'https://www.alpes-maritimes.gouv.fr'
__RAA_PAGE = {
'2024': [
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2024/Recueils-mensuels',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2024/Recueils-speciaux',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2024/Recueils-specifiques'
],
'2023': [
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2023/Recueils-mensuels',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2023/Recueils-speciaux',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2023/Recueils-specifiques'
],
'2022': [
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2022/Recueils-mensuels',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2022/Recueils-speciaux',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2022/Recueils-specifiques'
],
'2021': [
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2021/Recueils-mensuels',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2021/Recueils-speciaux',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2021/Recueils-specifiques'
],
'2020': [
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2020/Recueils-mensuels',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2020/Recueils-speciaux',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2020/Recueils-specifiques'
],
'2019': [
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2019/Recueils-mensuels',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2019/Recueils-speciaux',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2019/Recueils-specifiques'
]
}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
# Configuration de la préfecture
hostname = 'https://www.alpes-maritimes.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs-RAA'
full_name = 'Préfecture des Alpes-Maritimes'
short_code = 'pref06'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
pages_to_parse = []
if self.not_before.year <= 2024:
for page in self.__RAA_PAGE['2024']:
pages_to_parse.append(page)
if self.not_before.year <= 2023:
for page in self.__RAA_PAGE['2023']:
pages_to_parse.append(page)
if self.not_before.year <= 2022:
for page in self.__RAA_PAGE['2022']:
pages_to_parse.append(page)
if self.not_before.year <= 2021:
for page in self.__RAA_PAGE['2021']:
pages_to_parse.append(page)
if self.not_before.year <= 2020:
for page in self.__RAA_PAGE['2020']:
pages_to_parse.append(page)
if self.not_before.year <= 2019:
for page in self.__RAA_PAGE['2019']:
pages_to_parse.append(page)
elements = self.get_raa_with_pager(
pages_to_parse,
".fr-pagination__link.fr-pagination__link--next",
self.__HOST
)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque élément fr-card__content, on cherche sa balise a, et si
# c'est un PDF on le parse
cards = soup.find_all('div', class_='fr-card__content')
for card in cards:
a = card.find('a')
if a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.get_text().strip()
date = datetime.datetime.strptime(card.find('p', class_='fr-card__detail').get_text().replace('Publié le ', '').strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'Année *([0-9]{4})'
import os
import datetime
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref09(Attrap_prefdpt):
class Attrap_pref09(Attrap):
# Config
__HOST = 'https://www.ariege.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-l-Ariege-a-partir-du-28-avril-2015'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
# Configuration de la préfecture
hostname = 'https://www.ariege.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-l-Ariege-a-partir-du-28-avril-2015'
full_name = 'Préfecture de l\'Ariège'
short_code = 'pref09'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
pages_to_parse = []
# Les RAA de l'Ariège sont éparpillés sur des sous-pages par mois.
# Donc on parse la page principale à la recherche des sous-pages.
sub_pages = self.get_sub_pages_with_pager(
self.__RAA_PAGE,
'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next',
'div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.__HOST
)[::-1]
# On filtre par date les sous-pages pour limiter les requêtes
for sub_page in sub_pages:
guessed_date = datetime.datetime.strptime(sub_page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y')
guessed_date.replace(day=1)
if guessed_date >= self.not_before:
pages_to_parse.append(sub_page['url'])
# On parse les pages contenant des RAA
elements = []
for page in pages_to_parse:
page_content = self.get_page(page, 'get').content
for element in self.get_raa_elements(page_content):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('div.fr-downloads-group.fr-downloads-group--bordered ul li a'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
timezone = 'Europe/Paris'
import os
import datetime
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref10(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.aube.gouv.fr'
raa_page = [
f'{hostname}/Publications/RAA-Recueil-des-Actes-Administratifs',
f'{hostname}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA-Archives'
]
class Attrap_pref10(Attrap):
# Config
__HOST = 'https://www.aube.gouv.fr'
__RAA_ARCHIVES_PAGE = f'{__HOST}/Publications/RAA-Recueil-des-Actes-Administratifs/RAA-Archives/'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:129.0) Gecko/20100101 Firefox/129.0'
full_name = 'Préfecture de l\'Aube'
short_code = 'pref10'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
# La préfecture de l'Aube a une page avec ses archives, et une page avec l'année en cours. On parse
# donc le menu déroulant de la gauche de la page d'archives (et non le contenu de la page) pour récupérér,
# premièrement, la page d'année en cours et, deuxièmement, les années archivées.
pages_to_parse = []
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'RAA *([0-9]{4})'
archives_page_content = self.get_page(self.__RAA_ARCHIVES_PAGE, 'get').content
for year_page in self.get_sub_pages(
archives_page_content,
'ul.fr-sidemenu__list a.fr-sidemenu__link',
self.__HOST,
False,
):
if Attrap.guess_date(year_page['name'], '.* ([0-9]{4})').year >= self.not_before.year:
pages_to_parse.append(year_page['url'])
elements = self.get_raa_with_pager(
pages_to_parse,
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next',
self.__HOST
# On ajoute un widget custom représentant les liens sur la page d'accueil
Attrap_prefdpt.widgets.append(
Attrap_prefdpt.DptWidget(
'homepage_links',
regex={'year': 'Année *([0-9]{4})'},
css_path={'title': 'div.fr-text--lead p a.fr-link'}
)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-card__link.menu-item-link'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.text.strip()
date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y')
if date >= self.not_before:
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
)
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref11(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.aude.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs-RAA'
full_name = 'Préfecture de l\'Aude'
short_code = 'pref11'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'Année *([0-9]{4})'
import os
import datetime
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref13(Attrap_prefdpt):
class Attrap_pref13(Attrap):
# Config
__HOST = 'https://www.bouches-du-rhone.gouv.fr'
__RAA_PAGE = [
f'{__HOST}/Publications/RAA-et-Archives/RAA-2024',
f'{__HOST}/Publications/RAA-et-Archives/RAA-2023',
f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2022',
f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2021',
f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2020',
f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2019'
# Configuration de la préfecture
hostname = 'https://www.bouches-du-rhone.gouv.fr'
raa_page = [
f'{hostname}/Publications/RAA-et-Archives',
f'{hostname}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone'
]
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
full_name = 'Préfecture des Bouches-du-Rhône'
short_code = 'pref13'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
elements = []
for raa_page in self.__RAA_PAGE:
page_content = self.get_page(raa_page, 'get').content
for element in self.get_raa_elements(page_content):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'RAA[- ]*([0-9]{4})'
Attrap_prefdpt.grey_card['follow_link_on_unrecognised_date'] = False
import os
import datetime
import logging
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
logger = logging.getLogger(__name__)
class Attrap_pref25(Attrap):
# Config
__HOST = 'https://www.doubs.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Publications-Legales/Recueil-des-Actes-Administratifs-RAA'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
class Attrap_pref25(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.doubs.gouv.fr'
raa_page = f'{hostname}/Publications/Publications-Legales/Recueil-des-Actes-Administratifs-RAA'
full_name = 'Préfecture du Doubs'
short_code = 'pref25'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
sub_pages = self.get_sub_pages_with_pager(
self.__RAA_PAGE,
'a.fr-card__link',
'a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
None,
self.__HOST,
)
pages_to_parse = []
# TODO : détecter la date de la page à partir du parsing de la page principale et non à partir de son URL
for sub_page in sub_pages:
url = sub_page['url']
last_word = url.split('-')[-1]
year = 0
try:
year = int(last_word)
if self.not_before.year <= year:
pages_to_parse.append(url)
except Exception as e:
logger.warning(f"Impossible de déterminer l'année de l'URL {url}")
elements = []
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
elements.extend(self.get_raa_elements(page_content))
self.parse_raa(elements[::-1], keywords)
self.mailer()
def get_raa_elements(self, page_content):
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
elements = []
# On récupère chaque balise a
for a in soup.select('div.fr-downloads-group.fr-downloads-group--bordered ul li a'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').text.split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
Attrap_prefdpt.grey_card['follow_link_on_unrecognised_date'] = False
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref29(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.finistere.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
full_name = 'Préfecture du Finistère'
short_code = 'pref29'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '(?:Recueils publiés en ).*([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref2a(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.corse-du-sud.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-de-la-prefecture-de-la-Corse-du-Sud'
full_name = 'Préfecture de la Corse-du-Sud'
short_code = 'pref2a'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.white_card['regex']['year'] = '([0-9]{4})'
import os
import datetime
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref2b(Attrap_prefdpt):
class Attrap_pref2b(Attrap):
# Config
__HOST = 'https://www.haute-corse.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:129.0) Gecko/20100101 Firefox/129.0'
# Configuration de la préfecture
hostname = 'https://www.haute-corse.gouv.fr'
raa_page = f'{hostname}/Publications/Publications-administratives-et-legales/Recueils-des-actes-administratifs'
full_name = 'Préfecture de Haute-Corse'
short_code = 'pref2b'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
# La préfecture de Haute-Corse organise son site avec une page dédiée à l'année N, une autre dédiée à l'année N-1,
# puis les années d'avant sont regroupées ensemble sur deux pages. On doit donc parser les pages jusqu'à ce qu'on ne
# tombe plus sur des cartes d'années.
pages_to_parse = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
for card in self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
):
if Attrap.guess_date(card['name'], '[a-z]* ([0-9]{4})').year >= self.not_before.year:
pages_to_parse.append(card['url'])
else:
# Si on n'a pas trouvé une page d'année, on tente de parser la page à la rechercge
# de sous-pages (et sinon on ignore la page)
page_content = self.get_page(card['url'], 'get').content
for card in self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
):
if Attrap.guess_date(card['name'], '[a-z]* ([0-9]{4})').year >= self.not_before.year:
pages_to_parse.append(card['url'])
# Pour chaque année, on cherche les sous-pages de mois
month_pages_to_parse = []
for year_page in pages_to_parse:
for month_page in self.get_sub_pages_with_pager(
year_page,
'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next',
None,
self.__HOST
):
if Attrap.guess_date(month_page['name'], '([a-zûé]* [0-9]{4})').replace(day=1) >= self.not_before.replace(day=1):
month_pages_to_parse.append(month_page['url'])
# On parse les pages contenant des RAA
elements = []
for page in month_pages_to_parse:
page_content = self.get_page(page, 'get').content
for element in self.get_raa_elements(page_content):
elements.append(element)
self.parse_raa(elements[::-1], keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('div.fr-downloads-group.fr-downloads-group--bordered ul li a'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'Recueils des actes administratifs ([0-9]{4})'
Attrap_prefdpt.white_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref30(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.gard.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs'
full_name = 'Préfecture du Gard'
short_code = 'pref30'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
import os
import datetime
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref31(Attrap_prefdpt):
class Attrap_pref31(Attrap):
# Config
__HOST = 'https://www.haute-garonne.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-Actes-Administratifs/Recueil-des-Actes-Administratifs-Haute-Garonne'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
# Configuration de la préfecture
hostname = 'https://www.haute-garonne.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs/Recueil-des-Actes-Administratifs-Haute-Garonne'
full_name = 'Préfecture de la Haute-Garonne'
short_code = 'pref31'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
# On cherche les pages de chaque mois
page_content = self.get_page(self.__RAA_PAGE, 'get').content
month_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)[::-1]
pages_to_parse = []
# On filtre les pages de mois pour limiter le nombre de requêtes
for month_page in month_pages:
guessed_date = Attrap.guess_date(month_page['name'], '([a-zéû]* [0-9]{4})')
if guessed_date >= self.not_before.replace(day=1):
pages_to_parse.append(month_page['url'])
elements = []
# On parse les pages des mois qu'on veut analyser
for element in self.get_raa_with_pager(
pages_to_parse,
".fr-pagination__link.fr-pagination__link--next",
self.__HOST
):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link.menu-item-link'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.get_text().strip().capitalize()
date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'