Skip to content
Extraits de code Groupes Projets

Comparer les révisions

Les modifications sont affichées comme si la révision source était fusionnée avec la révision cible. En savoir plus sur la comparaison des révisions.

Source

Sélectionner le projet cible
No results found

Cible

Sélectionner le projet cible
  • la-quadrature-du-net/Attrap
  • foggyfrog/Attrap
  • skhwiz/Attrap
  • precambrien/Attrap
  • ketsapiwiq/Attrap
  • Joseki/Attrap
  • kr1p/attrap-pref-12
  • kr1p/attrap-pref-46
  • kr1p/attrap-pi
  • Guinness/Attrap
  • astroidgritty/attrap-pref-84
  • davinov/Attrap
  • maettellite/attrap-pref-01
  • m242/Attrap
  • multi/Attrap
  • mverdeil/Attrap
  • olpo/Attrap
17 résultats
Afficher les modifications
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref33(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.gironde.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs'
full_name = 'Préfecture de la Gironde'
short_code = 'pref33'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'Recueils{0,1} des [Aa]ctes [Aa]dministratifs de l\'année ([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.grey_card['follow_link_on_unrecognised_date'] = False
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref34(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.herault.gouv.fr'
raa_page = f'{hostname}/Publications/Recueils-des-actes-administratifs'
full_name = 'Préfecture de l\'Hérault'
short_code = 'pref34'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
year_regex = '(?:(?:Recueil des actes administratifs)|(?:Année))[ -]([0-9]{4})'
Attrap_prefdpt.white_card['regex']['year'] = year_regex
Attrap_prefdpt.grey_card['regex']['year'] = year_regex
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref35(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.ille-et-vilaine.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
full_name = 'Préfecture d\'Ille-et-Vilaine'
short_code = 'pref35'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
year_regex = 'Recueil des actes administratifs ([0-9]{4})'
Attrap_prefdpt.white_card['regex']['year'] = year_regex
# On ajoute un widget de menu déroulant
Attrap_prefdpt.select_widgets.append(
Attrap_prefdpt.DptSelectWidget(
'menu_deroulant',
regex=year_regex,
css_path='div.fr-select-group select#Archives-des-RAA-liste-docs.fr-select',
type='year'
)
)
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref38(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.isere.gouv.fr'
raa_page = f'{hostname}/Publications/RAA-Recueil-des-actes-administratifs'
full_name = 'Préfecture de l\'Isère'
short_code = 'pref38'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
year_regex = '(?:(?:[Rr]ecueils{0,1} des [Aa]ctes [Aa]dministratifs de la [Pp]réfecture de l\'Isère[ -]*)|(?:Année ))([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['year'] = year_regex
Attrap_prefdpt.white_card['regex']['year'] = year_regex
Attrap_prefdpt.white_card['exclude'] = ['Vous recherchez "Le Journal officiel de la République française" ?']
# On ajoute un widget de menu déroulant
Attrap_prefdpt.select_widgets.append(
Attrap_prefdpt.DptSelectWidget(
'menu_deroulant',
regex='([0-9]{1,2}[er]{0,1} [a-zéû]* [0-9]{4})',
css_path='select#-liste-docs',
type='year-month-day'
)
)
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref39(Attrap_prefdpt):
# Configuration de la préfecture
hostname = "https://www.jura.gouv.fr"
raa_page = f'{hostname}/Publications/Publications-legales/Recueil-des-Actes-Administratifs'
full_name = "Préfecture du Jura"
short_code = "pref39"
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'Année ([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref42(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.loire.gouv.fr'
raa_page = f'{hostname}/Publications/Publications-legales/Recueil-des-Actes-Administratifs'
full_name = 'Préfecture de la Loire'
short_code = 'pref42'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.white_card['regex']['year'] = '([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref44(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.loire-atlantique.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs-RAA-en-Loire-Atlantique'
full_name = 'Préfecture de la Loire-Atlantique'
short_code = 'pref44'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.grey_card['add_year_to_months'] = True
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref49(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.maine-et-loire.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs'
full_name = 'Préfecture de Maine-et-Loire'
short_code = 'pref49'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '.*([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref50(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.manche.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
full_name = 'Préfecture de la Manche'
short_code = 'pref50'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref52(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.haute-marne.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs-RAA'
full_name = 'Préfecture de la Haute-Marne'
short_code = 'pref52'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.white_card['regex']['year'] = 'Année ([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref54(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.meurthe-et-moselle.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
full_name = 'Préfecture de Meurthe-et-Moselle'
short_code = 'pref54'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.white_card['regex']['year'] = '([0-9]{4})'
# On ajoute un widget de menu déroulant
Attrap_prefdpt.select_widgets.append(
Attrap_prefdpt.DptSelectWidget(
'menu_deroulant',
regex='.* du ([0-9]*(?:er|ER)? [A-Za-zéÉûÛ]* [0-9]*)',
css_path='select#Liste-liste-docs',
type='year-month-day'
)
)
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref55(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.meuse.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs-RAA'
full_name = 'Préfecture de la Meuse'
short_code = 'pref55'
timezone = 'Europe/Paris'
# On configure le widget de menu déroulant
Attrap_prefdpt.select_widgets.append(
Attrap_prefdpt.DptSelectWidget(
'menu_deroulant',
regex='RAA année ([0-9]{4})',
css_path='select#Liste-des-recueils-liste-docs',
type='year'
)
)
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref59(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.nord.gouv.fr'
raa_page = f'{hostname}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord'
full_name = 'Préfecture du Nord'
short_code = 'pref59'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.grey_card['add_year_to_months'] = True
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref61(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.orne.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs-RAA/Recueil-des-Actes-Administratifs-RAA'
full_name = 'Préfecture de l\'Orne'
short_code = 'pref61'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'Le Recueil des actes administratifs ([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.grey_card['add_year_to_months'] = True
Attrap_prefdpt.white_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.white_card['add_year_to_months'] = True
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref62(Attrap_prefdpt):
# Config
hostname = 'https://www.pas-de-calais.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
full_name = 'Préfecture du Pas-de-Calais'
short_code = 'pref62'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.white_card['regex']['year'] = '([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref63(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.puy-de-dome.gouv.fr'
raa_page = f'{hostname}/Publications/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-Puy-de-Dome'
full_name = 'Préfecture du Puy-de-Dôme'
short_code = 'pref63'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref64(Attrap):
# Config
hostname = 'https://www.pyrenees-atlantiques.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture des Pyrénées-Atlantiques'
short_code = 'pref64'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30)
def get_raa(self, keywords):
# On récupère les pages d'années
year_pages = []
page_content = self.get_page(self.raa_page, 'get').content
for year_page in self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.hostname,
False
):
year = Attrap.guess_date(year_page['name'], '.* ([0-9]{4})').year
if year < 9999 and year >= self.not_before.year:
year_pages.append(year_page['url'])
# Pour chaque page d'année, on récupère les pages de mois
month_pages = []
for year_page in year_pages:
page_content = self.get_page(year_page, 'get').content
for month_page in self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.hostname,
False
):
if Attrap.guess_date(month_page['name'], '(.*)').replace(day=1) >= self.not_before.replace(day=1):
month_pages.append(month_page['url'])
# On récupère les RAA en suivant la navigation de chaque page de mois
elements = self.get_raa_with_pager(
month_pages[::-1],
'a.fr-pagination__link--next.fr-pagination__link--lg-label',
self.hostname
)[::-1]
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link.menu-item-link'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.hostname}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.get_text().strip()
date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name, timezone=self.timezone)
elements.append(raa)
return elements
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref65(Attrap):
# Config
hostname = 'https://www.hautes-pyrenees.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-d-actes-administratifs'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture des Hautes-Pyrénées'
short_code = 'pref65'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30)
def get_raa(self, keywords):
# On détermine quelles pages d'année parser
pages_to_parse = []
year_pages = self.get_sub_pages_with_pager(
self.raa_page,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.hostname
)
for year_page in year_pages:
if Attrap.guess_date(year_page['name'].strip(), '.*([0-9]{4})').year >= self.not_before.year:
pages_to_parse.append(year_page['url'])
elements = []
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
for element in self.get_raa_elements(page_content):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.hostname}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name, timezone=self.timezone)
elements.append(raa)
return elements
import os
import sys
import datetime
import logging
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
logger = logging.getLogger(__name__)
class Attrap_pref66(Attrap):
# Config
hostname = 'https://www.pyrenees-orientales.gouv.fr'
raa_page = f'{hostname}/Publications/Le-recueil-des-actes-administratifs'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture des Pyrénées-Orientales'
short_code = 'pref66'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30)
def get_raa(self, keywords):
# On détermine quelles pages d'année parser
year_pages = []
page_content = self.get_page(self.raa_page, 'get').content
for year_page in self.get_sub_pages(
page_content,
'.fr-table table tr td h3 a.fr-link',
self.hostname,
False
):
year = Attrap.guess_date(year_page['name'].strip(), '.* ([0-9]{4})').year
if year < 9999 and year >= self.not_before.year:
year_pages.append([year_page['url'], year])
elements = []
# La préfecture des Pyrénées-Orientales est une originale : avant 2024,
# chaque page annuelle contient l'ensemble des RAA, mais pas tout le
# temps avec leur date, qu'il faut deviner à partir du nom du RAA.
# Mais en 2024, ça change ! La page de 2024 contient un tableau
# récapitulatif avec toutes les dates de publication des RAA, mais
# aussi un pager. Sauf qu'il s'avère que le tableau récapitulatif
# n'est pas exhaustif. On doit donc parser toutes les sous-pages de
# 2024 puisqu'on ne peut se fier au tableau récapitulatif.
# Grrr.
for year_page in year_pages:
url = year_page[0]
year = year_page[1]
if year >= 2024:
for element in self.get_raa_elements_since_2024(url):
elements.append(element)
else:
for element in self.get_raa_elements_before_2024(url):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
# On parse un lien d'avant 2024
def get_raa_elements_before_2024(self, page):
elements = []
page_content = self.get_page(page, 'get').content
soup = BeautifulSoup(page_content, 'html.parser')
for a in soup.select('div.fr-table.fr-table--bordered.list a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
date = None
try:
# Lorsque la date n'est pas affichée à l'écran, elle est en
# fait cachée dans la propriété "title" du lien
details = ''
if a.find('span'):
details = a.find('span').get_text().split(' - ')[-1].strip()
else:
details = a['title'].split(' - ')[-1].strip()
date = datetime.datetime.strptime(details, '%d/%m/%Y')
except Exception as exc:
logger.error(f'Impossible de trouver de date pour le texte : {text_raw}: {exc}')
sys.exit(1)
if date >= self.not_before:
url = ''
if a['href'].startswith('/'):
url = f"{self.hostname}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = ''
if a.find('span') and a.find('span').previous_sibling:
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
else:
name = a.get_text().replace('Télécharger ', '').strip()
elements.append(Attrap.RAA(url, date, name))
return elements
# On parse les RAA depuis 2024
def get_raa_elements_since_2024(self, root_page):
pages = self.get_sub_pages_with_pager(
root_page,
'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next',
'div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.hostname
)[::-1]
pages_to_parse = []
elements = []
for page in pages:
if not page['url'].endswith('.pdf'):
logger.warning(f"Attention, le lien vers {page['url']} n'est pas bon !")
else:
if page['url'].startswith('/'):
url = f"{self.hostname}{page['url']}"
else:
url = page['url']
url = unquote(url)
name = page['name'].replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y')
elements.append(Attrap.RAA(url, date, name, timezone=self.timezone))
return elements
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref69(Attrap):
# Config
hostname = 'https://www.rhone.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Rhône'
short_code = 'pref69'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30)
def get_raa(self, keywords):
# On détermine quelles pages d'année parser
year_pages = []
page_content = self.get_page(self.raa_page, 'get').content
for year_page in self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.hostname,
False
):
year = Attrap.guess_date(year_page['name'].strip(), '.* ([0-9]{4})').year
if year < 9999 and year >= self.not_before.year:
year_pages.append(year_page['url'])
sub_pages_to_parse = []
for raa_page in year_pages:
sub_pages = self.get_sub_pages_with_pager(
raa_page,
'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link--next',
None,
self.hostname)[::-1]
for sub_page in sub_pages:
sub_pages_to_parse.append(sub_page['url'])
elements = []
for sub_page_to_parse in sub_pages_to_parse:
page_content = self.get_page(sub_page_to_parse, 'get').content
for element in self.get_raa_elements(page_content)[::-1]:
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.hostname}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name, timezone=self.timezone)
elements.append(raa)
return elements