Skip to content
Extraits de code Groupes Projets

Comparer les révisions

Les modifications sont affichées comme si la révision source était fusionnée avec la révision cible. En savoir plus sur la comparaison des révisions.

Source

Sélectionner le projet cible
No results found

Cible

Sélectionner le projet cible
  • la-quadrature-du-net/Attrap
  • foggyfrog/Attrap
  • skhwiz/Attrap
  • precambrien/Attrap
  • ketsapiwiq/Attrap
  • Joseki/Attrap
  • kr1p/attrap-pref-12
  • kr1p/attrap-pref-46
  • kr1p/attrap-pi
  • Guinness/Attrap
  • astroidgritty/attrap-pref-84
  • davinov/Attrap
  • maettellite/attrap-pref-01
  • m242/Attrap
  • multi/Attrap
  • mverdeil/Attrap
  • olpo/Attrap
17 résultats
Afficher les modifications
Affichage de
avec 903 ajouts et 962 suppressions
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_ppparis(RAAspotter):
# Config
__HOST = 'https://www.prefecturedepolice.interieur.gouv.fr'
__RAA_PAGE = f'{__HOST}/actualites-et-presse/arretes/accueil-arretes'
__WAIT_ELEMENT = 'block-decree-list-block'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
full_name = 'Préfecture de police de Paris'
short_code = 'ppparis'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
def get_raa(self, keywords):
self.print_output('RAAspotter_ppparis')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
page_content = self.get_session()
raa_elements = self.get_raa_elements(page_content)
self.parse_raa(raa_elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse
for a in soup.find_all('a', href=True):
if a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = 'https://www.prefecturedepolice.interieur.gouv.fr'+a['href']
else:
url = a['href']
url = unquote(url)
name = a.find('span').get_text()
date = datetime.datetime.strptime(a.find('div', class_="field--type-datetime").get_text().strip(), '%d/%m/%Y')
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
def get_session(self):
return super().get_session(self.__RAA_PAGE, self.__WAIT_ELEMENT)
import os, sys
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_pref06(RAAspotter):
# Config
__HOST = 'https://www.alpes-maritimes.gouv.fr'
__RAA_PAGE = {'2024':
[f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2024/Recueils-mensuels',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2024/Recueils-speciaux',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2024/Recueils-specifiques'],
'2023':
[f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2023/Recueils-mensuels',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2023/Recueils-speciaux',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2023/Recueils-specifiques'],
'2022':
[f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2022/Recueils-mensuels',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2022/Recueils-speciaux',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2022/Recueils-specifiques'],
'2021':
[f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2021/Recueils-mensuels',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2021/Recueils-speciaux',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2021/Recueils-specifiques'],
'2020':
[f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2020/Recueils-mensuels',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2020/Recueils-speciaux',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2020/Recueils-specifiques'],
'2019':
[f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2019/Recueils-mensuels',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2019/Recueils-speciaux',
f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA/Annee-2019/Recueils-specifiques']}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture des Alpes-Maritimes'
short_code = 'pref06'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(20)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref06')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
for page in self.__RAA_PAGE['2024']:
pages_to_parse.append(page)
if self.not_before.year <= 2023:
for page in self.__RAA_PAGE['2023']:
pages_to_parse.append(page)
if self.not_before.year <= 2022:
for page in self.__RAA_PAGE['2022']:
pages_to_parse.append(page)
if self.not_before.year <= 2021:
for page in self.__RAA_PAGE['2021']:
pages_to_parse.append(page)
if self.not_before.year <= 2020:
for page in self.__RAA_PAGE['2020']:
pages_to_parse.append(page)
if self.not_before.year <= 2019:
for page in self.__RAA_PAGE['2019']:
pages_to_parse.append(page)
elements = self.get_raa_with_pager(pages_to_parse, ".fr-pagination__link.fr-pagination__link--next", self.__HOST)
self.parse_raa(elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque élément fr-card__content, on cherche sa balise a, et si c'est un PDF on le parse
cards = soup.find_all('div', class_='fr-card__content')
for card in cards:
a = card.find('a')
if a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.get_text().strip()
date = datetime.datetime.strptime(card.find('p', class_='fr-card__detail').get_text().replace('Publié le ', '').strip(), '%d/%m/%Y')
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
import os, sys
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_pref13(RAAspotter):
# Config
__HOST = 'https://www.bouches-du-rhone.gouv.fr'
__RAA_PAGE = [f'{__HOST}/Publications/RAA-et-Archives/RAA-2024',
f'{__HOST}/Publications/RAA-et-Archives/RAA-2023',
f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2022',
f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2021',
f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2020',
f'{__HOST}/Publications/RAA-et-Archives/Archives-RAA-des-Bouches-du-Rhone/RAA-2019']
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
full_name = 'Préfecture des Bouches-du-Rhône'
short_code = 'pref13'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref13')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
for raa_page in self.__RAA_PAGE:
page_content = self.get_page(raa_page, 'get').content
raa_elements = self.get_raa_elements(page_content)
self.parse_raa(raa_elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
import os, sys
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_pref34(RAAspotter):
# Config
__HOST = 'https://www.herault.gouv.fr'
__RAA_PAGE = {'2024': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2024',
'2023': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2023',
'2022': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2022',
'2021': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2021',
'2020': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2020',
'2019': f'{__HOST}/Publications/Recueils-des-actes-administratifs/Archives/Recueil-des-actes-administratifs-2019'}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
full_name = 'Préfecture de l\'Hérault'
short_code = 'pref34'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref34')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
raa_elements = self.get_raa_elements(page_content)
self.parse_raa(raa_elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
import os, sys
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_pref35(RAAspotter):
# Config
__HOST = 'https://www.ille-et-vilaine.gouv.fr'
__RAA_PAGE = [f'{__HOST}/Publications/Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2024',
f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2023',
f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2022',
f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2021',
f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2020',
f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs/Recueil-des-actes-administratifs-2019']
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
full_name = 'Préfecture d\'Ille-et-Vilaine'
short_code = 'pref35'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref35')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
for raa_page in self.__RAA_PAGE:
page_content = self.get_page(raa_page, 'get').content
raa_elements = self.get_raa_elements(page_content)
self.parse_raa(raa_elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse
for a in soup.find_all('a', href=True, class_='fr-link--download'):
if a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
import os, sys, re
import datetime
import logging
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
logger = logging.getLogger(__name__)
class RAAspotter_pref38(RAAspotter):
# Config
__HOST = 'https://www.isere.gouv.fr'
__RAA_PAGE = {'2024': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2024',
'2023': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2023',
'2022': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2022',
'2021': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-recueils-des-actes-administratifs-de-la-prefecture-de-l-Isere-2021/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2021',
'2020': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-recueils-des-actes-administratifs-de-la-prefecture-de-l-Isere-2020/Recueils-des-Actes-Administratifs-de-la-Prefecture-de-l-Isere-2020',
'2019': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives/Archives-des-Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2019/Archives-des-Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2019'}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture de l\'Isère'
short_code = 'pref38'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(20)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref38')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
raa_elements = self.get_raa_elements(page_content, raa_page)
self.parse_raa(raa_elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content, raa_page):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère le select qui contient la liste des RAA
select_list = soup.select('select#-liste-docs')[0]
# On analyse chaque résultat
for option in select_list.find_all('option'):
if not option['value'] == "":
# On estime la date à partir du nom de fichier
guessed_date = RAAspotter.guess_date(option['title'], '.* n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)')
# Si la date estimée correspond à la plage d'analyse, on demande au serveur les détails du RAA
if guessed_date >= self.not_before:
page_content = self.get_page(raa_page, 'post', {'-liste-docs':option['value']}).content
# On parse la page de détails pour obtenir les propriétés du RAA
soup = BeautifulSoup(page_content, 'html.parser')
a = soup.select('div.liste_deroulante a.fr-link.fr-link--download')[0]
# Si la page contient une balise a qui renvoie vers un pdf, c'est qu'on a obtenu les détails du RAA demandé, donc on le parse
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
import os, sys, re
import datetime
import dateparser
import logging
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
logger = logging.getLogger(__name__)
class RAAspotter_pref59(RAAspotter):
# Config
__HOST = 'https://www.nord.gouv.fr'
__RAA_PAGE = {'2024': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2024',
'2023': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2023',
'2022': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2022',
'2021': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2021',
'2020': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2020',
'2019': f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord/2019'}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Nord'
short_code = 'pref59'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(20)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref59')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
sub_pages = self.get_sub_pages(page_content, "div.fr-card__body div.fr-card__content h2.fr-card__title a", self.__HOST, True)
for sub_page in sub_pages[::-1]:
sub_page_content = self.get_page(sub_page['url'], 'get').content
sub_raa_elements = self.get_raa_elements(sub_page_content)
self.parse_raa(sub_raa_elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
import os, sys
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_pref62(RAAspotter):
# Config
__HOST = 'https://www.pas-de-calais.gouv.fr'
__RAA_PAGE = {'2024':
[f'{__HOST}/Publications/Recueil-des-actes-administratifs/2024-Recueils-des-actes-administratifs'],
'2023':
[f'{__HOST}/Publications/Recueil-des-actes-administratifs/2023-Recueils-des-actes-administratifs',
f'{__HOST}/Publications/Recueil-des-actes-administratifs/2023-Recueils-speciaux-des-actes-administratifs'],
'2022':
[f'{__HOST}/Publications/Recueil-des-actes-administratifs/2022-Recueils-des-Actes-Administratifs',
f'{__HOST}/Publications/Recueil-des-actes-administratifs/2022-Recueils-Speciaux-des-Actes-Administratifs'],
'2021':
[f'{__HOST}/Publications/Recueil-des-actes-administratifs/2021-Recueils-des-actes-administratifs',
f'{__HOST}/Publications/Recueil-des-actes-administratifs/2021-Recueils-speciaux-des-actes-administratifs'],
'2020':
[f'{__HOST}/Publications/Recueil-des-actes-administratifs/2020-Recueils-des-actes-administratifs',
f'{__HOST}/Publications/Recueil-des-actes-administratifs/2020-Recueils-speciaux-des-actes-administratifs'],
'2019':
[f'{__HOST}/Publications/Recueil-des-actes-administratifs/2019-Recueil-des-actes-administratifs',
f'{__HOST}/Publications/Recueil-des-actes-administratifs/2019-Recueils-speciaux-des-actes-administratifs']}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Pas-de-Calais'
short_code = 'pref62'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(20)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref62')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
for page in self.__RAA_PAGE['2024']:
pages_to_parse.append(page)
if self.not_before.year <= 2023:
for page in self.__RAA_PAGE['2023']:
pages_to_parse.append(page)
if self.not_before.year <= 2022:
for page in self.__RAA_PAGE['2022']:
pages_to_parse.append(page)
if self.not_before.year <= 2021:
for page in self.__RAA_PAGE['2021']:
pages_to_parse.append(page)
if self.not_before.year <= 2020:
for page in self.__RAA_PAGE['2020']:
pages_to_parse.append(page)
if self.not_before.year <= 2019:
for page in self.__RAA_PAGE['2019']:
pages_to_parse.append(page)
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
raa_elements = self.get_raa_elements(page_content)
self.parse_raa(raa_elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère le div qui contient la liste des RAA
cards = soup.select('div.fr-downloads-group.fr-downloads-group--bordered')[0]
# On analyse chaque balise a dans ce div
for a in cards.find_all('a', href=True):
if a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements[::-1]
import os, sys
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_pref69(RAAspotter):
# Config
__HOST = 'https://www.rhone.gouv.fr'
__RAA_PAGE = {'2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2024',
'2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2023',
'2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2022',
'2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2021',
'2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2020',
'2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2019'}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Rhône'
short_code = 'pref69'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(20)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref69')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
sub_pages_to_parse = []
for raa_page in pages_to_parse:
sub_pages = self.get_sub_pages_with_pager(raa_page,
"div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link",
"ul.fr-pagination__list li a.fr-pagination__link--next",
self.__HOST)[::-1]
for sub_page in sub_pages:
sub_pages_to_parse.append(sub_page['url'])
elements = []
for sub_page_to_parse in sub_pages_to_parse:
page_content = self.get_page(sub_page_to_parse, 'get').content
for element in self.get_raa_elements(page_content)[::-1]:
elements.append(element)
self.parse_raa(elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
import os, sys
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_pref83(RAAspotter):
# Config
__HOST = 'https://www.var.gouv.fr'
__RAA_PAGE = {'2024': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2024',
'2023': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2023',
'2022': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2022',
'2021': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2021',
'2020': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2020',
'2019': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Recueil-des-actes-administratifs-2019'}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Var'
short_code = 'pref83'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref83')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
sub_pages_to_parse = []
# Pour chaque année, on cherche les sous-pages de mois
for raa_page in pages_to_parse:
sub_pages_to_parse.append(raa_page)
page_content = self.get_page(raa_page, 'get').content
month_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)[::-1]
for month_page in month_pages:
sub_pages_to_parse.append(month_page['url'])
# On parse les pages contenant des RAA
elements = self.get_raa_with_pager(sub_pages_to_parse[::-1], ".fr-pagination__link.fr-pagination__link--next", self.__HOST)
self.parse_raa(elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque section contenant un RAA
for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link.menu-item-link'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.get_text().strip()
date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y')
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
import os, sys
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_pref976(RAAspotter):
# Config
__HOST = 'https://www.mayotte.gouv.fr'
__RAA_PAGE = {'default': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A',
'2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2024',
'2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2023',
'2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2022',
'2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2021',
'2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2020',
'2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-R.A.A/RAA-2019'}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
full_name = 'Préfecture de Mayotte'
short_code = 'pref976'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref976')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
sub_pages_to_parse = [self.__RAA_PAGE['default']]
# Pour chaque année, on cherche les sous-pages de mois
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
month_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)[::-1]
# On regarde aussi si sur la page de l'année il n'y aurait pas un RAA mal catégorisé
for page_to_parse in self.find_raa_card(raa_page):
sub_pages_to_parse.append(page_to_parse)
# Pour chaque mois, on cherche les pages des RAA
for month_page in month_pages:
year = RAAspotter.guess_date(month_page['name'], '(.*)').year
for page_to_parse in self.find_raa_card(month_page['url'], year):
sub_pages_to_parse.append(page_to_parse)
# On parse les pages contenant des RAA
for page in sub_pages_to_parse:
page_content = self.get_page(page, 'get').content
raa_elements = self.get_raa_elements(page_content)
self.parse_raa(raa_elements, keywords.split(','))
self.mailer()
def find_raa_card(self, page, year=None):
pages = []
card_pages = self.get_sub_pages_with_pager(
page,
'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next',
self.__HOST
)[::-1]
for card_page in card_pages:
# On filtre les pages de RAA ne correspondant pas à la période analysée
guessed_date = RAAspotter.guess_date(card_page['name'], 'n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)')
if year:
guessed_date = guessed_date.replace(year = year)
if guessed_date >= self.not_before:
pages.append(card_page['url'])
return pages
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
Ce diff est replié.
Ce diff est replié.
#!/usr/bin/env bash
set -e
s3_key="${1}"
s3_secret="${2}"
s3_host="${3}"
s3_bucket="${4}"
dest="${5}"
root_path=$(dirname $(realpath "${BASH_SOURCE[0]}"))
administrations="ppparis
pref01
pref02
pref03
pref04
pref05
pref06
pref07
pref08
pref09
pref10
pref11
pref12
pref13
pref14
pref15
pref16
pref17
pref18
pref19
pref2a
pref2b
pref21
pref22
pref23
pref24
pref25
pref26
pref27
pref28
pref29
pref30
pref31
pref32
pref33
pref34
pref35
pref36
pref37
pref38
pref39
pref40
pref41
pref42
pref43
pref44
pref45
pref46
pref47
pref49
pref49
pref50
pref51
pref52
pref53
pref54
pref55
pref56
pref57
pref58
pref59
pref60
pref61
pref62
pref63
pref64
pref65
pref66
pref67
pref68
pref69
pref70
pref71
pref72
pref73
pref74
pref75
pref76
pref77
pref78
pref79
pref80
pref81
pref82
pref83
pref84
pref85
pref86
pref87
pref88
pref89
pref90
pref91
pref92
pref93
pref94
pref95
pref971
pref972
pref973
pref974
pref976
prefidf
prefpaca"
if test -z "$s3_key" || test -z "$s3_secret" || test -z "$s3_host" || test -z "$s3_bucket" || test -z "$dest"; then
echo "Usage: ${0} <s3_key> <s3_secret> <s3_host> <s3_bucket> <dest>"
exit 1
fi
for i in $administrations; do
${root_path}/download-from-s3.sh "${i}" "$s3_key" "$s3_secret" "$s3_host" "$s3_bucket" "$dest" || true
rm "${dest}/${i}.zip" || true
done
Ce diff est replié.
Ce diff est replié.
Ce diff est replié.
Ce diff est replié.
Ce diff est replié.
Ce diff est replié.