Skip to content
Extraits de code Groupes Projets

Comparer les révisions

Les modifications sont affichées comme si la révision source était fusionnée avec la révision cible. En savoir plus sur la comparaison des révisions.

Source

Sélectionner le projet cible
No results found

Cible

Sélectionner le projet cible
  • la-quadrature-du-net/Attrap
  • foggyfrog/Attrap
  • skhwiz/Attrap
  • precambrien/Attrap
  • ketsapiwiq/Attrap
  • Joseki/Attrap
  • kr1p/attrap-pref-12
  • kr1p/attrap-pref-46
  • kr1p/attrap-pi
  • Guinness/Attrap
  • astroidgritty/attrap-pref-84
  • davinov/Attrap
  • maettellite/attrap-pref-01
  • m242/Attrap
  • multi/Attrap
  • mverdeil/Attrap
  • olpo/Attrap
17 résultats
Afficher les modifications
Affichage de avec 778 ajouts et 1031 suppressions
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_pref09(RAAspotter):
# Config
__HOST = 'https://www.ariege.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs' \
'/Recueils-des-Actes-Administratifs-de-l-Ariege-a-partir-du-28' \
'-avril-2015'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) ' \
'Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture de l\'Ariège'
short_code = 'pref09'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref09')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
# Les RAA de l'Ariège sont éparpillés sur des sous-pages par mois.
# Donc on parse la page principale à la recherche des sous-pages.
sub_pages = self.get_sub_pages_with_pager(
self.__RAA_PAGE,
'div.fr-card__body div.fr-card__content h2.fr-card__title '
'a.fr-card__link',
'ul.fr-pagination__list li '
'a.fr-pagination__link.fr-pagination__link--next',
'div.fr-card__body div.fr-card__content div.fr-card__end '
'p.fr-card__detail',
self.__HOST
)[::-1]
# On filtre par date les sous-pages pour limiter les requêtes
for sub_page in sub_pages:
guessed_date = datetime.datetime.strptime(
sub_page['details'].replace('Publié le ', '').strip(),
'%d/%m/%Y'
)
guessed_date.replace(day=1)
if guessed_date >= self.not_before:
pages_to_parse.append(sub_page['url'])
# On parse les pages contenant des RAA
for page in pages_to_parse:
page_content = self.get_page(page, 'get').content
raa_elements = self.get_raa_elements(page_content)
self.parse_raa(raa_elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select(
'div.fr-downloads-group.fr-downloads-group--bordered ul li a'
):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace(
'Télécharger ',
''
).strip()
date = datetime.datetime.strptime(
a.find('span').get_text().split(' - ')[-1].strip(),
'%d/%m/%Y'
)
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_pref34(RAAspotter):
# Config
__HOST = 'https://www.herault.gouv.fr'
__RAA_PAGE = {
'2024': f'{__HOST}/Publications/Recueils-des-actes-administratifs'
'/Recueil-des-actes-administratifs-2024',
'2023': f'{__HOST}/Publications/Recueils-des-actes-administratifs'
'/Recueil-des-actes-administratifs-2023',
'2022': f'{__HOST}/Publications/Recueils-des-actes-administratifs'
'/Recueil-des-actes-administratifs-2022',
'2021': f'{__HOST}/Publications/Recueils-des-actes-administratifs'
'/Recueil-des-actes-administratifs-2021',
'2020': f'{__HOST}/Publications/Recueils-des-actes-administratifs'
'/Recueil-des-actes-administratifs-2020',
'2019': f'{__HOST}/Publications/Recueils-des-actes-administratifs'
'/Archives/Recueil-des-actes-administratifs-2019'
}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
'(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
full_name = 'Préfecture de l\'Hérault'
short_code = 'pref34'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref34')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
raa_elements = self.get_raa_elements(page_content)
self.parse_raa(raa_elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace(
'Télécharger ',
''
).strip()
date = datetime.datetime.strptime(
a.find('span').get_text().split(' - ')[-1].strip(),
'%d/%m/%Y'
)
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_pref35(RAAspotter):
# Config
__HOST = 'https://www.ille-et-vilaine.gouv.fr'
__RAA_PAGE = [
f'{__HOST}/Publications/Recueil-des-actes-administratifs/Recueil-des-'
'actes-administratifs-2024',
f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-'
'des-recueils-des-actes-administratifs/Recueil-des-actes-'
'administratifs-2023',
f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-'
'des-recueils-des-actes-administratifs/Recueil-des-actes-'
'administratifs-2022',
f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-'
'des-recueils-des-actes-administratifs/Recueil-des-actes-'
'administratifs-2021',
f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-'
'des-recueils-des-actes-administratifs/Recueil-des-actes-'
'administratifs-2020',
f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-'
'des-recueils-des-actes-administratifs/Recueil-des-actes-'
'administratifs-2019'
]
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
'(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
full_name = 'Préfecture d\'Ille-et-Vilaine'
short_code = 'pref35'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref35')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
for raa_page in self.__RAA_PAGE:
page_content = self.get_page(raa_page, 'get').content
raa_elements = self.get_raa_elements(page_content)
self.parse_raa(raa_elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# parse
for a in soup.find_all('a', href=True, class_='fr-link--download'):
if a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace(
'Télécharger ',
''
).strip()
date = datetime.datetime.strptime(
a.find('span').get_text().split(' - ')[-1].strip(),
'%d/%m/%Y'
)
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
import os
import datetime
import logging
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
logger = logging.getLogger(__name__)
class RAAspotter_pref38(RAAspotter):
# Config
__HOST = 'https://www.isere.gouv.fr'
__RAA_PAGE = {
'2024': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs'
'/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2024',
'2023': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs'
'/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-Isere-2023',
'2022': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs'
'/Archives/Recueils-des-Actes-Administratifs-de-la-prefecture-de-l-'
'Isere-2022',
'2021': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs'
'/Archives/Archives-des-recueils-des-actes-administratifs-de-la-'
'prefecture-de-l-Isere-2021/Recueils-des-Actes-Administratifs-de-la-'
'prefecture-de-l-Isere-2021',
'2020': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs'
'/Archives/Archives-des-recueils-des-actes-administratifs-de-la-'
'prefecture-de-l-Isere-2020/Recueils-des-Actes-Administratifs-de-la-'
'Prefecture-de-l-Isere-2020',
'2019': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs'
'/Archives/Archives-des-Recueils-des-Actes-Administratifs-de-la-'
'prefecture-de-l-Isere-2019/Archives-des-Recueils-des-Actes-'
'Administratifs-de-la-prefecture-de-l-Isere-2019'
}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) ' \
'Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture de l\'Isère'
short_code = 'pref38'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(20)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref38')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
raa_elements = self.get_raa_elements(page_content, raa_page)
self.parse_raa(raa_elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content, raa_page):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère le select qui contient la liste des RAA
select_list = soup.select('select#-liste-docs')[0]
# On analyse chaque résultat
for option in select_list.find_all('option'):
if not option['value'] == "":
# On estime la date à partir du nom de fichier
guessed_date = RAAspotter.guess_date(
option['title'],
'.* n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)'
)
# Si la date estimée correspond à la plage d'analyse, on
# demande au serveur les détails du RAA
if guessed_date >= self.not_before:
page_content = self.get_page(
raa_page,
'post',
{
'-liste-docs': option['value']
}
).content
# On parse la page de détails pour obtenir les propriétés
# du RAA
soup = BeautifulSoup(page_content, 'html.parser')
a = soup.select(
'div.liste_deroulante a.fr-link.fr-link--download'
)[0]
# Si la page contient une balise a qui renvoie vers un pdf,
# c'est qu'on a obtenu les détails du RAA demandé, donc
# on le parse
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace(
'Télécharger ',
''
).strip()
date = datetime.datetime.strptime(
a.find('span').get_text().split(' - ')[-1].strip(),
'%d/%m/%Y'
)
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
import os
import datetime
import dateparser
import logging
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
logger = logging.getLogger(__name__)
class RAAspotter_pref59(RAAspotter):
# Config
__HOST = 'https://www.nord.gouv.fr'
__RAA_PAGE = {
'2024': f'{__HOST}/Publications/Recueils-des-actes-administratifs'
'/RAA-du-departement-du-Nord/2024',
'2023': f'{__HOST}/Publications/Recueils-des-actes-administratifs'
'/RAA-du-departement-du-Nord/2023',
'2022': f'{__HOST}/Publications/Recueils-des-actes-administratifs'
'/RAA-du-departement-du-Nord/2022',
'2021': f'{__HOST}/Publications/Recueils-des-actes-administratifs'
'/RAA-du-departement-du-Nord/2021',
'2020': f'{__HOST}/Publications/Recueils-des-actes-administratifs'
'/RAA-du-departement-du-Nord/2020',
'2019': f'{__HOST}/Publications/Recueils-des-actes-administratifs'
'/RAA-du-departement-du-Nord/2019'
}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) ' \
'Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Nord'
short_code = 'pref59'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(20)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref59')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
sub_pages = self.get_sub_pages(
page_content,
"div.fr-card__body div.fr-card__content h2.fr-card__title a",
self.__HOST,
True
)
for sub_page in sub_pages[::-1]:
sub_page_content = self.get_page(
sub_page['url'],
'get'
).content
sub_raa_elements = self.get_raa_elements(sub_page_content)
self.parse_raa(sub_raa_elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace(
'Télécharger ',
''
).strip()
date = datetime.datetime.strptime(
a.find('span').get_text().split(' - ')[-1].strip(),
'%d/%m/%Y'
)
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_pref62(RAAspotter):
# Config
__HOST = 'https://www.pas-de-calais.gouv.fr'
__RAA_PAGE = {
'2024': [
f'{__HOST}/Publications/Recueil-des-actes-administratifs'
'/2024-Recueils-des-actes-administratifs'
],
'2023': [
f'{__HOST}/Publications/Recueil-des-actes-administratifs'
'/2023-Recueils-des-actes-administratifs',
f'{__HOST}/Publications/Recueil-des-actes-administratifs'
'/2023-Recueils-speciaux-des-actes-administratifs'
],
'2022': [
f'{__HOST}/Publications/Recueil-des-actes-administratifs'
'/2022-Recueils-des-Actes-Administratifs',
f'{__HOST}/Publications/Recueil-des-actes-administratifs'
'/2022-Recueils-Speciaux-des-Actes-Administratifs'
],
'2021': [
f'{__HOST}/Publications/Recueil-des-actes-administratifs'
'/2021-Recueils-des-actes-administratifs',
f'{__HOST}/Publications/Recueil-des-actes-administratifs'
'/2021-Recueils-speciaux-des-actes-administratifs'
],
'2020': [
f'{__HOST}/Publications/Recueil-des-actes-administratifs'
'/2020-Recueils-des-actes-administratifs',
f'{__HOST}/Publications/Recueil-des-actes-administratifs'
'/2020-Recueils-speciaux-des-actes-administratifs'
],
'2019': [
f'{__HOST}/Publications/Recueil-des-actes-administratifs'
'/2019-Recueil-des-actes-administratifs',
f'{__HOST}/Publications/Recueil-des-actes-administratifs'
'/2019-Recueils-speciaux-des-actes-administratifs'
]
}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) ' \
'Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Pas-de-Calais'
short_code = 'pref62'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(20)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref62')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
for page in self.__RAA_PAGE['2024']:
pages_to_parse.append(page)
if self.not_before.year <= 2023:
for page in self.__RAA_PAGE['2023']:
pages_to_parse.append(page)
if self.not_before.year <= 2022:
for page in self.__RAA_PAGE['2022']:
pages_to_parse.append(page)
if self.not_before.year <= 2021:
for page in self.__RAA_PAGE['2021']:
pages_to_parse.append(page)
if self.not_before.year <= 2020:
for page in self.__RAA_PAGE['2020']:
pages_to_parse.append(page)
if self.not_before.year <= 2019:
for page in self.__RAA_PAGE['2019']:
pages_to_parse.append(page)
for raa_page in pages_to_parse:
page_content = self.get_page(
raa_page,
'get'
).content
raa_elements = self.get_raa_elements(page_content)
self.parse_raa(raa_elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère le div qui contient la liste des RAA
cards = soup.select(
'div.fr-downloads-group.fr-downloads-group--bordered'
)[0]
# On analyse chaque balise a dans ce div
for a in cards.find_all('a', href=True):
if a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace(
'Télécharger ',
''
).strip()
date = datetime.datetime.strptime(
a.find('span').get_text().split(' - ')[-1].strip(),
'%d/%m/%Y'
)
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements[::-1]
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_pref65(RAAspotter):
# Config
__HOST = 'https://www.hautes-pyrenees.gouv.fr'
__RAA_PAGE = {
'2024': f'{__HOST}/Publications/Recueil-d-actes-administratifs'
'/RAA-2024',
'2023': f'{__HOST}/Publications/Recueil-d-actes-administratifs'
'/RAA-2023',
'2022': f'{__HOST}/Publications/Recueil-d-actes-administratifs'
'/RAA-2022',
'2021': f'{__HOST}/Publications/Recueil-d-actes-administratifs'
'/RAA-2021',
'2020': f'{__HOST}/Publications/Recueil-d-actes-administratifs'
'/RAA-2020',
'2019': f'{__HOST}/Publications/Recueil-d-actes-administratifs'
'/RAA-2019'
}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) ' \
'Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture des Hautes-Pyrénées'
short_code = 'pref65'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref65')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
raa_elements = self.get_raa_elements(page_content)
self.parse_raa(raa_elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# parse
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace(
'Télécharger ',
''
).strip()
date = datetime.datetime.strptime(
a.find('span').get_text().split(' - ')[-1].strip(),
'%d/%m/%Y'
)
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_pref81(RAAspotter):
# Config
__HOST = 'https://www.tarn.gouv.fr'
__RAA_PAGE = {
'default': f'{__HOST}/Publications/RAA-Recueil-des-Actes-'
'Administratifs/RAA',
'2024': f'{__HOST}/Publications/RAA-Recueil-des-Actes-'
'Administratifs/RAA/2024',
'2023': f'{__HOST}/Publications/RAA-Recueil-des-Actes-'
'Administratifs/RAA/2023',
'2022': f'{__HOST}/Publications/RAA-Recueil-des-Actes-'
'Administratifs/RAA/2022',
'2021': f'{__HOST}/Publications/RAA-Recueil-des-Actes-'
'Administratifs/RAA/2021',
'2020': f'{__HOST}/Publications/RAA-Recueil-des-Actes-'
'Administratifs/RAA/2020',
'2019': f'{__HOST}/Publications/RAA-Recueil-des-Actes-'
'Administratifs/RAA/2019',
}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) ' \
'Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Tarn'
short_code = 'pref81'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref81')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
sub_pages_to_parse = [self.__RAA_PAGE['default']]
# Pour chaque année, on cherche les sous-pages de mois
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
month_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link '
'div.fr-card__body div.fr-card__content '
'h2.fr-card__title a',
self.__HOST,
False
)[::-1]
# On regarde aussi si sur la page de l'année il n'y aurait pas un
# RAA mal catégorisé
for page_to_parse in self.find_raa_card(raa_page):
sub_pages_to_parse.append(page_to_parse)
# Pour chaque mois, on cherche les pages des RAA
for month_page in month_pages:
year = RAAspotter.guess_date(month_page['name'], '(.*)').year
for page_to_parse in self.find_raa_card(
month_page['url'],
year
):
sub_pages_to_parse.append(page_to_parse)
# On ajoute aussi la page des mois à parser au cas où il y ait
# eu une redirection vers un RAA
sub_pages_to_parse.append(month_page['url'])
# On parse les pages contenant des RAA
for page in sub_pages_to_parse:
page_content = self.get_page(page, 'get').content
raa_elements = self.get_raa_elements(page_content)
self.parse_raa(raa_elements, keywords.split(','))
self.mailer()
def find_raa_card(self, page, year=None):
pages = []
card_pages = self.get_sub_pages_with_pager(
page,
'div.fr-card__body div.fr-card__content h2.fr-card__title '
'a.fr-card__link',
'ul.fr-pagination__list li '
'a.fr-pagination__link.fr-pagination__link--next',
'div.fr-card__body div.fr-card__content div.fr-card__end '
'p.fr-card__detail',
self.__HOST
)[::-1]
for card_page in card_pages:
# On filtre les pages de RAA ne correspondant pas à la période
# analysée
guessed_date = datetime.datetime.strptime(
card_page['details'].replace('Publié le ', '').strip(),
'%d/%m/%Y'
)
if guessed_date >= self.not_before:
pages.append(card_page['url'])
return pages
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select(
'div.fr-downloads-group.fr-downloads-group--bordered ul li a'
):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace(
'Télécharger ',
''
).strip()
date = datetime.datetime.strptime(
a.find('span').get_text().split(' - ')[-1].strip(),
'%d/%m/%Y'
)
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_pref83(RAAspotter):
# Config
__HOST = 'https://www.var.gouv.fr'
__RAA_PAGE = {
'2024': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs'
'/Recueil-des-actes-administratifs-2024',
'2023': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs'
'/Recueil-des-actes-administratifs-2023',
'2022': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs'
'/Recueil-des-actes-administratifs-2022',
'2021': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs'
'/Recueil-des-actes-administratifs-2021',
'2020': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs'
'/Recueil-des-actes-administratifs-2020',
'2019': f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs'
'/Recueil-des-actes-administratifs-2019'
}
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) ' \
'Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Var'
short_code = 'pref83'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
def get_raa(self, keywords):
self.print_output('RAAspotter_pref83')
self.print_output(f'Termes recherchés: {keywords}')
self.print_output('')
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
sub_pages_to_parse = []
# Pour chaque année, on cherche les sous-pages de mois
for raa_page in pages_to_parse:
sub_pages_to_parse.append(raa_page)
page_content = self.get_page(raa_page, 'get').content
month_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link '
'div.fr-card__body '
'div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)[::-1]
for month_page in month_pages:
sub_pages_to_parse.append(month_page['url'])
# On parse les pages contenant des RAA
elements = self.get_raa_with_pager(
sub_pages_to_parse[::-1],
".fr-pagination__link.fr-pagination__link--next",
self.__HOST
)
self.parse_raa(elements, keywords.split(','))
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque section contenant un RAA
cards = soup.select(
'div.fr-card__body div.fr-card__content '
'h2.fr-card__title a.fr-card__link.menu-item-link'
)
for a in cards:
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.get_text().strip()
date = datetime.datetime.strptime(
a['title'].split(' - ')[-1].strip(),
'%d/%m/%Y'
)
filename = url.split('/')[-1]
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements
Ce diff est replié.
Ce diff est replié.
Ce diff est replié.
#!/usr/bin/env bash
set -e
pref="${1}"
s3_key="${2}"
s3_secret="${3}"
s3_host="${4}"
s3_bucket="${5}"
dest="${6}"
if test -z "$pref" || test -z "$s3_key" || test -z "$s3_secret" || test -z "$s3_host" || test -z "$s3_bucket" || test -z "$dest"; then
echo "Usage: ${0} <pref> <s3_key> <s3_secret> <s3_host> <s3_bucket> <dest>"
exit 1
fi
dest=$(realpath "${dest}")
mkdir -p "${dest}/"
cd "${dest}/"
file="${pref}.zip"
echo "Downloading ${pref}..."
ressource="/${s3_bucket}/${file}"
content_type="application/octet-stream"
date=$(date --utc -R)
signature=$(echo -en "GET\n\n${content_type}\n${date}\n${ressource}" | openssl sha1 -hmac "${s3_secret}" -binary | base64)
curl -X GET \
--silent \
-H "Date: ${date}" \
-H "Content-Type: ${content_type}" \
-H "Authorization: AWS ${s3_key}:${signature}" \
"${s3_host}${ressource}" \
-o "${file}"
unzip -o "${file}" > /dev/null
rm "${file}"
Ce diff est replié.
Ce diff est replié.
Ce diff est replié.
Ce diff est replié.
beautifulsoup4>=4.12.3
dateparser>=1.2.0
ftfy>=6.2.0
Mastodon.py>=1.8.1
pdfminer.six>=20231228
ocrmypdf<16.4.0
pycodestyle>=2.11.1
pypdf>=4.2.0
pytz>=2024.2
PyVirtualDisplay>=3.0
requests>=2.31.0
selenium>=4.19.0
stem>=1.8.2