Skip to content
Extraits de code Groupes Projets

Comparer les révisions

Les modifications sont affichées comme si la révision source était fusionnée avec la révision cible. En savoir plus sur la comparaison des révisions.

Source

Sélectionner le projet cible
No results found

Cible

Sélectionner le projet cible
  • la-quadrature-du-net/Attrap
  • foggyfrog/Attrap
  • skhwiz/Attrap
  • precambrien/Attrap
  • ketsapiwiq/Attrap
  • Joseki/Attrap
  • kr1p/attrap-pref-12
  • kr1p/attrap-pref-46
  • kr1p/attrap-pi
  • Guinness/Attrap
  • astroidgritty/attrap-pref-84
  • davinov/Attrap
  • maettellite/attrap-pref-01
  • m242/Attrap
  • multi/Attrap
  • mverdeil/Attrap
  • olpo/Attrap
17 résultats
Afficher les modifications
Affichage de avec 290 ajouts et 856 suppressions
import os from Attrap_prefdpt import Attrap_prefdpt
import re
import datetime
import logging
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap class Attrap_pref33(Attrap_prefdpt):
logger = logging.getLogger(__name__) # Configuration de la préfecture
hostname = 'https://www.gironde.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs'
class Attrap_pref33(Attrap):
# Config
__HOST = 'https://www.gironde.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-Actes-Administratifs'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture de la Gironde' full_name = 'Préfecture de la Gironde'
short_code = 'pref33' short_code = 'pref33'
timezone = 'Europe/Paris'
def __init__(self, data_dir): # Configuration des widgets à analyser
super().__init__(data_dir, self.__USER_AGENT) Attrap_prefdpt.grey_card['regex']['year'] = 'Recueils{0,1} des [Aa]ctes [Aa]dministratifs de l\'année ([0-9]{4})'
self.set_sleep_time(30) Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.grey_card['follow_link_on_unrecognised_date'] = False
def get_raa(self, keywords):
pages_to_parse = []
# Parfois un RAA est mal catégorisé et se retrouve sur la page racine, donc on la parse
pages_to_parse.append(self.__RAA_PAGE)
# On détermine quelles pages d'année parser
year_pages_to_parse = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
year_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)
for year_page in year_pages:
year = 9999
try:
year = int(re.search('.*([0-9]{4})', year_page['name'].strip(), re.IGNORECASE).group(1))
if year is None:
year = 9999
except Exception as exc:
logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}")
year = 9999
if year >= self.not_before.year:
year_pages_to_parse.append(year_page['url'])
# Pour chaque année, on cherche les sous-pages de mois
month_pages_to_parse = []
for year_page in year_pages_to_parse:
page_content = self.get_page(year_page, 'get').content
month_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)[::-1]
for month_page in month_pages:
guessed_date = Attrap.guess_date(month_page['name'], '([a-zéû]* [0-9]{4})')
if guessed_date >= self.not_before.replace(day=1):
pages_to_parse.append(month_page['url'])
# On parse les pages sélectionnées
elements = self.get_raa_with_pager(
pages_to_parse,
"ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label",
self.__HOST
)[::-1]
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On récupère chaque carte avec un RAA
for card in BeautifulSoup(page_content, 'html.parser').select('div.fr-card.fr-card--horizontal div.fr-card__body div.fr-card__content'):
# On récupère le lien
links = card.select('h2.fr-card__title a.fr-card__link.menu-item-link')
# On récupère la date
dates_raw = card.select('div.fr-card__end p.fr-card__detail')
# Si on a toutes les infos, on continue
if links and links[0] and dates_raw and dates_raw[0]:
a = links[0]
date_raw = dates_raw[0]
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.get_text().strip()
date = datetime.datetime.strptime(date_raw.get_text().replace('Publié le', '').strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
import os from Attrap_prefdpt import Attrap_prefdpt
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap class Attrap_pref34(Attrap_prefdpt):
# Configuration de la préfecture
class Attrap_pref34(Attrap): hostname = 'https://www.herault.gouv.fr'
raa_page = f'{hostname}/Publications/Recueils-des-actes-administratifs'
# Config
__HOST = 'https://www.herault.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueils-des-actes-administratifs'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture de l\'Hérault' full_name = 'Préfecture de l\'Hérault'
short_code = 'pref34' short_code = 'pref34'
timezone = 'Europe/Paris'
def __init__(self, data_dir): # Configuration des widgets à analyser
super().__init__(data_dir, self.__USER_AGENT) year_regex = '(?:(?:Recueil des actes administratifs)|(?:Année))[ -]([0-9]{4})'
self.set_sleep_time(30) Attrap_prefdpt.white_card['regex']['year'] = year_regex
Attrap_prefdpt.grey_card['regex']['year'] = year_regex
def get_raa(self, keywords):
pages_to_parse = []
year_pages = self.get_sub_pages_with_pager(
self.__RAA_PAGE,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.__HOST
)
for year_page in year_pages:
year = Attrap.guess_date(year_page['name'], '.*([0-9]{4})').year
if year >= self.not_before.year:
pages_to_parse.append(year_page['url'])
elements = []
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
for element in self.get_raa_elements(page_content):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
import os from Attrap_prefdpt import Attrap_prefdpt
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap class Attrap_pref35(Attrap_prefdpt):
# Configuration de la préfecture
class Attrap_pref35(Attrap): hostname = 'https://www.ille-et-vilaine.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
# Config
__HOST = 'https://www.ille-et-vilaine.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs'
__RAA_PAGE_ARCHIVES = f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture d\'Ille-et-Vilaine' full_name = 'Préfecture d\'Ille-et-Vilaine'
short_code = 'pref35' short_code = 'pref35'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT) # Configuration des widgets à analyser
self.set_sleep_time(30) year_regex = 'Recueil des actes administratifs ([0-9]{4})'
Attrap_prefdpt.white_card['regex']['year'] = year_regex
def get_raa(self, keywords):
year_pages_to_parse = [] # On ajoute un widget de menu déroulant
Attrap_prefdpt.select_widgets.append(
# La page de l'année en cours est normalement listée sur __RAA_PAGE Attrap_prefdpt.DptSelectWidget(
year_pages = self.get_sub_pages_with_pager( 'menu_deroulant',
self.__RAA_PAGE, regex=year_regex,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', css_path='div.fr-select-group select#Archives-des-RAA-liste-docs.fr-select',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label', type='year'
None,
self.__HOST
) )
for year_page in year_pages: )
year = Attrap.guess_date(year_page['name'], '.*([0-9]{4})').year
if year >= self.not_before.year:
year_pages_to_parse.append(year_page['url'])
# Les URL des pages des années précédentes sont dans un menu déroulant de __RAA_PAGE_ARCHIVES
page_content = self.get_page(self.__RAA_PAGE_ARCHIVES, 'get').content
for option in BeautifulSoup(page_content, 'html.parser').select('div.fr-select-group select.fr-select option'):
if not option['value'] == '':
year = Attrap.guess_date(option.get_text().strip(), '.*([0-9]{4})').year
if year >= self.not_before.year:
url = option['value']
year_pages_to_parse.append(f'{self.__HOST}/{url}')
elements = []
for raa_page in year_pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
for element in self.get_raa_elements(page_content):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# parse
for a in soup.find_all('a', href=True, class_='fr-link--download'):
if a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
import os from Attrap_prefdpt import Attrap_prefdpt
import datetime
import logging
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap class Attrap_pref38(Attrap_prefdpt):
logger = logging.getLogger(__name__) # Configuration de la préfecture
hostname = 'https://www.isere.gouv.fr'
raa_page = f'{hostname}/Publications/RAA-Recueil-des-actes-administratifs'
class Attrap_pref38(Attrap):
# Config
__HOST = 'https://www.isere.gouv.fr'
__RAA_PAGE = [
f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs',
f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives'
]
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture de l\'Isère' full_name = 'Préfecture de l\'Isère'
short_code = 'pref38' short_code = 'pref38'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT) # Configuration des widgets à analyser
self.set_sleep_time(30) year_regex = '(?:(?:[Rr]ecueils{0,1} des [Aa]ctes [Aa]dministratifs de la [Pp]réfecture de l\'Isère[ -]*)|(?:Année ))([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['year'] = year_regex
def get_raa(self, keywords): Attrap_prefdpt.white_card['regex']['year'] = year_regex
pages_to_parse = [] Attrap_prefdpt.white_card['exclude'] = ['Vous recherchez "Le Journal officiel de la République française" ?']
# On cherche les pages d'années. Elles sont mélangées dans des blocs de cartes grises et des blocs blancs avec pager # On ajoute un widget de menu déroulant
for page in self.__RAA_PAGE: Attrap_prefdpt.select_widgets.append(
page_content = self.get_page(page, 'get').content Attrap_prefdpt.DptSelectWidget(
# On parse les cartes grises 'menu_deroulant',
cards = self.get_sub_pages( regex='([0-9]{1,2}[er]{0,1} [a-zéû]* [0-9]{4})',
page_content, css_path='select#-liste-docs',
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a', type='year-month-day'
self.__HOST, )
False )
)[::-1]
for card in cards:
year = Attrap.guess_date(card['name'].strip(), '.*([0-9]{4})').year
if year >= self.not_before.year and year < 9999:
pages_to_parse.append(card['url'])
# On parse les blocs blancs
blocks = self.get_sub_pages_with_pager(
page,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
None,
self.__HOST
)
for block in blocks:
year = Attrap.guess_date(block['name'].strip(), '.*([0-9]{4})').year
if year >= self.not_before.year and year < 9999:
pages_to_parse.append(block['url'])
elements = []
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
for element in self.get_raa_elements(page_content, raa_page):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content, raa_page):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère le select qui contient la liste des RAA
select_list = soup.select('select#-liste-docs')[0]
# On analyse chaque résultat
for option in select_list.find_all('option'):
if not option['value'] == "":
# On estime la date à partir du nom de fichier
guessed_date = Attrap.guess_date(option['title'], '.* n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)')
# Si la date estimée correspond à la plage d'analyse, on
# demande au serveur les détails du RAA
if guessed_date >= self.not_before:
page_content = self.get_page(
raa_page,
'post',
{
'-liste-docs': option['value']
}
).content
# On parse la page de détails pour obtenir les propriétés
# du RAA
soup = BeautifulSoup(page_content, 'html.parser')
a = soup.select('div.liste_deroulante a.fr-link.fr-link--download')[0]
# Si la page contient une balise a qui renvoie vers un pdf,
# c'est qu'on a obtenu les détails du RAA demandé, donc
# on le parse
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
import os from Attrap_prefdpt import Attrap_prefdpt
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap class Attrap_pref39(Attrap_prefdpt):
# Configuration de la préfecture
class Attrap_pref39(Attrap): hostname = "https://www.jura.gouv.fr"
raa_page = f'{hostname}/Publications/Publications-legales/Recueil-des-Actes-Administratifs'
# Config
__HOST = "https://www.jura.gouv.fr"
__RAA_PAGE = f'{__HOST}/Publications/Publications-legales/Recueil-des-Actes-Administratifs'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = "Préfecture du Jura" full_name = "Préfecture du Jura"
short_code = "pref39" short_code = "pref39"
timezone = 'Europe/Paris'
def __init__(self, data_dir): # Configuration des widgets à analyser
super().__init__(data_dir, self.__USER_AGENT) Attrap_prefdpt.grey_card['regex']['year'] = 'Année ([0-9]{4})'
self.set_sleep_time(30)
def get_raa(self, keywords):
# On récupère les pages d'années
year_pages = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
for card in self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
):
if Attrap.guess_date(card['name'], '.* ([0-9]{4})').year >= self.not_before.year:
year_pages.append(card['url'])
# On récupère tous les RAA en suivant la navigation
elements = self.get_raa_with_pager(
year_pages,
'a.fr-pagination__link.fr-pagination__link--next',
self.__HOST
)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse
for a in soup.select('.fr-card__title a.fr-card__link.menu-item-link'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.get_text().strip()
date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
import os from Attrap_prefdpt import Attrap_prefdpt
import datetime
import re
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap class Attrap_pref42(Attrap_prefdpt):
# Configuration de la préfecture
class Attrap_pref42(Attrap): hostname = 'https://www.loire.gouv.fr'
raa_page = f'{hostname}/Publications/Publications-legales/Recueil-des-Actes-Administratifs'
# Config
__HOST = 'https://www.loire.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Publications-legales/Recueil-des-Actes-Administratifs'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture de la Loire' full_name = 'Préfecture de la Loire'
short_code = 'pref42' short_code = 'pref42'
timezone = 'Europe/Paris'
def __init__(self, data_dir): # Configuration des widgets à analyser
super().__init__(data_dir, self.__USER_AGENT) Attrap_prefdpt.white_card['regex']['year'] = '([0-9]{4})'
self.set_sleep_time(30)
def get_raa(self, keywords):
year_pages_to_parse = []
# On détermine quelles pages d'année parser
year_pages = self.get_sub_pages_with_pager(
self.__RAA_PAGE,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.__HOST
)
for year_page in year_pages:
year = 9999
try:
year = int(re.search('([0-9]{4})', year_page['name'], re.IGNORECASE).group(1))
if year is None:
year = 9999
except Exception as exc:
logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}")
year = 9999
if year >= self.not_before.year:
year_pages_to_parse.append(year_page['url'])
elements = []
# Pour chaque année, on parse les RAA
for year_page in year_pages_to_parse:
page_content = self.get_page(year_page, 'get').content
for element in self.get_raa_elements(page_content)[::-1]:
elements.append(element)
# On parse les RAA
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('div.fr-downloads-group.fr-downloads-group--bordered ul li a'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
import os from Attrap_prefdpt import Attrap_prefdpt
import datetime
import logging
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap class Attrap_pref44(Attrap_prefdpt):
logger = logging.getLogger(__name__) # Configuration de la préfecture
hostname = 'https://www.loire-atlantique.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs-RAA-en-Loire-Atlantique'
class Attrap_pref44(Attrap):
# Config
__HOST = 'https://www.loire-atlantique.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA-en-Loire-Atlantique'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture de la Loire-Atlantique' full_name = 'Préfecture de la Loire-Atlantique'
short_code = 'pref44' short_code = 'pref44'
timezone = 'Europe/Paris'
def __init__(self, data_dir): # Configuration des widgets à analyser
super().__init__(data_dir, self.__USER_AGENT) Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
self.set_sleep_time(30) Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.grey_card['add_year_to_months'] = True
def get_raa(self, keywords):
pages_to_parse = []
# Parfois un RAA est mal catégorisé et se retrouve sur la page racine, donc on la parse
pages_to_parse.append(self.__RAA_PAGE)
# On détermine quelles pages d'année parser
year_pages_to_parse = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
year_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)
for year_page in year_pages:
year = 9999
try:
year = int(year_page['name'].strip())
if year is None:
year = 9999
except Exception as exc:
logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}")
year = 9999
if year >= self.not_before.year:
year_pages_to_parse.append(year_page['url'])
# Parfois un RAA est mal catégorisé et se retrouve sur la page de l'année, donc on la parse
pages_to_parse.append(year_page['url'])
# Pour chaque année, on cherche les sous-pages de mois
month_pages_to_parse = []
for year_page in year_pages_to_parse:
page_content = self.get_page(year_page, 'get').content
month_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)[::-1]
for month_page in month_pages:
pages_to_parse.append(month_page['url'])
# On parse les pages sélectionnées
elements = self.get_raa_with_pager(
pages_to_parse,
"ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label",
self.__HOST
)[::-1]
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On récupère chaque carte avec un RAA
for card in BeautifulSoup(page_content, 'html.parser').select('div.fr-card.fr-card--horizontal div.fr-card__body div.fr-card__content'):
# On récupère le lien
links = card.select('h2.fr-card__title a.fr-card__link.menu-item-link')
# On récupère la date
dates_raw = card.select('div.fr-card__end p.fr-card__detail')
# Si on a toutes les infos, on continue
if links and links[0] and dates_raw and dates_raw[0]:
a = links[0]
date_raw = dates_raw[0]
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.get_text().strip()
date = datetime.datetime.strptime(date_raw.get_text().replace('Publié le', '').strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref49(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.maine-et-loire.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs'
full_name = 'Préfecture de Maine-et-Loire'
short_code = 'pref49'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '.*([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref50(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.manche.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
full_name = 'Préfecture de la Manche'
short_code = 'pref50'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref52(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.haute-marne.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs-RAA'
full_name = 'Préfecture de la Haute-Marne'
short_code = 'pref52'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.white_card['regex']['year'] = 'Année ([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref54(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.meurthe-et-moselle.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
full_name = 'Préfecture de Meurthe-et-Moselle'
short_code = 'pref54'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.white_card['regex']['year'] = '([0-9]{4})'
# On ajoute un widget de menu déroulant
Attrap_prefdpt.select_widgets.append(
Attrap_prefdpt.DptSelectWidget(
'menu_deroulant',
regex='.* du ([0-9]*(?:er|ER)? [A-Za-zéÉûÛ]* [0-9]*)',
css_path='select#Liste-liste-docs',
type='year-month-day'
)
)
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref55(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.meuse.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs-RAA'
full_name = 'Préfecture de la Meuse'
short_code = 'pref55'
timezone = 'Europe/Paris'
# On configure le widget de menu déroulant
Attrap_prefdpt.select_widgets.append(
Attrap_prefdpt.DptSelectWidget(
'menu_deroulant',
regex='RAA année ([0-9]{4})',
css_path='select#Liste-des-recueils-liste-docs',
type='year'
)
)
import os from Attrap_prefdpt import Attrap_prefdpt
import datetime
import dateparser
import logging
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap class Attrap_pref59(Attrap_prefdpt):
logger = logging.getLogger(__name__) # Configuration de la préfecture
hostname = 'https://www.nord.gouv.fr'
raa_page = f'{hostname}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord'
class Attrap_pref59(Attrap):
# Config
__HOST = 'https://www.nord.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Nord' full_name = 'Préfecture du Nord'
short_code = 'pref59' short_code = 'pref59'
timezone = 'Europe/Paris'
def __init__(self, data_dir): # Configuration des widgets à analyser
super().__init__(data_dir, self.__USER_AGENT) Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
self.set_sleep_time(30) Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.grey_card['add_year_to_months'] = True
def get_raa(self, keywords):
# On récupère les pages d'années
year_pages = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
for card in self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
):
if Attrap.guess_date(card['name'], '([0-9]{4})').year >= self.not_before.year:
year_pages.append(card['url'])
elements = []
for raa_page in year_pages:
page_content = self.get_page(raa_page, 'get').content
sub_pages = self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
True
)
for sub_page in sub_pages[::-1]:
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref61(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.orne.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs-RAA/Recueil-des-Actes-Administratifs-RAA'
full_name = 'Préfecture de l\'Orne'
short_code = 'pref61'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'Le Recueil des actes administratifs ([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.grey_card['add_year_to_months'] = True
Attrap_prefdpt.white_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.white_card['add_year_to_months'] = True
import os from Attrap_prefdpt import Attrap_prefdpt
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap class Attrap_pref62(Attrap_prefdpt):
class Attrap_pref62(Attrap):
# Config # Config
__HOST = 'https://www.pas-de-calais.gouv.fr' hostname = 'https://www.pas-de-calais.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs' raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Pas-de-Calais' full_name = 'Préfecture du Pas-de-Calais'
short_code = 'pref62' short_code = 'pref62'
timezone = 'Europe/Paris'
def __init__(self, data_dir): # Configuration des widgets à analyser
super().__init__(data_dir, self.__USER_AGENT) Attrap_prefdpt.white_card['regex']['year'] = '([0-9]{4})'
self.set_sleep_time(30)
def get_raa(self, keywords):
pages_to_parse = []
# On détermine quelles pages d'année parser
year_pages = self.get_sub_pages_with_pager(
self.__RAA_PAGE,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.__HOST
)
for year_page in year_pages:
if Attrap.guess_date(year_page['name'].strip(), '([0-9]{4}).*').year >= self.not_before.year:
pages_to_parse.append(year_page['url'])
elements = []
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
for element in self.get_raa_elements(page_content):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère le div qui contient la liste des RAA
cards = soup.select('div.fr-downloads-group.fr-downloads-group--bordered')[0]
# On analyse chaque balise a dans ce div
for a in cards.find_all('a', href=True):
if a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements[::-1]
import os from Attrap_prefdpt import Attrap_prefdpt
import datetime
import re
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap class Attrap_pref63(Attrap_prefdpt):
# Configuration de la préfecture
class Attrap_pref63(Attrap): hostname = 'https://www.puy-de-dome.gouv.fr'
raa_page = f'{hostname}/Publications/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-Puy-de-Dome'
# Config
__HOST = 'https://www.puy-de-dome.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-Puy-de-Dome'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Puy-de-Dôme' full_name = 'Préfecture du Puy-de-Dôme'
short_code = 'pref63' short_code = 'pref63'
timezone = 'Europe/Paris'
def __init__(self, data_dir): # Configuration des widgets à analyser
super().__init__(data_dir, self.__USER_AGENT) Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
self.set_sleep_time(30)
def get_raa(self, keywords):
year_pages_to_parse = []
# On détermine quelles pages d'année parser
page_content = self.get_page(self.__RAA_PAGE, 'get').content
year_pages = self.get_sub_pages(
page_content,
'div.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)
for year_page in year_pages:
if not year_page['name'].strip() == 'Archives':
year = 9999
try:
year = int(year_page['name'].strip())
except Exception as exc:
logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}")
year = 9999
if year >= self.not_before.year:
year_pages_to_parse.append(year_page['url'])
elements = []
# Pour chaque année, on parse les RAA
for year_page in year_pages_to_parse:
page_content = self.get_page(year_page, 'get').content
for element in self.get_raa_elements(page_content):
elements.append(element)
# On parse les RAA
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
...@@ -10,24 +10,25 @@ from Attrap import Attrap ...@@ -10,24 +10,25 @@ from Attrap import Attrap
class Attrap_pref64(Attrap): class Attrap_pref64(Attrap):
# Config # Config
__HOST = 'https://www.pyrenees-atlantiques.gouv.fr' hostname = 'https://www.pyrenees-atlantiques.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs' raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0' user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture des Pyrénées-Atlantiques' full_name = 'Préfecture des Pyrénées-Atlantiques'
short_code = 'pref64' short_code = 'pref64'
timezone = 'Europe/Paris'
def __init__(self, data_dir): def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT) super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30) self.set_sleep_time(30)
def get_raa(self, keywords): def get_raa(self, keywords):
# On récupère les pages d'années # On récupère les pages d'années
year_pages = [] year_pages = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content page_content = self.get_page(self.raa_page, 'get').content
for year_page in self.get_sub_pages( for year_page in self.get_sub_pages(
page_content, page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST, self.hostname,
False False
): ):
year = Attrap.guess_date(year_page['name'], '.* ([0-9]{4})').year year = Attrap.guess_date(year_page['name'], '.* ([0-9]{4})').year
...@@ -41,7 +42,7 @@ class Attrap_pref64(Attrap): ...@@ -41,7 +42,7 @@ class Attrap_pref64(Attrap):
for month_page in self.get_sub_pages( for month_page in self.get_sub_pages(
page_content, page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST, self.hostname,
False False
): ):
if Attrap.guess_date(month_page['name'], '(.*)').replace(day=1) >= self.not_before.replace(day=1): if Attrap.guess_date(month_page['name'], '(.*)').replace(day=1) >= self.not_before.replace(day=1):
...@@ -51,7 +52,7 @@ class Attrap_pref64(Attrap): ...@@ -51,7 +52,7 @@ class Attrap_pref64(Attrap):
elements = self.get_raa_with_pager( elements = self.get_raa_with_pager(
month_pages[::-1], month_pages[::-1],
'a.fr-pagination__link--next.fr-pagination__link--lg-label', 'a.fr-pagination__link--next.fr-pagination__link--lg-label',
self.__HOST self.hostname
)[::-1] )[::-1]
self.parse_raa(elements, keywords) self.parse_raa(elements, keywords)
...@@ -66,7 +67,7 @@ class Attrap_pref64(Attrap): ...@@ -66,7 +67,7 @@ class Attrap_pref64(Attrap):
for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link.menu-item-link'): for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link.menu-item-link'):
if a.get('href') and a['href'].endswith('.pdf'): if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'): if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}" url = f"{self.hostname}{a['href']}"
else: else:
url = a['href'] url = a['href']
...@@ -74,6 +75,6 @@ class Attrap_pref64(Attrap): ...@@ -74,6 +75,6 @@ class Attrap_pref64(Attrap):
name = a.get_text().strip() name = a.get_text().strip()
date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y') date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name) raa = Attrap.RAA(url, date, name, timezone=self.timezone)
elements.append(raa) elements.append(raa)
return elements return elements
...@@ -10,25 +10,26 @@ from Attrap import Attrap ...@@ -10,25 +10,26 @@ from Attrap import Attrap
class Attrap_pref65(Attrap): class Attrap_pref65(Attrap):
# Config # Config
__HOST = 'https://www.hautes-pyrenees.gouv.fr' hostname = 'https://www.hautes-pyrenees.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-d-actes-administratifs' raa_page = f'{hostname}/Publications/Recueil-d-actes-administratifs'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0' user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture des Hautes-Pyrénées' full_name = 'Préfecture des Hautes-Pyrénées'
short_code = 'pref65' short_code = 'pref65'
timezone = 'Europe/Paris'
def __init__(self, data_dir): def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT) super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30) self.set_sleep_time(30)
def get_raa(self, keywords): def get_raa(self, keywords):
# On détermine quelles pages d'année parser # On détermine quelles pages d'année parser
pages_to_parse = [] pages_to_parse = []
year_pages = self.get_sub_pages_with_pager( year_pages = self.get_sub_pages_with_pager(
self.__RAA_PAGE, self.raa_page,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', 'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label', 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail', 'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.__HOST self.hostname
) )
for year_page in year_pages: for year_page in year_pages:
if Attrap.guess_date(year_page['name'].strip(), '.*([0-9]{4})').year >= self.not_before.year: if Attrap.guess_date(year_page['name'].strip(), '.*([0-9]{4})').year >= self.not_before.year:
...@@ -52,7 +53,7 @@ class Attrap_pref65(Attrap): ...@@ -52,7 +53,7 @@ class Attrap_pref65(Attrap):
for a in soup.select('a.fr-link.fr-link--download'): for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'): if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'): if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}" url = f"{self.hostname}{a['href']}"
else: else:
url = a['href'] url = a['href']
...@@ -60,6 +61,6 @@ class Attrap_pref65(Attrap): ...@@ -60,6 +61,6 @@ class Attrap_pref65(Attrap):
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name) raa = Attrap.RAA(url, date, name, timezone=self.timezone)
elements.append(raa) elements.append(raa)
return elements return elements
...@@ -14,24 +14,31 @@ logger = logging.getLogger(__name__) ...@@ -14,24 +14,31 @@ logger = logging.getLogger(__name__)
class Attrap_pref66(Attrap): class Attrap_pref66(Attrap):
# Config # Config
__HOST = 'https://www.pyrenees-orientales.gouv.fr' hostname = 'https://www.pyrenees-orientales.gouv.fr'
__RAA_PAGE = { raa_page = f'{hostname}/Publications/Le-recueil-des-actes-administratifs'
'2024': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2024', user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
'2023': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2023',
'2022': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2022',
'2021': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2021',
'2020': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2020',
'2019': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2019'
}
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture des Pyrénées-Orientales' full_name = 'Préfecture des Pyrénées-Orientales'
short_code = 'pref66' short_code = 'pref66'
timezone = 'Europe/Paris'
def __init__(self, data_dir): def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT) super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30) self.set_sleep_time(30)
def get_raa(self, keywords): def get_raa(self, keywords):
# On détermine quelles pages d'année parser
year_pages = []
page_content = self.get_page(self.raa_page, 'get').content
for year_page in self.get_sub_pages(
page_content,
'.fr-table table tr td h3 a.fr-link',
self.hostname,
False
):
year = Attrap.guess_date(year_page['name'].strip(), '.* ([0-9]{4})').year
if year < 9999 and year >= self.not_before.year:
year_pages.append([year_page['url'], year])
elements = [] elements = []
# La préfecture des Pyrénées-Orientales est une originale : avant 2024, # La préfecture des Pyrénées-Orientales est une originale : avant 2024,
...@@ -43,24 +50,16 @@ class Attrap_pref66(Attrap): ...@@ -43,24 +50,16 @@ class Attrap_pref66(Attrap):
# n'est pas exhaustif. On doit donc parser toutes les sous-pages de # n'est pas exhaustif. On doit donc parser toutes les sous-pages de
# 2024 puisqu'on ne peut se fier au tableau récapitulatif. # 2024 puisqu'on ne peut se fier au tableau récapitulatif.
# Grrr. # Grrr.
if self.not_before.year <= 2024: for year_page in year_pages:
for element in self.get_raa_elements_since_2024(self.__RAA_PAGE['2024']): url = year_page[0]
elements.append(element) year = year_page[1]
if self.not_before.year <= 2023:
for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2023']): if year >= 2024:
elements.append(element) for element in self.get_raa_elements_since_2024(url):
if self.not_before.year <= 2022: elements.append(element)
for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2022']): else:
elements.append(element) for element in self.get_raa_elements_before_2024(url):
if self.not_before.year <= 2021: elements.append(element)
for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2021']):
elements.append(element)
if self.not_before.year <= 2020:
for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2020']):
elements.append(element)
if self.not_before.year <= 2019:
for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2019']):
elements.append(element)
self.parse_raa(elements, keywords) self.parse_raa(elements, keywords)
self.mailer() self.mailer()
...@@ -89,7 +88,7 @@ class Attrap_pref66(Attrap): ...@@ -89,7 +88,7 @@ class Attrap_pref66(Attrap):
if date >= self.not_before: if date >= self.not_before:
url = '' url = ''
if a['href'].startswith('/'): if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}" url = f"{self.hostname}{a['href']}"
else: else:
url = a['href'] url = a['href']
...@@ -110,7 +109,7 @@ class Attrap_pref66(Attrap): ...@@ -110,7 +109,7 @@ class Attrap_pref66(Attrap):
'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next', 'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next',
'div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail', 'div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.__HOST self.hostname
)[::-1] )[::-1]
pages_to_parse = [] pages_to_parse = []
...@@ -121,7 +120,7 @@ class Attrap_pref66(Attrap): ...@@ -121,7 +120,7 @@ class Attrap_pref66(Attrap):
logger.warning(f"Attention, le lien vers {page['url']} n'est pas bon !") logger.warning(f"Attention, le lien vers {page['url']} n'est pas bon !")
else: else:
if page['url'].startswith('/'): if page['url'].startswith('/'):
url = f"{self.__HOST}{page['url']}" url = f"{self.hostname}{page['url']}"
else: else:
url = page['url'] url = page['url']
...@@ -129,5 +128,5 @@ class Attrap_pref66(Attrap): ...@@ -129,5 +128,5 @@ class Attrap_pref66(Attrap):
name = page['name'].replace('Télécharger ', '').strip() name = page['name'].replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y') date = datetime.datetime.strptime(page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y')
elements.append(Attrap.RAA(url, date, name)) elements.append(Attrap.RAA(url, date, name, timezone=self.timezone))
return elements return elements
...@@ -10,47 +10,39 @@ from Attrap import Attrap ...@@ -10,47 +10,39 @@ from Attrap import Attrap
class Attrap_pref69(Attrap): class Attrap_pref69(Attrap):
# Config # Config
__HOST = 'https://www.rhone.gouv.fr' hostname = 'https://www.rhone.gouv.fr'
__RAA_PAGE = { raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA'
'2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2024', user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
'2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2023',
'2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2022',
'2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2021',
'2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2020',
'2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2019'
}
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Rhône' full_name = 'Préfecture du Rhône'
short_code = 'pref69' short_code = 'pref69'
timezone = 'Europe/Paris'
def __init__(self, data_dir): def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT) super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30) self.set_sleep_time(30)
def get_raa(self, keywords): def get_raa(self, keywords):
pages_to_parse = [] # On détermine quelles pages d'année parser
if self.not_before.year <= 2024: year_pages = []
pages_to_parse.append(self.__RAA_PAGE['2024']) page_content = self.get_page(self.raa_page, 'get').content
if self.not_before.year <= 2023: for year_page in self.get_sub_pages(
pages_to_parse.append(self.__RAA_PAGE['2023']) page_content,
if self.not_before.year <= 2022: 'div.fr-card__body div.fr-card__content h2.fr-card__title a',
pages_to_parse.append(self.__RAA_PAGE['2022']) self.hostname,
if self.not_before.year <= 2021: False
pages_to_parse.append(self.__RAA_PAGE['2021']) ):
if self.not_before.year <= 2020: year = Attrap.guess_date(year_page['name'].strip(), '.* ([0-9]{4})').year
pages_to_parse.append(self.__RAA_PAGE['2020']) if year < 9999 and year >= self.not_before.year:
if self.not_before.year <= 2019: year_pages.append(year_page['url'])
pages_to_parse.append(self.__RAA_PAGE['2019'])
sub_pages_to_parse = [] sub_pages_to_parse = []
for raa_page in year_pages:
for raa_page in pages_to_parse:
sub_pages = self.get_sub_pages_with_pager( sub_pages = self.get_sub_pages_with_pager(
raa_page, raa_page,
'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link', 'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link--next', 'ul.fr-pagination__list li a.fr-pagination__link--next',
None, None,
self.__HOST)[::-1] self.hostname)[::-1]
for sub_page in sub_pages: for sub_page in sub_pages:
sub_pages_to_parse.append(sub_page['url']) sub_pages_to_parse.append(sub_page['url'])
...@@ -72,7 +64,7 @@ class Attrap_pref69(Attrap): ...@@ -72,7 +64,7 @@ class Attrap_pref69(Attrap):
for a in soup.select('a.fr-link.fr-link--download'): for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'): if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'): if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}" url = f"{self.hostname}{a['href']}"
else: else:
url = a['href'] url = a['href']
...@@ -80,6 +72,6 @@ class Attrap_pref69(Attrap): ...@@ -80,6 +72,6 @@ class Attrap_pref69(Attrap):
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name) raa = Attrap.RAA(url, date, name, timezone=self.timezone)
elements.append(raa) elements.append(raa)
return elements return elements