Skip to content
Extraits de code Groupes Projets

Comparer les révisions

Les modifications sont affichées comme si la révision source était fusionnée avec la révision cible. En savoir plus sur la comparaison des révisions.

Source

Sélectionner le projet cible
No results found

Cible

Sélectionner le projet cible
  • la-quadrature-du-net/Attrap
  • foggyfrog/Attrap
  • skhwiz/Attrap
  • precambrien/Attrap
  • ketsapiwiq/Attrap
  • Joseki/Attrap
  • kr1p/attrap-pref-12
  • kr1p/attrap-pref-46
  • kr1p/attrap-pi
  • Guinness/Attrap
  • astroidgritty/attrap-pref-84
  • davinov/Attrap
  • maettellite/attrap-pref-01
  • m242/Attrap
  • multi/Attrap
  • mverdeil/Attrap
  • olpo/Attrap
17 résultats
Afficher les modifications
Affichage de avec 290 ajouts et 856 suppressions
import os
import re
import datetime
import logging
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref33(Attrap_prefdpt):
logger = logging.getLogger(__name__)
class Attrap_pref33(Attrap):
# Config
__HOST = 'https://www.gironde.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-Actes-Administratifs'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
# Configuration de la préfecture
hostname = 'https://www.gironde.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs'
full_name = 'Préfecture de la Gironde'
short_code = 'pref33'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
pages_to_parse = []
# Parfois un RAA est mal catégorisé et se retrouve sur la page racine, donc on la parse
pages_to_parse.append(self.__RAA_PAGE)
# On détermine quelles pages d'année parser
year_pages_to_parse = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
year_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)
for year_page in year_pages:
year = 9999
try:
year = int(re.search('.*([0-9]{4})', year_page['name'].strip(), re.IGNORECASE).group(1))
if year is None:
year = 9999
except Exception as exc:
logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}")
year = 9999
if year >= self.not_before.year:
year_pages_to_parse.append(year_page['url'])
# Pour chaque année, on cherche les sous-pages de mois
month_pages_to_parse = []
for year_page in year_pages_to_parse:
page_content = self.get_page(year_page, 'get').content
month_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)[::-1]
for month_page in month_pages:
guessed_date = Attrap.guess_date(month_page['name'], '([a-zéû]* [0-9]{4})')
if guessed_date >= self.not_before.replace(day=1):
pages_to_parse.append(month_page['url'])
# On parse les pages sélectionnées
elements = self.get_raa_with_pager(
pages_to_parse,
"ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label",
self.__HOST
)[::-1]
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On récupère chaque carte avec un RAA
for card in BeautifulSoup(page_content, 'html.parser').select('div.fr-card.fr-card--horizontal div.fr-card__body div.fr-card__content'):
# On récupère le lien
links = card.select('h2.fr-card__title a.fr-card__link.menu-item-link')
# On récupère la date
dates_raw = card.select('div.fr-card__end p.fr-card__detail')
# Si on a toutes les infos, on continue
if links and links[0] and dates_raw and dates_raw[0]:
a = links[0]
date_raw = dates_raw[0]
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.get_text().strip()
date = datetime.datetime.strptime(date_raw.get_text().replace('Publié le', '').strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'Recueils{0,1} des [Aa]ctes [Aa]dministratifs de l\'année ([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.grey_card['follow_link_on_unrecognised_date'] = False
import os
import datetime
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref34(Attrap_prefdpt):
class Attrap_pref34(Attrap):
# Config
__HOST = 'https://www.herault.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueils-des-actes-administratifs'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
# Configuration de la préfecture
hostname = 'https://www.herault.gouv.fr'
raa_page = f'{hostname}/Publications/Recueils-des-actes-administratifs'
full_name = 'Préfecture de l\'Hérault'
short_code = 'pref34'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
pages_to_parse = []
year_pages = self.get_sub_pages_with_pager(
self.__RAA_PAGE,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.__HOST
)
for year_page in year_pages:
year = Attrap.guess_date(year_page['name'], '.*([0-9]{4})').year
if year >= self.not_before.year:
pages_to_parse.append(year_page['url'])
elements = []
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
for element in self.get_raa_elements(page_content):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
# Configuration des widgets à analyser
year_regex = '(?:(?:Recueil des actes administratifs)|(?:Année))[ -]([0-9]{4})'
Attrap_prefdpt.white_card['regex']['year'] = year_regex
Attrap_prefdpt.grey_card['regex']['year'] = year_regex
import os
import datetime
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref35(Attrap_prefdpt):
class Attrap_pref35(Attrap):
# Config
__HOST = 'https://www.ille-et-vilaine.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs'
__RAA_PAGE_ARCHIVES = f'{__HOST}/Publications/Recueil-des-actes-administratifs/Archives-des-recueils-des-actes-administratifs'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
# Configuration de la préfecture
hostname = 'https://www.ille-et-vilaine.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
full_name = 'Préfecture d\'Ille-et-Vilaine'
short_code = 'pref35'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
year_pages_to_parse = []
# La page de l'année en cours est normalement listée sur __RAA_PAGE
year_pages = self.get_sub_pages_with_pager(
self.__RAA_PAGE,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
None,
self.__HOST
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
year_regex = 'Recueil des actes administratifs ([0-9]{4})'
Attrap_prefdpt.white_card['regex']['year'] = year_regex
# On ajoute un widget de menu déroulant
Attrap_prefdpt.select_widgets.append(
Attrap_prefdpt.DptSelectWidget(
'menu_deroulant',
regex=year_regex,
css_path='div.fr-select-group select#Archives-des-RAA-liste-docs.fr-select',
type='year'
)
for year_page in year_pages:
year = Attrap.guess_date(year_page['name'], '.*([0-9]{4})').year
if year >= self.not_before.year:
year_pages_to_parse.append(year_page['url'])
# Les URL des pages des années précédentes sont dans un menu déroulant de __RAA_PAGE_ARCHIVES
page_content = self.get_page(self.__RAA_PAGE_ARCHIVES, 'get').content
for option in BeautifulSoup(page_content, 'html.parser').select('div.fr-select-group select.fr-select option'):
if not option['value'] == '':
year = Attrap.guess_date(option.get_text().strip(), '.*([0-9]{4})').year
if year >= self.not_before.year:
url = option['value']
year_pages_to_parse.append(f'{self.__HOST}/{url}')
elements = []
for raa_page in year_pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
for element in self.get_raa_elements(page_content):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# parse
for a in soup.find_all('a', href=True, class_='fr-link--download'):
if a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
)
import os
import datetime
import logging
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref38(Attrap_prefdpt):
logger = logging.getLogger(__name__)
class Attrap_pref38(Attrap):
# Config
__HOST = 'https://www.isere.gouv.fr'
__RAA_PAGE = [
f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs',
f'{__HOST}/Publications/RAA-Recueil-des-actes-administratifs/Archives'
]
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
# Configuration de la préfecture
hostname = 'https://www.isere.gouv.fr'
raa_page = f'{hostname}/Publications/RAA-Recueil-des-actes-administratifs'
full_name = 'Préfecture de l\'Isère'
short_code = 'pref38'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
pages_to_parse = []
# On cherche les pages d'années. Elles sont mélangées dans des blocs de cartes grises et des blocs blancs avec pager
for page in self.__RAA_PAGE:
page_content = self.get_page(page, 'get').content
# On parse les cartes grises
cards = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)[::-1]
for card in cards:
year = Attrap.guess_date(card['name'].strip(), '.*([0-9]{4})').year
if year >= self.not_before.year and year < 9999:
pages_to_parse.append(card['url'])
# On parse les blocs blancs
blocks = self.get_sub_pages_with_pager(
page,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
None,
self.__HOST
)
for block in blocks:
year = Attrap.guess_date(block['name'].strip(), '.*([0-9]{4})').year
if year >= self.not_before.year and year < 9999:
pages_to_parse.append(block['url'])
elements = []
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
for element in self.get_raa_elements(page_content, raa_page):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content, raa_page):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère le select qui contient la liste des RAA
select_list = soup.select('select#-liste-docs')[0]
# On analyse chaque résultat
for option in select_list.find_all('option'):
if not option['value'] == "":
# On estime la date à partir du nom de fichier
guessed_date = Attrap.guess_date(option['title'], '.* n°[ 0-9]* du ([0-9]*(?:er)? [a-zéû]* [0-9]*)')
# Si la date estimée correspond à la plage d'analyse, on
# demande au serveur les détails du RAA
if guessed_date >= self.not_before:
page_content = self.get_page(
raa_page,
'post',
{
'-liste-docs': option['value']
}
).content
# On parse la page de détails pour obtenir les propriétés
# du RAA
soup = BeautifulSoup(page_content, 'html.parser')
a = soup.select('div.liste_deroulante a.fr-link.fr-link--download')[0]
# Si la page contient une balise a qui renvoie vers un pdf,
# c'est qu'on a obtenu les détails du RAA demandé, donc
# on le parse
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
year_regex = '(?:(?:[Rr]ecueils{0,1} des [Aa]ctes [Aa]dministratifs de la [Pp]réfecture de l\'Isère[ -]*)|(?:Année ))([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['year'] = year_regex
Attrap_prefdpt.white_card['regex']['year'] = year_regex
Attrap_prefdpt.white_card['exclude'] = ['Vous recherchez "Le Journal officiel de la République française" ?']
# On ajoute un widget de menu déroulant
Attrap_prefdpt.select_widgets.append(
Attrap_prefdpt.DptSelectWidget(
'menu_deroulant',
regex='([0-9]{1,2}[er]{0,1} [a-zéû]* [0-9]{4})',
css_path='select#-liste-docs',
type='year-month-day'
)
)
import os
import datetime
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref39(Attrap_prefdpt):
class Attrap_pref39(Attrap):
# Config
__HOST = "https://www.jura.gouv.fr"
__RAA_PAGE = f'{__HOST}/Publications/Publications-legales/Recueil-des-Actes-Administratifs'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
# Configuration de la préfecture
hostname = "https://www.jura.gouv.fr"
raa_page = f'{hostname}/Publications/Publications-legales/Recueil-des-Actes-Administratifs'
full_name = "Préfecture du Jura"
short_code = "pref39"
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
# On récupère les pages d'années
year_pages = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
for card in self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
):
if Attrap.guess_date(card['name'], '.* ([0-9]{4})').year >= self.not_before.year:
year_pages.append(card['url'])
# On récupère tous les RAA en suivant la navigation
elements = self.get_raa_with_pager(
year_pages,
'a.fr-pagination__link.fr-pagination__link--next',
self.__HOST
)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse
for a in soup.select('.fr-card__title a.fr-card__link.menu-item-link'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.get_text().strip()
date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'Année ([0-9]{4})'
import os
import datetime
import re
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref42(Attrap_prefdpt):
class Attrap_pref42(Attrap):
# Config
__HOST = 'https://www.loire.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Publications-legales/Recueil-des-Actes-Administratifs'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
# Configuration de la préfecture
hostname = 'https://www.loire.gouv.fr'
raa_page = f'{hostname}/Publications/Publications-legales/Recueil-des-Actes-Administratifs'
full_name = 'Préfecture de la Loire'
short_code = 'pref42'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
year_pages_to_parse = []
# On détermine quelles pages d'année parser
year_pages = self.get_sub_pages_with_pager(
self.__RAA_PAGE,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.__HOST
)
for year_page in year_pages:
year = 9999
try:
year = int(re.search('([0-9]{4})', year_page['name'], re.IGNORECASE).group(1))
if year is None:
year = 9999
except Exception as exc:
logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}")
year = 9999
if year >= self.not_before.year:
year_pages_to_parse.append(year_page['url'])
elements = []
# Pour chaque année, on parse les RAA
for year_page in year_pages_to_parse:
page_content = self.get_page(year_page, 'get').content
for element in self.get_raa_elements(page_content)[::-1]:
elements.append(element)
# On parse les RAA
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('div.fr-downloads-group.fr-downloads-group--bordered ul li a'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
# Configuration des widgets à analyser
Attrap_prefdpt.white_card['regex']['year'] = '([0-9]{4})'
import os
import datetime
import logging
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref44(Attrap_prefdpt):
logger = logging.getLogger(__name__)
class Attrap_pref44(Attrap):
# Config
__HOST = 'https://www.loire-atlantique.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs-RAA-en-Loire-Atlantique'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
# Configuration de la préfecture
hostname = 'https://www.loire-atlantique.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs-RAA-en-Loire-Atlantique'
full_name = 'Préfecture de la Loire-Atlantique'
short_code = 'pref44'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
pages_to_parse = []
# Parfois un RAA est mal catégorisé et se retrouve sur la page racine, donc on la parse
pages_to_parse.append(self.__RAA_PAGE)
# On détermine quelles pages d'année parser
year_pages_to_parse = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
year_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)
for year_page in year_pages:
year = 9999
try:
year = int(year_page['name'].strip())
if year is None:
year = 9999
except Exception as exc:
logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}")
year = 9999
if year >= self.not_before.year:
year_pages_to_parse.append(year_page['url'])
# Parfois un RAA est mal catégorisé et se retrouve sur la page de l'année, donc on la parse
pages_to_parse.append(year_page['url'])
# Pour chaque année, on cherche les sous-pages de mois
month_pages_to_parse = []
for year_page in year_pages_to_parse:
page_content = self.get_page(year_page, 'get').content
month_pages = self.get_sub_pages(
page_content,
'.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)[::-1]
for month_page in month_pages:
pages_to_parse.append(month_page['url'])
# On parse les pages sélectionnées
elements = self.get_raa_with_pager(
pages_to_parse,
"ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label",
self.__HOST
)[::-1]
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On récupère chaque carte avec un RAA
for card in BeautifulSoup(page_content, 'html.parser').select('div.fr-card.fr-card--horizontal div.fr-card__body div.fr-card__content'):
# On récupère le lien
links = card.select('h2.fr-card__title a.fr-card__link.menu-item-link')
# On récupère la date
dates_raw = card.select('div.fr-card__end p.fr-card__detail')
# Si on a toutes les infos, on continue
if links and links[0] and dates_raw and dates_raw[0]:
a = links[0]
date_raw = dates_raw[0]
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.get_text().strip()
date = datetime.datetime.strptime(date_raw.get_text().replace('Publié le', '').strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.grey_card['add_year_to_months'] = True
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref49(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.maine-et-loire.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs'
full_name = 'Préfecture de Maine-et-Loire'
short_code = 'pref49'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '.*([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref50(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.manche.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
full_name = 'Préfecture de la Manche'
short_code = 'pref50'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref52(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.haute-marne.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs-RAA'
full_name = 'Préfecture de la Haute-Marne'
short_code = 'pref52'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.white_card['regex']['year'] = 'Année ([0-9]{4})'
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref54(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.meurthe-et-moselle.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
full_name = 'Préfecture de Meurthe-et-Moselle'
short_code = 'pref54'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.white_card['regex']['year'] = '([0-9]{4})'
# On ajoute un widget de menu déroulant
Attrap_prefdpt.select_widgets.append(
Attrap_prefdpt.DptSelectWidget(
'menu_deroulant',
regex='.* du ([0-9]*(?:er|ER)? [A-Za-zéÉûÛ]* [0-9]*)',
css_path='select#Liste-liste-docs',
type='year-month-day'
)
)
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref55(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.meuse.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs-RAA'
full_name = 'Préfecture de la Meuse'
short_code = 'pref55'
timezone = 'Europe/Paris'
# On configure le widget de menu déroulant
Attrap_prefdpt.select_widgets.append(
Attrap_prefdpt.DptSelectWidget(
'menu_deroulant',
regex='RAA année ([0-9]{4})',
css_path='select#Liste-des-recueils-liste-docs',
type='year'
)
)
import os
import datetime
import dateparser
import logging
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref59(Attrap_prefdpt):
logger = logging.getLogger(__name__)
class Attrap_pref59(Attrap):
# Config
__HOST = 'https://www.nord.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
# Configuration de la préfecture
hostname = 'https://www.nord.gouv.fr'
raa_page = f'{hostname}/Publications/Recueils-des-actes-administratifs/RAA-du-departement-du-Nord'
full_name = 'Préfecture du Nord'
short_code = 'pref59'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
# On récupère les pages d'années
year_pages = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
for card in self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
):
if Attrap.guess_date(card['name'], '([0-9]{4})').year >= self.not_before.year:
year_pages.append(card['url'])
elements = []
for raa_page in year_pages:
page_content = self.get_page(raa_page, 'get').content
sub_pages = self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
True
)
for sub_page in sub_pages[::-1]:
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.grey_card['add_year_to_months'] = True
from Attrap_prefdpt import Attrap_prefdpt
class Attrap_pref61(Attrap_prefdpt):
# Configuration de la préfecture
hostname = 'https://www.orne.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-Actes-Administratifs-RAA/Recueil-des-Actes-Administratifs-RAA'
full_name = 'Préfecture de l\'Orne'
short_code = 'pref61'
timezone = 'Europe/Paris'
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = 'Le Recueil des actes administratifs ([0-9]{4})'
Attrap_prefdpt.grey_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.grey_card['add_year_to_months'] = True
Attrap_prefdpt.white_card['regex']['month'] = '([A-Za-zéû]* [0-9]{4})'
Attrap_prefdpt.white_card['add_year_to_months'] = True
import os
import datetime
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref62(Attrap):
class Attrap_pref62(Attrap_prefdpt):
# Config
__HOST = 'https://www.pas-de-calais.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
hostname = 'https://www.pas-de-calais.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
full_name = 'Préfecture du Pas-de-Calais'
short_code = 'pref62'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
pages_to_parse = []
# On détermine quelles pages d'année parser
year_pages = self.get_sub_pages_with_pager(
self.__RAA_PAGE,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.__HOST
)
for year_page in year_pages:
if Attrap.guess_date(year_page['name'].strip(), '([0-9]{4}).*').year >= self.not_before.year:
pages_to_parse.append(year_page['url'])
elements = []
for raa_page in pages_to_parse:
page_content = self.get_page(raa_page, 'get').content
for element in self.get_raa_elements(page_content):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère le div qui contient la liste des RAA
cards = soup.select('div.fr-downloads-group.fr-downloads-group--bordered')[0]
# On analyse chaque balise a dans ce div
for a in cards.find_all('a', href=True):
if a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements[::-1]
# Configuration des widgets à analyser
Attrap_prefdpt.white_card['regex']['year'] = '([0-9]{4})'
import os
import datetime
import re
from Attrap_prefdpt import Attrap_prefdpt
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref63(Attrap_prefdpt):
class Attrap_pref63(Attrap):
# Config
__HOST = 'https://www.puy-de-dome.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-Puy-de-Dome'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
# Configuration de la préfecture
hostname = 'https://www.puy-de-dome.gouv.fr'
raa_page = f'{hostname}/Publications/Recueils-des-actes-administratifs/Recueils-des-actes-administratifs-Puy-de-Dome'
full_name = 'Préfecture du Puy-de-Dôme'
short_code = 'pref63'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.set_sleep_time(30)
def get_raa(self, keywords):
year_pages_to_parse = []
# On détermine quelles pages d'année parser
page_content = self.get_page(self.__RAA_PAGE, 'get').content
year_pages = self.get_sub_pages(
page_content,
'div.fr-card.fr-card--sm.fr-card--grey.fr-enlarge-link div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
False
)
for year_page in year_pages:
if not year_page['name'].strip() == 'Archives':
year = 9999
try:
year = int(year_page['name'].strip())
except Exception as exc:
logger.warning(f"Impossible de deviner l\'année de la page {year_page['name']}")
year = 9999
if year >= self.not_before.year:
year_pages_to_parse.append(year_page['url'])
elements = []
# Pour chaque année, on parse les RAA
for year_page in year_pages_to_parse:
page_content = self.get_page(year_page, 'get').content
for element in self.get_raa_elements(page_content):
elements.append(element)
# On parse les RAA
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On récupère chaque balise a
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
return elements
# Configuration des widgets à analyser
Attrap_prefdpt.grey_card['regex']['year'] = '([0-9]{4})'
......@@ -10,24 +10,25 @@ from Attrap import Attrap
class Attrap_pref64(Attrap):
# Config
__HOST = 'https://www.pyrenees-atlantiques.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
hostname = 'https://www.pyrenees-atlantiques.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture des Pyrénées-Atlantiques'
short_code = 'pref64'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30)
def get_raa(self, keywords):
# On récupère les pages d'années
year_pages = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
page_content = self.get_page(self.raa_page, 'get').content
for year_page in self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
self.hostname,
False
):
year = Attrap.guess_date(year_page['name'], '.* ([0-9]{4})').year
......@@ -41,7 +42,7 @@ class Attrap_pref64(Attrap):
for month_page in self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
self.hostname,
False
):
if Attrap.guess_date(month_page['name'], '(.*)').replace(day=1) >= self.not_before.replace(day=1):
......@@ -51,7 +52,7 @@ class Attrap_pref64(Attrap):
elements = self.get_raa_with_pager(
month_pages[::-1],
'a.fr-pagination__link--next.fr-pagination__link--lg-label',
self.__HOST
self.hostname
)[::-1]
self.parse_raa(elements, keywords)
......@@ -66,7 +67,7 @@ class Attrap_pref64(Attrap):
for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link.menu-item-link'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
url = f"{self.hostname}{a['href']}"
else:
url = a['href']
......@@ -74,6 +75,6 @@ class Attrap_pref64(Attrap):
name = a.get_text().strip()
date = datetime.datetime.strptime(a['title'].split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
raa = Attrap.RAA(url, date, name, timezone=self.timezone)
elements.append(raa)
return elements
......@@ -10,25 +10,26 @@ from Attrap import Attrap
class Attrap_pref65(Attrap):
# Config
__HOST = 'https://www.hautes-pyrenees.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-d-actes-administratifs'
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
hostname = 'https://www.hautes-pyrenees.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-d-actes-administratifs'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture des Hautes-Pyrénées'
short_code = 'pref65'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30)
def get_raa(self, keywords):
# On détermine quelles pages d'année parser
pages_to_parse = []
year_pages = self.get_sub_pages_with_pager(
self.__RAA_PAGE,
self.raa_page,
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next.fr-pagination__link--lg-label',
'div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.__HOST
self.hostname
)
for year_page in year_pages:
if Attrap.guess_date(year_page['name'].strip(), '.*([0-9]{4})').year >= self.not_before.year:
......@@ -52,7 +53,7 @@ class Attrap_pref65(Attrap):
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
url = f"{self.hostname}{a['href']}"
else:
url = a['href']
......@@ -60,6 +61,6 @@ class Attrap_pref65(Attrap):
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
raa = Attrap.RAA(url, date, name, timezone=self.timezone)
elements.append(raa)
return elements
......@@ -14,24 +14,31 @@ logger = logging.getLogger(__name__)
class Attrap_pref66(Attrap):
# Config
__HOST = 'https://www.pyrenees-orientales.gouv.fr'
__RAA_PAGE = {
'2024': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2024',
'2023': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2023',
'2022': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2022',
'2021': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2021',
'2020': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2020',
'2019': f'{__HOST}/Publications/Le-recueil-des-actes-administratifs/Annee-2019'
}
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
hostname = 'https://www.pyrenees-orientales.gouv.fr'
raa_page = f'{hostname}/Publications/Le-recueil-des-actes-administratifs'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture des Pyrénées-Orientales'
short_code = 'pref66'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30)
def get_raa(self, keywords):
# On détermine quelles pages d'année parser
year_pages = []
page_content = self.get_page(self.raa_page, 'get').content
for year_page in self.get_sub_pages(
page_content,
'.fr-table table tr td h3 a.fr-link',
self.hostname,
False
):
year = Attrap.guess_date(year_page['name'].strip(), '.* ([0-9]{4})').year
if year < 9999 and year >= self.not_before.year:
year_pages.append([year_page['url'], year])
elements = []
# La préfecture des Pyrénées-Orientales est une originale : avant 2024,
......@@ -43,24 +50,16 @@ class Attrap_pref66(Attrap):
# n'est pas exhaustif. On doit donc parser toutes les sous-pages de
# 2024 puisqu'on ne peut se fier au tableau récapitulatif.
# Grrr.
if self.not_before.year <= 2024:
for element in self.get_raa_elements_since_2024(self.__RAA_PAGE['2024']):
elements.append(element)
if self.not_before.year <= 2023:
for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2023']):
elements.append(element)
if self.not_before.year <= 2022:
for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2022']):
elements.append(element)
if self.not_before.year <= 2021:
for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2021']):
elements.append(element)
if self.not_before.year <= 2020:
for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2020']):
elements.append(element)
if self.not_before.year <= 2019:
for element in self.get_raa_elements_before_2024(self.__RAA_PAGE['2019']):
elements.append(element)
for year_page in year_pages:
url = year_page[0]
year = year_page[1]
if year >= 2024:
for element in self.get_raa_elements_since_2024(url):
elements.append(element)
else:
for element in self.get_raa_elements_before_2024(url):
elements.append(element)
self.parse_raa(elements, keywords)
self.mailer()
......@@ -89,7 +88,7 @@ class Attrap_pref66(Attrap):
if date >= self.not_before:
url = ''
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
url = f"{self.hostname}{a['href']}"
else:
url = a['href']
......@@ -110,7 +109,7 @@ class Attrap_pref66(Attrap):
'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link.fr-pagination__link--next',
'div.fr-card__body div.fr-card__content div.fr-card__end p.fr-card__detail',
self.__HOST
self.hostname
)[::-1]
pages_to_parse = []
......@@ -121,7 +120,7 @@ class Attrap_pref66(Attrap):
logger.warning(f"Attention, le lien vers {page['url']} n'est pas bon !")
else:
if page['url'].startswith('/'):
url = f"{self.__HOST}{page['url']}"
url = f"{self.hostname}{page['url']}"
else:
url = page['url']
......@@ -129,5 +128,5 @@ class Attrap_pref66(Attrap):
name = page['name'].replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(page['details'].replace('Publié le ', '').strip(), '%d/%m/%Y')
elements.append(Attrap.RAA(url, date, name))
elements.append(Attrap.RAA(url, date, name, timezone=self.timezone))
return elements
......@@ -10,47 +10,39 @@ from Attrap import Attrap
class Attrap_pref69(Attrap):
# Config
__HOST = 'https://www.rhone.gouv.fr'
__RAA_PAGE = {
'2024': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2024',
'2023': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2023',
'2022': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2022',
'2021': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2021',
'2020': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2020',
'2019': f'{__HOST}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA/Recueils-de-2019'
}
__USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
hostname = 'https://www.rhone.gouv.fr'
raa_page = f'{hostname}/Publications/Recueil-des-actes-administratifs-du-Rhone-RAA'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Rhône'
short_code = 'pref69'
timezone = 'Europe/Paris'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
super().__init__(data_dir, self.user_agent)
self.set_sleep_time(30)
def get_raa(self, keywords):
pages_to_parse = []
if self.not_before.year <= 2024:
pages_to_parse.append(self.__RAA_PAGE['2024'])
if self.not_before.year <= 2023:
pages_to_parse.append(self.__RAA_PAGE['2023'])
if self.not_before.year <= 2022:
pages_to_parse.append(self.__RAA_PAGE['2022'])
if self.not_before.year <= 2021:
pages_to_parse.append(self.__RAA_PAGE['2021'])
if self.not_before.year <= 2020:
pages_to_parse.append(self.__RAA_PAGE['2020'])
if self.not_before.year <= 2019:
pages_to_parse.append(self.__RAA_PAGE['2019'])
# On détermine quelles pages d'année parser
year_pages = []
page_content = self.get_page(self.raa_page, 'get').content
for year_page in self.get_sub_pages(
page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.hostname,
False
):
year = Attrap.guess_date(year_page['name'].strip(), '.* ([0-9]{4})').year
if year < 9999 and year >= self.not_before.year:
year_pages.append(year_page['url'])
sub_pages_to_parse = []
for raa_page in pages_to_parse:
for raa_page in year_pages:
sub_pages = self.get_sub_pages_with_pager(
raa_page,
'div.fr-card__body div.fr-card__content h2.fr-card__title a.fr-card__link',
'ul.fr-pagination__list li a.fr-pagination__link--next',
None,
self.__HOST)[::-1]
self.hostname)[::-1]
for sub_page in sub_pages:
sub_pages_to_parse.append(sub_page['url'])
......@@ -72,7 +64,7 @@ class Attrap_pref69(Attrap):
for a in soup.select('a.fr-link.fr-link--download'):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
url = f"{self.hostname}{a['href']}"
else:
url = a['href']
......@@ -80,6 +72,6 @@ class Attrap_pref69(Attrap):
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
raa = Attrap.RAA(url, date, name, timezone=self.timezone)
elements.append(raa)
return elements