Skip to content
Extraits de code Groupes Projets
Valider bf271b74 rédigé par kr1p's avatar kr1p
Parcourir les fichiers

Ajout de la préfecture de lot-et-garonne

parent 4ce909ed
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref47(Attrap):
# Config
__HOST = 'https://www.lot-et-garonne.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Publications-legales/RAA'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Lot-et-garonne'
short_code = 'pref47'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
self.not_before = datetime.datetime(2020, 1, 1) # annee de differenciation pager
#supercharge pour classes css avec whitespaces
def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, details_element, host):
pages = []
page_content = self.get_page(page, 'get').content
# On initialise le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On recherche les sous-pages
sub_pages = soup.select(sub_page_element)
sub_pages_details = None
if details_element is not None:
sub_pages_details = soup.select(details_element)
i = 0
for sub_page in sub_pages:
print(sub_page)
if sub_page.get('href'):
page = {
'url': f"{host}{sub_page['href']}",
'name': sub_page.get_text().strip(),
'details': ''
}
if details_element is not None:
page['details'] = sub_pages_details[i].get_text().strip()
pages.append(page)
i = i + 1
# On recherche un pager, et si on le trouve on le suit
# modif ici, le parametre pager_element ne doit contenir
# que le contenu de la classe
# et pas l'element pour les classes avec whitespaces
pager = soup.find_all("a",class_=pager_element)
print(pager)
if pager and len(pager)>0 and pager[0].get('href'):
for sub_page in self.get_sub_pages_with_pager(
f"{host}{pager[0]['href']}",
sub_page_element,
pager_element,
details_element,
host
):
pages.append(sub_page)
return pages
def get_raa(self, keywords):
elements = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
soup = BeautifulSoup(page_content, 'html.parser')
for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a'):
# Annees sans pager
if Attrap.guess_date(f'{self.__HOST}{a.get_text().strip()}', '([0-9]{4}).*').year >= self.not_before.year:
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages(page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
True):
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
#les raa de 2019 et années précédentes ont des pagers
else:
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages_with_pager(f"{self.__HOST}{a['href']}", 'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'a.fr-pagination__link fr-pagination__link--next fr-pagination__link--lg-label', None, self.__HOST):
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
#bug sur ocrmypdf sur mon ubuntu 20.04
#sur --invalidate-digital-signatures bien que dans la doc
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# parse
print(soup.find_all("a",{"id":'class="fr-link'}))
print(len(soup.find_all("a",{"id":'class="fr-link'})))
for a in soup.find_all("a",{"id":'class="fr-link'}):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
self.download_file(raa)
elements.append(raa)
print(elements)
return elements
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter