Skip to content
Extraits de code Groupes Projets
Valider eae5fa90 rédigé par kr1p's avatar kr1p Validation de Guillaume Seren
Parcourir les fichiers

add pref12,pref40,pref46

parent 91b2f25f
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
...@@ -147,7 +147,7 @@ class Attrap: ...@@ -147,7 +147,7 @@ class Attrap:
self.tor_enabled = False self.tor_enabled = False
self.tor_max_requests = 0 self.tor_max_requests = 0
self.tor_requests = 0 self.tor_requests = 0
self.not_before = datetime.datetime(2024, 1, 1) self.not_before = datetime.datetime(2015, 1, 1)
self.smtp_configured = False self.smtp_configured = False
self.mastodon = None self.mastodon = None
self.mastodon_prefix = '' self.mastodon_prefix = ''
...@@ -462,7 +462,7 @@ class Attrap: ...@@ -462,7 +462,7 @@ class Attrap:
except requests.exceptions.ConnectionError: except requests.exceptions.ConnectionError:
logger.warning(f'Erreur de connexion, temporisation...') logger.warning(f'Erreur de connexion, temporisation...')
self.tor_get_new_id() self.tor_get_new_id()
time.sleep(55) time.sleep(60)
return self.get_page(url, method, data) return self.get_page(url, method, data)
except requests.exceptions.Timeout: except requests.exceptions.Timeout:
logger.warning(f'Timeout, on relance la requête...') logger.warning(f'Timeout, on relance la requête...')
......
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
import re
from Attrap import Attrap
class Attrap_pref46(Attrap):
# Config
__HOST = 'https://www.aveyron.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = "Préfecture de l'Aveyron"
short_code = 'pref12'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
self.not_before = datetime.datetime(2020, 1, 1)# avant 2020 page "archives"
#supercharge pour classes css avec whitespaces
def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, details_element, host):
pages = []
page_content = self.get_page(page, 'get').content
# On initialise le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On recherche les sous-pages
sub_pages = soup.select(sub_page_element)
sub_pages_details = None
if details_element is not None:
sub_pages_details = soup.select(details_element)
i = 0
for sub_page in sub_pages:
print(sub_page)
if sub_page.get('href'):
page = {
'url': f"{host}{sub_page['href']}",
'name': sub_page.get_text().strip(),
'details': ''
}
if details_element is not None:
page['details'] = sub_pages_details[i].get_text().strip()
pages.append(page)
i = i + 1
# On recherche un pager, et si on le trouve on le suit
# modif ici, le parametre pager_element ne doit contenir
# que le contenu de la classe
# et pas l'element pour les classes avec whitespaces
pager = soup.find_all("a",class_=pager_element)
print(pager)
if pager and len(pager)>0 and pager[0].get('href'):
for sub_page in self.get_sub_pages_with_pager(
f"{host}{pager[0]['href']}",
sub_page_element,
pager_element,
details_element,
host
):
pages.append(sub_page)
return pages
def get_raa(self, keywords):
elements = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
soup = BeautifulSoup(page_content, 'html.parser')
#selection des grey cards
for a in soup.select('div.fr-card--grey div.fr-card__body div.fr-card__content h2.fr-card__title a'):
#regular
if Attrap.guess_date(f'{a.get_text().strip()}', '([0-9]{4}).*').year >= self.not_before.year and "Archives" not in f'{a.get_text().strip()}':
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages_with_pager(f"{self.__HOST}{a['href']}", 'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'fr-pagination__link fr-pagination__link--next fr-pagination__link--lg-label', None, self.__HOST):
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
#archives
elif self.not_before.year<2021 and "Archives" in f'{a.get_text().strip()}':
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages(page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
True):
sub_page_content = self.get_page(sub_page['url'], 'get').content
for a in sub_page_content.select('div.fr-card__body div.fr-card__content h2.fr-card__title a'):
sub_sub_page_content = self.get_page(a['url'], 'get').content
for element in self.get_raa_elements(sub_sub_page_content):
elements.append(element)
#selection des "spécials"
for div in soup.select("div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w"):
for a in div.select("div.fr-card__body div.fr-card__content h2.fr-card__title a"):
print(a)
search_pattern=re.search('(?<=Publié le).*',f'{a.parent.parent.get_text()}')
if search_pattern:
if Attrap.guess_date(search_pattern[0], '([0-9]{4}).*').year>=self.not_before.year:
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages(page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
True):
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
#bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement)
#sur --invalidate-digital-signatures bien que dans la doc
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# parse
print(soup.find_all("a",{"id":'class="fr-link'}))
print(len(soup.find_all("a",{"id":'class="fr-link'})))
for a in soup.find_all("a",{"id":'class="fr-link'}):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
self.download_file(raa)
elements.append(raa)
print(elements)
return elements
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref40(Attrap):
# Config
__HOST = 'https://www.landes.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Publications-legales/Le-Recueil-des-Actes-Administratifs-RAA'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture des Landes'
short_code = 'pref40'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
self.not_before = datetime.datetime(2016, 1, 1)# avant 2016 page "années antérieures"
#supercharge pour classes css avec whitespaces
def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, details_element, host):
pages = []
page_content = self.get_page(page, 'get').content
# On initialise le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On recherche les sous-pages
sub_pages = soup.select(sub_page_element)
sub_pages_details = None
if details_element is not None:
sub_pages_details = soup.select(details_element)
i = 0
for sub_page in sub_pages:
print(sub_page)
if sub_page.get('href'):
page = {
'url': f"{host}{sub_page['href']}",
'name': sub_page.get_text().strip(),
'details': ''
}
if details_element is not None:
page['details'] = sub_pages_details[i].get_text().strip()
pages.append(page)
i = i + 1
# On recherche un pager, et si on le trouve on le suit
# modif ici, le parametre pager_element ne doit contenir
# que le contenu de la classe
# et pas l'element pour les classes avec whitespaces
pager = soup.find_all("a",class_=pager_element)
print(pager)
if pager and len(pager)>0 and pager[0].get('href'):
for sub_page in self.get_sub_pages_with_pager(
f"{host}{pager[0]['href']}",
sub_page_element,
pager_element,
details_element,
host
):
pages.append(sub_page)
return pages
def get_raa(self, keywords):
elements = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
soup = BeautifulSoup(page_content, 'html.parser')
for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a'):
# Annee archivees
if self.not_before.year<2016 and "antérieures" in a.get_text().strip():
print(f"{a.get_text().strip()}")
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages_with_pager(f"{self.__HOST}{a['href']}", 'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'fr-pagination__link fr-pagination__link--next fr-pagination__link--lg-label', None, self.__HOST):
if Attrap.guess_date(f'{sub_page.get_text().strip()}', '([0-9]{4}).*').year>=self.not_before.year:
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
# Annees non archivees
elif Attrap.guess_date(f'{a.get_text().strip()}', '([0-9]{4}).*').year >= self.not_before.year and "antérieures" not in a.get_text().strip():
print(f"{a.get_text().strip()}")
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages_with_pager(f"{self.__HOST}{a['href']}", 'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'fr-pagination__link fr-pagination__link--next fr-pagination__link--lg-label', None, self.__HOST):
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
#bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement)
#sur --invalidate-digital-signatures bien que dans la doc
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# parse
for a in soup.find_all("a",{"id":'class="fr-link'}):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
self.download_file(raa)
elements.append(raa)
print(elements)
return elements
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
import re
from Attrap import Attrap
class Attrap_pref46(Attrap):
# Config
__HOST = 'https://www.lot.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-Actes-Administratifs'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Lot'
short_code = 'pref46'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
self.not_before = datetime.datetime(2020, 1, 1)# avant 2020 page "archives"
#supercharge pour classes css avec whitespaces
def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, details_element, host):
pages = []
page_content = self.get_page(page, 'get').content
# On initialise le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On recherche les sous-pages
sub_pages = soup.select(sub_page_element)
sub_pages_details = None
if details_element is not None:
sub_pages_details = soup.select(details_element)
i = 0
for sub_page in sub_pages:
print(sub_page)
if sub_page.get('href'):
page = {
'url': f"{host}{sub_page['href']}",
'name': sub_page.get_text().strip(),
'details': ''
}
if details_element is not None:
page['details'] = sub_pages_details[i].get_text().strip()
pages.append(page)
i = i + 1
# On recherche un pager, et si on le trouve on le suit
# modif ici, le parametre pager_element ne doit contenir
# que le contenu de la classe
# et pas l'element pour les classes avec whitespaces
pager = soup.find_all("a",class_=pager_element)
print(pager)
if pager and len(pager)>0 and pager[0].get('href'):
for sub_page in self.get_sub_pages_with_pager(
f"{host}{pager[0]['href']}",
sub_page_element,
pager_element,
details_element,
host
):
pages.append(sub_page)
return pages
def get_raa(self, keywords):
elements = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
soup = BeautifulSoup(page_content, 'html.parser')
#selection des grey cards
for a in soup.select('div.fr-card--grey div.fr-card__body div.fr-card__content h2.fr-card__title a'):
#regular
if Attrap.guess_date(f'{a.get_text().strip()}', '([0-9]{4}).*').year >= self.not_before.year and "Archives" not in f'{a.get_text().strip()}':
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages_with_pager(f"{self.__HOST}{a['href']}", 'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'fr-pagination__link fr-pagination__link--next fr-pagination__link--lg-label', None, self.__HOST):
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
#archives
elif self.not_before.year<2020 and "Archives" in f'{a.get_text().strip()}':
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages(page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
True):
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
#selection des "spécials"
for div in soup.select("div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w"):
for a in div.select("div.fr-card__body div.fr-card__content h2.fr-card__title a"):
print(a)
search_pattern=re.search('(?<=Publié le).*',f'{a.parent.parent.get_text()}')
if search_pattern:
if Attrap.guess_date(search_pattern[0], '([0-9]{4}).*').year>=self.not_before.year:
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages(page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
True):
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
#bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement)
#sur --invalidate-digital-signatures bien que dans la doc
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# parse
print(soup.find_all("a",{"id":'class="fr-link'}))
print(len(soup.find_all("a",{"id":'class="fr-link'})))
for a in soup.find_all("a",{"id":'class="fr-link'}):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
self.download_file(raa)
elements.append(raa)
print(elements)
return elements
...@@ -85,7 +85,7 @@ class Attrap_pref47(Attrap): ...@@ -85,7 +85,7 @@ class Attrap_pref47(Attrap):
#les raa de 2019 et années précédentes ont des pagers #les raa de 2019 et années précédentes ont des pagers
#else: #else:
# page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content # page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
# for sub_page in self.get_sub_pages_with_pager(f"{self.__HOST}{a['href']}", 'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'a.fr-pagination__link fr-pagination__link--next fr-pagination__link--lg-label', None, self.__HOST): # for sub_page in self.get_sub_pages_with_pager(f"{self.__HOST}{a['href']}", 'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'fr-pagination__link fr-pagination__link--next fr-pagination__link--lg-label', None, self.__HOST):
# sub_page_content = self.get_page(sub_page['url'], 'get').content # sub_page_content = self.get_page(sub_page['url'], 'get').content
# for element in self.get_raa_elements(sub_page_content): # for element in self.get_raa_elements(sub_page_content):
# elements.append(element) # elements.append(element)
......
...@@ -9,6 +9,8 @@ pref06: ...@@ -9,6 +9,8 @@ pref06:
bin/python3 cli.py pref06 bin/python3 cli.py pref06
pref09: pref09:
bin/python3 cli.py pref09 bin/python3 cli.py pref09
pref12:
bin/python3 cli.py pref12
pref13: pref13:
bin/python3 cli.py pref13 bin/python3 cli.py pref13
pref31: pref31:
...@@ -21,10 +23,14 @@ pref35: ...@@ -21,10 +23,14 @@ pref35:
bin/python3 cli.py pref35 bin/python3 cli.py pref35
pref38: pref38:
bin/python3 cli.py pref38 bin/python3 cli.py pref38
pref40:
bin/python3 cli.py pref40
pref42: pref42:
bin/python3 cli.py pref42 bin/python3 cli.py pref42
pref44: pref44:
bin/python3 cli.py pref44 bin/python3 cli.py pref44
pref46:
bin/python3 cli.py pref46
pref47: pref47:
bin/python3 cli.py pref47 bin/python3 cli.py pref47
pref59: pref59:
......
...@@ -44,14 +44,17 @@ available_administrations = [ ...@@ -44,14 +44,17 @@ available_administrations = [
'pref05', 'pref05',
'pref06', 'pref06',
'pref09', 'pref09',
'pref12',
'pref13', 'pref13',
'pref31', 'pref31',
'pref33', 'pref33',
'pref34', 'pref34',
'pref35', 'pref35',
'pref38', 'pref38',
'pref40',
'pref42', 'pref42',
'pref44', 'pref44',
'pref46',
'pref47', 'pref47',
'pref59', 'pref59',
'pref62', 'pref62',
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter