Skip to content
Extraits de code Groupes Projets

Comparer les révisions

Les modifications sont affichées comme si la révision source était fusionnée avec la révision cible. En savoir plus sur la comparaison des révisions.

Source

Sélectionner le projet cible
No results found

Cible

Sélectionner le projet cible
  • la-quadrature-du-net/Attrap
  • foggyfrog/Attrap
  • skhwiz/Attrap
  • precambrien/Attrap
  • ketsapiwiq/Attrap
  • Joseki/Attrap
  • kr1p/attrap-pref-12
  • kr1p/attrap-pref-46
  • kr1p/attrap-pi
  • Guinness/Attrap
  • astroidgritty/attrap-pref-84
  • davinov/Attrap
  • maettellite/attrap-pref-01
  • m242/Attrap
  • multi/Attrap
  • mverdeil/Attrap
  • olpo/Attrap
17 résultats
Afficher les modifications
Validations sur la source (4)
......@@ -158,6 +158,11 @@ test_pref44:
PREF: "pref44"
extends: .default_pref
test_pref47:
variables:
PREF: "pref47"
extends: .default_pref
test_pref59:
variables:
PREF: "pref59"
......
......@@ -140,7 +140,7 @@ class Attrap:
self.tor_enabled = False
self.tor_max_requests = 0
self.tor_requests = 0
self.not_before = datetime.datetime(2024, 1, 1)
self.not_before = datetime.datetime(2015, 1, 1)
self.smtp_configured = False
self.mastodon = None
self.mastodon_prefix = ''
......@@ -392,7 +392,7 @@ class Attrap:
if page.status_code == 429:
logger.warning('Erreur 429 Too Many Requests reçue, temporisation...')
self.tor_get_new_id()
time.sleep(55)
time.sleep(60)
return self.get_page(url, method, data)
if self.tor_enabled:
......@@ -405,7 +405,7 @@ class Attrap:
except requests.exceptions.ConnectionError:
logger.warning(f'Erreur de connexion, temporisation...')
self.tor_get_new_id()
time.sleep(55)
time.sleep(60)
return self.get_page(url, method, data)
except requests.exceptions.Timeout:
logger.warning(f'Timeout, on relance la requête...')
......@@ -415,16 +415,31 @@ class Attrap:
self.user_agent = user_agent
self.session.headers.update({'User-Agent': self.user_agent})
def download_file(self, raa):
def download_file(self, raa, overwrite=True):
try:
os.makedirs(
os.path.dirname(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf'),
exist_ok=True
)
file = self.get_page(raa.url, 'get')
f = open(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', 'wb')
f.write(file.content)
f.close()
# content-length 114 erronée sur test fichier
# response = requests.head(raa.url)
# size = response.headers.get('content-length')#length 9549465
# if size is not None:
# print(size)
if overwrite and os.path.isfile(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf'):
print(f'file already present on disk, overwriting {self.data_dir}/raa/{raa.get_sha256()}.pdf for {raa}')
#if size!=os.path.getsize(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf')
file = self.get_page(raa.url, 'get')
f = open(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', 'wb')
f.write(file.content)
f.close()
elif not os.path.isfile(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf'):
print(f'file not present on disk, downloading {self.data_dir}/raa/{raa.get_sha256()}.pdf for {raa}')
file = self.get_page(raa.url, 'get')
f = open(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', 'wb')
f.write(file.content)
f.close()
except (requests.exceptions.ConnectionError,
requests.exceptions.ChunkedEncodingError):
logger.warning(f'ATTENTION: la connexion a été interrompue pendant le téléchargement de {raa.url}, nouvelle tentative...')
......
import os
import datetime
import time
from bs4 import BeautifulSoup
from urllib.parse import unquote
import re
from Attrap import Attrap
class Attrap_pref12(Attrap):
# Config
__HOST = 'https://www.aveyron.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-actes-administratifs'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = "Préfecture de l'Aveyron"
short_code = 'pref12'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
self.not_before = datetime.datetime(2020, 1, 1) # avant 2020 page "archives"
#supercharge pour classes css avec whitespaces
def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, details_element, host):
pages = []
page_content = self.get_page(page, 'get').content
# On initialise le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On recherche les sous-pages
sub_pages = soup.select(sub_page_element)
sub_pages_details = None
if details_element is not None:
sub_pages_details = soup.select(details_element)
i = 0
for sub_page in sub_pages:
print(sub_page)
if sub_page.get('href'):
page = {
'url': f"{host}{sub_page['href']}",
'name': sub_page.get_text().strip(),
'details': ''
}
if details_element is not None:
page['details'] = sub_pages_details[i].get_text().strip()
pages.append(page)
i = i + 1
# On recherche un pager, et si on le trouve on le suit
# modif ici, le parametre pager_element ne doit contenir
# que le contenu de la classe
# et pas l'element pour les classes avec whitespaces
pager = soup.find_all("a",class_=pager_element)
print(pager)
if pager and len(pager)>0 and pager[0].get('href'):
for sub_page in self.get_sub_pages_with_pager(
f"{host}{pager[0]['href']}",
sub_page_element,
pager_element,
details_element,
host
):
pages.append(sub_page)
return pages
def get_raa(self, keywords):
elements = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
soup = BeautifulSoup(page_content, 'html.parser')
#selection des grey cards
print(f"not_before={self.not_before.year}")
for a in soup.select('div.fr-card--grey div.fr-card__body div.fr-card__content h2.fr-card__title a'):
#archives links
if self.not_before.year<2021 and "Archives" in f'{a.get_text().strip()}':
print(f"""########################\n
archives links\n
{a.get_text().strip()}
###########################""")
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages(page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
True):
print(f"retrieving {sub_page['url']}")
sub_page_content = self.get_page(sub_page['url'], 'get').content
subsoup = BeautifulSoup(sub_page_content, 'html.parser')
for a in subsoup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a'):
sub_sub_page_content = self.get_page(a['url'], 'get').content
for element in self.get_raa_elements(sub_sub_page_content):
print(f"appending {element}")
elements.append(element)
#regular links
if Attrap.guess_date(f'{a.get_text().strip()}', '([0-9]{4}).*').year >= self.not_before.year and "Archives" not in f'{a.get_text().strip()}':
print(f"""########################\n
regular links\n
{a.get_text().strip()}
###########################""")
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages_with_pager(f"{self.__HOST}{a['href']}", 'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'fr-pagination__link fr-pagination__link--next fr-pagination__link--lg-label', None, self.__HOST):
print(f"retrieving {sub_page['url']}")
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
print(f"appending {element}")
elements.append(element)
#selection des "spécials"
for div in soup.select("div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w"):
print("""########################\n
specials links\n
###########################""")
for a in div.select("div.fr-card__body div.fr-card__content h2.fr-card__title a"):
print(a)
search_pattern=re.search('(?<=Publié le).*',f'{a.parent.parent.get_text()}')
if search_pattern:
if Attrap.guess_date(search_pattern[0], '([0-9]{4}).*').year>=self.not_before.year:
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages(page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
True):
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
print(f"appending {element}")
elements.append(element)
for raa in elements:
print(f"downloading {raa}")
self.download_file(raa,overwrite=False)
time.sleep(14)
#bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement)
#sur --invalidate-digital-signatures bien que dans la doc
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# parse
#print(soup.find_all("a",{"id":'class="fr-link'}))
#print(len(soup.find_all("a",{"id":'class="fr-link'})))
for a in soup.find_all("a",{"id":'class="fr-link'}):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
print(elements)
return elements
Attrap_pref12('test').get_raa('algorithmes')
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref40(Attrap):
# Config
__HOST = 'https://www.landes.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Publications-legales/Le-Recueil-des-Actes-Administratifs-RAA'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture des Landes'
short_code = 'pref40'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
self.not_before = datetime.datetime(2016, 1, 1)# avant 2016 page "années antérieures"
#supercharge pour classes css avec whitespaces
def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, details_element, host):
pages = []
page_content = self.get_page(page, 'get').content
# On initialise le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On recherche les sous-pages
sub_pages = soup.select(sub_page_element)
sub_pages_details = None
if details_element is not None:
sub_pages_details = soup.select(details_element)
i = 0
for sub_page in sub_pages:
print(sub_page)
if sub_page.get('href'):
page = {
'url': f"{host}{sub_page['href']}",
'name': sub_page.get_text().strip(),
'details': ''
}
if details_element is not None:
page['details'] = sub_pages_details[i].get_text().strip()
pages.append(page)
i = i + 1
# On recherche un pager, et si on le trouve on le suit
# modif ici, le parametre pager_element ne doit contenir
# que le contenu de la classe
# et pas l'element pour les classes avec whitespaces
pager = soup.find_all("a",class_=pager_element)
print(pager)
if pager and len(pager)>0 and pager[0].get('href'):
for sub_page in self.get_sub_pages_with_pager(
f"{host}{pager[0]['href']}",
sub_page_element,
pager_element,
details_element,
host
):
pages.append(sub_page)
return pages
def get_raa(self, keywords):
elements = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
soup = BeautifulSoup(page_content, 'html.parser')
print(f"not_before={self.not_before.year}")
for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a'):
# Annee archivees
if self.not_before.year<2016 and "antérieures" in a.get_text().strip():
print(f"{a.get_text().strip()}")
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages_with_pager(f"{self.__HOST}{a['href']}", 'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'fr-pagination__link fr-pagination__link--next fr-pagination__link--lg-label', None, self.__HOST):
if Attrap.guess_date(f'{sub_page.get_text().strip()}', '([0-9]{4}).*').year>=self.not_before.year:
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
# Annees non archivees
elif Attrap.guess_date(f'{a.get_text().strip()}', '([0-9]{4}).*').year >= self.not_before.year and "antérieures" not in a.get_text().strip():
print(f"{a.get_text().strip()}")
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages_with_pager(f"{self.__HOST}{a['href']}", 'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'fr-pagination__link fr-pagination__link--next fr-pagination__link--lg-label', None, self.__HOST):
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
for raa in elements:
print(f'downloading {raa}')
self.download_file(raa, overwriting=False)
#bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement)
#sur --invalidate-digital-signatures bien que dans la doc
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# parse
for a in soup.find_all("a",{"id":'class="fr-link'}):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
print(elements)
return elements
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
import re
from Attrap import Attrap
class Attrap_pref46(Attrap):
# Config
__HOST = 'https://www.lot.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Recueil-des-Actes-Administratifs'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Lot'
short_code = 'pref46'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
self.not_before = datetime.datetime(2020, 1, 1)# avant 2020 page "archives"
#supercharge pour classes css avec whitespaces
def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, details_element, host):
pages = []
page_content = self.get_page(page, 'get').content
# On initialise le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On recherche les sous-pages
sub_pages = soup.select(sub_page_element)
sub_pages_details = None
if details_element is not None:
sub_pages_details = soup.select(details_element)
i = 0
for sub_page in sub_pages:
print(sub_page)
if sub_page.get('href'):
page = {
'url': f"{host}{sub_page['href']}",
'name': sub_page.get_text().strip(),
'details': ''
}
if details_element is not None:
page['details'] = sub_pages_details[i].get_text().strip()
pages.append(page)
i = i + 1
# On recherche un pager, et si on le trouve on le suit
# modif ici, le parametre pager_element ne doit contenir
# que le contenu de la classe
# et pas l'element pour les classes avec whitespaces
pager = soup.find_all("a",class_=pager_element)
print(pager)
if pager and len(pager)>0 and pager[0].get('href'):
for sub_page in self.get_sub_pages_with_pager(
f"{host}{pager[0]['href']}",
sub_page_element,
pager_element,
details_element,
host
):
pages.append(sub_page)
return pages
def get_raa(self, keywords):
elements = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
soup = BeautifulSoup(page_content, 'html.parser')
#selection des grey cards
for a in soup.select('div.fr-card--grey div.fr-card__body div.fr-card__content h2.fr-card__title a'):
#regular
if Attrap.guess_date(f'{a.get_text().strip()}', '([0-9]{4}).*').year >= self.not_before.year and "Archives" not in f'{a.get_text().strip()}':
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages_with_pager(f"{self.__HOST}{a['href']}", 'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'fr-pagination__link fr-pagination__link--next fr-pagination__link--lg-label', None, self.__HOST):
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
#archives
elif self.not_before.year<2020 and "Archives" in f'{a.get_text().strip()}':
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages(page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
True):
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
#selection des "spécials"
for div in soup.select("div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w"):
for a in div.select("div.fr-card__body div.fr-card__content h2.fr-card__title a"):
print(a)
search_pattern=re.search('(?<=Publié le).*',f'{a.parent.parent.get_text()}')
if search_pattern:
if Attrap.guess_date(search_pattern[0], '([0-9]{4}).*').year>=self.not_before.year:
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages(page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
True):
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
#bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement)
#sur --invalidate-digital-signatures bien que dans la doc
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# parse
print(soup.find_all("a",{"id":'class="fr-link'}))
print(len(soup.find_all("a",{"id":'class="fr-link'})))
for a in soup.find_all("a",{"id":'class="fr-link'}):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
self.download_file(raa)
elements.append(raa)
print(elements)
return elements
import os
import datetime
from bs4 import BeautifulSoup
from urllib.parse import unquote
from Attrap import Attrap
class Attrap_pref47(Attrap):
# Config
__HOST = 'https://www.lot-et-garonne.gouv.fr'
__RAA_PAGE = f'{__HOST}/Publications/Publications-legales/RAA'
__USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0'
full_name = 'Préfecture du Lot-et-garonne'
short_code = 'pref47'
def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10)
self.not_before = datetime.datetime(2024, 1, 1) # mettre 2020 et decommenter lignes 86 a 91 pour dl les années précédentes
#supercharge pour classes css avec whitespaces
def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, details_element, host):
pages = []
page_content = self.get_page(page, 'get').content
# On initialise le parser
soup = BeautifulSoup(page_content, 'html.parser')
# On recherche les sous-pages
sub_pages = soup.select(sub_page_element)
sub_pages_details = None
if details_element is not None:
sub_pages_details = soup.select(details_element)
i = 0
for sub_page in sub_pages:
print(sub_page)
if sub_page.get('href'):
page = {
'url': f"{host}{sub_page['href']}",
'name': sub_page.get_text().strip(),
'details': ''
}
if details_element is not None:
page['details'] = sub_pages_details[i].get_text().strip()
pages.append(page)
i = i + 1
# On recherche un pager, et si on le trouve on le suit
# modif ici, le parametre pager_element ne doit contenir
# que le contenu de la classe
# et pas l'element pour les classes avec whitespaces
pager = soup.find_all("a",class_=pager_element)
print(pager)
if pager and len(pager)>0 and pager[0].get('href'):
for sub_page in self.get_sub_pages_with_pager(
f"{host}{pager[0]['href']}",
sub_page_element,
pager_element,
details_element,
host
):
pages.append(sub_page)
return pages
def get_raa(self, keywords):
elements = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content
soup = BeautifulSoup(page_content, 'html.parser')
print(f"not_before={self.not_before.year}")
for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a'):
# Annees sans pager
if Attrap.guess_date(f'{self.__HOST}{a.get_text().strip()}', '([0-9]{4}).*').year >= self.not_before.year:
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages(page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST,
True):
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
elements.append(element)
#les raa de 2019 et années précédentes ont des pagers
#else:
# page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
# for sub_page in self.get_sub_pages_with_pager(f"{self.__HOST}{a['href']}", 'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'fr-pagination__link fr-pagination__link--next fr-pagination__link--lg-label', None, self.__HOST):
# sub_page_content = self.get_page(sub_page['url'], 'get').content
# for element in self.get_raa_elements(sub_page_content):
# elements.append(element)
for raa in elements:
print(f"downloading {raa}")
self.download_file(raa, overwriting=False)
#bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement)
#sur --invalidate-digital-signatures bien que dans la doc
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
self.parse_raa(elements, keywords)
self.mailer()
def get_raa_elements(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# parse
print(soup.find_all("a",{"id":'class="fr-link'}))
print(len(soup.find_all("a",{"id":'class="fr-link'})))
for a in soup.find_all("a",{"id":'class="fr-link'}):
if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}"
else:
url = a['href']
url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name)
elements.append(raa)
print(elements)
return elements
make: ppparis pref04 pref05 pref06 pref09 pref13 pref31 pref33 pref34 pref35 pref38 pref42 pref44 pref59 pref62 pref63 pref64 pref65 pref66 pref69 pref80 pref81 pref83 pref87 pref93 pref976
make: ppparis pref04 pref05 pref06 pref09 pref13 pref31 pref33 pref34 pref35 pref38 pref42 pref44 pref47 pref59 pref62 pref63 pref64 pref65 pref66 pref69 pref80 pref81 pref83 pref87 pref93 pref976
ppparis:
bin/python3 cli.py ppparis
pref04:
......@@ -9,6 +9,8 @@ pref06:
bin/python3 cli.py pref06
pref09:
bin/python3 cli.py pref09
pref12:
bin/python3 cli.py pref12
pref13:
bin/python3 cli.py pref13
pref31:
......@@ -21,10 +23,16 @@ pref35:
bin/python3 cli.py pref35
pref38:
bin/python3 cli.py pref38
pref40:
bin/python3 cli.py pref40
pref42:
bin/python3 cli.py pref42
pref44:
bin/python3 cli.py pref44
pref46:
bin/python3 cli.py pref46
pref47:
bin/python3 cli.py pref47
pref59:
bin/python3 cli.py pref59
pref62:
......
......@@ -42,14 +42,18 @@ available_administrations = [
'pref05',
'pref06',
'pref09',
'pref12',
'pref13',
'pref31',
'pref33',
'pref34',
'pref35',
'pref38',
'pref40',
'pref42',
'pref44',
'pref46',
'pref47',
'pref59',
'pref62',
'pref63',
......