Skip to content
Extraits de code Groupes Projets
Valider dc4f709e rédigé par kr1p's avatar kr1p
Parcourir les fichiers

update with print and download at the end, still testing

parent 02697af5
Branches main
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
...@@ -392,7 +392,7 @@ class Attrap: ...@@ -392,7 +392,7 @@ class Attrap:
if page.status_code == 429: if page.status_code == 429:
logger.warning('Erreur 429 Too Many Requests reçue, temporisation...') logger.warning('Erreur 429 Too Many Requests reçue, temporisation...')
self.tor_get_new_id() self.tor_get_new_id()
time.sleep(55) time.sleep(60)
return self.get_page(url, method, data) return self.get_page(url, method, data)
if self.tor_enabled: if self.tor_enabled:
...@@ -415,16 +415,31 @@ class Attrap: ...@@ -415,16 +415,31 @@ class Attrap:
self.user_agent = user_agent self.user_agent = user_agent
self.session.headers.update({'User-Agent': self.user_agent}) self.session.headers.update({'User-Agent': self.user_agent})
def download_file(self, raa): def download_file(self, raa, overwrite=True):
try: try:
os.makedirs( os.makedirs(
os.path.dirname(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf'), os.path.dirname(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf'),
exist_ok=True exist_ok=True
) )
file = self.get_page(raa.url, 'get') # content-length 114 erronée sur test fichier
f = open(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', 'wb') # response = requests.head(raa.url)
f.write(file.content) # size = response.headers.get('content-length')#length 9549465
f.close() # if size is not None:
# print(size)
if overwrite and os.path.isfile(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf'):
print(f'file already present on disk, overwriting {self.data_dir}/raa/{raa.get_sha256()}.pdf for {raa}')
#if size!=os.path.getsize(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf')
file = self.get_page(raa.url, 'get')
f = open(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', 'wb')
f.write(file.content)
f.close()
elif not os.path.isfile(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf'):
print(f'file not present on disk, downloading {self.data_dir}/raa/{raa.get_sha256()}.pdf for {raa}')
file = self.get_page(raa.url, 'get')
f = open(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf', 'wb')
f.write(file.content)
f.close()
except (requests.exceptions.ConnectionError, except (requests.exceptions.ConnectionError,
requests.exceptions.ChunkedEncodingError): requests.exceptions.ChunkedEncodingError):
logger.warning(f'ATTENTION: la connexion a été interrompue pendant le téléchargement de {raa.url}, nouvelle tentative...') logger.warning(f'ATTENTION: la connexion a été interrompue pendant le téléchargement de {raa.url}, nouvelle tentative...')
......
import os import os
import datetime import datetime
import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import unquote from urllib.parse import unquote
import re import re
from Attrap import Attrap from Attrap import Attrap
class Attrap_pref46(Attrap): class Attrap_pref12(Attrap):
# Config # Config
__HOST = 'https://www.aveyron.gouv.fr' __HOST = 'https://www.aveyron.gouv.fr'
...@@ -19,15 +19,13 @@ class Attrap_pref46(Attrap): ...@@ -19,15 +19,13 @@ class Attrap_pref46(Attrap):
def __init__(self, data_dir): def __init__(self, data_dir):
super().__init__(data_dir, self.__USER_AGENT) super().__init__(data_dir, self.__USER_AGENT)
self.enable_tor(10) self.enable_tor(10)
self.not_before = datetime.datetime(2020, 1, 1)# avant 2020 page "archives" self.not_before = datetime.datetime(2020, 1, 1) # avant 2020 page "archives"
#supercharge pour classes css avec whitespaces #supercharge pour classes css avec whitespaces
def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, details_element, host): def get_sub_pages_with_pager(self, page, sub_page_element, pager_element, details_element, host):
pages = [] pages = []
page_content = self.get_page(page, 'get').content page_content = self.get_page(page, 'get').content
# On initialise le parser # On initialise le parser
soup = BeautifulSoup(page_content, 'html.parser') soup = BeautifulSoup(page_content, 'html.parser')
# On recherche les sous-pages # On recherche les sous-pages
sub_pages = soup.select(sub_page_element) sub_pages = soup.select(sub_page_element)
sub_pages_details = None sub_pages_details = None
...@@ -46,7 +44,6 @@ class Attrap_pref46(Attrap): ...@@ -46,7 +44,6 @@ class Attrap_pref46(Attrap):
page['details'] = sub_pages_details[i].get_text().strip() page['details'] = sub_pages_details[i].get_text().strip()
pages.append(page) pages.append(page)
i = i + 1 i = i + 1
# On recherche un pager, et si on le trouve on le suit # On recherche un pager, et si on le trouve on le suit
# modif ici, le parametre pager_element ne doit contenir # modif ici, le parametre pager_element ne doit contenir
# que le contenu de la classe # que le contenu de la classe
...@@ -62,36 +59,51 @@ class Attrap_pref46(Attrap): ...@@ -62,36 +59,51 @@ class Attrap_pref46(Attrap):
host host
): ):
pages.append(sub_page) pages.append(sub_page)
return pages return pages
def get_raa(self, keywords): def get_raa(self, keywords):
elements = [] elements = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content page_content = self.get_page(self.__RAA_PAGE, 'get').content
soup = BeautifulSoup(page_content, 'html.parser') soup = BeautifulSoup(page_content, 'html.parser')
#selection des grey cards #selection des grey cards
print(f"not_before={self.not_before.year}")
for a in soup.select('div.fr-card--grey div.fr-card__body div.fr-card__content h2.fr-card__title a'): for a in soup.select('div.fr-card--grey div.fr-card__body div.fr-card__content h2.fr-card__title a'):
#regular #archives links
if Attrap.guess_date(f'{a.get_text().strip()}', '([0-9]{4}).*').year >= self.not_before.year and "Archives" not in f'{a.get_text().strip()}': if self.not_before.year<2021 and "Archives" in f'{a.get_text().strip()}':
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content print(f"""########################\n
for sub_page in self.get_sub_pages_with_pager(f"{self.__HOST}{a['href']}", 'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'fr-pagination__link fr-pagination__link--next fr-pagination__link--lg-label', None, self.__HOST): archives links\n
sub_page_content = self.get_page(sub_page['url'], 'get').content {a.get_text().strip()}
for element in self.get_raa_elements(sub_page_content): ###########################""")
elements.append(element)
#archives
elif self.not_before.year<2021 and "Archives" in f'{a.get_text().strip()}':
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages(page_content, for sub_page in self.get_sub_pages(page_content,
'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'div.fr-card__body div.fr-card__content h2.fr-card__title a',
self.__HOST, self.__HOST,
True): True):
print(f"retrieving {sub_page['url']}")
sub_page_content = self.get_page(sub_page['url'], 'get').content sub_page_content = self.get_page(sub_page['url'], 'get').content
for a in sub_page_content.select('div.fr-card__body div.fr-card__content h2.fr-card__title a'): subsoup = BeautifulSoup(sub_page_content, 'html.parser')
for a in subsoup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a'):
sub_sub_page_content = self.get_page(a['url'], 'get').content sub_sub_page_content = self.get_page(a['url'], 'get').content
for element in self.get_raa_elements(sub_sub_page_content): for element in self.get_raa_elements(sub_sub_page_content):
print(f"appending {element}")
elements.append(element) elements.append(element)
#regular links
if Attrap.guess_date(f'{a.get_text().strip()}', '([0-9]{4}).*').year >= self.not_before.year and "Archives" not in f'{a.get_text().strip()}':
print(f"""########################\n
regular links\n
{a.get_text().strip()}
###########################""")
page_content = self.get_page(f"{self.__HOST}{a['href']}", 'get').content
for sub_page in self.get_sub_pages_with_pager(f"{self.__HOST}{a['href']}", 'div.fr-card__body div.fr-card__content h2.fr-card__title a', 'fr-pagination__link fr-pagination__link--next fr-pagination__link--lg-label', None, self.__HOST):
print(f"retrieving {sub_page['url']}")
sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content):
print(f"appending {element}")
elements.append(element)
#selection des "spécials" #selection des "spécials"
for div in soup.select("div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w"): for div in soup.select("div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w"):
print("""########################\n
specials links\n
###########################""")
for a in div.select("div.fr-card__body div.fr-card__content h2.fr-card__title a"): for a in div.select("div.fr-card__body div.fr-card__content h2.fr-card__title a"):
print(a) print(a)
search_pattern=re.search('(?<=Publié le).*',f'{a.parent.parent.get_text()}') search_pattern=re.search('(?<=Publié le).*',f'{a.parent.parent.get_text()}')
...@@ -104,7 +116,12 @@ class Attrap_pref46(Attrap): ...@@ -104,7 +116,12 @@ class Attrap_pref46(Attrap):
True): True):
sub_page_content = self.get_page(sub_page['url'], 'get').content sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content): for element in self.get_raa_elements(sub_page_content):
print(f"appending {element}")
elements.append(element) elements.append(element)
for raa in elements:
print(f"downloading {raa}")
self.download_file(raa,overwrite=False)
time.sleep(14)
#bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement) #bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement)
#sur --invalidate-digital-signatures bien que dans la doc #sur --invalidate-digital-signatures bien que dans la doc
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html #ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
...@@ -118,21 +135,19 @@ class Attrap_pref46(Attrap): ...@@ -118,21 +135,19 @@ class Attrap_pref46(Attrap):
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le # Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# parse # parse
print(soup.find_all("a",{"id":'class="fr-link'})) #print(soup.find_all("a",{"id":'class="fr-link'}))
print(len(soup.find_all("a",{"id":'class="fr-link'}))) #print(len(soup.find_all("a",{"id":'class="fr-link'})))
for a in soup.find_all("a",{"id":'class="fr-link'}): for a in soup.find_all("a",{"id":'class="fr-link'}):
if a.get('href') and a['href'].endswith('.pdf'): if a.get('href') and a['href'].endswith('.pdf'):
if a['href'].startswith('/'): if a['href'].startswith('/'):
url = f"{self.__HOST}{a['href']}" url = f"{self.__HOST}{a['href']}"
else: else:
url = a['href'] url = a['href']
url = unquote(url) url = unquote(url)
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name) raa = Attrap.RAA(url, date, name)
self.download_file(raa)
elements.append(raa) elements.append(raa)
print(elements) print(elements)
return elements return elements
Attrap_pref12('test').get_raa('algorithmes')
...@@ -69,6 +69,7 @@ class Attrap_pref40(Attrap): ...@@ -69,6 +69,7 @@ class Attrap_pref40(Attrap):
elements = [] elements = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content page_content = self.get_page(self.__RAA_PAGE, 'get').content
soup = BeautifulSoup(page_content, 'html.parser') soup = BeautifulSoup(page_content, 'html.parser')
print(f"not_before={self.not_before.year}")
for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a'): for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a'):
# Annee archivees # Annee archivees
if self.not_before.year<2016 and "antérieures" in a.get_text().strip(): if self.not_before.year<2016 and "antérieures" in a.get_text().strip():
...@@ -87,7 +88,9 @@ class Attrap_pref40(Attrap): ...@@ -87,7 +88,9 @@ class Attrap_pref40(Attrap):
sub_page_content = self.get_page(sub_page['url'], 'get').content sub_page_content = self.get_page(sub_page['url'], 'get').content
for element in self.get_raa_elements(sub_page_content): for element in self.get_raa_elements(sub_page_content):
elements.append(element) elements.append(element)
for raa in elements:
print(f'downloading {raa}')
self.download_file(raa, overwriting=False)
#bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement) #bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement)
#sur --invalidate-digital-signatures bien que dans la doc #sur --invalidate-digital-signatures bien que dans la doc
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html #ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
...@@ -112,7 +115,6 @@ class Attrap_pref40(Attrap): ...@@ -112,7 +115,6 @@ class Attrap_pref40(Attrap):
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name) raa = Attrap.RAA(url, date, name)
self.download_file(raa)
elements.append(raa) elements.append(raa)
print(elements) print(elements)
return elements return elements
......
...@@ -71,6 +71,7 @@ class Attrap_pref47(Attrap): ...@@ -71,6 +71,7 @@ class Attrap_pref47(Attrap):
elements = [] elements = []
page_content = self.get_page(self.__RAA_PAGE, 'get').content page_content = self.get_page(self.__RAA_PAGE, 'get').content
soup = BeautifulSoup(page_content, 'html.parser') soup = BeautifulSoup(page_content, 'html.parser')
print(f"not_before={self.not_before.year}")
for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a'): for a in soup.select('div.fr-card__body div.fr-card__content h2.fr-card__title a'):
# Annees sans pager # Annees sans pager
if Attrap.guess_date(f'{self.__HOST}{a.get_text().strip()}', '([0-9]{4}).*').year >= self.not_before.year: if Attrap.guess_date(f'{self.__HOST}{a.get_text().strip()}', '([0-9]{4}).*').year >= self.not_before.year:
...@@ -89,7 +90,9 @@ class Attrap_pref47(Attrap): ...@@ -89,7 +90,9 @@ class Attrap_pref47(Attrap):
# sub_page_content = self.get_page(sub_page['url'], 'get').content # sub_page_content = self.get_page(sub_page['url'], 'get').content
# for element in self.get_raa_elements(sub_page_content): # for element in self.get_raa_elements(sub_page_content):
# elements.append(element) # elements.append(element)
for raa in elements:
print(f"downloading {raa}")
self.download_file(raa, overwriting=False)
#bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement) #bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement)
#sur --invalidate-digital-signatures bien que dans la doc #sur --invalidate-digital-signatures bien que dans la doc
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html #ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
...@@ -116,7 +119,6 @@ class Attrap_pref47(Attrap): ...@@ -116,7 +119,6 @@ class Attrap_pref47(Attrap):
name = a.find('span').previous_sibling.replace('Télécharger ', '').strip() name = a.find('span').previous_sibling.replace('Télécharger ', '').strip()
date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y') date = datetime.datetime.strptime(a.find('span').get_text().split(' - ')[-1].strip(), '%d/%m/%Y')
raa = Attrap.RAA(url, date, name) raa = Attrap.RAA(url, date, name)
self.download_file(raa)
elements.append(raa) elements.append(raa)
print(elements) print(elements)
return elements return elements
......
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter