Skip to content
GitLab
Explorer
Connexion
Navigation principale
Rechercher ou aller à…
Projet
Attrap
Gestion
Activité
Membres
Labels
Programmation
Tickets
Tableaux des tickets
Jalons
Wiki
Code
Requêtes de fusion
Dépôt
Branches
Validations
Étiquettes
Graphe du dépôt
Comparer les révisions
Compilation
Pipelines
Jobs
Planifications de pipeline
Artéfacts
Déploiement
Releases
Registre de paquets
Registre de conteneur
Opération
Modules Terraform
Aide
Aide
Support
Documentation de GitLab
Comparer les forfaits GitLab
Forum de la communauté
Contribuer à GitLab
Donner votre avis
Conditions générales et politique de confidentialité
Raccourcis clavier
?
Extraits de code
Groupes
Projets
Afficher davantage de fils d'Ariane
Joseki
Attrap
Validations
dc4f709e
Valider
dc4f709e
rédigé
Il y a 9 mois
par
kr1p
Parcourir les fichiers
Options
Téléchargements
Correctifs
Plain Diff
update with print and download at the end, still testing
parent
02697af5
Branches
main
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
Modifications
4
Masquer les modifications d'espaces
En ligne
Côte à côte
Affichage de
4 fichiers modifiés
Attrap.py
+21
-6
21 ajouts, 6 suppressions
Attrap.py
Attrap_pref12.py
+38
-23
38 ajouts, 23 suppressions
Attrap_pref12.py
Attrap_pref40.py
+4
-2
4 ajouts, 2 suppressions
Attrap_pref40.py
Attrap_pref47.py
+4
-2
4 ajouts, 2 suppressions
Attrap_pref47.py
avec
67 ajouts
et
33 suppressions
Attrap.py
+
21
−
6
Voir le fichier @
dc4f709e
...
@@ -392,7 +392,7 @@ class Attrap:
...
@@ -392,7 +392,7 @@ class Attrap:
if
page
.
status_code
==
429
:
if
page
.
status_code
==
429
:
logger
.
warning
(
'
Erreur 429 Too Many Requests reçue, temporisation...
'
)
logger
.
warning
(
'
Erreur 429 Too Many Requests reçue, temporisation...
'
)
self
.
tor_get_new_id
()
self
.
tor_get_new_id
()
time
.
sleep
(
55
)
time
.
sleep
(
60
)
return
self
.
get_page
(
url
,
method
,
data
)
return
self
.
get_page
(
url
,
method
,
data
)
if
self
.
tor_enabled
:
if
self
.
tor_enabled
:
...
@@ -415,16 +415,31 @@ class Attrap:
...
@@ -415,16 +415,31 @@ class Attrap:
self
.
user_agent
=
user_agent
self
.
user_agent
=
user_agent
self
.
session
.
headers
.
update
({
'
User-Agent
'
:
self
.
user_agent
})
self
.
session
.
headers
.
update
({
'
User-Agent
'
:
self
.
user_agent
})
def
download_file
(
self
,
raa
):
def
download_file
(
self
,
raa
,
overwrite
=
True
):
try
:
try
:
os
.
makedirs
(
os
.
makedirs
(
os
.
path
.
dirname
(
f
'
{
self
.
data_dir
}
/raa/
{
raa
.
get_sha256
()
}
.pdf
'
),
os
.
path
.
dirname
(
f
'
{
self
.
data_dir
}
/raa/
{
raa
.
get_sha256
()
}
.pdf
'
),
exist_ok
=
True
exist_ok
=
True
)
)
file
=
self
.
get_page
(
raa
.
url
,
'
get
'
)
# content-length 114 erronée sur test fichier
f
=
open
(
f
'
{
self
.
data_dir
}
/raa/
{
raa
.
get_sha256
()
}
.pdf
'
,
'
wb
'
)
# response = requests.head(raa.url)
f
.
write
(
file
.
content
)
# size = response.headers.get('content-length')#length 9549465
f
.
close
()
# if size is not None:
# print(size)
if
overwrite
and
os
.
path
.
isfile
(
f
'
{
self
.
data_dir
}
/raa/
{
raa
.
get_sha256
()
}
.pdf
'
):
print
(
f
'
file already present on disk, overwriting
{
self
.
data_dir
}
/raa/
{
raa
.
get_sha256
()
}
.pdf for
{
raa
}
'
)
#if size!=os.path.getsize(f'{self.data_dir}/raa/{raa.get_sha256()}.pdf')
file
=
self
.
get_page
(
raa
.
url
,
'
get
'
)
f
=
open
(
f
'
{
self
.
data_dir
}
/raa/
{
raa
.
get_sha256
()
}
.pdf
'
,
'
wb
'
)
f
.
write
(
file
.
content
)
f
.
close
()
elif
not
os
.
path
.
isfile
(
f
'
{
self
.
data_dir
}
/raa/
{
raa
.
get_sha256
()
}
.pdf
'
):
print
(
f
'
file not present on disk, downloading
{
self
.
data_dir
}
/raa/
{
raa
.
get_sha256
()
}
.pdf for
{
raa
}
'
)
file
=
self
.
get_page
(
raa
.
url
,
'
get
'
)
f
=
open
(
f
'
{
self
.
data_dir
}
/raa/
{
raa
.
get_sha256
()
}
.pdf
'
,
'
wb
'
)
f
.
write
(
file
.
content
)
f
.
close
()
except
(
requests
.
exceptions
.
ConnectionError
,
except
(
requests
.
exceptions
.
ConnectionError
,
requests
.
exceptions
.
ChunkedEncodingError
):
requests
.
exceptions
.
ChunkedEncodingError
):
logger
.
warning
(
f
'
ATTENTION: la connexion a été interrompue pendant le téléchargement de
{
raa
.
url
}
, nouvelle tentative...
'
)
logger
.
warning
(
f
'
ATTENTION: la connexion a été interrompue pendant le téléchargement de
{
raa
.
url
}
, nouvelle tentative...
'
)
...
...
Ce diff est replié.
Cliquez pour l'agrandir.
Attrap_pref12.py
+
38
−
23
Voir le fichier @
dc4f709e
import
os
import
os
import
datetime
import
datetime
import
time
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
urllib.parse
import
unquote
from
urllib.parse
import
unquote
import
re
import
re
from
Attrap
import
Attrap
from
Attrap
import
Attrap
class
Attrap_pref
46
(
Attrap
):
class
Attrap_pref
12
(
Attrap
):
# Config
# Config
__HOST
=
'
https://www.aveyron.gouv.fr
'
__HOST
=
'
https://www.aveyron.gouv.fr
'
...
@@ -19,15 +19,13 @@ class Attrap_pref46(Attrap):
...
@@ -19,15 +19,13 @@ class Attrap_pref46(Attrap):
def
__init__
(
self
,
data_dir
):
def
__init__
(
self
,
data_dir
):
super
().
__init__
(
data_dir
,
self
.
__USER_AGENT
)
super
().
__init__
(
data_dir
,
self
.
__USER_AGENT
)
self
.
enable_tor
(
10
)
self
.
enable_tor
(
10
)
self
.
not_before
=
datetime
.
datetime
(
2020
,
1
,
1
)
# avant 2020 page "archives"
self
.
not_before
=
datetime
.
datetime
(
2020
,
1
,
1
)
# avant 2020 page "archives"
#supercharge pour classes css avec whitespaces
#supercharge pour classes css avec whitespaces
def
get_sub_pages_with_pager
(
self
,
page
,
sub_page_element
,
pager_element
,
details_element
,
host
):
def
get_sub_pages_with_pager
(
self
,
page
,
sub_page_element
,
pager_element
,
details_element
,
host
):
pages
=
[]
pages
=
[]
page_content
=
self
.
get_page
(
page
,
'
get
'
).
content
page_content
=
self
.
get_page
(
page
,
'
get
'
).
content
# On initialise le parser
# On initialise le parser
soup
=
BeautifulSoup
(
page_content
,
'
html.parser
'
)
soup
=
BeautifulSoup
(
page_content
,
'
html.parser
'
)
# On recherche les sous-pages
# On recherche les sous-pages
sub_pages
=
soup
.
select
(
sub_page_element
)
sub_pages
=
soup
.
select
(
sub_page_element
)
sub_pages_details
=
None
sub_pages_details
=
None
...
@@ -46,7 +44,6 @@ class Attrap_pref46(Attrap):
...
@@ -46,7 +44,6 @@ class Attrap_pref46(Attrap):
page
[
'
details
'
]
=
sub_pages_details
[
i
].
get_text
().
strip
()
page
[
'
details
'
]
=
sub_pages_details
[
i
].
get_text
().
strip
()
pages
.
append
(
page
)
pages
.
append
(
page
)
i
=
i
+
1
i
=
i
+
1
# On recherche un pager, et si on le trouve on le suit
# On recherche un pager, et si on le trouve on le suit
# modif ici, le parametre pager_element ne doit contenir
# modif ici, le parametre pager_element ne doit contenir
# que le contenu de la classe
# que le contenu de la classe
...
@@ -62,36 +59,51 @@ class Attrap_pref46(Attrap):
...
@@ -62,36 +59,51 @@ class Attrap_pref46(Attrap):
host
host
):
):
pages
.
append
(
sub_page
)
pages
.
append
(
sub_page
)
return
pages
return
pages
def
get_raa
(
self
,
keywords
):
def
get_raa
(
self
,
keywords
):
elements
=
[]
elements
=
[]
page_content
=
self
.
get_page
(
self
.
__RAA_PAGE
,
'
get
'
).
content
page_content
=
self
.
get_page
(
self
.
__RAA_PAGE
,
'
get
'
).
content
soup
=
BeautifulSoup
(
page_content
,
'
html.parser
'
)
soup
=
BeautifulSoup
(
page_content
,
'
html.parser
'
)
#selection des grey cards
#selection des grey cards
print
(
f
"
not_before=
{
self
.
not_before
.
year
}
"
)
for
a
in
soup
.
select
(
'
div.fr-card--grey div.fr-card__body div.fr-card__content h2.fr-card__title a
'
):
for
a
in
soup
.
select
(
'
div.fr-card--grey div.fr-card__body div.fr-card__content h2.fr-card__title a
'
):
#regular
#archives links
if
Attrap
.
guess_date
(
f
'
{
a
.
get_text
().
strip
()
}
'
,
'
([0-9]{4}).*
'
).
year
>=
self
.
not_before
.
year
and
"
Archives
"
not
in
f
'
{
a
.
get_text
().
strip
()
}
'
:
if
self
.
not_before
.
year
<
2021
and
"
Archives
"
in
f
'
{
a
.
get_text
().
strip
()
}
'
:
page_content
=
self
.
get_page
(
f
"
{
self
.
__HOST
}{
a
[
'
href
'
]
}
"
,
'
get
'
).
content
print
(
f
"""
########################
\n
for
sub_page
in
self
.
get_sub_pages_with_pager
(
f
"
{
self
.
__HOST
}{
a
[
'
href
'
]
}
"
,
'
div.fr-card__body div.fr-card__content h2.fr-card__title a
'
,
'
fr-pagination__link fr-pagination__link--next fr-pagination__link--lg-label
'
,
None
,
self
.
__HOST
):
archives links
\n
sub_page_content
=
self
.
get_page
(
sub_page
[
'
url
'
],
'
get
'
).
content
{
a
.
get_text
().
strip
()
}
for
element
in
self
.
get_raa_elements
(
sub_page_content
):
###########################
"""
)
elements
.
append
(
element
)
#archives
elif
self
.
not_before
.
year
<
2021
and
"
Archives
"
in
f
'
{
a
.
get_text
().
strip
()
}
'
:
page_content
=
self
.
get_page
(
f
"
{
self
.
__HOST
}{
a
[
'
href
'
]
}
"
,
'
get
'
).
content
page_content
=
self
.
get_page
(
f
"
{
self
.
__HOST
}{
a
[
'
href
'
]
}
"
,
'
get
'
).
content
for
sub_page
in
self
.
get_sub_pages
(
page_content
,
for
sub_page
in
self
.
get_sub_pages
(
page_content
,
'
div.fr-card__body div.fr-card__content h2.fr-card__title a
'
,
'
div.fr-card__body div.fr-card__content h2.fr-card__title a
'
,
self
.
__HOST
,
self
.
__HOST
,
True
):
True
):
print
(
f
"
retrieving
{
sub_page
[
'
url
'
]
}
"
)
sub_page_content
=
self
.
get_page
(
sub_page
[
'
url
'
],
'
get
'
).
content
sub_page_content
=
self
.
get_page
(
sub_page
[
'
url
'
],
'
get
'
).
content
for
a
in
sub_page_content
.
select
(
'
div.fr-card__body div.fr-card__content h2.fr-card__title a
'
):
subsoup
=
BeautifulSoup
(
sub_page_content
,
'
html.parser
'
)
for
a
in
subsoup
.
select
(
'
div.fr-card__body div.fr-card__content h2.fr-card__title a
'
):
sub_sub_page_content
=
self
.
get_page
(
a
[
'
url
'
],
'
get
'
).
content
sub_sub_page_content
=
self
.
get_page
(
a
[
'
url
'
],
'
get
'
).
content
for
element
in
self
.
get_raa_elements
(
sub_sub_page_content
):
for
element
in
self
.
get_raa_elements
(
sub_sub_page_content
):
print
(
f
"
appending
{
element
}
"
)
elements
.
append
(
element
)
elements
.
append
(
element
)
#regular links
if
Attrap
.
guess_date
(
f
'
{
a
.
get_text
().
strip
()
}
'
,
'
([0-9]{4}).*
'
).
year
>=
self
.
not_before
.
year
and
"
Archives
"
not
in
f
'
{
a
.
get_text
().
strip
()
}
'
:
print
(
f
"""
########################
\n
regular links
\n
{
a
.
get_text
().
strip
()
}
###########################
"""
)
page_content
=
self
.
get_page
(
f
"
{
self
.
__HOST
}{
a
[
'
href
'
]
}
"
,
'
get
'
).
content
for
sub_page
in
self
.
get_sub_pages_with_pager
(
f
"
{
self
.
__HOST
}{
a
[
'
href
'
]
}
"
,
'
div.fr-card__body div.fr-card__content h2.fr-card__title a
'
,
'
fr-pagination__link fr-pagination__link--next fr-pagination__link--lg-label
'
,
None
,
self
.
__HOST
):
print
(
f
"
retrieving
{
sub_page
[
'
url
'
]
}
"
)
sub_page_content
=
self
.
get_page
(
sub_page
[
'
url
'
],
'
get
'
).
content
for
element
in
self
.
get_raa_elements
(
sub_page_content
):
print
(
f
"
appending
{
element
}
"
)
elements
.
append
(
element
)
#selection des "spécials"
#selection des "spécials"
for
div
in
soup
.
select
(
"
div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w
"
):
for
div
in
soup
.
select
(
"
div.fr-card.fr-card--horizontal.fr-card--sm.fr-enlarge-link.fr-mb-3w
"
):
print
(
"""
########################
\n
specials links
\n
###########################
"""
)
for
a
in
div
.
select
(
"
div.fr-card__body div.fr-card__content h2.fr-card__title a
"
):
for
a
in
div
.
select
(
"
div.fr-card__body div.fr-card__content h2.fr-card__title a
"
):
print
(
a
)
print
(
a
)
search_pattern
=
re
.
search
(
'
(?<=Publié le).*
'
,
f
'
{
a
.
parent
.
parent
.
get_text
()
}
'
)
search_pattern
=
re
.
search
(
'
(?<=Publié le).*
'
,
f
'
{
a
.
parent
.
parent
.
get_text
()
}
'
)
...
@@ -104,7 +116,12 @@ class Attrap_pref46(Attrap):
...
@@ -104,7 +116,12 @@ class Attrap_pref46(Attrap):
True
):
True
):
sub_page_content
=
self
.
get_page
(
sub_page
[
'
url
'
],
'
get
'
).
content
sub_page_content
=
self
.
get_page
(
sub_page
[
'
url
'
],
'
get
'
).
content
for
element
in
self
.
get_raa_elements
(
sub_page_content
):
for
element
in
self
.
get_raa_elements
(
sub_page_content
):
print
(
f
"
appending
{
element
}
"
)
elements
.
append
(
element
)
elements
.
append
(
element
)
for
raa
in
elements
:
print
(
f
"
downloading
{
raa
}
"
)
self
.
download_file
(
raa
,
overwrite
=
False
)
time
.
sleep
(
14
)
#bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement)
#bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement)
#sur --invalidate-digital-signatures bien que dans la doc
#sur --invalidate-digital-signatures bien que dans la doc
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
...
@@ -118,21 +135,19 @@ class Attrap_pref46(Attrap):
...
@@ -118,21 +135,19 @@ class Attrap_pref46(Attrap):
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le
# parse
# parse
print
(
soup
.
find_all
(
"
a
"
,{
"
id
"
:
'
class=
"
fr-link
'
}))
#
print(soup.find_all("a",{"id":'class="fr-link'}))
print
(
len
(
soup
.
find_all
(
"
a
"
,{
"
id
"
:
'
class=
"
fr-link
'
})))
#
print(len(soup.find_all("a",{"id":'class="fr-link'})))
for
a
in
soup
.
find_all
(
"
a
"
,{
"
id
"
:
'
class=
"
fr-link
'
}):
for
a
in
soup
.
find_all
(
"
a
"
,{
"
id
"
:
'
class=
"
fr-link
'
}):
if
a
.
get
(
'
href
'
)
and
a
[
'
href
'
].
endswith
(
'
.pdf
'
):
if
a
.
get
(
'
href
'
)
and
a
[
'
href
'
].
endswith
(
'
.pdf
'
):
if
a
[
'
href
'
].
startswith
(
'
/
'
):
if
a
[
'
href
'
].
startswith
(
'
/
'
):
url
=
f
"
{
self
.
__HOST
}{
a
[
'
href
'
]
}
"
url
=
f
"
{
self
.
__HOST
}{
a
[
'
href
'
]
}
"
else
:
else
:
url
=
a
[
'
href
'
]
url
=
a
[
'
href
'
]
url
=
unquote
(
url
)
url
=
unquote
(
url
)
name
=
a
.
find
(
'
span
'
).
previous_sibling
.
replace
(
'
Télécharger
'
,
''
).
strip
()
name
=
a
.
find
(
'
span
'
).
previous_sibling
.
replace
(
'
Télécharger
'
,
''
).
strip
()
date
=
datetime
.
datetime
.
strptime
(
a
.
find
(
'
span
'
).
get_text
().
split
(
'
-
'
)[
-
1
].
strip
(),
'
%d/%m/%Y
'
)
date
=
datetime
.
datetime
.
strptime
(
a
.
find
(
'
span
'
).
get_text
().
split
(
'
-
'
)[
-
1
].
strip
(),
'
%d/%m/%Y
'
)
raa
=
Attrap
.
RAA
(
url
,
date
,
name
)
raa
=
Attrap
.
RAA
(
url
,
date
,
name
)
self
.
download_file
(
raa
)
elements
.
append
(
raa
)
elements
.
append
(
raa
)
print
(
elements
)
print
(
elements
)
return
elements
return
elements
Attrap_pref12
(
'
test
'
).
get_raa
(
'
algorithmes
'
)
Ce diff est replié.
Cliquez pour l'agrandir.
Attrap_pref40.py
+
4
−
2
Voir le fichier @
dc4f709e
...
@@ -69,6 +69,7 @@ class Attrap_pref40(Attrap):
...
@@ -69,6 +69,7 @@ class Attrap_pref40(Attrap):
elements
=
[]
elements
=
[]
page_content
=
self
.
get_page
(
self
.
__RAA_PAGE
,
'
get
'
).
content
page_content
=
self
.
get_page
(
self
.
__RAA_PAGE
,
'
get
'
).
content
soup
=
BeautifulSoup
(
page_content
,
'
html.parser
'
)
soup
=
BeautifulSoup
(
page_content
,
'
html.parser
'
)
print
(
f
"
not_before=
{
self
.
not_before
.
year
}
"
)
for
a
in
soup
.
select
(
'
div.fr-card__body div.fr-card__content h2.fr-card__title a
'
):
for
a
in
soup
.
select
(
'
div.fr-card__body div.fr-card__content h2.fr-card__title a
'
):
# Annee archivees
# Annee archivees
if
self
.
not_before
.
year
<
2016
and
"
antérieures
"
in
a
.
get_text
().
strip
():
if
self
.
not_before
.
year
<
2016
and
"
antérieures
"
in
a
.
get_text
().
strip
():
...
@@ -87,7 +88,9 @@ class Attrap_pref40(Attrap):
...
@@ -87,7 +88,9 @@ class Attrap_pref40(Attrap):
sub_page_content
=
self
.
get_page
(
sub_page
[
'
url
'
],
'
get
'
).
content
sub_page_content
=
self
.
get_page
(
sub_page
[
'
url
'
],
'
get
'
).
content
for
element
in
self
.
get_raa_elements
(
sub_page_content
):
for
element
in
self
.
get_raa_elements
(
sub_page_content
):
elements
.
append
(
element
)
elements
.
append
(
element
)
for
raa
in
elements
:
print
(
f
'
downloading
{
raa
}
'
)
self
.
download_file
(
raa
,
overwriting
=
False
)
#bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement)
#bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement)
#sur --invalidate-digital-signatures bien que dans la doc
#sur --invalidate-digital-signatures bien que dans la doc
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
...
@@ -112,7 +115,6 @@ class Attrap_pref40(Attrap):
...
@@ -112,7 +115,6 @@ class Attrap_pref40(Attrap):
name
=
a
.
find
(
'
span
'
).
previous_sibling
.
replace
(
'
Télécharger
'
,
''
).
strip
()
name
=
a
.
find
(
'
span
'
).
previous_sibling
.
replace
(
'
Télécharger
'
,
''
).
strip
()
date
=
datetime
.
datetime
.
strptime
(
a
.
find
(
'
span
'
).
get_text
().
split
(
'
-
'
)[
-
1
].
strip
(),
'
%d/%m/%Y
'
)
date
=
datetime
.
datetime
.
strptime
(
a
.
find
(
'
span
'
).
get_text
().
split
(
'
-
'
)[
-
1
].
strip
(),
'
%d/%m/%Y
'
)
raa
=
Attrap
.
RAA
(
url
,
date
,
name
)
raa
=
Attrap
.
RAA
(
url
,
date
,
name
)
self
.
download_file
(
raa
)
elements
.
append
(
raa
)
elements
.
append
(
raa
)
print
(
elements
)
print
(
elements
)
return
elements
return
elements
...
...
Ce diff est replié.
Cliquez pour l'agrandir.
Attrap_pref47.py
+
4
−
2
Voir le fichier @
dc4f709e
...
@@ -71,6 +71,7 @@ class Attrap_pref47(Attrap):
...
@@ -71,6 +71,7 @@ class Attrap_pref47(Attrap):
elements
=
[]
elements
=
[]
page_content
=
self
.
get_page
(
self
.
__RAA_PAGE
,
'
get
'
).
content
page_content
=
self
.
get_page
(
self
.
__RAA_PAGE
,
'
get
'
).
content
soup
=
BeautifulSoup
(
page_content
,
'
html.parser
'
)
soup
=
BeautifulSoup
(
page_content
,
'
html.parser
'
)
print
(
f
"
not_before=
{
self
.
not_before
.
year
}
"
)
for
a
in
soup
.
select
(
'
div.fr-card__body div.fr-card__content h2.fr-card__title a
'
):
for
a
in
soup
.
select
(
'
div.fr-card__body div.fr-card__content h2.fr-card__title a
'
):
# Annees sans pager
# Annees sans pager
if
Attrap
.
guess_date
(
f
'
{
self
.
__HOST
}{
a
.
get_text
().
strip
()
}
'
,
'
([0-9]{4}).*
'
).
year
>=
self
.
not_before
.
year
:
if
Attrap
.
guess_date
(
f
'
{
self
.
__HOST
}{
a
.
get_text
().
strip
()
}
'
,
'
([0-9]{4}).*
'
).
year
>=
self
.
not_before
.
year
:
...
@@ -89,7 +90,9 @@ class Attrap_pref47(Attrap):
...
@@ -89,7 +90,9 @@ class Attrap_pref47(Attrap):
# sub_page_content = self.get_page(sub_page['url'], 'get').content
# sub_page_content = self.get_page(sub_page['url'], 'get').content
# for element in self.get_raa_elements(sub_page_content):
# for element in self.get_raa_elements(sub_page_content):
# elements.append(element)
# elements.append(element)
for
raa
in
elements
:
print
(
f
"
downloading
{
raa
}
"
)
self
.
download_file
(
raa
,
overwriting
=
False
)
#bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement)
#bug sur ocrmypdf sur mon ubuntu 20.04 (test avec arch prochainement)
#sur --invalidate-digital-signatures bien que dans la doc
#sur --invalidate-digital-signatures bien que dans la doc
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
#ici https://ocrmypdf.readthedocs.io/en/latest/pdfsecurity.html
...
@@ -116,7 +119,6 @@ class Attrap_pref47(Attrap):
...
@@ -116,7 +119,6 @@ class Attrap_pref47(Attrap):
name
=
a
.
find
(
'
span
'
).
previous_sibling
.
replace
(
'
Télécharger
'
,
''
).
strip
()
name
=
a
.
find
(
'
span
'
).
previous_sibling
.
replace
(
'
Télécharger
'
,
''
).
strip
()
date
=
datetime
.
datetime
.
strptime
(
a
.
find
(
'
span
'
).
get_text
().
split
(
'
-
'
)[
-
1
].
strip
(),
'
%d/%m/%Y
'
)
date
=
datetime
.
datetime
.
strptime
(
a
.
find
(
'
span
'
).
get_text
().
split
(
'
-
'
)[
-
1
].
strip
(),
'
%d/%m/%Y
'
)
raa
=
Attrap
.
RAA
(
url
,
date
,
name
)
raa
=
Attrap
.
RAA
(
url
,
date
,
name
)
self
.
download_file
(
raa
)
elements
.
append
(
raa
)
elements
.
append
(
raa
)
print
(
elements
)
print
(
elements
)
return
elements
return
elements
...
...
Ce diff est replié.
Cliquez pour l'agrandir.
Aperçu
0%
Chargement en cours
Veuillez réessayer
ou
joindre un nouveau fichier
.
Annuler
You are about to add
0
people
to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Enregistrer le commentaire
Annuler
Veuillez vous
inscrire
ou vous
se connecter
pour commenter