Newer
Older
from bs4 import BeautifulSoup
from urllib.parse import unquote
from RAAspotter import RAAspotter
class RAAspotter_ppparis(RAAspotter):
def get_raa(self, page_content):
elements = []
# On charge le parser
soup = BeautifulSoup(page_content, 'html.parser')
# Pour chaque balise a, on regarde si c'est un PDF, et si oui on le parse
for a in soup.find_all('a', href=True):
if a['href'].endswith('.pdf'):
if a['href'].startswith('/'):
url = 'https://www.prefecturedepolice.interieur.gouv.fr'+a['href']
else:
url = a['href']
name = a.find('span').get_text()
date = a.find('div', class_="field--type-datetime").get_text()
filename = unquote(url.split('/')[-1])
raa = RAAspotter.RAA(url, date, name, filename)
elements.append(raa)
return elements