Skip to content
Extraits de code Groupes Projets

Comparer les révisions

Les modifications sont affichées comme si la révision source était fusionnée avec la révision cible. En savoir plus sur la comparaison des révisions.

Source

Sélectionner le projet cible
No results found

Cible

Sélectionner le projet cible
  • la-quadrature-du-net/Attrap
  • foggyfrog/Attrap
  • skhwiz/Attrap
  • precambrien/Attrap
  • ketsapiwiq/Attrap
  • Joseki/Attrap
  • kr1p/attrap-pref-12
  • kr1p/attrap-pref-46
  • kr1p/attrap-pi
  • Guinness/Attrap
  • astroidgritty/attrap-pref-84
  • davinov/Attrap
  • maettellite/attrap-pref-01
  • m242/Attrap
  • multi/Attrap
  • mverdeil/Attrap
  • olpo/Attrap
17 résultats
Afficher les modifications
Validations sur la source (3)
...@@ -15,7 +15,7 @@ class Attrap_pref75(Attrap): ...@@ -15,7 +15,7 @@ class Attrap_pref75(Attrap):
# Config # Config
hostname = 'https://www.prefectures-regions.gouv.fr' hostname = 'https://www.prefectures-regions.gouv.fr'
raa_page = f'{hostname}/ile-de-france/tags/view/Ile-de-France/Documents+et+publications/Recueil+des+actes+administratifs' raa_page = f'{hostname}/ile-de-france/tags/view/Ile-de-France/Documents+et+publications/Recueil+des+actes+administratifs'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/115.0' user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0'
full_name = 'Préfecture de Paris' full_name = 'Préfecture de Paris'
short_code = 'pref75' short_code = 'pref75'
timezone = 'Europe/Paris' timezone = 'Europe/Paris'
...@@ -23,17 +23,19 @@ class Attrap_pref75(Attrap): ...@@ -23,17 +23,19 @@ class Attrap_pref75(Attrap):
def __init__(self, data_dir): def __init__(self, data_dir):
super().__init__(data_dir, self.user_agent) super().__init__(data_dir, self.user_agent)
self.enable_tor(10) self.enable_tor(10)
self.set_sleep_time(10)
def get_raa(self, keywords): def get_raa(self, keywords):
year_pages_to_parse = [] year_pages_to_parse = []
# On détermine quelles pages d'année parser # On détermine quelles pages d'année parser
page_content = self.get_page(self.raa_page, 'get').content page_content = self.get_session(self.raa_page, 'main', 6)
year_pages = self.get_sub_pages( year_pages = self.get_sub_pages(
page_content, page_content,
'article.news-list-item header h2.news-list-title a', 'article.news-list-item header h2.news-list-title a',
self.hostname, self.hostname,
False False,
selenium=True
) )
for year_page in year_pages: for year_page in year_pages:
year_date = Attrap.guess_date(year_page['name'].strip(), '(?:.*Paris.*)([0-9]{4})').replace(day=1, month=1) year_date = Attrap.guess_date(year_page['name'].strip(), '(?:.*Paris.*)([0-9]{4})').replace(day=1, month=1)
...@@ -42,13 +44,14 @@ class Attrap_pref75(Attrap): ...@@ -42,13 +44,14 @@ class Attrap_pref75(Attrap):
pages_to_parse = [] pages_to_parse = []
for year_page in year_pages_to_parse: for year_page in year_pages_to_parse:
page_content = self.get_page(year_page, 'get').content page_content = self.get_session(year_page, 'main', 6)
year = BeautifulSoup(page_content, 'html.parser').select('div.breadcrumb div.container p span.active')[0].get_text().split('-')[-1].strip() year = BeautifulSoup(page_content, 'html.parser').select('div.breadcrumb div.container p span.active')[0].get_text().split('-')[-1].strip()
month_pages = self.get_sub_pages( month_pages = self.get_sub_pages(
page_content, page_content,
'div.sommaire-bloc div.sommaire-content ol li a', 'div.sommaire-bloc div.sommaire-content ol li a',
self.hostname, self.hostname,
False False,
selenium=True
)[::-1] )[::-1]
for month_page in month_pages: for month_page in month_pages:
month_date = Attrap.guess_date(f"{month_page['name']} {year}", "(.*)").replace(day=1) month_date = Attrap.guess_date(f"{month_page['name']} {year}", "(.*)").replace(day=1)
...@@ -57,7 +60,7 @@ class Attrap_pref75(Attrap): ...@@ -57,7 +60,7 @@ class Attrap_pref75(Attrap):
elements = [] elements = []
for page in pages_to_parse[::-1]: for page in pages_to_parse[::-1]:
page_content = self.get_page(page, 'get').content page_content = self.get_session(page, 'main', 6)
for element in self.get_raa_elements(page_content): for element in self.get_raa_elements(page_content):
elements.append(element) elements.append(element)
......
...@@ -20,6 +20,7 @@ class Attrap_prefidf(Attrap): ...@@ -20,6 +20,7 @@ class Attrap_prefidf(Attrap):
def __init__(self, data_dir): def __init__(self, data_dir):
super().__init__(data_dir, self.user_agent) super().__init__(data_dir, self.user_agent)
self.enable_tor(10) self.enable_tor(10)
self.set_sleep_time(10)
def get_raa(self, keywords): def get_raa(self, keywords):
year_pages_to_parse = [] year_pages_to_parse = []
......
...@@ -20,6 +20,7 @@ class Attrap_prefpaca(Attrap): ...@@ -20,6 +20,7 @@ class Attrap_prefpaca(Attrap):
def __init__(self, data_dir): def __init__(self, data_dir):
super().__init__(data_dir, self.user_agent) super().__init__(data_dir, self.user_agent)
self.enable_tor(10) self.enable_tor(10)
self.set_sleep_time(10)
def get_raa(self, keywords): def get_raa(self, keywords):
# On récupère une session avec Selenium # On récupère une session avec Selenium
......