Commit ed8a5782 authored by jaster's avatar jaster

Add new importer for nosdeputes/nossenateurs representatives

Add a new importer to download representatives from nosdeputes.fr and
nossenateurs.fr.

This importer is strongly based on the code from
`https://github.com/LaboratoireCitoyen/FranceData`, which serves as the source
of the FranceData importer, but with the difference that it also scrape the data
from the previous legislative cycles.
parent f19b8b1a
Pipeline #1160 passed with stages
in 17 minutes and 47 seconds
......@@ -16,6 +16,15 @@ function francedata_download_pipe() {
[ -n "$CLEAN" ] && rm -rf $1
}
function nosdeputes_download_pipe() {
[ -n "$CLEAN" ] && rm -rf $1
[ -f "$1" ] || bin/utils/nosdeputes_importer $3 $1 || exit 1
export DJANGO_SETTINGS_MODULE=memopol.settings
gunzip -c ${OPENSHIFT__DIR}$1 | $2
[ -n "$CLEAN" ] && rm -rf $1
}
function refresh_scores() {
export DJANGO_SETTINGS_MODULE=memopol.settings
memopol refresh_scores
......
......@@ -8,3 +8,7 @@ parltrack_download_pipe ep_meps_current.json.xz parltrack_import_representatives
sleep 10
francedata_download_pipe parlementaires.json.gz francedata_import_representatives
sleep 10
nosdeputes_download_pipe nosdeputes.json.gz francedata_import_representatives parl
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
#
# Strongly based on https://github.com/LaboratoireCitoyen/FranceData
import gzip, json, os, sys, re
from urllib import quote
import scrapy
from scrapy.dupefilters import RFPDupeFilter
from scrapy import signals
from scrapy.exceptions import DropItem
from scrapy.utils.serialize import ScrapyJSONEncoder
from scrapy.crawler import CrawlerProcess
from scrapy import Request
from scrapy.spiders import CrawlSpider
def get_photo_url(rep_url, kind):
res = rep_url.rsplit('/', 1)
res.insert(-1, kind + '/photo')
return '/'.join(res)
class URLScreenFilter(RFPDupeFilter):
urls = set()
def request_seen(self, request):
if not request.url.endswith('#nodedupe') and request.url in self.urls:
return True
else:
self.urls.add(request.url)
return False
class FrancedataPipeline(object):
has_items = False
urls = set()
@classmethod
def from_crawler(cls, crawler):
pipeline = cls(crawler.settings.get('OUTPUT_FILE'))
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def __init__(self, outfile):
self.json = gzip.open(outfile, 'wb')
def spider_opened(self, spider):
self.json.write('[')
try:
spider.set_pipeline(self)
except:
pass
def process_item(self, item, spider):
if 'url' in item:
if item['url'] in self.urls:
raise DropItem()
else:
self.urls.add(item['url'])
if self.has_items:
self.json.write(',\n')
json.dump(item, self.json, cls=ScrapyJSONEncoder)
self.has_items = True
return item
def spider_closed(self, spider):
self.json.write(']')
self.json.close()
class ParlSpider(CrawlSpider):
name = "parlspider"
geocode_url = 'http://api-adresse.data.gouv.fr/search/?q=%s'
nd_photo_url = 'http://www.nosdeputes.fr/depute/photo/%s'
ns_photo_url = 'http://www.nossenateurs.fr/senateur/photo/%s'
allowed_domains = [
"2007-2012.nosdeputes.fr",
"2007-2012.nossenateurs.fr",
"2012-2017.nosdeputes.fr",
"2012-2017.nossenateurs.fr",
"www.nosdeputes.fr",
"www.nossenateurs.fr",
"api-adresse.data.gouv.fr"
]
start_urls = [
"http://2007-2012.nosdeputes.fr/deputes/json",
"http://2007-2012.nossenateurs.fr/senateurs/json",
"http://2012-2017.nosdeputes.fr/deputes/json",
"http://2012-2017.nossenateurs.fr/senateurs/json",
"http://www.nosdeputes.fr/deputes/json",
"http://www.nossenateurs.fr/senateurs/json"
]
def parse(self, response):
reps = json.loads(response.body_as_unicode())
if 'deputes' in reps:
reps = reps['deputes']
elif 'senateurs' in reps:
reps = reps['senateurs']
for rep in reps:
if 'depute' in rep:
rep = rep['depute']
yield Request(url=rep['url_nosdeputes_api'],
callback=self.parse_parlementaire)
elif 'senateur' in rep:
rep = rep['senateur']
yield Request(url=rep['url_nossenateurs_api'],
callback=self.parse_parlementaire)
def parse_parlementaire(self, response):
rep = json.loads(response.body_as_unicode())
if 'depute' in rep:
rep = rep['depute']
rep['chambre'] = 'AN'
rep['photo_url'] = get_photo_url(rep['url_nosdeputes'],
'depute')
elif 'senateur' in rep:
rep = rep['senateur']
rep['chambre'] = 'SEN'
rep['photo_url'] = get_photo_url(rep['url_nossenateurs'],
'senateur')
reqs = []
for ad in rep['adresses']:
adresse = ad['adresse']
pattern = ur'Télé(phone|copie)\s*:\s*(\d[0-9 ]+\d)'
for telm in re.finditer(pattern, adresse):
if telm.group(1) == 'phone':
ad['tel'] = telm.group(2)
else:
ad['fax'] = telm.group(2)
lad = adresse.lower()
if (not lad.startswith(u'assemblée nationale') and
not lad.startswith(u'sénat')):
trimmed = re.sub(pattern, '', adresse)
req = Request(url=self.get_geocode_url(trimmed),
callback=self.parse_geocode)
req.meta['rep'] = rep
req.meta['adresse'] = ad
reqs.append(req)
if len(reqs) > 0:
req = reqs.pop()
req.meta['requests'] = reqs
yield req
else:
yield rep
def get_geocode_url(self, q):
return self.geocode_url % quote(q.encode('utf-8'))
def parse_geocode(self, response):
rep = response.meta['rep']
adresse = response.meta['adresse']
reqs = response.meta['requests']
geo = json.loads(response.body_as_unicode())
if 'features' in geo and len(geo['features']) > 0:
adresse['geo'] = geo['features'][0]
if len(reqs) > 0:
req = reqs.pop()
req.meta['requests'] = reqs
yield req
else:
yield rep
commands = {
'parl': { 'class': ParlSpider, 'output': 'parlementaires.json.gz' },
}
def crawl(spider, outfile, **spargs):
tmpfile = ('%s.tmp' % outfile)
if os.path.exists(tmpfile):
os.remove(tmpfile)
process = CrawlerProcess({
'BOT_NAME': 'francedata',
'LOG_LEVEL': 'INFO',
'TELNETCONSOLE_ENABLED': False,
'OUTPUT_FILE': tmpfile,
'DUPEFILTER_CLASS': '__main__.URLScreenFilter',
'ITEM_PIPELINES': {
'__main__.FrancedataPipeline': 500
},
})
process.crawl(spider, **spargs)
process.start()
if os.path.exists(outfile):
os.remove(outfile)
os.rename(tmpfile, outfile)
def update():
if len(sys.argv) < 2:
print 'Usage: %s command [outfile]'
sys.exit(1)
cmd = sys.argv[1]
if len(sys.argv) > 2:
outfile = sys.argv[2]
else:
outfile = os.path.join(os.path.dirname(os.path.dirname(__file__)),
command['output'])
if cmd not in commands:
print 'Unknown command "%s"' % cmd
sys.exit(1)
command = commands[cmd]
crawl(command['class'], outfile)
if __name__ == "__main__":
update()
......@@ -35,6 +35,7 @@ setup(name='memopol',
'Whoosh==2.7.4',
'alabaster==0.7.10',
'django-nested-admin==3.0.17',
'Scrapy==1.4.0',
],
extras_require={
# Full version hardcode for testing dependencies so that
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment