diff --git a/bin/lib.sh b/bin/lib.sh index af0cfbffea109f01801e04925e63a3712399e062..41c128520bd8b37c282634cd452df85f686a2a90 100644 --- a/bin/lib.sh +++ b/bin/lib.sh @@ -16,6 +16,15 @@ function francedata_download_pipe() { [ -n "$CLEAN" ] && rm -rf $1 } +function nosdeputes_download_pipe() { + [ -n "$CLEAN" ] && rm -rf $1 + [ -f "$1" ] || bin/utils/nosdeputes_importer $3 $1 || exit 1 + + export DJANGO_SETTINGS_MODULE=memopol.settings + gunzip -c ${OPENSHIFT__DIR}$1 | $2 + [ -n "$CLEAN" ] && rm -rf $1 +} + function refresh_scores() { export DJANGO_SETTINGS_MODULE=memopol.settings memopol refresh_scores diff --git a/bin/update_representatives b/bin/update_representatives index 5955f3f510cba33c735d9680b2be405553313bae..a2ef36a49ed93ab0c6d0d9627148ed905ddbb8b5 100755 --- a/bin/update_representatives +++ b/bin/update_representatives @@ -8,3 +8,7 @@ parltrack_download_pipe ep_meps_current.json.xz parltrack_import_representatives sleep 10 francedata_download_pipe parlementaires.json.gz francedata_import_representatives + +sleep 10 + +nosdeputes_download_pipe nosdeputes.json.gz francedata_import_representatives parl diff --git a/bin/utils/nosdeputes_importer b/bin/utils/nosdeputes_importer new file mode 100755 index 0000000000000000000000000000000000000000..bc5011e629a19ce0fd4a9ed9e268afe57cbbc1db --- /dev/null +++ b/bin/utils/nosdeputes_importer @@ -0,0 +1,228 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +# +# Strongly based on https://github.com/LaboratoireCitoyen/FranceData + +import gzip, json, os, sys, re +from urllib import quote + +import scrapy +from scrapy.dupefilters import RFPDupeFilter +from scrapy import signals +from scrapy.exceptions import DropItem +from scrapy.utils.serialize import ScrapyJSONEncoder +from scrapy.crawler import CrawlerProcess +from scrapy import Request +from scrapy.spiders import CrawlSpider + +def get_photo_url(rep_url, kind): + res = rep_url.rsplit('/', 1) + res.insert(-1, kind + '/photo') + return '/'.join(res) + +class URLScreenFilter(RFPDupeFilter): + urls = set() + + def request_seen(self, request): + if not request.url.endswith('#nodedupe') and request.url in self.urls: + return True + else: + self.urls.add(request.url) + return False + +class FrancedataPipeline(object): + has_items = False + urls = set() + + @classmethod + def from_crawler(cls, crawler): + pipeline = cls(crawler.settings.get('OUTPUT_FILE')) + crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) + crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) + return pipeline + + def __init__(self, outfile): + self.json = gzip.open(outfile, 'wb') + + def spider_opened(self, spider): + self.json.write('[') + + try: + spider.set_pipeline(self) + except: + pass + + def process_item(self, item, spider): + if 'url' in item: + if item['url'] in self.urls: + raise DropItem() + else: + self.urls.add(item['url']) + + if self.has_items: + self.json.write(',\n') + + json.dump(item, self.json, cls=ScrapyJSONEncoder) + self.has_items = True + + return item + + def spider_closed(self, spider): + self.json.write(']') + self.json.close() + +class ParlSpider(CrawlSpider): + name = "parlspider" + geocode_url = 'http://api-adresse.data.gouv.fr/search/?q=%s' + nd_photo_url = 'http://www.nosdeputes.fr/depute/photo/%s' + ns_photo_url = 'http://www.nossenateurs.fr/senateur/photo/%s' + + allowed_domains = [ + "2007-2012.nosdeputes.fr", + "2007-2012.nossenateurs.fr", + "2012-2017.nosdeputes.fr", + "2012-2017.nossenateurs.fr", + "www.nosdeputes.fr", + "www.nossenateurs.fr", + "api-adresse.data.gouv.fr" + ] + + start_urls = [ + "http://2007-2012.nosdeputes.fr/deputes/json", + "http://2007-2012.nossenateurs.fr/senateurs/json", + "http://2012-2017.nosdeputes.fr/deputes/json", + "http://2012-2017.nossenateurs.fr/senateurs/json", + "http://www.nosdeputes.fr/deputes/json", + "http://www.nossenateurs.fr/senateurs/json" + ] + + def parse(self, response): + reps = json.loads(response.body_as_unicode()) + + if 'deputes' in reps: + reps = reps['deputes'] + elif 'senateurs' in reps: + reps = reps['senateurs'] + + for rep in reps: + if 'depute' in rep: + rep = rep['depute'] + yield Request(url=rep['url_nosdeputes_api'], + callback=self.parse_parlementaire) + elif 'senateur' in rep: + rep = rep['senateur'] + yield Request(url=rep['url_nossenateurs_api'], + callback=self.parse_parlementaire) + + def parse_parlementaire(self, response): + rep = json.loads(response.body_as_unicode()) + if 'depute' in rep: + rep = rep['depute'] + rep['chambre'] = 'AN' + rep['photo_url'] = get_photo_url(rep['url_nosdeputes'], + 'depute') + elif 'senateur' in rep: + rep = rep['senateur'] + rep['chambre'] = 'SEN' + rep['photo_url'] = get_photo_url(rep['url_nossenateurs'], + 'senateur') + + reqs = [] + + for ad in rep['adresses']: + adresse = ad['adresse'] + + pattern = ur'Télé(phone|copie)\s*:\s*(\d[0-9 ]+\d)' + for telm in re.finditer(pattern, adresse): + if telm.group(1) == 'phone': + ad['tel'] = telm.group(2) + else: + ad['fax'] = telm.group(2) + + lad = adresse.lower() + if (not lad.startswith(u'assemblée nationale') and + not lad.startswith(u'sénat')): + trimmed = re.sub(pattern, '', adresse) + req = Request(url=self.get_geocode_url(trimmed), + callback=self.parse_geocode) + + req.meta['rep'] = rep + req.meta['adresse'] = ad + reqs.append(req) + + if len(reqs) > 0: + req = reqs.pop() + req.meta['requests'] = reqs + yield req + else: + yield rep + + def get_geocode_url(self, q): + return self.geocode_url % quote(q.encode('utf-8')) + + def parse_geocode(self, response): + rep = response.meta['rep'] + adresse = response.meta['adresse'] + reqs = response.meta['requests'] + + geo = json.loads(response.body_as_unicode()) + if 'features' in geo and len(geo['features']) > 0: + adresse['geo'] = geo['features'][0] + + if len(reqs) > 0: + req = reqs.pop() + req.meta['requests'] = reqs + yield req + else: + yield rep + + +commands = { + 'parl': { 'class': ParlSpider, 'output': 'parlementaires.json.gz' }, +} +def crawl(spider, outfile, **spargs): + tmpfile = ('%s.tmp' % outfile) + + if os.path.exists(tmpfile): + os.remove(tmpfile) + + process = CrawlerProcess({ + 'BOT_NAME': 'francedata', + 'LOG_LEVEL': 'INFO', + 'TELNETCONSOLE_ENABLED': False, + 'OUTPUT_FILE': tmpfile, + 'DUPEFILTER_CLASS': '__main__.URLScreenFilter', + 'ITEM_PIPELINES': { + '__main__.FrancedataPipeline': 500 + }, + }) + + process.crawl(spider, **spargs) + process.start() + + if os.path.exists(outfile): + os.remove(outfile) + + os.rename(tmpfile, outfile) + +def update(): + if len(sys.argv) < 2: + print 'Usage: %s command [outfile]' + sys.exit(1) + + cmd = sys.argv[1] + if len(sys.argv) > 2: + outfile = sys.argv[2] + else: + outfile = os.path.join(os.path.dirname(os.path.dirname(__file__)), + command['output']) + + if cmd not in commands: + print 'Unknown command "%s"' % cmd + sys.exit(1) + + command = commands[cmd] + crawl(command['class'], outfile) + +if __name__ == "__main__": + update() diff --git a/setup.py b/setup.py index d79bcc72a30e7b15f8b0df1476abdd4472b4b86f..1d737bde3f9e37b03c09c2ed44ff13df3b384096 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,7 @@ setup(name='memopol', 'Whoosh==2.7.4', 'alabaster==0.7.10', 'django-nested-admin==3.0.17', + 'Scrapy==1.4.0', ], extras_require={ # Full version hardcode for testing dependencies so that