From bf1a24df32b625edd26269867d57854808364be3 Mon Sep 17 00:00:00 2001 From: Nicolas Joyard <joyard.nicolas@gmail.com> Date: Sat, 4 Jun 2016 11:49:29 +0200 Subject: [PATCH] Improve position import (dateless + wrong mep names) --- .../contrib/import_data.py | 39 +++++++++++++++++++ .../contrib/import_positions.py | 29 ++++++++++++-- 2 files changed, 64 insertions(+), 4 deletions(-) create mode 100644 representatives_positions/contrib/import_data.py diff --git a/representatives_positions/contrib/import_data.py b/representatives_positions/contrib/import_data.py new file mode 100644 index 00000000..8fcd5411 --- /dev/null +++ b/representatives_positions/contrib/import_data.py @@ -0,0 +1,39 @@ +# coding: utf-8 +# flake8: noqa + +# This dict is used to find dates from URLs for dateless positions +position_dates = { + "http://www.dailymotion.com/video/x8pi7a_marielle-de-sarnez-l-europe-et-les_news#from=embed&start=1": "2009-03-18", + "http://tempsreel.nouvelobs.com/election-presidentielle-2012/20111215.OBS6873/interview-eva-joly-legalisons-le-partage-sur-internet.html": "2011-12-17", + "http://patricklehyaricpe.wordpress.com/2010/03/10/le-parlement-eurpeen-a-battu-swift-il-doit-battre-acta/": "2010-03-10", + "http://www.sanchezschmid.eu/uploads/PDF/lettre-MT2S-1Trim.pdf": "2010-01-01", + "http://www.tokia-saifi.eu/index.php?option=com_content&view=article&id=201:halte-a-la-contrefacon-et-au-piratage&catid=65:commerce-international&Itemid=96&lang=fr": "2010-01-01", + "http://www.tokia-saifi.eu/index.php?option=com_content&view=article&id=283%3Ale-piratage-en-ligne-reprime-&catid=34%3Aactualites&lang=fr": "2010-01-01", + "http://www.tokia-saifi.eu/index.php?option=com_content&view=article&id=296%3Alacta-un-bouclier-protecteur-pour-lindustrie-europeenne-&catid=34%3Aactualites&lang=fr": "2010-01-01", + "http://www.tokia-saifi.eu/index.php?option=com_content&view=article&id=230%3Aqlacta-un-accord-commercial-international-essentiel-pour-lutter-contre-la-contrefacon-a-lechelle-internationaleq-tokia-saifi-ump-ppe-f&catid=35%3Ainterventions-en-seance-pleniere&Itemid=59&lang=fr": "2010-01-01", + "http://www.marietjeschaake.eu/12/05/2011/une-%C2%AB-diplomatie-culturelle-%C2%BB-pour-promouvoir-les-valeurs-europeennes-2/?lang=fr": "2011-05-12", + "http://www.eurocitoyenne.fr/content/acta-un-rejet-de-cet-accord-par-le-parlement-europeen-est-possible-sil-y-une-mobilisation-co": "2010-01-01", + "http://www.eurocitoyenne.fr/content/arretons-la-piraterie-en-public": "2010-01-01", + "http://www.eurocitoyenne.fr/content/le-guide-juridique-la-protection-des-donnees-personnelles-sans-vie-privee-pas-de-liberte": "2010-01-01", +} + +# This dict maps full names from old memopol to [1st, last] names in new memopol +# Necessary because we sometimes have different accents/hyphenation/separation, or additional/missing name parts. +rep_names = { + u"Alexander Graf LAMBSDORFF": [u"Alexander Graf", u"Graf LAMBSDORFF"], + u"Carlos José ITURGAIZ ANGULO": [u"Carlos", u"ITURGAIZ"], + + u"Cristian Silviu BUÅžOI": [u"Cristian-Silviu", u"BUÅžOI"], + u"Eider GARDIAZÃBAL RUBIAL": [u"Eider", u"GARDIAZABAL RUBIAL"], + u"Filiz Hakaeva HYUSMENOVA": [u"Filiz", u"HYUSMENOVA"], + u"Glenis WILLMOTT": [u"Dame Glenis", u"WILLMOTT"], + u"Iliana Malinova IOTOVA": [u"Iliana", u"IOTOVA"], + u"Janusz WÅ‚adysÅ‚aw ZEMKE": [u"Janusz", u"ZEMKE"], + u"Marielle de SARNEZ": [u"Marielle", u"de SARNEZ"], + u"Monica Luisa MACOVEI": [u"Monica", u"MACOVEI"], + u"Róża Gräfin von THUN UND HOHENSTEIN": [u"Róża Gräfin", u"von THUN UND HOHENSTEIN"], + u"Santiago FISAS AYXELA": [u"Santiago", u"FISAS AYXELÀ"], + u"Sophia in 't VELD": [u"Sophia", u"in 't VELD"], + u"Vasilica Viorica DÄ‚NCILÄ‚": [u"Viorica", u"DÄ‚NCILÄ‚"], + u"Wim van de CAMP": [u"Wim", u"van de CAMP"] +} diff --git a/representatives_positions/contrib/import_positions.py b/representatives_positions/contrib/import_positions.py index 95c1936b..ea233ea2 100644 --- a/representatives_positions/contrib/import_positions.py +++ b/representatives_positions/contrib/import_positions.py @@ -9,6 +9,7 @@ import re from representatives_positions.models import Position from representatives.models import Representative +from .import_data import position_dates, rep_names logger = logging.getLogger(__name__) @@ -21,6 +22,7 @@ class PositionImporter: key = '%s %s' % (first_name, last_name) rep = self.rep_cache.get(key, None) + # Find rep if rep is None: try: rep = Representative.objects.get(first_name=first_name, @@ -29,13 +31,31 @@ class PositionImporter: except Representative.DoesNotExist: rep = None + # Not found => try to use an alternate name + if rep is None: + newname = rep_names.get(key, None) + if newname is not None: + try: + rep = Representative.objects.get(first_name=newname[0], + last_name=newname[1]) + self.rep_cache[key] = rep + except Representative.DoesNotExist: + rep = None + return rep def import_row(self, row): if len(row['date']) == 0: - logger.warn('Cannot import dateless position for %s %s on URL %s' % - (row['first_name'], row['last_name'], row['url'])) - return False + if len(row['url']) == 0: + row['date'] = '2010-01-01' + row['url'] = '/' + else: + row['date'] = position_dates.get(row['url'], None) + + if row['date'] is None: + logger.warn('Dateless position for %s %s on URL %s' % + (row['first_name'], row['last_name'], row['url'])) + return False rep = self.get_rep(row['first_name'], row['last_name']) if rep is None: @@ -104,4 +124,5 @@ def main(stream=None): else: imported = imported + 1 - logger.info('%d rows imported, %d rows rejected', imported, len(rejected)) + logger.info('%d rows imported or already present, %d rows rejected', + imported, len(rejected)) -- GitLab