import_representatives.py 16.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
# coding: utf-8

import logging
import sys
from datetime import datetime

import django.dispatch
import ijson
import django
from django.apps import apps
from django.db import transaction
from django.utils import timezone
13
from django.utils.text import slugify
okhin's avatar
okhin committed
14
from django.core.exceptions import MultipleObjectsReturned
15 16 17

from representatives.models import (Address, Constituency, Country, Email,
                                    Group, Mandate, Phone, Representative,
18
                                    WebSite, Chamber)
19 20 21

logger = logging.getLogger(__name__)

Jamesie Pic's avatar
Jamesie Pic committed
22 23
representative_pre_import = django.dispatch.Signal(
    providing_args=['representative_data'])
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65


def _parse_date(date):
    return datetime.strptime(date, "%Y-%m-%dT00:%H:00").date()


class GenericImporter(object):

    def pre_import(self):
        self.import_start_datetime = timezone.now()

    def post_import(self):
        # Clean not touched models
        models = [Representative, Group, Constituency,
                  Mandate, Address, Phone, Email, WebSite]
        for model in models:
            model.objects.filter(
                updated__lt=self.import_start_datetime).delete()

    def touch_model(self, model, **data):
        '''
        This method create or look up a model with the given data
        it saves the given model if it exists, updating its
        updated field
        '''
        instance, created = model.objects.get_or_create(**data)

        if not created:
            if instance.updated < self.import_start_datetime:
                instance.save()     # Updates updated field

        return (instance, created)


class ParltrackImporter(GenericImporter):
    url = 'http://parltrack.euwiki.org/dumps/ep_meps_current.json.xz'
    check_etag = True

    def parse_date(self, date):
        return _parse_date(date)

    def __init__(self):
Jamesie Pic's avatar
Jamesie Pic committed
66 67 68
        self.cache = {
            'countries': {c.name: c.pk for c in Country.objects.all()},
        }
69
        self.ep_chamber, _ = Chamber.objects.get_or_create(
70
            name='European Parliament', abbreviation='EP')
71 72 73 74 75
        self.ep_constituency, _ = Constituency.objects.get_or_create(
            name='European Parliament')
        self.ep_group, _ = Group.objects.get_or_create(
            name='European Parliament', kind='chamber', abbreviation='EP',
            chamber=self.ep_chamber)
76 77 78 79 80 81 82 83

    @transaction.atomic
    def manage_mep(self, mep_json):
        '''
        Import a mep as a representative from the json dict fetched from
        parltrack
        '''

Jamesie Pic's avatar
Jamesie Pic committed
84 85 86 87 88 89 90 91 92 93
        # Some versions of memopol will connect to this and skip inactive meps.
        responses = representative_pre_import.send(sender=self,
                representative_data=mep_json)

        for receiver, response in responses:
            if response is False:
                logger.debug(
                    'Skipping MEP %s', mep_json['Name']['full'])
                return

94
        changed = False
95 96
        # Issue 185. We must have a Birth date for our mep, to allow import
        # and slugifying stuff.
okhin's avatar
okhin committed
97 98
        if "Birth" not in mep_json:
            mep_json["Birth"] = {"date": "9999-01-01T00:00:00", "place": ""}
99

100
        slug = slugify('%s-%s' % (
101
            mep_json["Name"]["full"] if 'full' in mep_json["Name"]
102 103 104
            else mep_json["Name"]["sur"] + " " + mep_json["Name"]["family"],
            _parse_date(mep_json["Birth"]["date"])
        ))
105
        try:
106
            representative = Representative.objects.get(slug=slug)
107
        except Representative.DoesNotExist:
108 109
            representative = Representative(slug=slug)
            changed = True
110 111

        # Save representative attributes
112
        self.import_representative_details(representative, mep_json, changed)
113 114 115 116 117 118 119 120 121

        self.add_mandates(representative, mep_json)

        self.add_contacts(representative, mep_json)

        logger.debug('Imported MEP %s', unicode(representative))

        return representative

122 123 124 125
    def import_representative_details(self, representative, mep_json, changed):
        if representative.active != mep_json['active']:
            representative.active = mep_json['active']
            changed = True
126 127

        if mep_json.get("Birth"):
128 129 130 131
            birth_date = _parse_date(mep_json["Birth"]["date"])
            if representative.birth_date != birth_date:
                representative.birth_date = birth_date
                changed = True
132
            if "place" in mep_json["Birth"]:
133 134 135 136 137 138 139 140
                birth_place = mep_json["Birth"]["place"]
                if representative.birth_place != birth_place:
                    representative.birth_place = birth_place
                    changed = True

        if representative.first_name != mep_json["Name"]["sur"]:
            representative.first_name = mep_json["Name"]["sur"]
            changed = True
141

142
        last_name = mep_json["Name"]["family"]
143

144 145 146 147 148 149 150
        if representative.full_name != mep_json["Name"]["full"]:
            representative.full_name = mep_json["Name"]["full"]
            changed = True

        if representative.photo != mep_json["Photo"]:
            representative.photo = mep_json["Photo"]
            changed = True
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184

        fix_last_name_with_prefix = {
            "Esther de LANGE": "de LANGE",
            "Patricia van der KAMMEN": "van der KAMMEN",
            "Judith A. MERKIES": "MERKIES",
            "Heinz K. BECKER": "BECKER",
            "Cornelis de JONG": "de JONG",
            "Peter van DALEN": "van DALEN",
            "Sophia in 't VELD": "in 't VELD",
            "Marielle de SARNEZ": "de SARNEZ",
            "Anne E. JENSEN": "JENSEN",
            "Wim van de CAMP": "van de CAMP",
            "Lambert van NISTELROOIJ": "van NISTELROOIJ",
            "Johannes Cornelis van BAALEN": "van BAALEN",
            "Ioannis A. TSOUKALAS": "TSOUKALAS",
            "Pilar del CASTILLO VERA": "del CASTILLO VERA",
            "Luis de GRANDES PASCUAL": "de GRANDES PASCUAL",
            "Philippe de VILLIERS": "de VILLIERS",
            "Daniël van der STOEP": "van der STOEP",
            "William (The Earl of) DARTMOUTH": "(The Earl of) Dartmouth",
            "Bairbre de BRÚN": u'de Br\xfan',
            "Karl von WOGAU": u'von WOGAU',
            "Ieke van den BURG": u'van den BURG',
            "Manuel António dos SANTOS": u'dos SANTOS',
            "Paul van BUITENEN": u'van BUITENEN',
            "Elly de GROEN-KOUWENHOVEN": u'de GROEN-KOUWENHOVEN',
            "Margrietus van den BERG": u'van den BERG',
            u'Dani\xebl van der STOEP': u'van der STOEP',
            "Alexander Graf LAMBSDORFF": u'Graf LAMBSDORFF',
            u'Bairbre de BR\xdaN': u'de BR\xdaN',
            'Luigi de MAGISTRIS': 'de MAGISTRIS',
        }

        if fix_last_name_with_prefix.get(representative.full_name):
185 186 187 188 189 190 191 192 193
            last_name = fix_last_name_with_prefix[representative.full_name]
        elif last_name == "J.A.J. STASSEN":
            last_name = "STASSEN"

        if representative.last_name != last_name:
            representative.last_name = last_name
            changed = True

        gender_convertion_dict = {u"F": 1, u"M": 2}
194
        if 'Gender' in mep_json:
195
            gender = gender_convertion_dict.get(mep_json['Gender'], 0)
196
        else:
197 198 199 200
            gender = 0
        if representative.gender != gender:
            representative.gender = gender
            changed = True
201

202 203 204 205
        cv = "\n".join([cv_title for cv_title in mep_json.get("CV", [])])
        if representative.cv != cv:
            representative.cv = cv
            changed = True
206

207 208
        if changed:
            representative.save()
209 210

    def add_mandates(self, representative, mep_json):
211
        def create_mandate(mandate_data, representative, group, constituency):
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
            if mandate_data.get("start"):
                begin_date = _parse_date(mandate_data.get("start"))
            if mandate_data.get("end"):
                end_date = _parse_date(mandate_data.get("end"))

            role = mandate_data['role'] if 'role' in mandate_data else ''
            mandate, _ = Mandate.objects.get_or_create(
                representative=representative,
                group=group,
                constituency=constituency,
                role=role,
                begin_date=begin_date,
                end_date=end_date
            )

            if _:
                logger.debug('Created mandate %s with %s', mandate.pk,
                             mandate_data)

        # Committee
        for mandate_data in mep_json.get('Committees', []):
            if mandate_data.get("committee_id"):
                group, _ = self.touch_model(model=Group,
                        abbreviation=mandate_data['committee_id'],
236 237
                        kind='committee', name=mandate_data['Organization'],
                        chamber=self.ep_chamber)
238

239 240
                create_mandate(mandate_data, representative, group,
                               self.ep_constituency)
241 242 243 244 245

        # Delegations
        for mandate_data in mep_json.get('Delegations', []):
            group, _ = self.touch_model(model=Group,
                                        kind='delegation',
246 247
                                        name=mandate_data['Organization'],
                                        chamber=self.ep_chamber
248 249
                                        )

250 251
            create_mandate(mandate_data, representative, group,
                           self.ep_constituency)
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272

        # Group
        convert = {
            "S&D": "SD",
            "NA": "NI",
            "ID": "IND/DEM",
            "PPE": "EPP",
            "Verts/ALE": "Greens/EFA"}
        for mandate_data in mep_json.get('Groups', []):
            if not mandate_data.get('groupid'):
                continue

            if isinstance(mandate_data.get('groupid'), list):
                abbreviation = mandate_data.get('groupid')[0]
            else:
                abbreviation = mandate_data.get('groupid')

            abbreviation = convert.get(abbreviation, abbreviation)
            group, _ = self.touch_model(model=Group,
                                        abbreviation=abbreviation,
                                        kind='group',
273 274
                                        name=mandate_data['Organization'],
                                        chamber=self.ep_chamber
275 276
                                        )

277 278
            create_mandate(mandate_data, representative, group,
                           self.ep_constituency)
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294

        # Countries
        for mandate_data in mep_json.get('Constituencies', []):
            if not mandate_data:
                continue

            _country = Country.objects.get(name=mandate_data['country'])

            group, _ = self.touch_model(model=Group,
                                        abbreviation=_country.code,
                                        kind='country',
                                        name=_country.name
                                        )

            local_party = mandate_data['party'] if mandate_data[
                'party'] and mandate_data['party'] != '-' else 'unknown'
Jamesie Pic's avatar
Jamesie Pic committed
295 296 297 298 299 300 301 302 303 304

            country_id = (self.cache['countries'].get(mandate_data['country'])
                if 'country' in mandate_data else None)

            save_constituency = False
            try:
                constituency = Constituency.objects.get(name=local_party)
            except Constituency.DoesNotExist:
                constituency = Constituency(name=local_party)
                save_constituency = True
okhin's avatar
okhin committed
305
            except MultipleObjectsReturned:
306 307 308 309
                # There is more than one constituency with that name.
                # We must filter them by country.
                constituency = Constituency.objects.get(name=local_party,
                        country_id=country_id)
Jamesie Pic's avatar
Jamesie Pic committed
310 311 312 313 314 315 316

            if constituency.country_id != country_id:
                constituency.country_id = country_id
                save_constituency = True

            if save_constituency:
                constituency.save()
317

318
            create_mandate(mandate_data, representative, group, constituency)
319

320 321
            create_mandate(mandate_data, representative, self.ep_group,
                           self.ep_constituency)
322

323 324 325 326 327 328 329 330 331
        # Organisations
        for mandate_data in mep_json.get('Staff', []):

            group, _ = self.touch_model(model=Group,
                                        abbreviation='',
                                        kind='organization',
                                        name=mandate_data['Organization']
                                        )

332 333
            create_mandate(mandate_data, representative, group,
                           self.ep_constituency)
334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383

    def add_contacts(self, representative, mep_json):
        # Addresses
        if mep_json.get('Addresses', None):
            address = mep_json.get('Addresses')

            belgium = Country.objects.get(name="Belgium")
            france = Country.objects.get(name="France")

            for city in address:
                if city in ['Brussels', 'Strasbourg']:
                    if city == 'Brussels':
                        country = belgium
                        street = u"rue Wiertz / Wiertzstraat"
                        number = '60'
                        postcode = '1047'
                        name = "Brussels European Parliament"
                    elif city == 'Strasbourg':
                        country = france
                        street = u"Av. du Président Robert Schuman - CS 91024"
                        number = '1'
                        postcode = '67070'
                        name = "Strasbourg European Parliament"

                    address_model, _ = self.touch_model(model=Address,
                        representative=representative, country=country,
                        city=city,
                        floor=address[city]['Address']['Office'][:3],
                        office_number=address[city]['Address']['Office'][3:],
                        street=street, number=number, postcode=postcode,
                        kind='official', name=name)

                    self.touch_model(model=Phone,
                        representative=representative, address=address_model,
                        kind='office phone',
                        number=address[city].get('Phone', ''))

        # Emails
        if mep_json.get('Mail', None):
            mails = mep_json.get('Mail')
            if not isinstance(mails, list):
                mails = list(mails)

            for mail in mails:
                self.touch_model(
                    model=Email,
                    representative=representative,
                    kind=('official' if '@europarl.europa.eu' in mail
                        else 'other'),
                    email=mail)
384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402

        # EP page
        changed = False
        try:
            site = WebSite.objects.get(kind='EP',
                                       representative=representative)
        except WebSite.DoesNotExist:
            site = WebSite(kind='EP', representative=representative)
            changed = True

        uid = mep_json['UserID']
        url = 'http://www.europarl.europa.eu/meps/en/%s/_home.html' % uid
        if site.url != url:
            site.url = url
            changed = True

        if changed:
            site.save()

403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433
        # WebSite
        websites = mep_json.get('Homepage', [])
        for url in websites:
            self.touch_model(model=WebSite,
                             url=url,
                             representative=representative
                             )

        if mep_json.get('Twitter', None):
            self.touch_model(model=WebSite,
                             representative=representative,
                             kind='twitter',
                             url=mep_json.get('Twitter')[0]
                             )

        if mep_json.get('Facebook', None):
            self.touch_model(model=WebSite,
                             representative=representative,
                             kind='facebook',
                             url=mep_json.get('Facebook')[0]
                             )


def main(stream=None):
    if not apps.ready:
        django.setup()

    importer = ParltrackImporter()
    GenericImporter.pre_import(importer)

    for data in ijson.items(stream or sys.stdin, 'item'):
434 435
        try:
            importer.manage_mep(data)
436 437
        except Exception:
            logger.exception('error trying to import rep %s', str(data))
438

439 440 441
    # Commenting for now, it's a bit dangerous, if a json file was corrupt it
    # would drop valid data !
    # importer.post_import()