import_representatives.py 16.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
# coding: utf-8

import logging
import sys
from datetime import datetime

import django.dispatch
import ijson
import django
from django.apps import apps
from django.db import transaction
from django.utils import timezone
13
from django.utils.text import slugify
14 15 16

from representatives.models import (Address, Constituency, Country, Email,
                                    Group, Mandate, Phone, Representative,
17
                                    WebSite, Chamber)
18 19 20

logger = logging.getLogger(__name__)

Jamesie Pic's avatar
Jamesie Pic committed
21 22
representative_pre_import = django.dispatch.Signal(
    providing_args=['representative_data'])
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64


def _parse_date(date):
    return datetime.strptime(date, "%Y-%m-%dT00:%H:00").date()


class GenericImporter(object):

    def pre_import(self):
        self.import_start_datetime = timezone.now()

    def post_import(self):
        # Clean not touched models
        models = [Representative, Group, Constituency,
                  Mandate, Address, Phone, Email, WebSite]
        for model in models:
            model.objects.filter(
                updated__lt=self.import_start_datetime).delete()

    def touch_model(self, model, **data):
        '''
        This method create or look up a model with the given data
        it saves the given model if it exists, updating its
        updated field
        '''
        instance, created = model.objects.get_or_create(**data)

        if not created:
            if instance.updated < self.import_start_datetime:
                instance.save()     # Updates updated field

        return (instance, created)


class ParltrackImporter(GenericImporter):
    url = 'http://parltrack.euwiki.org/dumps/ep_meps_current.json.xz'
    check_etag = True

    def parse_date(self, date):
        return _parse_date(date)

    def __init__(self):
Jamesie Pic's avatar
Jamesie Pic committed
65 66 67
        self.cache = {
            'countries': {c.name: c.pk for c in Country.objects.all()},
        }
68
        self.ep_chamber, _ = Chamber.objects.get_or_create(
69
            name='European Parliament', abbreviation='EP')
70 71 72 73 74
        self.ep_constituency, _ = Constituency.objects.get_or_create(
            name='European Parliament')
        self.ep_group, _ = Group.objects.get_or_create(
            name='European Parliament', kind='chamber', abbreviation='EP',
            chamber=self.ep_chamber)
75 76 77 78 79 80 81 82

    @transaction.atomic
    def manage_mep(self, mep_json):
        '''
        Import a mep as a representative from the json dict fetched from
        parltrack
        '''

Jamesie Pic's avatar
Jamesie Pic committed
83 84 85 86 87 88 89 90 91 92
        # Some versions of memopol will connect to this and skip inactive meps.
        responses = representative_pre_import.send(sender=self,
                representative_data=mep_json)

        for receiver, response in responses:
            if response is False:
                logger.debug(
                    'Skipping MEP %s', mep_json['Name']['full'])
                return

93
        changed = False
94 95
        # Issue 185. We must have a Birth date for our mep, to allow import
        # and slugifying stuff.
Okhin's avatar
Okhin committed
96 97
        if "Birth" not in mep_json:
            mep_json["Birth"] = {"date": "9999-01-01T00:00:00", "place": ""}
98

Nicolas Joyard's avatar
Nicolas Joyard committed
99
        slug = slugify('%s-%s' % (
100
            mep_json["Name"]["full"] if 'full' in mep_json["Name"]
Nicolas Joyard's avatar
Nicolas Joyard committed
101 102 103
            else mep_json["Name"]["sur"] + " " + mep_json["Name"]["family"],
            _parse_date(mep_json["Birth"]["date"])
        ))
104
        try:
105
            representative = Representative.objects.get(slug=slug)
106
        except Representative.DoesNotExist:
107 108
            representative = Representative(slug=slug)
            changed = True
109 110

        # Save representative attributes
111
        self.import_representative_details(representative, mep_json, changed)
112 113 114 115 116 117 118 119 120

        self.add_mandates(representative, mep_json)

        self.add_contacts(representative, mep_json)

        logger.debug('Imported MEP %s', unicode(representative))

        return representative

121 122 123 124
    def import_representative_details(self, representative, mep_json, changed):
        if representative.active != mep_json['active']:
            representative.active = mep_json['active']
            changed = True
125 126

        if mep_json.get("Birth"):
127 128 129 130
            birth_date = _parse_date(mep_json["Birth"]["date"])
            if representative.birth_date != birth_date:
                representative.birth_date = birth_date
                changed = True
131
            if "place" in mep_json["Birth"]:
132 133 134 135 136 137 138 139
                birth_place = mep_json["Birth"]["place"]
                if representative.birth_place != birth_place:
                    representative.birth_place = birth_place
                    changed = True

        if representative.first_name != mep_json["Name"]["sur"]:
            representative.first_name = mep_json["Name"]["sur"]
            changed = True
140

141
        last_name = mep_json["Name"]["family"]
142

143 144 145 146 147 148 149
        if representative.full_name != mep_json["Name"]["full"]:
            representative.full_name = mep_json["Name"]["full"]
            changed = True

        if representative.photo != mep_json["Photo"]:
            representative.photo = mep_json["Photo"]
            changed = True
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183

        fix_last_name_with_prefix = {
            "Esther de LANGE": "de LANGE",
            "Patricia van der KAMMEN": "van der KAMMEN",
            "Judith A. MERKIES": "MERKIES",
            "Heinz K. BECKER": "BECKER",
            "Cornelis de JONG": "de JONG",
            "Peter van DALEN": "van DALEN",
            "Sophia in 't VELD": "in 't VELD",
            "Marielle de SARNEZ": "de SARNEZ",
            "Anne E. JENSEN": "JENSEN",
            "Wim van de CAMP": "van de CAMP",
            "Lambert van NISTELROOIJ": "van NISTELROOIJ",
            "Johannes Cornelis van BAALEN": "van BAALEN",
            "Ioannis A. TSOUKALAS": "TSOUKALAS",
            "Pilar del CASTILLO VERA": "del CASTILLO VERA",
            "Luis de GRANDES PASCUAL": "de GRANDES PASCUAL",
            "Philippe de VILLIERS": "de VILLIERS",
            "Daniël van der STOEP": "van der STOEP",
            "William (The Earl of) DARTMOUTH": "(The Earl of) Dartmouth",
            "Bairbre de BRÚN": u'de Br\xfan',
            "Karl von WOGAU": u'von WOGAU',
            "Ieke van den BURG": u'van den BURG',
            "Manuel António dos SANTOS": u'dos SANTOS',
            "Paul van BUITENEN": u'van BUITENEN',
            "Elly de GROEN-KOUWENHOVEN": u'de GROEN-KOUWENHOVEN',
            "Margrietus van den BERG": u'van den BERG',
            u'Dani\xebl van der STOEP': u'van der STOEP',
            "Alexander Graf LAMBSDORFF": u'Graf LAMBSDORFF',
            u'Bairbre de BR\xdaN': u'de BR\xdaN',
            'Luigi de MAGISTRIS': 'de MAGISTRIS',
        }

        if fix_last_name_with_prefix.get(representative.full_name):
184 185 186 187 188 189 190 191 192
            last_name = fix_last_name_with_prefix[representative.full_name]
        elif last_name == "J.A.J. STASSEN":
            last_name = "STASSEN"

        if representative.last_name != last_name:
            representative.last_name = last_name
            changed = True

        gender_convertion_dict = {u"F": 1, u"M": 2}
193
        if 'Gender' in mep_json:
194
            gender = gender_convertion_dict.get(mep_json['Gender'], 0)
195
        else:
196 197 198 199
            gender = 0
        if representative.gender != gender:
            representative.gender = gender
            changed = True
200

201 202 203 204
        cv = "\n".join([cv_title for cv_title in mep_json.get("CV", [])])
        if representative.cv != cv:
            representative.cv = cv
            changed = True
205

206 207
        if changed:
            representative.save()
208 209

    def add_mandates(self, representative, mep_json):
210
        def create_mandate(mandate_data, representative, group, constituency):
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
            if mandate_data.get("start"):
                begin_date = _parse_date(mandate_data.get("start"))
            if mandate_data.get("end"):
                end_date = _parse_date(mandate_data.get("end"))

            role = mandate_data['role'] if 'role' in mandate_data else ''
            mandate, _ = Mandate.objects.get_or_create(
                representative=representative,
                group=group,
                constituency=constituency,
                role=role,
                begin_date=begin_date,
                end_date=end_date
            )

            if _:
                logger.debug('Created mandate %s with %s', mandate.pk,
                             mandate_data)

        # Committee
        for mandate_data in mep_json.get('Committees', []):
            if mandate_data.get("committee_id"):
                group, _ = self.touch_model(model=Group,
                        abbreviation=mandate_data['committee_id'],
235 236
                        kind='committee', name=mandate_data['Organization'],
                        chamber=self.ep_chamber)
237

238 239
                create_mandate(mandate_data, representative, group,
                               self.ep_constituency)
240 241 242 243 244

        # Delegations
        for mandate_data in mep_json.get('Delegations', []):
            group, _ = self.touch_model(model=Group,
                                        kind='delegation',
245 246
                                        name=mandate_data['Organization'],
                                        chamber=self.ep_chamber
247 248
                                        )

249 250
            create_mandate(mandate_data, representative, group,
                           self.ep_constituency)
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271

        # Group
        convert = {
            "S&D": "SD",
            "NA": "NI",
            "ID": "IND/DEM",
            "PPE": "EPP",
            "Verts/ALE": "Greens/EFA"}
        for mandate_data in mep_json.get('Groups', []):
            if not mandate_data.get('groupid'):
                continue

            if isinstance(mandate_data.get('groupid'), list):
                abbreviation = mandate_data.get('groupid')[0]
            else:
                abbreviation = mandate_data.get('groupid')

            abbreviation = convert.get(abbreviation, abbreviation)
            group, _ = self.touch_model(model=Group,
                                        abbreviation=abbreviation,
                                        kind='group',
272 273
                                        name=mandate_data['Organization'],
                                        chamber=self.ep_chamber
274 275
                                        )

276 277
            create_mandate(mandate_data, representative, group,
                           self.ep_constituency)
278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293

        # Countries
        for mandate_data in mep_json.get('Constituencies', []):
            if not mandate_data:
                continue

            _country = Country.objects.get(name=mandate_data['country'])

            group, _ = self.touch_model(model=Group,
                                        abbreviation=_country.code,
                                        kind='country',
                                        name=_country.name
                                        )

            local_party = mandate_data['party'] if mandate_data[
                'party'] and mandate_data['party'] != '-' else 'unknown'
Jamesie Pic's avatar
Jamesie Pic committed
294 295 296 297 298 299 300 301 302 303

            country_id = (self.cache['countries'].get(mandate_data['country'])
                if 'country' in mandate_data else None)

            save_constituency = False
            try:
                constituency = Constituency.objects.get(name=local_party)
            except Constituency.DoesNotExist:
                constituency = Constituency(name=local_party)
                save_constituency = True
304 305 306 307 308
            except Constituency.MultipleObjectsReturned:
                # There is more than one constituency with that name.
                # We must filter them by country.
                constituency = Constituency.objects.get(name=local_party,
                        country_id=country_id)
Jamesie Pic's avatar
Jamesie Pic committed
309 310 311 312 313 314 315

            if constituency.country_id != country_id:
                constituency.country_id = country_id
                save_constituency = True

            if save_constituency:
                constituency.save()
316

317
            create_mandate(mandate_data, representative, group, constituency)
318

319 320
            create_mandate(mandate_data, representative, self.ep_group,
                           self.ep_constituency)
321

322 323 324 325 326 327 328 329 330
        # Organisations
        for mandate_data in mep_json.get('Staff', []):

            group, _ = self.touch_model(model=Group,
                                        abbreviation='',
                                        kind='organization',
                                        name=mandate_data['Organization']
                                        )

331 332
            create_mandate(mandate_data, representative, group,
                           self.ep_constituency)
333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382

    def add_contacts(self, representative, mep_json):
        # Addresses
        if mep_json.get('Addresses', None):
            address = mep_json.get('Addresses')

            belgium = Country.objects.get(name="Belgium")
            france = Country.objects.get(name="France")

            for city in address:
                if city in ['Brussels', 'Strasbourg']:
                    if city == 'Brussels':
                        country = belgium
                        street = u"rue Wiertz / Wiertzstraat"
                        number = '60'
                        postcode = '1047'
                        name = "Brussels European Parliament"
                    elif city == 'Strasbourg':
                        country = france
                        street = u"Av. du Président Robert Schuman - CS 91024"
                        number = '1'
                        postcode = '67070'
                        name = "Strasbourg European Parliament"

                    address_model, _ = self.touch_model(model=Address,
                        representative=representative, country=country,
                        city=city,
                        floor=address[city]['Address']['Office'][:3],
                        office_number=address[city]['Address']['Office'][3:],
                        street=street, number=number, postcode=postcode,
                        kind='official', name=name)

                    self.touch_model(model=Phone,
                        representative=representative, address=address_model,
                        kind='office phone',
                        number=address[city].get('Phone', ''))

        # Emails
        if mep_json.get('Mail', None):
            mails = mep_json.get('Mail')
            if not isinstance(mails, list):
                mails = list(mails)

            for mail in mails:
                self.touch_model(
                    model=Email,
                    representative=representative,
                    kind=('official' if '@europarl.europa.eu' in mail
                        else 'other'),
                    email=mail)
383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401

        # EP page
        changed = False
        try:
            site = WebSite.objects.get(kind='EP',
                                       representative=representative)
        except WebSite.DoesNotExist:
            site = WebSite(kind='EP', representative=representative)
            changed = True

        uid = mep_json['UserID']
        url = 'http://www.europarl.europa.eu/meps/en/%s/_home.html' % uid
        if site.url != url:
            site.url = url
            changed = True

        if changed:
            site.save()

402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
        # WebSite
        websites = mep_json.get('Homepage', [])
        for url in websites:
            self.touch_model(model=WebSite,
                             url=url,
                             representative=representative
                             )

        if mep_json.get('Twitter', None):
            self.touch_model(model=WebSite,
                             representative=representative,
                             kind='twitter',
                             url=mep_json.get('Twitter')[0]
                             )

        if mep_json.get('Facebook', None):
            self.touch_model(model=WebSite,
                             representative=representative,
                             kind='facebook',
                             url=mep_json.get('Facebook')[0]
                             )


def main(stream=None):
    if not apps.ready:
        django.setup()

    importer = ParltrackImporter()
    GenericImporter.pre_import(importer)

    for data in ijson.items(stream or sys.stdin, 'item'):
433 434
        try:
            importer.manage_mep(data)
435 436
        except Exception:
            logger.exception('error trying to import rep %s', str(data))
437

438 439 440
    # Commenting for now, it's a bit dangerous, if a json file was corrupt it
    # would drop valid data !
    # importer.post_import()