import_representatives.py 15.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
# coding: utf-8

import logging
import sys
from datetime import datetime

import django.dispatch
import ijson
import django
from django.apps import apps
from django.db import transaction
from django.utils import timezone
13
from django.utils.text import slugify
14
15
16

from representatives.models import (Address, Constituency, Country, Email,
                                    Group, Mandate, Phone, Representative,
17
                                    WebSite, Chamber)
18
19
20

logger = logging.getLogger(__name__)

Jamesie Pic's avatar
Jamesie Pic committed
21
22
representative_pre_import = django.dispatch.Signal(
    providing_args=['representative_data'])
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64


def _parse_date(date):
    return datetime.strptime(date, "%Y-%m-%dT00:%H:00").date()


class GenericImporter(object):

    def pre_import(self):
        self.import_start_datetime = timezone.now()

    def post_import(self):
        # Clean not touched models
        models = [Representative, Group, Constituency,
                  Mandate, Address, Phone, Email, WebSite]
        for model in models:
            model.objects.filter(
                updated__lt=self.import_start_datetime).delete()

    def touch_model(self, model, **data):
        '''
        This method create or look up a model with the given data
        it saves the given model if it exists, updating its
        updated field
        '''
        instance, created = model.objects.get_or_create(**data)

        if not created:
            if instance.updated < self.import_start_datetime:
                instance.save()     # Updates updated field

        return (instance, created)


class ParltrackImporter(GenericImporter):
    url = 'http://parltrack.euwiki.org/dumps/ep_meps_current.json.xz'
    check_etag = True

    def parse_date(self, date):
        return _parse_date(date)

    def __init__(self):
Jamesie Pic's avatar
Jamesie Pic committed
65
66
67
        self.cache = {
            'countries': {c.name: c.pk for c in Country.objects.all()},
        }
68
        self.ep_chamber, _ = Chamber.objects.get_or_create(
69
            name='European Parliament', abbreviation='EP')
70
71
72
73
74
        self.ep_constituency, _ = Constituency.objects.get_or_create(
            name='European Parliament')
        self.ep_group, _ = Group.objects.get_or_create(
            name='European Parliament', kind='chamber', abbreviation='EP',
            chamber=self.ep_chamber)
75
76
77
78
79
80
81
82

    @transaction.atomic
    def manage_mep(self, mep_json):
        '''
        Import a mep as a representative from the json dict fetched from
        parltrack
        '''

Jamesie Pic's avatar
Jamesie Pic committed
83
84
85
86
87
88
89
90
91
92
        # Some versions of memopol will connect to this and skip inactive meps.
        responses = representative_pre_import.send(sender=self,
                representative_data=mep_json)

        for receiver, response in responses:
            if response is False:
                logger.debug(
                    'Skipping MEP %s', mep_json['Name']['full'])
                return

93
94
95
96
97
        changed = False
        slug = slugify(
            mep_json["Name"]["full"] if 'full' in mep_json["Name"]
            else mep_json["Name"]["sur"] + " " + mep_json["Name"]["family"]
        )
98
        try:
99
            representative = Representative.objects.get(slug=slug)
100
        except Representative.DoesNotExist:
101
102
            representative = Representative(slug=slug)
            changed = True
103
104

        # Save representative attributes
105
        self.import_representative_details(representative, mep_json, changed)
106
107
108
109
110
111
112
113
114

        self.add_mandates(representative, mep_json)

        self.add_contacts(representative, mep_json)

        logger.debug('Imported MEP %s', unicode(representative))

        return representative

115
116
117
118
    def import_representative_details(self, representative, mep_json, changed):
        if representative.active != mep_json['active']:
            representative.active = mep_json['active']
            changed = True
119
120

        if mep_json.get("Birth"):
121
122
123
124
            birth_date = _parse_date(mep_json["Birth"]["date"])
            if representative.birth_date != birth_date:
                representative.birth_date = birth_date
                changed = True
125
            if "place" in mep_json["Birth"]:
126
127
128
129
130
131
132
133
                birth_place = mep_json["Birth"]["place"]
                if representative.birth_place != birth_place:
                    representative.birth_place = birth_place
                    changed = True

        if representative.first_name != mep_json["Name"]["sur"]:
            representative.first_name = mep_json["Name"]["sur"]
            changed = True
134

135
        last_name = mep_json["Name"]["family"]
136

137
138
139
140
141
142
143
        if representative.full_name != mep_json["Name"]["full"]:
            representative.full_name = mep_json["Name"]["full"]
            changed = True

        if representative.photo != mep_json["Photo"]:
            representative.photo = mep_json["Photo"]
            changed = True
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177

        fix_last_name_with_prefix = {
            "Esther de LANGE": "de LANGE",
            "Patricia van der KAMMEN": "van der KAMMEN",
            "Judith A. MERKIES": "MERKIES",
            "Heinz K. BECKER": "BECKER",
            "Cornelis de JONG": "de JONG",
            "Peter van DALEN": "van DALEN",
            "Sophia in 't VELD": "in 't VELD",
            "Marielle de SARNEZ": "de SARNEZ",
            "Anne E. JENSEN": "JENSEN",
            "Wim van de CAMP": "van de CAMP",
            "Lambert van NISTELROOIJ": "van NISTELROOIJ",
            "Johannes Cornelis van BAALEN": "van BAALEN",
            "Ioannis A. TSOUKALAS": "TSOUKALAS",
            "Pilar del CASTILLO VERA": "del CASTILLO VERA",
            "Luis de GRANDES PASCUAL": "de GRANDES PASCUAL",
            "Philippe de VILLIERS": "de VILLIERS",
            "Daniël van der STOEP": "van der STOEP",
            "William (The Earl of) DARTMOUTH": "(The Earl of) Dartmouth",
            "Bairbre de BRÚN": u'de Br\xfan',
            "Karl von WOGAU": u'von WOGAU',
            "Ieke van den BURG": u'van den BURG',
            "Manuel António dos SANTOS": u'dos SANTOS',
            "Paul van BUITENEN": u'van BUITENEN',
            "Elly de GROEN-KOUWENHOVEN": u'de GROEN-KOUWENHOVEN',
            "Margrietus van den BERG": u'van den BERG',
            u'Dani\xebl van der STOEP': u'van der STOEP',
            "Alexander Graf LAMBSDORFF": u'Graf LAMBSDORFF',
            u'Bairbre de BR\xdaN': u'de BR\xdaN',
            'Luigi de MAGISTRIS': 'de MAGISTRIS',
        }

        if fix_last_name_with_prefix.get(representative.full_name):
178
179
180
181
182
183
184
185
186
            last_name = fix_last_name_with_prefix[representative.full_name]
        elif last_name == "J.A.J. STASSEN":
            last_name = "STASSEN"

        if representative.last_name != last_name:
            representative.last_name = last_name
            changed = True

        gender_convertion_dict = {u"F": 1, u"M": 2}
187
        if 'Gender' in mep_json:
188
            gender = gender_convertion_dict.get(mep_json['Gender'], 0)
189
        else:
190
191
192
193
            gender = 0
        if representative.gender != gender:
            representative.gender = gender
            changed = True
194

195
196
197
198
        cv = "\n".join([cv_title for cv_title in mep_json.get("CV", [])])
        if representative.cv != cv:
            representative.cv = cv
            changed = True
199

200
201
        if changed:
            representative.save()
202
203

    def add_mandates(self, representative, mep_json):
204
        def create_mandate(mandate_data, representative, group, constituency):
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
            if mandate_data.get("start"):
                begin_date = _parse_date(mandate_data.get("start"))
            if mandate_data.get("end"):
                end_date = _parse_date(mandate_data.get("end"))

            role = mandate_data['role'] if 'role' in mandate_data else ''
            mandate, _ = Mandate.objects.get_or_create(
                representative=representative,
                group=group,
                constituency=constituency,
                role=role,
                begin_date=begin_date,
                end_date=end_date
            )

            if _:
                logger.debug('Created mandate %s with %s', mandate.pk,
                             mandate_data)

        # Committee
        for mandate_data in mep_json.get('Committees', []):
            if mandate_data.get("committee_id"):
                group, _ = self.touch_model(model=Group,
                        abbreviation=mandate_data['committee_id'],
229
230
                        kind='committee', name=mandate_data['Organization'],
                        chamber=self.ep_chamber)
231

232
233
                create_mandate(mandate_data, representative, group,
                               self.ep_constituency)
234
235
236
237
238

        # Delegations
        for mandate_data in mep_json.get('Delegations', []):
            group, _ = self.touch_model(model=Group,
                                        kind='delegation',
239
240
                                        name=mandate_data['Organization'],
                                        chamber=self.ep_chamber
241
242
                                        )

243
244
            create_mandate(mandate_data, representative, group,
                           self.ep_constituency)
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265

        # Group
        convert = {
            "S&D": "SD",
            "NA": "NI",
            "ID": "IND/DEM",
            "PPE": "EPP",
            "Verts/ALE": "Greens/EFA"}
        for mandate_data in mep_json.get('Groups', []):
            if not mandate_data.get('groupid'):
                continue

            if isinstance(mandate_data.get('groupid'), list):
                abbreviation = mandate_data.get('groupid')[0]
            else:
                abbreviation = mandate_data.get('groupid')

            abbreviation = convert.get(abbreviation, abbreviation)
            group, _ = self.touch_model(model=Group,
                                        abbreviation=abbreviation,
                                        kind='group',
266
267
                                        name=mandate_data['Organization'],
                                        chamber=self.ep_chamber
268
269
                                        )

270
271
            create_mandate(mandate_data, representative, group,
                           self.ep_constituency)
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287

        # Countries
        for mandate_data in mep_json.get('Constituencies', []):
            if not mandate_data:
                continue

            _country = Country.objects.get(name=mandate_data['country'])

            group, _ = self.touch_model(model=Group,
                                        abbreviation=_country.code,
                                        kind='country',
                                        name=_country.name
                                        )

            local_party = mandate_data['party'] if mandate_data[
                'party'] and mandate_data['party'] != '-' else 'unknown'
Jamesie Pic's avatar
Jamesie Pic committed
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304

            country_id = (self.cache['countries'].get(mandate_data['country'])
                if 'country' in mandate_data else None)

            save_constituency = False
            try:
                constituency = Constituency.objects.get(name=local_party)
            except Constituency.DoesNotExist:
                constituency = Constituency(name=local_party)
                save_constituency = True

            if constituency.country_id != country_id:
                constituency.country_id = country_id
                save_constituency = True

            if save_constituency:
                constituency.save()
305

306
            create_mandate(mandate_data, representative, group, constituency)
307

308
309
            create_mandate(mandate_data, representative, self.ep_group,
                           self.ep_constituency)
310

311
312
313
314
315
316
317
318
319
        # Organisations
        for mandate_data in mep_json.get('Staff', []):

            group, _ = self.touch_model(model=Group,
                                        abbreviation='',
                                        kind='organization',
                                        name=mandate_data['Organization']
                                        )

320
321
            create_mandate(mandate_data, representative, group,
                           self.ep_constituency)
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371

    def add_contacts(self, representative, mep_json):
        # Addresses
        if mep_json.get('Addresses', None):
            address = mep_json.get('Addresses')

            belgium = Country.objects.get(name="Belgium")
            france = Country.objects.get(name="France")

            for city in address:
                if city in ['Brussels', 'Strasbourg']:
                    if city == 'Brussels':
                        country = belgium
                        street = u"rue Wiertz / Wiertzstraat"
                        number = '60'
                        postcode = '1047'
                        name = "Brussels European Parliament"
                    elif city == 'Strasbourg':
                        country = france
                        street = u"Av. du Président Robert Schuman - CS 91024"
                        number = '1'
                        postcode = '67070'
                        name = "Strasbourg European Parliament"

                    address_model, _ = self.touch_model(model=Address,
                        representative=representative, country=country,
                        city=city,
                        floor=address[city]['Address']['Office'][:3],
                        office_number=address[city]['Address']['Office'][3:],
                        street=street, number=number, postcode=postcode,
                        kind='official', name=name)

                    self.touch_model(model=Phone,
                        representative=representative, address=address_model,
                        kind='office phone',
                        number=address[city].get('Phone', ''))

        # Emails
        if mep_json.get('Mail', None):
            mails = mep_json.get('Mail')
            if not isinstance(mails, list):
                mails = list(mails)

            for mail in mails:
                self.touch_model(
                    model=Email,
                    representative=representative,
                    kind=('official' if '@europarl.europa.eu' in mail
                        else 'other'),
                    email=mail)
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390

        # EP page
        changed = False
        try:
            site = WebSite.objects.get(kind='EP',
                                       representative=representative)
        except WebSite.DoesNotExist:
            site = WebSite(kind='EP', representative=representative)
            changed = True

        uid = mep_json['UserID']
        url = 'http://www.europarl.europa.eu/meps/en/%s/_home.html' % uid
        if site.url != url:
            site.url = url
            changed = True

        if changed:
            site.save()

391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
        # WebSite
        websites = mep_json.get('Homepage', [])
        for url in websites:
            self.touch_model(model=WebSite,
                             url=url,
                             representative=representative
                             )

        if mep_json.get('Twitter', None):
            self.touch_model(model=WebSite,
                             representative=representative,
                             kind='twitter',
                             url=mep_json.get('Twitter')[0]
                             )

        if mep_json.get('Facebook', None):
            self.touch_model(model=WebSite,
                             representative=representative,
                             kind='facebook',
                             url=mep_json.get('Facebook')[0]
                             )


def main(stream=None):
    if not apps.ready:
        django.setup()

    importer = ParltrackImporter()
    GenericImporter.pre_import(importer)

    for data in ijson.items(stream or sys.stdin, 'item'):
        importer.manage_mep(data)
    # Commenting for now, it's a bit dangerous, if a json file was corrupt it
    # would drop valid data !
    # importer.post_import()