Commit 094808d7 authored by njoyard's avatar njoyard Committed by GitHub

Merge pull request #29 from political-memory/add-document

Add Document model
parents 2c7a1dce 3944388c
...@@ -2,14 +2,22 @@ ...@@ -2,14 +2,22 @@
from django.contrib import admin from django.contrib import admin
from .models import Dossier, Proposal, Vote from .models import Dossier, Document, Proposal, Vote
class DossierAdmin(admin.ModelAdmin): class DossierAdmin(admin.ModelAdmin):
list_display = ('id', 'reference', 'title', 'link') list_display = ('id', 'reference', 'title')
search_fields = ('reference', 'title') search_fields = ('reference', 'title')
class DocumentAdmin(admin.ModelAdmin):
list_display = ('dossier_reference', 'kind', 'title', 'link')
search_fields = ('reference', 'dossier__reference', 'title')
def dossier_reference(self, obj):
return obj.dossier.reference
class ProposalAdmin(admin.ModelAdmin): class ProposalAdmin(admin.ModelAdmin):
list_display = ( list_display = (
'reference', 'reference',
...@@ -49,5 +57,6 @@ class VoteAdmin(admin.ModelAdmin): ...@@ -49,5 +57,6 @@ class VoteAdmin(admin.ModelAdmin):
return obj.proposal.reference return obj.proposal.reference
admin.site.register(Dossier, DossierAdmin) admin.site.register(Dossier, DossierAdmin)
admin.site.register(Document, DocumentAdmin)
admin.site.register(Proposal, ProposalAdmin) admin.site.register(Proposal, ProposalAdmin)
admin.site.register(Vote, VoteAdmin) admin.site.register(Vote, VoteAdmin)
...@@ -51,7 +51,8 @@ class DossierViewSet(viewsets.ReadOnlyModelViewSet): ...@@ -51,7 +51,8 @@ class DossierViewSet(viewsets.ReadOnlyModelViewSet):
def retrieve(self, request, pk=None): def retrieve(self, request, pk=None):
self.serializer_class = DossierDetailSerializer self.serializer_class = DossierDetailSerializer
self.queryset = self.queryset.prefetch_related('proposals') self.queryset = self.queryset.prefetch_related('proposals',
'documents')
return super(DossierViewSet, self).retrieve(request, pk) return super(DossierViewSet, self).retrieve(request, pk)
......
...@@ -3,27 +3,48 @@ ...@@ -3,27 +3,48 @@
import sys import sys
import ijson import ijson
import logging import logging
import re
import django import django
from django.apps import apps from django.apps import apps
from django.db import transaction
from representatives_votes.models import Dossier from representatives.contrib.francedata.import_representatives import \
ensure_chambers
from representatives.models import Chamber
from representatives_votes.models import Document, Dossier
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def extract_reference(url):
m = re.search(r'/dossier-legislatif/([^./]+)\.html', url)
if m:
return m.group(1)
m = re.search(r'/(\d+)/dossiers/([^./]+)\.asp', url)
if m:
return '%s/%s' % (m.group(1), m.group(2))
m = re.search(r'/dossiers/([^./]+)\.asp', url)
if m:
return m.group(1)
return None
def find_dossier(data): def find_dossier(data):
''' '''
Find dossier with reference matching either 'url_an' or 'url_sen', Find dossier with reference matching either 'ref_an' or 'ref_sen',
create it if not found. Ensure its reference and source are 'url_an' if create it if not found. Ensure its reference is 'ref_an' if both fields
both fields are present. are present.
''' '''
changed = False changed = False
dossier = None dossier = None
reffield = None reffield = None
for field in [k for k in ('url_an', 'url_sen') if k in data]: for field in [k for k in ('ref_an', 'ref_sen') if k in data]:
try: try:
dossier = Dossier.objects.get(reference=data[field]) dossier = Dossier.objects.get(reference=data[field])
reffield = field reffield = field
...@@ -32,50 +53,85 @@ def find_dossier(data): ...@@ -32,50 +53,85 @@ def find_dossier(data):
pass pass
if dossier is None: if dossier is None:
reffield = 'url_an' if 'url_an' in data else 'url_sen' reffield = 'ref_an' if 'ref_an' in data else 'ref_sen'
dossier = Dossier(reference=data[reffield]) dossier = Dossier(reference=data[reffield])
logger.debug('Created dossier %s' % data[reffield]) logger.debug('Created dossier %s' % data[reffield])
changed = True changed = True
if 'url_an' in data and reffield != 'url_an': if 'ref_an' in data and reffield != 'ref_an':
logger.debug('Changed dossier reference to %s' % data['url_an']) logger.debug('Changed dossier reference to %s' % data['ref_an'])
dossier.reference = data['url_an'] dossier.reference = data['ref_an']
changed = True changed = True
return dossier, changed return dossier, changed
def parse_dossier_data(data): def handle_document(dossier, chamber, url):
dossier, changed = find_dossier(data) doc_changed = False
try:
doc = Document.objects.get(chamber=chamber, dossier=dossier,
kind='procedure-file')
except Document.DoesNotExist:
doc = Document(chamber=chamber, dossier=dossier, kind='procedure-file')
logger.debug('Created %s document for dossier %s' %
(chamber.abbreviation, dossier.title))
doc_changed = True
if doc.link != url:
logger.debug('Changing %s url from %s to %s' %
(chamber.abbreviation, doc.link, url))
doc.link = url
doc_changed = True
if doc_changed:
doc.save()
def parse_dossier_data(data, an, sen):
if 'url_an' in data:
ref_an = extract_reference(data['url_an'])
if ref_an is None:
logger.warn('No reference for dossier %s' % data['url_an'])
return
else:
data['ref_an'] = ref_an
if 'url_sen' in data:
ref_sen = extract_reference(data['url_sen'])
if ref_sen is None:
logger.warn('No reference for dossier %s' % data['url_sen'])
return
else:
data['ref_sen'] = ref_sen
thisurl = data['url_an' if data['chambre'] == 'AN' else 'url_sen'] dossier, changed = find_dossier(data)
if dossier.reference != dossier.link: thisref = data['ref_an' if data['chambre'] == 'AN' else 'ref_sen']
logger.debug('Changed dossier link to %s' % dossier.reference)
dossier.link = dossier.reference
changed = True
title = data['titre'] title = data['titre']
if dossier.reference == thisurl and dossier.title != title: if dossier.reference == thisref and dossier.title != title:
logger.debug('Changed dossier title to %s' % title) logger.debug('Changed dossier title to %s' % title)
dossier.title = title dossier.title = title
changed = True changed = True
if 'url_an' in data and 'url_sen' in data: with transaction.atomic():
ext_link = data['url_sen'] if changed:
if dossier.ext_link != ext_link: logger.debug('Saved dossier %s' % dossier.reference)
logger.debug('Changed dossier ext. link to %s' % ext_link) dossier.save()
dossier.ext_link = ext_link
changed = True if 'url_an' in data:
handle_document(dossier, an, data['url_an'])
if changed: if 'url_sen' in data:
logger.debug('Saved dossier %s' % dossier.reference) handle_document(dossier, sen, data['url_sen'])
dossier.save()
def main(stream=None): def main(stream=None):
if not apps.ready: if not apps.ready:
django.setup() django.setup()
ensure_chambers()
an = Chamber.objects.get(abbreviation='AN')
sen = Chamber.objects.get(abbreviation='SEN')
for data in ijson.items(stream or sys.stdin, 'item'): for data in ijson.items(stream or sys.stdin, 'item'):
parse_dossier_data(data) parse_dossier_data(data, an, sen)
...@@ -48,23 +48,16 @@ def _get_unique_title(proposal_pk, candidate): ...@@ -48,23 +48,16 @@ def _get_unique_title(proposal_pk, candidate):
class ScrutinImporter: class ScrutinImporter:
dossiers_ref = None dossiers = {}
dossiers_ext = None
def get_dossier(self, url): def get_dossier(self, url):
if self.dossiers_ref is None: if url not in self.dossiers:
self.dossiers_ref = { try:
d[0]: d[1] for d in Dossier.objects.values_list('reference', self.dossiers[url] = Dossier.objects.get(documents__link=url)
'pk') except Dossier.DoesNotExist:
} return None
if self.dossiers_ext is None:
self.dossiers_ext = {
d[0]: d[1] for d in Dossier.objects.exclude(ext_link='')
.values_list('ext_link', 'pk')
}
return self.dossiers_ref.get(url, self.dossiers_ext.get(url, None)) return self.dossiers[url]
def parse_scrutin_data(self, data): def parse_scrutin_data(self, data):
ref = data['url'] ref = data['url']
...@@ -91,7 +84,7 @@ class ScrutinImporter: ...@@ -91,7 +84,7 @@ class ScrutinImporter:
values = dict( values = dict(
title=_get_unique_title(proposal.pk, data["objet"]), title=_get_unique_title(proposal.pk, data["objet"]),
datetime=_parse_date(data["date"]), datetime=_parse_date(data["date"]),
dossier_id=dossier, dossier_id=dossier.pk,
kind='dossier' kind='dossier'
) )
......
[ [
{
"fields" : {
"abbreviation": "AN",
"country": 1095,
"name": "Assembl\u00e9e nationale"
},
"model": "representatives.chamber",
"pk": 2
},
{
"fields": {
"abbreviation": "SEN",
"country": 1095,
"name": "S\u00e9nat"
},
"model": "representatives.chamber",
"pk": 3
},
{ {
"fields": { "fields": {
"updated": "2016-02-14T13:16:31.417Z",
"reference": "http://www.assemblee-nationale.fr/14/dossiers/liberte_maires_rythmes_scolaires_premier_degre.asp",
"title": "Education : libre choix des maires concernant les rythmes scolaires dans le premier degr\u00e9",
"text": "", "text": "",
"created": "2016-02-14T13:16:31.417Z", "updated": "2016-07-07T20:23:24.303Z",
"link": "http://www.assemblee-nationale.fr/14/dossiers/liberte_maires_rythmes_scolaires_premier_degre.asp", "title": "Education : libre choix des maires concernant les rythmes scolaires dans le premier degr\u00e9",
"ext_link": "" "reference": "14/liberte_maires_rythmes_scolaires_premier_degre",
"created": "2016-07-07T20:23:24.302Z"
}, },
"model": "representatives_votes.dossier", "model": "representatives_votes.dossier",
"pk": 1 "pk": 1
}, },
{ {
"fields": { "fields": {
"updated": "2016-02-14T13:16:31.428Z",
"reference": "http://www.assemblee-nationale.fr/14/dossiers/action_publique_territoriale_metropoles.asp",
"title": "Collectivit\u00e9s territoriales : action publique territoriale et m\u00e9tropoles",
"text": "", "text": "",
"created": "2016-02-14T13:16:31.428Z", "updated": "2016-07-07T20:23:24.365Z",
"link": "http://www.assemblee-nationale.fr/14/dossiers/action_publique_territoriale_metropoles.asp", "title": "Collectivit\u00e9s territoriales : action publique territoriale et m\u00e9tropoles",
"ext_link": "http://www.senat.fr/dossier-legislatif/pjl12-495.html" "reference": "14/action_publique_territoriale_metropoles",
"created": "2016-07-07T20:23:24.332Z"
}, },
"model": "representatives_votes.dossier", "model": "representatives_votes.dossier",
"pk": 2 "pk": 2
}, },
{ {
"fields": { "fields": {
"updated": "2016-02-21T14:34:35.721Z",
"reference": "http://www.senat.fr/dossier-legislatif/ppl13-799.html",
"title": "Protection de l'enfant",
"text": "", "text": "",
"created": "2016-02-21T14:34:35.721Z", "updated": "2016-07-07T20:23:24.410Z",
"link": "http://www.senat.fr/dossier-legislatif/ppl13-799.html", "title": "Protection de l'enfant",
"ext_link": "" "reference": "ppl13-799",
"created": "2016-07-07T20:23:24.410Z"
}, },
"model": "representatives_votes.dossier", "model": "representatives_votes.dossier",
"pk": 3 "pk": 3
},
{
"fields": {
"updated": "2016-07-07T20:23:24.307Z",
"title": "",
"dossier": 1,
"created": "2016-07-07T20:23:24.307Z",
"kind": "procedure-file",
"chamber": 2,
"link": "http://www.assemblee-nationale.fr/14/dossiers/liberte_maires_rythmes_scolaires_premier_degre.asp"
},
"model": "representatives_votes.document",
"pk": 1
},
{
"fields": {
"updated": "2016-07-07T20:23:24.335Z",
"title": "",
"dossier": 2,
"created": "2016-07-07T20:23:24.335Z",
"kind": "procedure-file",
"chamber": 3,
"link": "http://www.senat.fr/dossier-legislatif/pjl12-495.html"
},
"model": "representatives_votes.document",
"pk": 2
},
{
"fields": {
"updated": "2016-07-07T20:23:24.371Z",
"title": "",
"dossier": 2,
"created": "2016-07-07T20:23:24.371Z",
"kind": "procedure-file",
"chamber": 2,
"link": "http://www.assemblee-nationale.fr/14/dossiers/action_publique_territoriale_metropoles.asp"
},
"model": "representatives_votes.document",
"pk": 3
},
{
"fields": {
"updated": "2016-07-07T20:23:24.415Z",
"title": "",
"dossier": 3,
"created": "2016-07-07T20:23:24.415Z",
"kind": "procedure-file",
"chamber": 3,
"link": "http://www.senat.fr/dossier-legislatif/ppl13-799.html"
},
"model": "representatives_votes.document",
"pk": 4
} }
] ]
...@@ -6,8 +6,10 @@ import urllib2 ...@@ -6,8 +6,10 @@ import urllib2
import ijson import ijson
import django import django
from django.apps import apps from django.apps import apps
from django.db import transaction
from representatives_votes.models import Dossier from representatives.models import Chamber
from representatives_votes.models import Dossier, Document
from .import_votes import Command from .import_votes import Command
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -17,38 +19,51 @@ URL = 'http://parltrack.euwiki.org/dumps/ep_dossiers.json.xz' ...@@ -17,38 +19,51 @@ URL = 'http://parltrack.euwiki.org/dumps/ep_dossiers.json.xz'
LOCAL_PATH = 'ep_dossiers.json.xz' LOCAL_PATH = 'ep_dossiers.json.xz'
def parse_dossier_data(data): def parse_dossier_data(data, ep):
"""Parse data from parltarck dossier export (1 dossier) Update dossier """Parse data from parltarck dossier export (1 dossier) Update dossier
if it existed before, this function goal is to import and update a if it existed before, this function goal is to import and update a
dossier, not to import all parltrack data dossier, not to import all parltrack data
""" """
changed = False changed = False
doc_changed = False
ref = data['procedure']['reference'] ref = data['procedure']['reference']
logger.debug('Processing dossier %s', ref) logger.debug('Processing dossier %s', ref)
try: with transaction.atomic():
dossier = Dossier.objects.get(reference=ref) try:
except Dossier.DoesNotExist: dossier = Dossier.objects.get(reference=ref)
dossier = Dossier(reference=ref) except Dossier.DoesNotExist:
logger.debug('Dossier did not exist') dossier = Dossier(reference=ref)
changed = True logger.debug('Dossier did not exist')
changed = True
if dossier.title != data['procedure']['title']:
logger.debug('Title changed from "%s" to "%s"', dossier.title, if dossier.title != data['procedure']['title']:
data['procedure']['title']) logger.debug('Title changed from "%s" to "%s"', dossier.title,
dossier.title = data['procedure']['title'] data['procedure']['title'])
changed = True dossier.title = data['procedure']['title']
changed = True
source = data['meta']['source'].replace('&l=en', '')
if dossier.link != source: if changed:
logger.debug('Source changed from "%s" to "%s"', dossier.link, source) logger.info('Updated dossier %s', ref)
dossier.link = source dossier.save()
changed = True
source = data['meta']['source'].replace('&l=en', '')
if changed: try:
logger.info('Updated dossier %s', ref) doc = Document.objects.get(dossier=dossier, kind='procedure-file')
dossier.save() except Document.DoesNotExist:
doc = Document(dossier=dossier, kind='procedure-file', chamber=ep)
logger.debug('Document for dossier %s did not exist', ref)
doc_changed = True
if doc.link != source:
logger.debug('Link changed from %s to %s', doc.link, source)
doc.link = source
doc_changed = True
if doc_changed:
logger.info('Updated document %s for dossier %s', doc.link, ref)
doc.save()
if 'votes' in data.keys() and 'epref' in data['votes']: if 'votes' in data.keys() and 'epref' in data['votes']:
command = Command() command = Command()
...@@ -68,13 +83,15 @@ def import_single(stream): ...@@ -68,13 +83,15 @@ def import_single(stream):
if not apps.ready: if not apps.ready:
django.setup() django.setup()
ep = Chamber.objects.get(abbreviation='EP')
for data in ijson.items(stream, ''): for data in ijson.items(stream, ''):
parse_dossier_data(data) parse_dossier_data(data, ep)
def main(stream=None): def main(stream=None):
if not apps.ready: if not apps.ready:
django.setup() django.setup()
ep = Chamber.objects.get(abbreviation='EP')
for data in ijson.items(stream or sys.stdin, 'item'): for data in ijson.items(stream or sys.stdin, 'item'):
parse_dossier_data(data) parse_dossier_data(data, ep)
[ [
{ {
"fields": { "fields": {
"updated": "2015-12-13T10:11:31.369Z",
"reference": "2012/2002(INI)",
"title": "Agenda for change: the future of EU development policy",
"text": "", "text": "",
"created": "2015-12-13T10:11:31.369Z", "updated": "2016-07-08T05:17:40.580Z",
"link": "http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?reference=2012/2002(INI)" "title": "Agenda for change: the future of EU development policy",
"reference": "2012/2002(INI)",
"created": "2016-07-08T05:17:40.580Z"
}, },
"model": "representatives_votes.dossier", "model": "representatives_votes.dossier",
"pk": 1 "pk": 1
}, },
{ {
"fields": { "fields": {