Commit 5bdee8f1 authored by Nicolas Joyard's avatar Nicolas Joyard
Browse files

Move dossier links to new Document model

parent 2c7a1dce
...@@ -2,14 +2,22 @@ ...@@ -2,14 +2,22 @@
from django.contrib import admin from django.contrib import admin
from .models import Dossier, Proposal, Vote from .models import Dossier, Document, Proposal, Vote
class DossierAdmin(admin.ModelAdmin): class DossierAdmin(admin.ModelAdmin):
list_display = ('id', 'reference', 'title', 'link') list_display = ('id', 'reference', 'title')
search_fields = ('reference', 'title') search_fields = ('reference', 'title')
class DocumentAdmin(admin.ModelAdmin):
list_display = ('dossier_reference', 'kind', 'title', 'link')
search_fields = ('reference', 'dossier__reference', 'title')
def dossier_reference(self, obj):
return obj.dossier.reference
class ProposalAdmin(admin.ModelAdmin): class ProposalAdmin(admin.ModelAdmin):
list_display = ( list_display = (
'reference', 'reference',
...@@ -49,5 +57,6 @@ class VoteAdmin(admin.ModelAdmin): ...@@ -49,5 +57,6 @@ class VoteAdmin(admin.ModelAdmin):
return obj.proposal.reference return obj.proposal.reference
admin.site.register(Dossier, DossierAdmin) admin.site.register(Dossier, DossierAdmin)
admin.site.register(Document, DocumentAdmin)
admin.site.register(Proposal, ProposalAdmin) admin.site.register(Proposal, ProposalAdmin)
admin.site.register(Vote, VoteAdmin) admin.site.register(Vote, VoteAdmin)
...@@ -3,27 +3,48 @@ ...@@ -3,27 +3,48 @@
import sys import sys
import ijson import ijson
import logging import logging
import re
import django import django
from django.apps import apps from django.apps import apps
from django.db import transaction
from representatives_votes.models import Dossier from representatives.contrib.francedata.import_representatives import \
ensure_chambers
from representatives.models import Chamber
from representatives_votes.models import Document, Dossier
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def extract_reference(url):
m = re.search(r'/dossier-legislatif/([^./]+)\.html', url)
if m:
return m.group(1)
m = re.search(r'/(\d+)/dossiers/([^./]+)\.asp', url)
if m:
return '%s/%s' % (m.group(1), m.group(2))
m = re.search(r'/dossiers/([^./]+)\.asp', url)
if m:
return m.group(1)
return None
def find_dossier(data): def find_dossier(data):
''' '''
Find dossier with reference matching either 'url_an' or 'url_sen', Find dossier with reference matching either 'ref_an' or 'ref_sen',
create it if not found. Ensure its reference and source are 'url_an' if create it if not found. Ensure its reference is 'ref_an' if both fields
both fields are present. are present.
''' '''
changed = False changed = False
dossier = None dossier = None
reffield = None reffield = None
for field in [k for k in ('url_an', 'url_sen') if k in data]: for field in [k for k in ('ref_an', 'ref_sen') if k in data]:
try: try:
dossier = Dossier.objects.get(reference=data[field]) dossier = Dossier.objects.get(reference=data[field])
reffield = field reffield = field
...@@ -32,50 +53,85 @@ def find_dossier(data): ...@@ -32,50 +53,85 @@ def find_dossier(data):
pass pass
if dossier is None: if dossier is None:
reffield = 'url_an' if 'url_an' in data else 'url_sen' reffield = 'ref_an' if 'ref_an' in data else 'ref_sen'
dossier = Dossier(reference=data[reffield]) dossier = Dossier(reference=data[reffield])
logger.debug('Created dossier %s' % data[reffield]) logger.debug('Created dossier %s' % data[reffield])
changed = True changed = True
if 'url_an' in data and reffield != 'url_an': if 'ref_an' in data and reffield != 'ref_an':
logger.debug('Changed dossier reference to %s' % data['url_an']) logger.debug('Changed dossier reference to %s' % data['ref_an'])
dossier.reference = data['url_an'] dossier.reference = data['ref_an']
changed = True changed = True
return dossier, changed return dossier, changed
def parse_dossier_data(data): def handle_document(dossier, chamber, url):
dossier, changed = find_dossier(data) doc_changed = False
try:
doc = Document.objects.get(chamber=chamber, dossier=dossier,
kind='procedure-file')
except Document.DoesNotExist:
doc = Document(chamber=chamber, dossier=dossier, kind='procedure-file')
logger.debug('Created %s document for dossier %s' %
(chamber.abbreviation, dossier.title))
doc_changed = True
if doc.link != url:
logger.debug('Changing %s url from %s to %s' %
(chamber.abbreviation, doc.link, url))
doc.link = url
doc_changed = True
if doc_changed:
doc.save()
def parse_dossier_data(data, an, sen):
if 'url_an' in data:
ref_an = extract_reference(data['url_an'])
if ref_an is None:
logger.warn('No reference for dossier %s' % data['url_an'])
return
else:
data['ref_an'] = ref_an
if 'url_sen' in data:
ref_sen = extract_reference(data['url_sen'])
if ref_sen is None:
logger.warn('No reference for dossier %s' % data['url_sen'])
return
else:
data['ref_sen'] = ref_sen
thisurl = data['url_an' if data['chambre'] == 'AN' else 'url_sen'] dossier, changed = find_dossier(data)
if dossier.reference != dossier.link: thisref = data['ref_an' if data['chambre'] == 'AN' else 'ref_sen']
logger.debug('Changed dossier link to %s' % dossier.reference)
dossier.link = dossier.reference
changed = True
title = data['titre'] title = data['titre']
if dossier.reference == thisurl and dossier.title != title: if dossier.reference == thisref and dossier.title != title:
logger.debug('Changed dossier title to %s' % title) logger.debug('Changed dossier title to %s' % title)
dossier.title = title dossier.title = title
changed = True changed = True
if 'url_an' in data and 'url_sen' in data: with transaction.atomic():
ext_link = data['url_sen'] if changed:
if dossier.ext_link != ext_link: logger.debug('Saved dossier %s' % dossier.reference)
logger.debug('Changed dossier ext. link to %s' % ext_link) dossier.save()
dossier.ext_link = ext_link
changed = True if 'url_an' in data:
handle_document(dossier, an, data['url_an'])
if changed: if 'url_sen' in data:
logger.debug('Saved dossier %s' % dossier.reference) handle_document(dossier, sen, data['url_sen'])
dossier.save()
def main(stream=None): def main(stream=None):
if not apps.ready: if not apps.ready:
django.setup() django.setup()
ensure_chambers()
an = Chamber.objects.get(abbreviation='AN')
sen = Chamber.objects.get(abbreviation='SEN')
for data in ijson.items(stream or sys.stdin, 'item'): for data in ijson.items(stream or sys.stdin, 'item'):
parse_dossier_data(data) parse_dossier_data(data, an, sen)
...@@ -48,23 +48,16 @@ def _get_unique_title(proposal_pk, candidate): ...@@ -48,23 +48,16 @@ def _get_unique_title(proposal_pk, candidate):
class ScrutinImporter: class ScrutinImporter:
dossiers_ref = None dossiers = {}
dossiers_ext = None
def get_dossier(self, url): def get_dossier(self, url):
if self.dossiers_ref is None: if url not in self.dossiers:
self.dossiers_ref = { try:
d[0]: d[1] for d in Dossier.objects.values_list('reference', self.dossiers[url] = Dossier.objects.get(documents__link=url)
'pk') except Dossier.DoesNotExist:
} return None
if self.dossiers_ext is None:
self.dossiers_ext = {
d[0]: d[1] for d in Dossier.objects.exclude(ext_link='')
.values_list('ext_link', 'pk')
}
return self.dossiers_ref.get(url, self.dossiers_ext.get(url, None)) return self.dossiers[url]
def parse_scrutin_data(self, data): def parse_scrutin_data(self, data):
ref = data['url'] ref = data['url']
...@@ -91,7 +84,7 @@ class ScrutinImporter: ...@@ -91,7 +84,7 @@ class ScrutinImporter:
values = dict( values = dict(
title=_get_unique_title(proposal.pk, data["objet"]), title=_get_unique_title(proposal.pk, data["objet"]),
datetime=_parse_date(data["date"]), datetime=_parse_date(data["date"]),
dossier_id=dossier, dossier_id=dossier.pk,
kind='dossier' kind='dossier'
) )
......
...@@ -6,8 +6,10 @@ import urllib2 ...@@ -6,8 +6,10 @@ import urllib2
import ijson import ijson
import django import django
from django.apps import apps from django.apps import apps
from django.db import transaction
from representatives_votes.models import Dossier from representatives.models import Chamber
from representatives_votes.models import Dossier, Document
from .import_votes import Command from .import_votes import Command
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -17,38 +19,51 @@ URL = 'http://parltrack.euwiki.org/dumps/ep_dossiers.json.xz' ...@@ -17,38 +19,51 @@ URL = 'http://parltrack.euwiki.org/dumps/ep_dossiers.json.xz'
LOCAL_PATH = 'ep_dossiers.json.xz' LOCAL_PATH = 'ep_dossiers.json.xz'
def parse_dossier_data(data): def parse_dossier_data(data, ep):
"""Parse data from parltarck dossier export (1 dossier) Update dossier """Parse data from parltarck dossier export (1 dossier) Update dossier
if it existed before, this function goal is to import and update a if it existed before, this function goal is to import and update a
dossier, not to import all parltrack data dossier, not to import all parltrack data
""" """
changed = False changed = False
doc_changed = False
ref = data['procedure']['reference'] ref = data['procedure']['reference']
logger.debug('Processing dossier %s', ref) logger.debug('Processing dossier %s', ref)
try: with transaction.atomic():
dossier = Dossier.objects.get(reference=ref) try:
except Dossier.DoesNotExist: dossier = Dossier.objects.get(reference=ref)
dossier = Dossier(reference=ref) except Dossier.DoesNotExist:
logger.debug('Dossier did not exist') dossier = Dossier(reference=ref)
changed = True logger.debug('Dossier did not exist')
changed = True
if dossier.title != data['procedure']['title']:
logger.debug('Title changed from "%s" to "%s"', dossier.title, if dossier.title != data['procedure']['title']:
data['procedure']['title']) logger.debug('Title changed from "%s" to "%s"', dossier.title,
dossier.title = data['procedure']['title'] data['procedure']['title'])
changed = True dossier.title = data['procedure']['title']
changed = True
source = data['meta']['source'].replace('&l=en', '')
if dossier.link != source: if changed:
logger.debug('Source changed from "%s" to "%s"', dossier.link, source) logger.info('Updated dossier %s', ref)
dossier.link = source dossier.save()
changed = True
source = data['meta']['source'].replace('&l=en', '')
if changed: try:
logger.info('Updated dossier %s', ref) doc = Document.objects.get(dossier=dossier, kind='procedure-file')
dossier.save() except Document.DoesNotExist:
doc = Document(dossier=dossier, kind='procedure-file', chamber=ep)
logger.debug('Document for dossier %s did not exist', ref)
doc_changed = True
if doc.link != source:
logger.debug('Link changed from %s to %s', doc.link, source)
doc.link = source
doc_changed = True
if doc_changed:
logger.info('Updated document %s for dossier %s', doc.link, ref)
doc.save()
if 'votes' in data.keys() and 'epref' in data['votes']: if 'votes' in data.keys() and 'epref' in data['votes']:
command = Command() command = Command()
...@@ -68,13 +83,15 @@ def import_single(stream): ...@@ -68,13 +83,15 @@ def import_single(stream):
if not apps.ready: if not apps.ready:
django.setup() django.setup()
ep = Chamber.objects.get(abbreviation='EP')
for data in ijson.items(stream, ''): for data in ijson.items(stream, ''):
parse_dossier_data(data) parse_dossier_data(data, ep)
def main(stream=None): def main(stream=None):
if not apps.ready: if not apps.ready:
django.setup() django.setup()
ep = Chamber.objects.get(abbreviation='EP')
for data in ijson.items(stream or sys.stdin, 'item'): for data in ijson.items(stream or sys.stdin, 'item'):
parse_dossier_data(data) parse_dossier_data(data, ep)
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import logging
from django.db import migrations, models
def migrate_dossier_links(apps, schema_editor):
logger = logging.getLogger('migrate_dossier_links')
# Get model managers
Chamber = apps.get_model("representatives", "Chamber")
Dossier = apps.get_model("representatives_votes", "Dossier")
Document = apps.get_model("representatives_votes", "Document")
docs = []
# EP dossiers
ep_chamber = Chamber.objects.get(abbreviation='EP')
ep_link = 'europarl.europa.eu'
for dossier in Dossier.objects.filter(link__icontains=ep_link):
logger.info('Create document %s for dossier %s' % (dossier.link,
dossier.reference))
docs.append(Document(chamber=ep_chamber, dossier=dossier,
link=dossier.link, kind='procedure-file'))
# France dossiers
try:
an_chamber = Chamber.objects.get(abbreviation='AN')
sen_chamber = Chamber.objects.get(abbreviation='SEN')
except Chamber.DoesNotExist:
return
an_link = 'assemblee-nationale.fr'
sen_link = 'senat.fr'
for dossier in Dossier.objects.filter(link__icontains=an_link):
logger.info('Create document %s for dossier %s' % (dossier.link,
dossier.reference))
docs.append(Document(chamber=an_chamber, dossier=dossier,
link=dossier.link, kind='procedure-file'))
for dossier in Dossier.objects.filter(ext_link__icontains=an_link):
logger.info('Create document %s for dossier %s' % (dossier.link,
dossier.reference))
docs.append(Document(chamber=an_chamber, dossier=dossier,
link=dossier.ext_link, kind='procedure-file'))
for dossier in Dossier.objects.filter(link__icontains=sen_link):
logger.info('Create document %s for dossier %s' % (dossier.link,
dossier.reference))
docs.append(Document(chamber=sen_chamber, dossier=dossier,
link=dossier.link, kind='procedure-file'))
for dossier in Dossier.objects.filter(ext_link__icontains=an_link):
logger.info('Create document %s for dossier %s' % (dossier.link,
dossier.reference))
docs.append(Document(chamber=sen_chamber, dossier=dossier,
link=dossier.ext_link, kind='procedure-file'))
# Create all dossiers
logger.info('Saving %s documents...' % len(docs))
Document.objects.bulk_create(docs)
class Migration(migrations.Migration):
dependencies = [
('representatives', '0019_remove_fingerprints'),
('representatives_votes', '0011_remove_fingerprints'),
]
operations = [
migrations.CreateModel(
name='Document',
fields=[
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
('created', models.DateTimeField(auto_now_add=True)),
('updated', models.DateTimeField(auto_now=True)),
('title', models.CharField(max_length=1000)),
('kind', models.CharField(default=b'', max_length=255, blank=True)),
('link', models.URLField(max_length=1000)),
('chamber', models.ForeignKey(to='representatives.Chamber')),
('dossier', models.ForeignKey(related_name='documents', to='representatives_votes.Dossier')),
],
options={
'abstract': False,
},
),
migrations.RunPython(migrate_dossier_links),
migrations.RemoveField(
model_name='dossier',
name='link',
),
migrations.RemoveField(
model_name='dossier',
name='ext_link',
),
]
# coding: utf-8 # coding: utf-8
from django.db import models from django.db import models
from representatives.models import Representative, TimeStampedModel from representatives.models import Chamber, Representative, TimeStampedModel
class Dossier(TimeStampedModel): class Dossier(TimeStampedModel):
title = models.CharField(max_length=1000) title = models.CharField(max_length=1000)
reference = models.CharField(max_length=200, unique=True) reference = models.CharField(max_length=200, unique=True)
text = models.TextField(blank=True, default='') text = models.TextField(blank=True, default='')
link = models.URLField()
ext_link = models.URLField(blank=True, default='')
class Meta: class Meta:
unique_together = (('title', 'reference')) unique_together = (('title', 'reference'))
...@@ -18,6 +16,14 @@ class Dossier(TimeStampedModel): ...@@ -18,6 +16,14 @@ class Dossier(TimeStampedModel):
return unicode(self.title) return unicode(self.title)
class Document(TimeStampedModel):
dossier = models.ForeignKey(Dossier, related_name='documents')
chamber = models.ForeignKey(Chamber)
title = models.CharField(max_length=1000)
kind = models.CharField(max_length=255, blank=True, default='')
link = models.URLField(max_length=1000)
class Proposal(TimeStampedModel): class Proposal(TimeStampedModel):
dossier = models.ForeignKey(Dossier, related_name='proposals') dossier = models.ForeignKey(Dossier, related_name='proposals')
title = models.CharField(max_length=1000, unique=True) title = models.CharField(max_length=1000, unique=True)
......
...@@ -13,7 +13,7 @@ setup( ...@@ -13,7 +13,7 @@ setup(
keywords='django government parliament votes', keywords='django government parliament votes',
install_requires=[ install_requires=[
'django>1.8,<1.9', 'django>1.8,<1.9',
'django-representatives>=0.0.27', 'django-representatives>=0.0.29',
'py-dateutil>=2,<3', 'py-dateutil>=2,<3',
'ijson>=2,<3', 'ijson>=2,<3',
'pytz', # Always use up-to-date TZ data 'pytz', # Always use up-to-date TZ data
......
Supports Markdown
0% or