Commit 5bdee8f1 authored by Nicolas Joyard's avatar Nicolas Joyard

Move dossier links to new Document model

parent 2c7a1dce
......@@ -2,14 +2,22 @@
from django.contrib import admin
from .models import Dossier, Proposal, Vote
from .models import Dossier, Document, Proposal, Vote
class DossierAdmin(admin.ModelAdmin):
list_display = ('id', 'reference', 'title', 'link')
list_display = ('id', 'reference', 'title')
search_fields = ('reference', 'title')
class DocumentAdmin(admin.ModelAdmin):
list_display = ('dossier_reference', 'kind', 'title', 'link')
search_fields = ('reference', 'dossier__reference', 'title')
def dossier_reference(self, obj):
return obj.dossier.reference
class ProposalAdmin(admin.ModelAdmin):
list_display = (
'reference',
......@@ -49,5 +57,6 @@ class VoteAdmin(admin.ModelAdmin):
return obj.proposal.reference
admin.site.register(Dossier, DossierAdmin)
admin.site.register(Document, DocumentAdmin)
admin.site.register(Proposal, ProposalAdmin)
admin.site.register(Vote, VoteAdmin)
......@@ -3,27 +3,48 @@
import sys
import ijson
import logging
import re
import django
from django.apps import apps
from django.db import transaction
from representatives_votes.models import Dossier
from representatives.contrib.francedata.import_representatives import \
ensure_chambers
from representatives.models import Chamber
from representatives_votes.models import Document, Dossier
logger = logging.getLogger(__name__)
def extract_reference(url):
m = re.search(r'/dossier-legislatif/([^./]+)\.html', url)
if m:
return m.group(1)
m = re.search(r'/(\d+)/dossiers/([^./]+)\.asp', url)
if m:
return '%s/%s' % (m.group(1), m.group(2))
m = re.search(r'/dossiers/([^./]+)\.asp', url)
if m:
return m.group(1)
return None
def find_dossier(data):
'''
Find dossier with reference matching either 'url_an' or 'url_sen',
create it if not found. Ensure its reference and source are 'url_an' if
both fields are present.
Find dossier with reference matching either 'ref_an' or 'ref_sen',
create it if not found. Ensure its reference is 'ref_an' if both fields
are present.
'''
changed = False
dossier = None
reffield = None
for field in [k for k in ('url_an', 'url_sen') if k in data]:
for field in [k for k in ('ref_an', 'ref_sen') if k in data]:
try:
dossier = Dossier.objects.get(reference=data[field])
reffield = field
......@@ -32,50 +53,85 @@ def find_dossier(data):
pass
if dossier is None:
reffield = 'url_an' if 'url_an' in data else 'url_sen'
reffield = 'ref_an' if 'ref_an' in data else 'ref_sen'
dossier = Dossier(reference=data[reffield])
logger.debug('Created dossier %s' % data[reffield])
changed = True
if 'url_an' in data and reffield != 'url_an':
logger.debug('Changed dossier reference to %s' % data['url_an'])
dossier.reference = data['url_an']
if 'ref_an' in data and reffield != 'ref_an':
logger.debug('Changed dossier reference to %s' % data['ref_an'])
dossier.reference = data['ref_an']
changed = True
return dossier, changed
def parse_dossier_data(data):
dossier, changed = find_dossier(data)
def handle_document(dossier, chamber, url):
doc_changed = False
try:
doc = Document.objects.get(chamber=chamber, dossier=dossier,
kind='procedure-file')
except Document.DoesNotExist:
doc = Document(chamber=chamber, dossier=dossier, kind='procedure-file')
logger.debug('Created %s document for dossier %s' %
(chamber.abbreviation, dossier.title))
doc_changed = True
if doc.link != url:
logger.debug('Changing %s url from %s to %s' %
(chamber.abbreviation, doc.link, url))
doc.link = url
doc_changed = True
if doc_changed:
doc.save()
def parse_dossier_data(data, an, sen):
if 'url_an' in data:
ref_an = extract_reference(data['url_an'])
if ref_an is None:
logger.warn('No reference for dossier %s' % data['url_an'])
return
else:
data['ref_an'] = ref_an
if 'url_sen' in data:
ref_sen = extract_reference(data['url_sen'])
if ref_sen is None:
logger.warn('No reference for dossier %s' % data['url_sen'])
return
else:
data['ref_sen'] = ref_sen
thisurl = data['url_an' if data['chambre'] == 'AN' else 'url_sen']
dossier, changed = find_dossier(data)
if dossier.reference != dossier.link:
logger.debug('Changed dossier link to %s' % dossier.reference)
dossier.link = dossier.reference
changed = True
thisref = data['ref_an' if data['chambre'] == 'AN' else 'ref_sen']
title = data['titre']
if dossier.reference == thisurl and dossier.title != title:
if dossier.reference == thisref and dossier.title != title:
logger.debug('Changed dossier title to %s' % title)
dossier.title = title
changed = True
if 'url_an' in data and 'url_sen' in data:
ext_link = data['url_sen']
if dossier.ext_link != ext_link:
logger.debug('Changed dossier ext. link to %s' % ext_link)
dossier.ext_link = ext_link
changed = True
with transaction.atomic():
if changed:
logger.debug('Saved dossier %s' % dossier.reference)
dossier.save()
if 'url_an' in data:
handle_document(dossier, an, data['url_an'])
if changed:
logger.debug('Saved dossier %s' % dossier.reference)
dossier.save()
if 'url_sen' in data:
handle_document(dossier, sen, data['url_sen'])
def main(stream=None):
if not apps.ready:
django.setup()
ensure_chambers()
an = Chamber.objects.get(abbreviation='AN')
sen = Chamber.objects.get(abbreviation='SEN')
for data in ijson.items(stream or sys.stdin, 'item'):
parse_dossier_data(data)
parse_dossier_data(data, an, sen)
......@@ -48,23 +48,16 @@ def _get_unique_title(proposal_pk, candidate):
class ScrutinImporter:
dossiers_ref = None
dossiers_ext = None
dossiers = {}
def get_dossier(self, url):
if self.dossiers_ref is None:
self.dossiers_ref = {
d[0]: d[1] for d in Dossier.objects.values_list('reference',
'pk')
}
if self.dossiers_ext is None:
self.dossiers_ext = {
d[0]: d[1] for d in Dossier.objects.exclude(ext_link='')
.values_list('ext_link', 'pk')
}
if url not in self.dossiers:
try:
self.dossiers[url] = Dossier.objects.get(documents__link=url)
except Dossier.DoesNotExist:
return None
return self.dossiers_ref.get(url, self.dossiers_ext.get(url, None))
return self.dossiers[url]
def parse_scrutin_data(self, data):
ref = data['url']
......@@ -91,7 +84,7 @@ class ScrutinImporter:
values = dict(
title=_get_unique_title(proposal.pk, data["objet"]),
datetime=_parse_date(data["date"]),
dossier_id=dossier,
dossier_id=dossier.pk,
kind='dossier'
)
......
......@@ -6,8 +6,10 @@ import urllib2
import ijson
import django
from django.apps import apps
from django.db import transaction
from representatives_votes.models import Dossier
from representatives.models import Chamber
from representatives_votes.models import Dossier, Document
from .import_votes import Command
logger = logging.getLogger(__name__)
......@@ -17,38 +19,51 @@ URL = 'http://parltrack.euwiki.org/dumps/ep_dossiers.json.xz'
LOCAL_PATH = 'ep_dossiers.json.xz'
def parse_dossier_data(data):
def parse_dossier_data(data, ep):
"""Parse data from parltarck dossier export (1 dossier) Update dossier
if it existed before, this function goal is to import and update a
dossier, not to import all parltrack data
"""
changed = False
doc_changed = False
ref = data['procedure']['reference']
logger.debug('Processing dossier %s', ref)
try:
dossier = Dossier.objects.get(reference=ref)
except Dossier.DoesNotExist:
dossier = Dossier(reference=ref)
logger.debug('Dossier did not exist')
changed = True
if dossier.title != data['procedure']['title']:
logger.debug('Title changed from "%s" to "%s"', dossier.title,
data['procedure']['title'])
dossier.title = data['procedure']['title']
changed = True
source = data['meta']['source'].replace('&l=en', '')
if dossier.link != source:
logger.debug('Source changed from "%s" to "%s"', dossier.link, source)
dossier.link = source
changed = True
if changed:
logger.info('Updated dossier %s', ref)
dossier.save()
with transaction.atomic():
try:
dossier = Dossier.objects.get(reference=ref)
except Dossier.DoesNotExist:
dossier = Dossier(reference=ref)
logger.debug('Dossier did not exist')
changed = True
if dossier.title != data['procedure']['title']:
logger.debug('Title changed from "%s" to "%s"', dossier.title,
data['procedure']['title'])
dossier.title = data['procedure']['title']
changed = True
if changed:
logger.info('Updated dossier %s', ref)
dossier.save()
source = data['meta']['source'].replace('&l=en', '')
try:
doc = Document.objects.get(dossier=dossier, kind='procedure-file')
except Document.DoesNotExist:
doc = Document(dossier=dossier, kind='procedure-file', chamber=ep)
logger.debug('Document for dossier %s did not exist', ref)
doc_changed = True
if doc.link != source:
logger.debug('Link changed from %s to %s', doc.link, source)
doc.link = source
doc_changed = True
if doc_changed:
logger.info('Updated document %s for dossier %s', doc.link, ref)
doc.save()
if 'votes' in data.keys() and 'epref' in data['votes']:
command = Command()
......@@ -68,13 +83,15 @@ def import_single(stream):
if not apps.ready:
django.setup()
ep = Chamber.objects.get(abbreviation='EP')
for data in ijson.items(stream, ''):
parse_dossier_data(data)
parse_dossier_data(data, ep)
def main(stream=None):
if not apps.ready:
django.setup()
ep = Chamber.objects.get(abbreviation='EP')
for data in ijson.items(stream or sys.stdin, 'item'):
parse_dossier_data(data)
parse_dossier_data(data, ep)
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import logging
from django.db import migrations, models
def migrate_dossier_links(apps, schema_editor):
logger = logging.getLogger('migrate_dossier_links')
# Get model managers
Chamber = apps.get_model("representatives", "Chamber")
Dossier = apps.get_model("representatives_votes", "Dossier")
Document = apps.get_model("representatives_votes", "Document")
docs = []
# EP dossiers
ep_chamber = Chamber.objects.get(abbreviation='EP')
ep_link = 'europarl.europa.eu'
for dossier in Dossier.objects.filter(link__icontains=ep_link):
logger.info('Create document %s for dossier %s' % (dossier.link,
dossier.reference))
docs.append(Document(chamber=ep_chamber, dossier=dossier,
link=dossier.link, kind='procedure-file'))
# France dossiers
try:
an_chamber = Chamber.objects.get(abbreviation='AN')
sen_chamber = Chamber.objects.get(abbreviation='SEN')
except Chamber.DoesNotExist:
return
an_link = 'assemblee-nationale.fr'
sen_link = 'senat.fr'
for dossier in Dossier.objects.filter(link__icontains=an_link):
logger.info('Create document %s for dossier %s' % (dossier.link,
dossier.reference))
docs.append(Document(chamber=an_chamber, dossier=dossier,
link=dossier.link, kind='procedure-file'))
for dossier in Dossier.objects.filter(ext_link__icontains=an_link):
logger.info('Create document %s for dossier %s' % (dossier.link,
dossier.reference))
docs.append(Document(chamber=an_chamber, dossier=dossier,
link=dossier.ext_link, kind='procedure-file'))
for dossier in Dossier.objects.filter(link__icontains=sen_link):
logger.info('Create document %s for dossier %s' % (dossier.link,
dossier.reference))
docs.append(Document(chamber=sen_chamber, dossier=dossier,
link=dossier.link, kind='procedure-file'))
for dossier in Dossier.objects.filter(ext_link__icontains=an_link):
logger.info('Create document %s for dossier %s' % (dossier.link,
dossier.reference))
docs.append(Document(chamber=sen_chamber, dossier=dossier,
link=dossier.ext_link, kind='procedure-file'))
# Create all dossiers
logger.info('Saving %s documents...' % len(docs))
Document.objects.bulk_create(docs)
class Migration(migrations.Migration):
dependencies = [
('representatives', '0019_remove_fingerprints'),
('representatives_votes', '0011_remove_fingerprints'),
]
operations = [
migrations.CreateModel(
name='Document',
fields=[
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
('created', models.DateTimeField(auto_now_add=True)),
('updated', models.DateTimeField(auto_now=True)),
('title', models.CharField(max_length=1000)),
('kind', models.CharField(default=b'', max_length=255, blank=True)),
('link', models.URLField(max_length=1000)),
('chamber', models.ForeignKey(to='representatives.Chamber')),
('dossier', models.ForeignKey(related_name='documents', to='representatives_votes.Dossier')),
],
options={
'abstract': False,
},
),
migrations.RunPython(migrate_dossier_links),
migrations.RemoveField(
model_name='dossier',
name='link',
),
migrations.RemoveField(
model_name='dossier',
name='ext_link',
),
]
# coding: utf-8
from django.db import models
from representatives.models import Representative, TimeStampedModel
from representatives.models import Chamber, Representative, TimeStampedModel
class Dossier(TimeStampedModel):
title = models.CharField(max_length=1000)
reference = models.CharField(max_length=200, unique=True)
text = models.TextField(blank=True, default='')
link = models.URLField()
ext_link = models.URLField(blank=True, default='')
class Meta:
unique_together = (('title', 'reference'))
......@@ -18,6 +16,14 @@ class Dossier(TimeStampedModel):
return unicode(self.title)
class Document(TimeStampedModel):
dossier = models.ForeignKey(Dossier, related_name='documents')
chamber = models.ForeignKey(Chamber)
title = models.CharField(max_length=1000)
kind = models.CharField(max_length=255, blank=True, default='')
link = models.URLField(max_length=1000)
class Proposal(TimeStampedModel):
dossier = models.ForeignKey(Dossier, related_name='proposals')
title = models.CharField(max_length=1000, unique=True)
......
......@@ -13,7 +13,7 @@ setup(
keywords='django government parliament votes',
install_requires=[
'django>1.8,<1.9',
'django-representatives>=0.0.27',
'django-representatives>=0.0.29',
'py-dateutil>=2,<3',
'ijson>=2,<3',
'pytz', # Always use up-to-date TZ data
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment