From 7eb6eeab0443b9d4dab7eb7174f43a2290406102 Mon Sep 17 00:00:00 2001 From: Michael Witrant Date: Sat, 24 Sep 2011 12:31:52 +0200 Subject: [PATCH] parse annexes and languages --- consultation_ipred/parser.rb | 39 +++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/consultation_ipred/parser.rb b/consultation_ipred/parser.rb index 924bab2..d54fe19 100644 --- a/consultation_ipred/parser.rb +++ b/consultation_ipred/parser.rb @@ -1,6 +1,8 @@ require "rubygems" require "bundler/setup" +require 'cgi' +require 'iconv' files = %w(organisations.links public_authorities.links) @@ -8,12 +10,47 @@ file = files.first links = File.read(file).split("\n") +class Answer < Struct.new(:name, :files, :annexes, :languages) + def initialize(*args) + super + self.annexes ||= [] + self.files ||= [] + self.languages ||= [] + end +end + +answers = {} + links.each do |url| name = File.basename(URI.parse(url).path, ".pdf") + name = CGI::unescape(name) + #name = Iconv.iconv("utf-8", "iso8859-15", name).first names = name.split("_") language = names.pop + annex = names.index { |part| part =~ /^annex/ } + if annex + names[annex..-1] = [] + end + name = names.map(&:capitalize).join(" ") - p [name, language] + #name = "#{name} (#{language})" + + answer = answers[name] ||= Answer.new(name) + if annex + answer.annexes << url + else + answer.files << url + end + answer.languages << language + answer.languages.uniq! end +answers.sort.each do |name, answer| + puts "#{answer.name} (#{answer.languages.join(",")})" + (answer.files + answer.annexes).each do |url| + # p answer + name = File.basename(URI.parse(url).path) + puts " " + name + end +end -- GitLab