Commit ebdbb823 authored by Michael Witrant's avatar Michael Witrant
Browse files

moved method to a class

parent a8c7e554
...@@ -8,139 +8,148 @@ require 'nokogiri' ...@@ -8,139 +8,148 @@ require 'nokogiri'
require 'erb' require 'erb'
require 'ostruct' require 'ostruct'
options = {} class AmendmentExtractor
def debug(value) def debug(value)
if $DEBUG if $DEBUG
case value case value
when String when String
output = value output = value
else else
output = value.inspect output = value.inspect
end
STDERR.puts output
end end
STDERR.puts output
end end
end
extra_args = cli '--xml-dump' => lambda { |path| options[:xml_dump_path] = path }, def extract(opendocument_path, options = {})
'-d --debug' => lambda { $DEBUG = true }, xml = nil
'-1 --one' => lambda { options[:parse_only_one] = true }, Zippy.open(opendocument_path) do |zip|
'-n --number' => lambda { |num| options[:parse_only_num] = num } xml = zip['content.xml']
end
opendocument_path = extra_args.first doc = Nokogiri::XML::Document.parse(xml)
raise "usage: #$0 <OpenDocument file>" unless opendocument_path
xml = nil if options[:xml_dump_path]
Zippy.open(opendocument_path) do |zip| File.open(options[:xml_dump_path], "w") { |f| f.write doc.to_xml(indent: 2) }
xml = zip['content.xml'] end
end
doc = Nokogiri::XML::Document.parse(xml) styles = {}
doc.css("style|style").each do |node|
name = node["style:name"]
style = {}
text_properties = node.css("style|text-properties").first
if text_properties
style[:bold] = (text_properties["fo:font-weight"] == "bold")
end
styles[name] = style
end
if options[:xml_dump_path] text = doc.xpath('//office:text').first
File.open(options[:xml_dump_path], "w") { |f| f.write doc.to_xml(indent: 2) } raise "no office:text found" unless text
end
styles = {} amend_start = nil
doc.css("style|style").each do |node| amend_nodes = []
name = node["style:name"]
style = {} text.children.each_with_index do |node, i|
if node.search("[text()='<Amend>']").size > 0
text_properties = node.css("style|text-properties").first amend_start = i
if text_properties elsif node.search("[text()='</Amend>']").size > 0
style[:bold] = (text_properties["fo:font-weight"] == "bold") if amend_start.nil?
end raise "amend end before amend start"
end
styles[name] = style amend_end = i
end amend_nodes << text.children.slice(amend_start..amend_end)
amend_start = nil
end
end
text = doc.xpath('//office:text').first debug amendments_found: amend_nodes.length
raise "no office:text found" unless text
amendments = []
amend_nodes.each do |nodes|
amend_text = nodes.map(&:text).join
debug amend_text: amend_text unless options[:parse_only_num]
amend_doc = Nokogiri::XML::Document.parse(amend_text)
num_am = amend_doc.xpath("//NumAm").first.text
next if options[:parse_only_num] and num_am != options[:parse_only_num].to_s
doc_amend = amend_doc.xpath("//DocAmend").first.text
article = amend_doc.xpath("//Article").first.text
amendment = {
num: num_am,
doc: doc_amend,
article: article,
}
debug amendment
tables = nodes.css('table|table')
raise "amendment table not found" if tables.size == 0
raise "too many tables" if tables.size > 1
table = tables.first
text_table = table.css("table|table-row").map do |row|
row.css("table|table-cell").map do |cell|
cell.css("text|p").map do |paragraph|
paragraph.children.map do |element|
text = element.text
if element.is_a? Nokogiri::XML::Element and element.name == 'span'
style_name = element["text:style-name"]
if style_name and styles[style_name][:bold]
text = "'''#{text}'''"
end
end
text
end.join
end.join("\n")
end
end
debug text_table: text_table
header_index = text_table.index(["Text proposed by the Commission", "Amendment"])
raise "first row not found in table of amendment #{num_am}" unless header_index
changes = text_table[(header_index + 1)..-1]
raise "amendment changes not found" if changes.size == 0
debug changes: changes
amendment[:changes] = changes
amendments << amendment
break if options[:parse_only_one]
end
amend_start = nil template = ERB.new File.read('template.erb'), nil, '-'
amend_nodes = []
text.children.each_with_index do |node, i| result = []
if node.search("[text()='<Amend>']").size > 0 amendments.each do |amendment|
amend_start = i amendment_binding = OpenStruct.new(amendment).instance_eval { binding }
elsif node.search("[text()='</Amend>']").size > 0 output = template.result(amendment_binding)
if amend_start.nil? result << output
raise "amend end before amend start"
end end
amend_end = i result.join("\n")
amend_nodes << text.children.slice(amend_start..amend_end)
amend_start = nil
end end
end end
debug amendments_found: amend_nodes.length if $0 == __FILE__
options = {}
amendments = [] extra_args = cli '--xml-dump' => lambda { |path| options[:xml_dump_path] = path },
'-d --debug' => lambda { $DEBUG = true },
'-1 --one' => lambda { options[:parse_only_one] = true },
'-n --number' => lambda { |num| options[:parse_only_num] = num }
amend_nodes.each do |nodes| opendocument_path = extra_args.first
amend_text = nodes.map(&:text).join raise "usage: #$0 <OpenDocument file>" unless opendocument_path
debug amend_text: amend_text unless options[:parse_only_num]
amend_doc = Nokogiri::XML::Document.parse(amend_text)
num_am = amend_doc.xpath("//NumAm").first.text
next if options[:parse_only_num] and num_am != options[:parse_only_num] puts AmendmentExtractor.new.extract(opendocument_path, options)
doc_amend = amend_doc.xpath("//DocAmend").first.text
article = amend_doc.xpath("//Article").first.text
amendment = {
num: num_am,
doc: doc_amend,
article: article,
}
debug amendment
tables = nodes.css('table|table')
raise "amendment table not found" if tables.size == 0
raise "too many tables" if tables.size > 1
table = tables.first
text_table = table.css("table|table-row").map do |row|
row.css("table|table-cell").map do |cell|
cell.css("text|p").map do |paragraph|
paragraph.children.map do |element|
text = element.text
if element.is_a? Nokogiri::XML::Element and element.name == 'span'
style_name = element["text:style-name"]
if style_name and styles[style_name][:bold]
text = "'''#{text}'''"
end
end
text
end.join
end.join("\n")
end
end
debug text_table: text_table
header_index = text_table.index(["Text proposed by the Commission", "Amendment"])
raise "first row not found in table of amendment #{num_am}" unless header_index
changes = text_table[(header_index + 1)..-1]
raise "amendment changes not found" if changes.size == 0
debug changes: changes
amendment[:changes] = changes
amendments << amendment
break if options[:parse_only_one]
end
template = ERB.new File.read('template.erb'), nil, '-'
amendments.each do |amendment|
amendment_binding = OpenStruct.new(amendment).instance_eval { binding }
output = template.result(amendment_binding)
puts output
end end
...@@ -5,13 +5,15 @@ require 'bundler/setup' ...@@ -5,13 +5,15 @@ require 'bundler/setup'
require 'sinatra' require 'sinatra'
require 'haml' require 'haml'
require './extract_amendments'
get '/' do get '/' do
haml :index haml :index
end end
post '/extract' do post '/extract' do
haml :extract, locals: {result: %x(ruby extract_amendments.rb #{params['file'][:tempfile].path})} result = AmendmentExtractor.new.extract(params['file'][:tempfile].path)
haml :extract, locals: {result: result}
end end
__END__ __END__
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment