Commit 7410766c authored by Michael Witrant's avatar Michael Witrant

added step debug

parent ebdbb823
......@@ -22,17 +22,23 @@ class AmendmentExtractor
end
def extract(opendocument_path, options = {})
debug "extracting content from document"
xml = nil
Zippy.open(opendocument_path) do |zip|
xml = zip['content.xml']
end
debug "parsing document xml"
doc = Nokogiri::XML::Document.parse(xml)
if options[:xml_dump_path]
debug "dumping xml"
File.open(options[:xml_dump_path], "w") { |f| f.write doc.to_xml(indent: 2) }
end
debug "parsing styles"
styles = {}
doc.css("style|style").each do |node|
name = node["style:name"]
......@@ -46,9 +52,13 @@ class AmendmentExtractor
styles[name] = style
end
debug "extracting document text"
text = doc.xpath('//office:text').first
raise "no office:text found" unless text
debug "extracting amendment nodes"
amend_start = nil
amend_nodes = []
......@@ -67,6 +77,8 @@ class AmendmentExtractor
debug amendments_found: amend_nodes.length
debug "extracting info from amendments"
amendments = []
amend_nodes.each do |nodes|
......@@ -78,6 +90,9 @@ class AmendmentExtractor
num_am = amend_doc.xpath("//NumAm").first.text
next if options[:parse_only_num] and num_am != options[:parse_only_num].to_s
debug "parsing amendment #{num_am}"
doc_amend = amend_doc.xpath("//DocAmend").first.text
article = amend_doc.xpath("//Article").first.text
......@@ -88,18 +103,23 @@ class AmendmentExtractor
article: article,
}
debug amendment
debug "parsing amendment table"
tables = nodes.css('table|table')
raise "amendment table not found" if tables.size == 0
raise "too many tables" if tables.size > 1
table = tables.first
debug "converting table nodes to text"
text_table = table.css("table|table-row").map do |row|
row.css("table|table-cell").map do |cell|
cell.css("text|p").map do |paragraph|
paragraph.children.map do |element|
text = element.text
# add mediawiki triple quote if the text is bold in the document
if element.is_a? Nokogiri::XML::Element and element.name == 'span'
style_name = element["text:style-name"]
if style_name and styles[style_name][:bold]
......@@ -114,9 +134,13 @@ class AmendmentExtractor
end
debug text_table: text_table
debug "locating header"
header_index = text_table.index(["Text proposed by the Commission", "Amendment"])
raise "first row not found in table of amendment #{num_am}" unless header_index
raise "header not found in table of amendment #{num_am}" unless header_index
debug "extracting changes from table"
changes = text_table[(header_index + 1)..-1]
raise "amendment changes not found" if changes.size == 0
......@@ -128,6 +152,7 @@ class AmendmentExtractor
break if options[:parse_only_one]
end
debug "rendering amendments"
template = ERB.new File.read('template.erb'), nil, '-'
result = []
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment