extract_amendments.rb 4.54 KB
Newer Older
Michael Witrant's avatar
Michael Witrant committed
1
2
3
4
5
#!/usr/bin/env ruby

require 'rubygems'
require 'bundler/setup'
require 'zippy'
Michael Witrant's avatar
Michael Witrant committed
6
require 'clik'
Michael Witrant's avatar
Michael Witrant committed
7
require 'nokogiri'
8
9
require 'erb'
require 'ostruct'
Michael Witrant's avatar
Michael Witrant committed
10

Michael Witrant's avatar
Michael Witrant committed
11
12
13
14
15
16
17
18
19
20
class AmendmentExtractor
  def debug(value)
    if $DEBUG
      case value
      when String
        output = value
      else
        output = value.inspect
      end
      STDERR.puts output
21
22
    end
  end
Michael Witrant's avatar
Michael Witrant committed
23

Michael Witrant's avatar
Michael Witrant committed
24
  def extract(opendocument_path, options = {})
Michael Witrant's avatar
Michael Witrant committed
25
    debug "extracting content from document"
Michael Witrant's avatar
Michael Witrant committed
26
27
28
29
    xml = nil
    Zippy.open(opendocument_path) do |zip|
      xml = zip['content.xml']
    end
Michael Witrant's avatar
Michael Witrant committed
30

Michael Witrant's avatar
Michael Witrant committed
31
32
    
    debug "parsing document xml"
Michael Witrant's avatar
Michael Witrant committed
33
    doc = Nokogiri::XML::Document.parse(xml)
Michael Witrant's avatar
Michael Witrant committed
34

Michael Witrant's avatar
Michael Witrant committed
35
    if options[:xml_dump_path]
Michael Witrant's avatar
Michael Witrant committed
36
      debug "dumping xml"
Michael Witrant's avatar
Michael Witrant committed
37
38
      File.open(options[:xml_dump_path], "w") { |f| f.write doc.to_xml(indent: 2) }
    end
Michael Witrant's avatar
Michael Witrant committed
39

Michael Witrant's avatar
Michael Witrant committed
40
41
    
    debug "parsing styles"
Michael Witrant's avatar
Michael Witrant committed
42
43
44
45
46
47
48
49
50
51
52
53
    styles = {}
    doc.css("style|style").each do |node|
      name = node["style:name"]
      style = {}
      
      text_properties = node.css("style|text-properties").first
      if text_properties
        style[:bold] = (text_properties["fo:font-weight"] == "bold")
      end
      
      styles[name] = style
    end
Michael Witrant's avatar
Michael Witrant committed
54

Michael Witrant's avatar
Michael Witrant committed
55
56
    
    debug "extracting document text"
Michael Witrant's avatar
Michael Witrant committed
57
58
    text = doc.xpath('//office:text').first
    raise "no office:text found" unless text
Michael Witrant's avatar
Michael Witrant committed
59

Michael Witrant's avatar
Michael Witrant committed
60
61
    
    debug "extracting amendment nodes"
Michael Witrant's avatar
Michael Witrant committed
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
    amend_start = nil
    amend_nodes = []

    text.children.each_with_index do |node, i|
      if node.search("[text()='<Amend>']").size > 0
        amend_start = i
      elsif node.search("[text()='</Amend>']").size > 0
        if amend_start.nil?
          raise "amend end before amend start"
        end
        amend_end = i
        amend_nodes << text.children.slice(amend_start..amend_end)
        amend_start = nil
      end
    end
Michael Witrant's avatar
Michael Witrant committed
77

Michael Witrant's avatar
Michael Witrant committed
78
79
    debug amendments_found: amend_nodes.length

Michael Witrant's avatar
Michael Witrant committed
80
81
    
    debug "extracting info from amendments"
Michael Witrant's avatar
Michael Witrant committed
82
83
84
85
86
87
88
89
90
91
92
    amendments = []

    amend_nodes.each do |nodes|
      amend_text = nodes.map(&:text).join
      debug amend_text: amend_text unless options[:parse_only_num]
      
      amend_doc = Nokogiri::XML::Document.parse(amend_text)
      
      num_am = amend_doc.xpath("//NumAm").first.text
      
      next if options[:parse_only_num] and num_am != options[:parse_only_num].to_s
Michael Witrant's avatar
Michael Witrant committed
93
94
95

      
      debug "parsing amendment #{num_am}"
Michael Witrant's avatar
Michael Witrant committed
96
97
98
99
      
      doc_amend = amend_doc.xpath("//DocAmend").first.text
      article = amend_doc.xpath("//Article").first.text
      
100
101
102
103
      amendment = OpenStruct.new
      amendment.num = num_am
      amendment.doc = doc_amend
      amendment.article = article
Michael Witrant's avatar
Michael Witrant committed
104
      debug amendment
Michael Witrant's avatar
Michael Witrant committed
105

Michael Witrant's avatar
Michael Witrant committed
106
      
Michael Witrant's avatar
Michael Witrant committed
107
      debug "parsing amendment table"
Michael Witrant's avatar
Michael Witrant committed
108
109
110
111
      tables = nodes.css('table|table')
      raise "amendment table not found" if tables.size == 0
      raise "too many tables" if tables.size > 1
      table = tables.first
Michael Witrant's avatar
Michael Witrant committed
112

Michael Witrant's avatar
Michael Witrant committed
113
      
Michael Witrant's avatar
Michael Witrant committed
114
      debug "converting table nodes to text"
Michael Witrant's avatar
Michael Witrant committed
115
116
117
118
119
120
      text_table = table.css("table|table-row").map do |row|
        row.css("table|table-cell").map do |cell|
          cell.css("text|p").map do |paragraph|
            paragraph.children.map do |element|
              text = element.text
              
Michael Witrant's avatar
Michael Witrant committed
121
              # add mediawiki triple quote if the text is bold in the document
Michael Witrant's avatar
Michael Witrant committed
122
123
124
125
126
127
128
129
130
131
132
133
134
135
              if element.is_a? Nokogiri::XML::Element and element.name == 'span'
                style_name = element["text:style-name"]
                if style_name and styles[style_name][:bold]
                  text = "'''#{text}'''"
                end
              end
              
              text
            end.join
          end.join("\n")
        end
      end
      debug text_table: text_table
      
Michael Witrant's avatar
Michael Witrant committed
136
137

      debug "extracting changes from table"
138
      changes = text_table[2..-1]
Michael Witrant's avatar
Michael Witrant committed
139
140
141
      raise "amendment changes not found" if changes.size == 0
      
      debug changes: changes
142
      amendment.changes = changes
Michael Witrant's avatar
Michael Witrant committed
143
144
145
146
147
      
      amendments << amendment
      
      break if options[:parse_only_one]
    end
Michael Witrant's avatar
Michael Witrant committed
148

Michael Witrant's avatar
Michael Witrant committed
149
    debug "rendering amendments"
Michael Witrant's avatar
Michael Witrant committed
150
    template = ERB.new File.read('template.erb'), nil, '-'
Michael Witrant's avatar
Michael Witrant committed
151

152
153
    erb_binding = OpenStruct.new(amendments: amendments).instance_eval { binding }
    output = template.result(erb_binding)
Michael Witrant's avatar
Michael Witrant committed
154
155
156
  end
end

Michael Witrant's avatar
Michael Witrant committed
157
158
if $0 == __FILE__
  options = {}
Michael Witrant's avatar
Michael Witrant committed
159

Michael Witrant's avatar
Michael Witrant committed
160
161
162
163
  extra_args = cli '--xml-dump'  => lambda { |path| options[:xml_dump_path] = path },
                  '-d --debug'  => lambda { $DEBUG = true },
                  '-1 --one'    => lambda { options[:parse_only_one] = true },
                  '-n --number' => lambda { |num| options[:parse_only_num] = num }
Michael Witrant's avatar
Michael Witrant committed
164

Michael Witrant's avatar
Michael Witrant committed
165
166
  opendocument_path = extra_args.first
  raise "usage: #$0 <OpenDocument file>" unless opendocument_path
167
  
Michael Witrant's avatar
Michael Witrant committed
168
  puts AmendmentExtractor.new.extract(opendocument_path, options)
169
end