extract_amendments.rb 5.98 KB
Newer Older
Michael Witrant's avatar
Michael Witrant committed
1 2 3 4 5
#!/usr/bin/env ruby

require 'rubygems'
require 'bundler/setup'
require 'zippy'
Michael Witrant's avatar
Michael Witrant committed
6
require 'clik'
Michael Witrant's avatar
Michael Witrant committed
7
require 'nokogiri'
8 9
require 'erb'
require 'ostruct'
Michael Witrant's avatar
Michael Witrant committed
10
require 'active_support/core_ext/object'
Michael Witrant's avatar
Michael Witrant committed
11

Michael Witrant's avatar
Michael Witrant committed
12 13 14 15 16 17 18 19 20 21
class AmendmentExtractor
  def debug(value)
    if $DEBUG
      case value
      when String
        output = value
      else
        output = value.inspect
      end
      STDERR.puts output
22 23
    end
  end
Michael Witrant's avatar
Michael Witrant committed
24 25 26 27 28 29 30 31
  
  def style_weight(style_name)
    if @styles[style_name] and @styles[style_name][:bold]
      :bold
    else
      :normal
    end
  end
Michael Witrant's avatar
Michael Witrant committed
32

Michael Witrant's avatar
Michael Witrant committed
33
  def extract(opendocument_path, options = {})
Michael Witrant's avatar
Michael Witrant committed
34
    debug "extracting content from document"
Michael Witrant's avatar
Michael Witrant committed
35 36 37 38
    xml = nil
    Zippy.open(opendocument_path) do |zip|
      xml = zip['content.xml']
    end
Michael Witrant's avatar
Michael Witrant committed
39

Michael Witrant's avatar
Michael Witrant committed
40 41
    
    debug "parsing document xml"
Michael Witrant's avatar
Michael Witrant committed
42
    doc = Nokogiri::XML::Document.parse(xml)
Michael Witrant's avatar
Michael Witrant committed
43

Michael Witrant's avatar
Michael Witrant committed
44
    if options[:xml_dump_path]
Michael Witrant's avatar
Michael Witrant committed
45
      debug "dumping xml"
Michael Witrant's avatar
Michael Witrant committed
46 47
      File.open(options[:xml_dump_path], "w") { |f| f.write doc.to_xml(indent: 2) }
    end
Michael Witrant's avatar
Michael Witrant committed
48

Michael Witrant's avatar
Michael Witrant committed
49 50
    
    debug "parsing styles"
Michael Witrant's avatar
Michael Witrant committed
51 52 53 54 55 56 57 58 59 60 61 62
    styles = {}
    doc.css("style|style").each do |node|
      name = node["style:name"]
      style = {}
      
      text_properties = node.css("style|text-properties").first
      if text_properties
        style[:bold] = (text_properties["fo:font-weight"] == "bold")
      end
      
      styles[name] = style
    end
Michael Witrant's avatar
Michael Witrant committed
63
    @styles = styles
Michael Witrant's avatar
Michael Witrant committed
64

Michael Witrant's avatar
Michael Witrant committed
65 66
    
    debug "extracting document text"
Michael Witrant's avatar
Michael Witrant committed
67 68
    text = doc.xpath('//office:text').first
    raise "no office:text found" unless text
Michael Witrant's avatar
Michael Witrant committed
69

Michael Witrant's avatar
Michael Witrant committed
70 71
    
    debug "extracting amendment nodes"
Michael Witrant's avatar
Michael Witrant committed
72 73 74 75
    amend_start = nil
    amend_nodes = []

    text.children.each_with_index do |node, i|
76
      if node.search("[text()='<Amend>']").size > 0 or node.search("[text()='<RepeatBlock-Amend><Amend>']").size > 0
Michael Witrant's avatar
Michael Witrant committed
77 78 79
        amend_start = i
      elsif node.search("[text()='</Amend>']").size > 0
        if amend_start.nil?
80
          raise "amend end before amend start (#{node.path})"
Michael Witrant's avatar
Michael Witrant committed
81 82 83 84 85 86
        end
        amend_end = i
        amend_nodes << text.children.slice(amend_start..amend_end)
        amend_start = nil
      end
    end
Michael Witrant's avatar
Michael Witrant committed
87

Michael Witrant's avatar
Michael Witrant committed
88 89
    debug amendments_found: amend_nodes.length

Michael Witrant's avatar
Michael Witrant committed
90 91
    
    debug "extracting info from amendments"
Michael Witrant's avatar
Michael Witrant committed
92 93 94 95 96 97 98 99 100 101 102
    amendments = []

    amend_nodes.each do |nodes|
      amend_text = nodes.map(&:text).join
      debug amend_text: amend_text unless options[:parse_only_num]
      
      amend_doc = Nokogiri::XML::Document.parse(amend_text)
      
      num_am = amend_doc.xpath("//NumAm").first.text
      
      next if options[:parse_only_num] and num_am != options[:parse_only_num].to_s
Michael Witrant's avatar
Michael Witrant committed
103 104 105

      
      debug "parsing amendment #{num_am}"
Michael Witrant's avatar
Michael Witrant committed
106 107 108 109
      
      doc_amend = amend_doc.xpath("//DocAmend").first.text
      article = amend_doc.xpath("//Article").first.text
      
110 111 112 113
      amendment = OpenStruct.new
      amendment.num = num_am
      amendment.doc = doc_amend
      amendment.article = article
Michael Witrant's avatar
Michael Witrant committed
114
      debug amendment
Michael Witrant's avatar
Michael Witrant committed
115

Michael Witrant's avatar
Michael Witrant committed
116
      
Michael Witrant's avatar
Michael Witrant committed
117
      debug "parsing amendment table"
Michael Witrant's avatar
Michael Witrant committed
118 119 120 121
      tables = nodes.css('table|table')
      raise "amendment table not found" if tables.size == 0
      raise "too many tables" if tables.size > 1
      table = tables.first
Michael Witrant's avatar
Michael Witrant committed
122

Michael Witrant's avatar
Michael Witrant committed
123
      
Michael Witrant's avatar
Michael Witrant committed
124
      debug "converting table nodes to text"
Michael Witrant's avatar
Michael Witrant committed
125 126 127
      text_table = table.css("table|table-row").map do |row|
        row.css("table|table-cell").map do |cell|
          cell.css("text|p").map do |paragraph|
Michael Witrant's avatar
Michael Witrant committed
128 129 130
            style_name = paragraph["text:style-name"]
            paragraph_style = style_name ? style_weight(style_name) : :normal
            
Michael Witrant's avatar
Michael Witrant committed
131
            parts = paragraph.children.map do |element|
Michael Witrant's avatar
Michael Witrant committed
132
              text = element.text
Michael Witrant's avatar
Michael Witrant committed
133
              style = paragraph_style
Michael Witrant's avatar
Michael Witrant committed
134
              
Michael Witrant's avatar
Michael Witrant committed
135
              if text.present? and element.is_a? Nokogiri::XML::Element
Michael Witrant's avatar
Michael Witrant committed
136
                style_name = element["text:style-name"]
Michael Witrant's avatar
Michael Witrant committed
137
                style = style_weight(style_name) if style_name
Michael Witrant's avatar
Michael Witrant committed
138 139
              end
              
Michael Witrant's avatar
Michael Witrant committed
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
              [style, text]
            end
            
            # merge contiguous styles
            result = []
            last_style = nil
            parts.each do |style, text|
              if style == last_style
                result.last[1] += text
              else
                result << [style, text]
              end
            end
            
            result = result.map do |style, text|
              case style
              when :bold
                "'''#{text}'''"
              else
                text
              end
Michael Witrant's avatar
Michael Witrant committed
161
            end.join
Michael Witrant's avatar
Michael Witrant committed
162
            
Michael Witrant's avatar
Michael Witrant committed
163 164 165 166 167
          end.join("\n")
        end
      end
      debug text_table: text_table
      
Michael Witrant's avatar
Michael Witrant committed
168 169

      debug "extracting changes from table"
170
      changes = text_table[2..-1]
Michael Witrant's avatar
Michael Witrant committed
171 172 173
      raise "amendment changes not found" if changes.size == 0
      
      debug changes: changes
174
      amendment.changes = changes
Michael Witrant's avatar
Michael Witrant committed
175
      
Michael Witrant's avatar
Michael Witrant committed
176 177 178 179 180 181 182 183 184
      if justification_title_parent_node = nodes.detect { |node| node.search("[text()='<TitreJust>']").size > 0 }
        justification_title_index = nodes.index(justification_title_parent_node)
        justification_node = nodes[justification_title_index + 1]
        justification_text = justification_node.text
        debug justification_text: justification_text
        amendment.justification = justification_text
      end

      
Michael Witrant's avatar
Michael Witrant committed
185 186 187 188
      amendments << amendment
      
      break if options[:parse_only_one]
    end
Michael Witrant's avatar
Michael Witrant committed
189

Michael Witrant's avatar
Michael Witrant committed
190
    debug "rendering amendments"
Michael Witrant's avatar
Michael Witrant committed
191 192
    template_text = options[:template] || File.read('template.erb')
    template = ERB.new template_text, 4, '-'
Michael Witrant's avatar
Michael Witrant committed
193

Michael Witrant's avatar
Michael Witrant committed
194
    erb_binding = OpenStruct.new(amendments: amendments).instance_eval { binding }.taint
195
    output = template.result(erb_binding)
Michael Witrant's avatar
Michael Witrant committed
196 197 198
  end
end

Michael Witrant's avatar
Michael Witrant committed
199 200
if $0 == __FILE__
  options = {}
Michael Witrant's avatar
Michael Witrant committed
201

Michael Witrant's avatar
Michael Witrant committed
202 203 204 205 206
  extra_args = cli '--xml-dump'   => lambda { |path| options[:xml_dump_path] = path },
                  '-d --debug'    => lambda { $DEBUG = true },
                  '-1 --one'      => lambda { options[:parse_only_one] = true },
                  '-n --number'   => lambda { |num| options[:parse_only_num] = num },
                  '-t --template' => lambda { |file| options[:template] = File.read(file) }
Michael Witrant's avatar
Michael Witrant committed
207

Michael Witrant's avatar
Michael Witrant committed
208 209
  opendocument_path = extra_args.first
  raise "usage: #$0 <OpenDocument file>" unless opendocument_path
210
  
Michael Witrant's avatar
Michael Witrant committed
211
  puts AmendmentExtractor.new.extract(opendocument_path, options)
212
end