Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
LQDN Adminsys
amendments
Commits
7410766c
Commit
7410766c
authored
Mar 31, 2013
by
Michael Witrant
Browse files
added step debug
parent
ebdbb823
Changes
1
Show whitespace changes
Inline
Side-by-side
extract_amendments.rb
View file @
7410766c
...
@@ -22,17 +22,23 @@ class AmendmentExtractor
...
@@ -22,17 +22,23 @@ class AmendmentExtractor
end
end
def
extract
(
opendocument_path
,
options
=
{})
def
extract
(
opendocument_path
,
options
=
{})
debug
"extracting content from document"
xml
=
nil
xml
=
nil
Zippy
.
open
(
opendocument_path
)
do
|
zip
|
Zippy
.
open
(
opendocument_path
)
do
|
zip
|
xml
=
zip
[
'content.xml'
]
xml
=
zip
[
'content.xml'
]
end
end
debug
"parsing document xml"
doc
=
Nokogiri
::
XML
::
Document
.
parse
(
xml
)
doc
=
Nokogiri
::
XML
::
Document
.
parse
(
xml
)
if
options
[
:xml_dump_path
]
if
options
[
:xml_dump_path
]
debug
"dumping xml"
File
.
open
(
options
[
:xml_dump_path
],
"w"
)
{
|
f
|
f
.
write
doc
.
to_xml
(
indent:
2
)
}
File
.
open
(
options
[
:xml_dump_path
],
"w"
)
{
|
f
|
f
.
write
doc
.
to_xml
(
indent:
2
)
}
end
end
debug
"parsing styles"
styles
=
{}
styles
=
{}
doc
.
css
(
"style|style"
).
each
do
|
node
|
doc
.
css
(
"style|style"
).
each
do
|
node
|
name
=
node
[
"style:name"
]
name
=
node
[
"style:name"
]
...
@@ -46,9 +52,13 @@ class AmendmentExtractor
...
@@ -46,9 +52,13 @@ class AmendmentExtractor
styles
[
name
]
=
style
styles
[
name
]
=
style
end
end
debug
"extracting document text"
text
=
doc
.
xpath
(
'//office:text'
).
first
text
=
doc
.
xpath
(
'//office:text'
).
first
raise
"no office:text found"
unless
text
raise
"no office:text found"
unless
text
debug
"extracting amendment nodes"
amend_start
=
nil
amend_start
=
nil
amend_nodes
=
[]
amend_nodes
=
[]
...
@@ -67,6 +77,8 @@ class AmendmentExtractor
...
@@ -67,6 +77,8 @@ class AmendmentExtractor
debug
amendments_found:
amend_nodes
.
length
debug
amendments_found:
amend_nodes
.
length
debug
"extracting info from amendments"
amendments
=
[]
amendments
=
[]
amend_nodes
.
each
do
|
nodes
|
amend_nodes
.
each
do
|
nodes
|
...
@@ -79,6 +91,9 @@ class AmendmentExtractor
...
@@ -79,6 +91,9 @@ class AmendmentExtractor
next
if
options
[
:parse_only_num
]
and
num_am
!=
options
[
:parse_only_num
].
to_s
next
if
options
[
:parse_only_num
]
and
num_am
!=
options
[
:parse_only_num
].
to_s
debug
"parsing amendment
#{
num_am
}
"
doc_amend
=
amend_doc
.
xpath
(
"//DocAmend"
).
first
.
text
doc_amend
=
amend_doc
.
xpath
(
"//DocAmend"
).
first
.
text
article
=
amend_doc
.
xpath
(
"//Article"
).
first
.
text
article
=
amend_doc
.
xpath
(
"//Article"
).
first
.
text
...
@@ -89,17 +104,22 @@ class AmendmentExtractor
...
@@ -89,17 +104,22 @@ class AmendmentExtractor
}
}
debug
amendment
debug
amendment
debug
"parsing amendment table"
tables
=
nodes
.
css
(
'table|table'
)
tables
=
nodes
.
css
(
'table|table'
)
raise
"amendment table not found"
if
tables
.
size
==
0
raise
"amendment table not found"
if
tables
.
size
==
0
raise
"too many tables"
if
tables
.
size
>
1
raise
"too many tables"
if
tables
.
size
>
1
table
=
tables
.
first
table
=
tables
.
first
debug
"converting table nodes to text"
text_table
=
table
.
css
(
"table|table-row"
).
map
do
|
row
|
text_table
=
table
.
css
(
"table|table-row"
).
map
do
|
row
|
row
.
css
(
"table|table-cell"
).
map
do
|
cell
|
row
.
css
(
"table|table-cell"
).
map
do
|
cell
|
cell
.
css
(
"text|p"
).
map
do
|
paragraph
|
cell
.
css
(
"text|p"
).
map
do
|
paragraph
|
paragraph
.
children
.
map
do
|
element
|
paragraph
.
children
.
map
do
|
element
|
text
=
element
.
text
text
=
element
.
text
# add mediawiki triple quote if the text is bold in the document
if
element
.
is_a?
Nokogiri
::
XML
::
Element
and
element
.
name
==
'span'
if
element
.
is_a?
Nokogiri
::
XML
::
Element
and
element
.
name
==
'span'
style_name
=
element
[
"text:style-name"
]
style_name
=
element
[
"text:style-name"
]
if
style_name
and
styles
[
style_name
][
:bold
]
if
style_name
and
styles
[
style_name
][
:bold
]
...
@@ -114,9 +134,13 @@ class AmendmentExtractor
...
@@ -114,9 +134,13 @@ class AmendmentExtractor
end
end
debug
text_table:
text_table
debug
text_table:
text_table
debug
"locating header"
header_index
=
text_table
.
index
([
"Text proposed by the Commission"
,
"Amendment"
])
header_index
=
text_table
.
index
([
"Text proposed by the Commission"
,
"Amendment"
])
raise
"first row not found in table of amendment
#{
num_am
}
"
unless
header_index
raise
"header not found in table of amendment
#{
num_am
}
"
unless
header_index
debug
"extracting changes from table"
changes
=
text_table
[(
header_index
+
1
)
..-
1
]
changes
=
text_table
[(
header_index
+
1
)
..-
1
]
raise
"amendment changes not found"
if
changes
.
size
==
0
raise
"amendment changes not found"
if
changes
.
size
==
0
...
@@ -128,6 +152,7 @@ class AmendmentExtractor
...
@@ -128,6 +152,7 @@ class AmendmentExtractor
break
if
options
[
:parse_only_one
]
break
if
options
[
:parse_only_one
]
end
end
debug
"rendering amendments"
template
=
ERB
.
new
File
.
read
(
'template.erb'
),
nil
,
'-'
template
=
ERB
.
new
File
.
read
(
'template.erb'
),
nil
,
'-'
result
=
[]
result
=
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment