Commit e6c73c9c authored by Michael Witrant's avatar Michael Witrant

parse generated files

parent 242ea49d
...@@ -2,24 +2,14 @@ ...@@ -2,24 +2,14 @@
require "rubygems" require "rubygems"
require "bundler/setup" require "bundler/setup"
require "nokogiri" files = %w(organisations.links public_authorities.links)
files = %w(organisations.html public_authorities.html)
file = files.first file = files.first
doc = Nokogiri::HTML(File.read(file))
links = doc.css("a").map do |link|
href =link["href"]
href if href =~ /\.pdf$/i
end.compact.uniq { |url| URI.parse(url).path }
parsed_names = [] links = File.read(file).split("\n")
links.each do |url| links.each do |url|
name = File.basename(URI.parse(url).path, ".pdf") name = File.basename(URI.parse(url).path, ".pdf")
next if parsed_names.include?(name)
parsed_names << name
names = name.split("_") names = name.split("_")
language = names.pop language = names.pop
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment