Skip to content

Commit

Permalink
Changes dependency of HTML Importer from Mechanize to just Nokogiri
Browse files Browse the repository at this point in the history
  • Loading branch information
athityakumar committed Sep 2, 2017
1 parent fd08213 commit 27a7cd3
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 33 deletions.
2 changes: 1 addition & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ group :optional do
gem 'dbd-sqlite3'
gem 'dbi'
gem 'jsonpath'
gem 'mechanize'
gem 'mongo'
gem 'nokogiri'
gem 'redis'
gem 'roo', '~> 2.7.0'
gem 'rsruby'
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ Imports a **Daru::DataFrame** from a **.xlsx** file.
Imports an **Array** of **Daru::DataFrame**s from a **.html** file or website.

- **Docs**: [rubydoc.info](http://www.rubydoc.info/github/athityakumar/daru-io/master/Daru/IO/Importers/HTML)
- **Gem Dependencies**: `mechanize` gem
- **Gem Dependencies**: `nokogiri` gem
- **Usage**:
```ruby
#! Partially require just HTML Importer
Expand Down
37 changes: 22 additions & 15 deletions lib/daru/io/importers/html.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ class HTML < Base

# Checks for required gem dependencies of HTML Importer
def initialize
optional_gem 'mechanize'
require 'open-uri'
optional_gem 'nokogiri'
end

# Reads from a html file / website
Expand All @@ -29,7 +30,7 @@ def initialize
# @example Reading from a website url file
# instance = Daru::IO::Importers::HTML.read('http://www.moneycontrol.com/')
def read(path)
@file_data = Mechanize.new.get(path)
@file_data = Nokogiri.parse(open(path).read)
self
end

Expand Down Expand Up @@ -72,25 +73,23 @@ def read(path)
# # 3 ITC 315.85 6.75 621.12
# # 4 HDFC 1598.85 50.95 553.91
def call(match: nil, order: nil, index: nil, name: nil)
@match = match
@options = {name: name, order: order, index: index}
@match = match
@options = {name: name, index: index, order: order}

@file_data
.search('table').map { |table| parse_table table }
.keep_if { |table| search table }
.search('table')
.map { |table| parse_table(table) }
.compact
.map { |table| decide_values table, @options }
.map { |table| table_to_dataframe table }
.keep_if { |table| satisfy_dimension(table) && search(table) }
.map { |table| decide_values(table, @options) }
.map { |table| table_to_dataframe(table) }
end

private

# Allows user to override the scraped order / index / data
def decide_values(scraped_val={}, user_val={})
%I[data index name order].each do |key|
user_val[key] ||= scraped_val[key]
end
user_val
def decide_values(scraped_val, user_val)
scraped_val.merge(user_val) { |_key, scraped, user| user || scraped }
end

# Splits headers (all th tags) into order and index. Wherein,
Expand Down Expand Up @@ -121,15 +120,23 @@ def scrape_tag(table, tag)
[arr, size]
end

def satisfy_dimension(table)
return false if @options[:order] && table[:data].first.size != @options[:order].size
return false if @options[:index] && table[:data].size != @options[:index].size
true
end

def search(table)
@match.nil? ? true : (table.to_s.include? @match)
@match.nil? ? true : table.to_s.include?(@match)
end

def table_to_dataframe(table)
Daru::DataFrame.rows table[:data],
Daru::DataFrame.rows(
table[:data],
index: table[:index],
order: table[:order],
name: table[:name]
)
end
end
end
Expand Down
30 changes: 15 additions & 15 deletions spec/daru/io/importers/html_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
let(:df_index) { 0 }

context 'in wiki info table' do
let(:path) { "file://#{Dir.pwd}/spec/fixtures/html/wiki_table_info.html" }
let(:order) { %w[FName LName Age] }
let(:index) { %w[One Two Three Four Five Six Seven] }
let(:name) { 'Wikipedia Information Table' }
let(:path) { 'spec/fixtures/html/wiki_table_info.html' }
let(:order) { %w[FName LName Age] }
let(:index) { %w[One Two Three Four Five Six Seven] }
let(:name) { 'Wikipedia Information Table' }

context 'returns default dataframe' do
it_behaves_like 'exact daru dataframe',
Expand Down Expand Up @@ -40,7 +40,7 @@
end

context 'in wiki climate data' do
let(:path) { "file://#{Dir.pwd}/spec/fixtures/html/wiki_climate.html" }
let(:path) { 'spec/fixtures/html/wiki_climate.html' }

context 'returns default dataframe' do
it_behaves_like 'exact daru dataframe',
Expand All @@ -58,9 +58,9 @@
end

context 'with valid html table markups' do
let(:path) { "file://#{Dir.pwd}/spec/fixtures/html/valid_markup.html" }
let(:index) { %w[W X Y Z] }
let(:name) { 'Small HTML table with index' }
let(:path) { 'spec/fixtures/html/valid_markup.html' }
let(:index) { %w[W X Y Z] }
let(:name) { 'Small HTML table with index' }

context 'returns user-modified dataframe' do
let(:opts) { {index: index, name: name} }
Expand All @@ -76,9 +76,9 @@
end

context 'in year-wise passengers figure' do
let(:path) { "file://#{Dir.pwd}/spec/fixtures/html/macau.html" }
let(:match) { '2001' }
let(:name) { 'Year-wise Passengers Figure' }
let(:path) { 'spec/fixtures/html/macau.html' }
let(:match) { '2001' }
let(:name) { 'Year-wise Passengers Figure' }

context 'returns matching dataframes with index' do
let(:opts) { {match: match, name: name} }
Expand Down Expand Up @@ -108,9 +108,9 @@
end

context 'in share market data' do
let(:path) { "file://#{Dir.pwd}/spec/fixtures/html/moneycontrol.html" }
let(:match) { 'Sun Pharma' }
let(:index) { %w[Alpha Beta Gamma Delta Misc] }
let(:path) { 'spec/fixtures/html/moneycontrol.html' }
let(:match) { 'Sun Pharma' }
let(:index) { %w[Alpha Beta Gamma Delta Misc] }
let(:name) { 'Share Market Analysis' }

context 'returns matching dataframes' do
Expand Down Expand Up @@ -149,7 +149,7 @@
end

context 'in election results data' do
let(:path) { "file://#{Dir.pwd}/spec/fixtures/html/eciresults.html" }
let(:path) { 'spec/fixtures/html/eciresults.html' }

context 'returns default dataframes' do
it_behaves_like 'exact daru dataframe',
Expand Down
2 changes: 1 addition & 1 deletion spec/spec_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
require 'redis'
require 'dbi'
require 'jsonpath'
require 'mechanize'
require 'nokogiri'
require 'mongo'
require 'spreadsheet'
require 'sqlite3'
Expand Down

0 comments on commit 27a7cd3

Please sign in to comment.