-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.rb
58 lines (51 loc) · 1.51 KB
/
crawler.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
require 'rest_client'
require 'nokogiri'
require 'pry'
url = "url"
$host = "host.com"
$pro_sol_fea = ["Products", "Solutions", "Features"]
$services = ["Services"]
$all_links = []
$debugger = false
$file = File.open("data.txt", "w")
def recursive_method(links)
links.each do |link|
href = link # link.attributes["href"].value
puts href
if link.include?($host)
begin
link_parsed_data = Nokogiri::HTML(RestClient.get(href))
write_to_file(href, link_parsed_data)
sub_links = link_parsed_data.xpath("//a").collect{|sub_link| sub_link.attributes["href"].value}-$all_links
puts "Sub Links: #{sub_links.size}"
$all_links += sub_links
recursive_method(sub_links)
rescue Exception => e
puts e
puts "Invalid link: #{href}"
binding.pry if $debugger
end
end
end
end
def write_to_file(url, data)
if $pro_sol_fea.collect{|key| data.include?(key)}.include?(true)
$file.write(url+"\n")
$file.write("Products: \n")
$file.write(data)
$file.write("\n")
end
if $services.collect{|key| data.include?(key)}.include?(true)
$file.write(url+"\n")
$file.write("Services: \n")
$file.write(data)
$file.write("\n")
end
end
raw_data = RestClient.get(url)
parsed_data = Nokogiri::HTML(raw_data)
write_to_file(url, parsed_data.text)
parent_links = all_links = parsed_data.xpath("//a").collect{|link| link.attributes["href"].value}
recursive_method(parent_links)
puts "Total links: #{$all_links.size}"
binding.pry