This repository has been archived by the owner on May 1, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.php
111 lines (79 loc) · 2.56 KB
/
index.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
<?php
# ENTER A LINK TO A PAGE WITH LINKS
$start = "";
$already_crawled = array();
$crawling = array();
function get_details($url)
{
$options = array('http'=>array('method'=>"GET", 'headers'=>"User-Agent: ringwormGO-Spidey/0.1\n"));
$context = stream_context_create($options);
$doc = new DOMDocument();
@$doc->loadHTML(@file_get_contents($url, false, $context));
@$title = $doc->getElementsByTagName("title");
@$title = $title->item(0)->nodeValue;
$description = "";
$keywords = "";
$metas = $doc->getElementsByTagName("meta");
for ($i = 0; $i < $metas->length; $i++) {
$meta = $metas->item($i);
if (strtolower($meta->getAttribute("name")) == "description")
$description = $meta->getAttribute("content");
if (strtolower($meta->getAttribute("name")) == "keywords")
$keywords = $meta->getAttribute("content");
}
return '{ "Title": "'.str_replace("\n", "", $title).'", "Description": "'.str_replace("\n", "", $description).'", "Keywords": "'.str_replace("\n", "", $keywords).'", "URL": "'.$url.'"},';
}
function follow_links($url) {
global $already_crawled;
global $crawling;
$options = array('http'=>array('method'=>"GET", 'headers'=>"User-Agent: ringwormGO-Spidey/0.1\n"));
$context = stream_context_create($options);
$doc = new DOMDocument();
@$doc->loadHTML(@file_get_contents($url, false, $context));
$linklist = $doc->getElementsByTagName("a");
foreach ($linklist as $link) {
$l = $link->getAttribute("href");
if (substr($l, 0, 1) == "/" && substr($l, 0, 2) != "//")
{
$l = parse_url($url)["scheme"]."://".parse_url($url)["host"].$l;
}
else if (substr($l, 0, 2) == "//")
{
$l = parse_url($url)["scheme"].":".$l;
}
else if (substr($l, 0, 2) == "./")
{
$l = parse_url($url)["scheme"]."://".parse_url($url)["host"].dirname(parse_url($url)["path"]).substr($l, 1);
}
else if (substr($l, 0, 1) == "#")
{
$l = parse_url($url)["scheme"]."://".parse_url($url)["host"].parse_url($url)["path"].$l;
}
else if (substr($l, 0, 3) == "../")
{
$l = parse_url($url)["scheme"]."://".parse_url($url)["host"]."/".$l;
}
else if (substr($l, 0, 11) == "javascript:")
{
continue;
}
else if (substr($l, 0, 5) != "https" && substr($l, 0, 4) != "http")
{
$l = parse_url($url)["scheme"]."://".parse_url($url)["host"]."/".$l;
}
if (!in_array($l, $already_crawled))
{
$already_crawled[] = $l;
$crawling[] = $l;
echo get_details($l)."\n";
#echo $l."\n";
}
}
array_shift($crawling);
foreach ($already_crawled as $site)
{
follow_links($site);
}
}
follow_links($start);
print_r($already_crawled);