-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.cpp
130 lines (105 loc) · 5.3 KB
/
crawler.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#include <bits/stdc++.h>
#include <unordered_map>
#include <curl/curl.h> //for downloading webpages
#include <stdio.h>
#include <regex> //to handle regular expressions
#include <thread>
#include <mutex> //to avoid race conditions
#include <chrono> //time-related functions
using namespace std;
mutex m;
//download html in file
void get_page(const char* url, const char* file_name)
{
CURL* easyhandle = curl_easy_init(); // initialize the curl Object
curl_easy_setopt( easyhandle, CURLOPT_URL, url ) ; //sets the url which will be used for http request
FILE* file = fopen(file_name, "w");//open file in write mode
curl_easy_setopt( easyhandle, CURLOPT_WRITEDATA, file) ;//specifies file where response data will be written
curl_easy_perform( easyhandle ); //performs actual http request
curl_easy_cleanup( easyhandle );//cleans up easyhandle object and frees any associated resource
fclose(file);
}
//-----------------------------------------------------------------------------------------------------------------------------------------------------------------
//extract links from file
vector<string> extract_hyperlinks(string html_file_name )
{
string html; // here we will store the html from file
ifstream read;
read.open(html_file_name);
while(!read.eof())
{
if(read.eof()) break;
char ch;
read.get(ch);
html.push_back(ch);
}
read.close();
static const regex hl_regex( "<a href=\"(.*?)\">", regex_constants::icase ); // initializing a regular expression object h1_regex that finds all hyperlinks in html file,icase mean not case sensitive
vector<string> links; // it will store all the links read from file
//below line will store all found links in vector
copy(sregex_token_iterator(html.begin(), html.end(), hl_regex, 1),sregex_token_iterator(),back_inserter(links));
return links;
}
//----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
//validate the links
void cleanUp(vector<string>&all_links )
{
vector<string>final_links;// it will store all the valid links
for(int i=0;i<all_links.size();i++) // cleaner
{
string one_link = all_links[i];//pick one link
string cleaned_link="";//it will store cleaned link
for(int j=0;j<one_link.length();j++)
{
if(one_link[j] == ' ' || one_link[j] == 34 ) break; // stops if it will find empty space or double quotation
cleaned_link.push_back(one_link[j]); // concatenating character by character to string
}
//after getting useful part now validate the link,if url is good to go then add it to final_vector
if (regex_match(cleaned_link,regex("((http|https)://)(www.)?[a-zA-Z0-9@:%._\\+~#?&//=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%._\\+~#?&//=]*)") )) {
final_links.push_back(cleaned_link);
}
}
all_links.clear(); // clear original vector
all_links = final_links; // replace it with final links vector
}
//-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
unordered_map<string,bool>visited; // used while dfs to identify url is already visited or not :TC O(1)
void dfs_crawler(const char* url,const char* file_path,int depth,int bot_id)
{
if(depth==4 || visited[url]) return; // if depth=4 or url is already visited
string s = url;//store url in string s
visited[s]=1; // mark current url as visited
m.lock();//using it for synchronization
clock_t startTime = clock();
get_page(url, file_path); // get html of current url
vector<string> allLinksData = extract_hyperlinks(file_path); // get all links
cleanUp(allLinksData);//clean all the links
m.unlock();//unlock the mutex lock
clock_t stopTime = clock();
float elapsedTime = (float)(stopTime-startTime)/CLOCKS_PER_SEC;
cout<<"Time taken to generate thread "<<bot_id<<" is: "<<elapsedTime<<" seconds"<<endl;
cout << "Thread_id: "<< bot_id <<"\tLink: "<< url << endl<<endl;//print the url and thread id
for(string i : allLinksData)//dfs
{
std::this_thread::sleep_for(std::chrono::milliseconds(10));//sleep for 10ms to get the , server overload n ho
const char *c = i.c_str();
if(visited[i]!=1){ // if url is not visited then call the function
dfs_crawler(c,file_path,depth+1,bot_id);
}
}
}
//---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
int main()
{
const char *filename = "1.txt"; // file where html will be stored
thread t1(dfs_crawler,"https://www.iitism.ac.in/",filename,0,1);
cout<<endl;
thread t2(dfs_crawler,"https://en.wikipedia.org/wiki/Main_page",filename,0,2);
cout<<endl;
thread t3(dfs_crawler,"https://codeforces.com/",filename,0,3);
cout<<endl;
t1.join(); //function to make the main thread wait until the newly created thread has finished its execution,all the threads have completed their execution before the program terminates.
t2.join();
t3.join();
return 0;
}