-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspider.py
94 lines (78 loc) · 3.77 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import requests
from bs4 import BeautifulSoup
import os
import time
import argparse
import shutil
# ANSI escape code variables for console coloring
ANSI_RESET = "\u001b[0m"
ANSI_GREEN = "\u001b[32m"
ANSI_RED = "\u001b[31m"
ANSI_CYAN = "\u001b[36m"
ANSI_YELLOW = "\u001b[33m"
def spider_site(url, output_directory, pause_time, visited_urls):
# Check if http or https is in the URL, and add it if it's not there
if not url.startswith('http://') and not url.startswith('https://'):
url = 'https://' + url
# Check if the URL has already been visited
if url in visited_urls:
return
# Add the URL to the visited URLs set
visited_urls.add(url)
# Get the HTML content of the URL
try:
print(f"{ANSI_CYAN}Requesting content for {url}...{ANSI_RESET}")
response = requests.get(url)
response.raise_for_status() # Raise an exception if the request was unsuccessful
html = response.content
print(f"{ANSI_GREEN}Successfully retrieved content for {url}{ANSI_RESET}")
except requests.RequestException as e:
print(f"{ANSI_RED}Error getting content for {url}: {e}{ANSI_RESET}")
return
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
# Find all the text on the page
text = soup.get_text()
# Remove any blank lines
text = os.linesep.join([s for s in text.splitlines() if s.strip()])
# Get the filename for the URL, and add "(reader)" if the text is the article or reader view
filename = url.replace('://', ' - ').replace('/', ' - ')
if soup.find_all(class_='article') or soup.find_all(class_='reader'):
filename = f"(reader) {filename}"
filename = f"url - {filename}.txt"
# Save the text to the file in the output directory
if not os.path.exists(output_directory):
os.makedirs(output_directory)
try:
with open(os.path.join(output_directory, filename), 'w', encoding='utf-8') as f:
f.write(text)
print(f"{ANSI_CYAN}Scanned and saved {url} to {os.path.join(output_directory, filename)}{ANSI_RESET}")
except Exception as e:
print(f"{ANSI_RED}Error saving file for {url}: {e}{ANSI_RESET}")
# Find all links on the page
links = soup.find_all('a')
# Iterate through the links
for link in links:
# Get the href attribute of the link
href = link.get('href')
# Check if the href is a valid URL on the same domain
if href.startswith('http') and url.split('/')[2] in href:
# Recursively spider the site with a pause
time.sleep(pause_time)
spider_site(href, output_directory, pause_time, visited_urls)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recursively spider an array of domains, extract all text, and save to a file in an output directory.')
parser.add_argument('-d', '--domains', nargs='+', help='Array of domains to spider', required=True)
parser.add_argument('-o', '--output', help='Output directory to save text files to', default='output')
parser.add_argument('-p', '--pause', type=int, default=1, help='Time to pause between spidering each page (in seconds)')
parser.add_argument('-c', '--clean', action='store_true', help='Remove existing files in the output directory')
args = parser.parse_args()
# Remove existing files in the output directory if the clean flag is set
if args.clean and os.path.exists(args.output):
shutil.rmtree(args.output)
# Iterate through the domains
for domain in args.domains:
visited_urls = set() # Set to store visited URLs for each domain
print(f"{ANSI_YELLOW}Spidering site: {domain}{ANSI_RESET}")
# Spider the site
spider_site(f"https://{domain}", args.output, args.pause, visited_urls)