-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler_0-0-1.py
115 lines (98 loc) · 3.68 KB
/
crawler_0-0-1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import sys
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urlunparse
# Assuming the 'webpage2markdown_callable' module is correctly set up in the 'web2markdown' directory
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'web2markdown'))
from webpage2markdown_callable import convert_url_to_markdown
def get_domain_name(url):
"""
Extract the full domain name from a URL to use for naming the output folder and URL list file.
"""
parsed_url = urlparse(url)
return parsed_url.netloc
def is_valid_url(url):
return url.startswith("https://docs.trychroma.com/")
def get_base_url(url):
parsed_url = urlparse(url)
return f"{parsed_url.scheme}://{parsed_url.netloc}"
def strip_fragment(url):
"""
Remove the fragment from a URL, preserving other components including parameters.
"""
parsed_url = urlparse(url)
return urlunparse(parsed_url._replace(fragment=""))
def get_links(url):
"""
Fetch all valid links from a given URL, excluding anchor links to the same page.
"""
urls = set()
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
base_url = get_base_url(url)
for link in soup.find_all('a', href=True):
href = link['href']
full_url = urljoin(base_url, href)
full_url = strip_fragment(full_url)
if is_valid_url(full_url):
urls.add(full_url)
except Exception as e:
print(f"Error fetching {url}: {e}")
return urls
def append_urls_to_file(urls, filename):
"""
Append a list of URLs to a file.
"""
with open(filename, 'a') as file:
for url in urls:
file.write(url + '\n')
def crawl_website(start_url, url_filename):
"""
Crawl a website starting from a given URL, following only valid links.
"""
visited = set()
queue = [start_url]
while queue:
current_url = queue.pop(0)
current_url = strip_fragment(current_url)
if current_url not in visited:
visited.add(current_url)
print(f"Crawling: {current_url}")
for new_url in get_links(current_url):
if new_url not in visited:
queue.append(new_url)
append_urls_to_file([new_url], url_filename)
return visited
def convert_and_save(urls, output_dir):
"""
Convert each URL in a list to markdown and save the output in a specified directory.
"""
for url in urls:
print(f"Converting: {url}")
filename = os.path.basename(strip_fragment(url)).replace('.html', '') or "index"
filename += ".md"
output_path = os.path.join(output_dir, filename)
convert_url_to_markdown(url, output_path)
def main():
"""
Main function to initiate the crawling and conversion process.
"""
script_dir = os.path.dirname(os.path.abspath(__file__))
start_url = "https://docs.trychroma.com/"
domain_name = get_domain_name(start_url)
output_dir = os.path.join(script_dir, domain_name) # Updated to use domain name for folder
# Ensure the output directory exists
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Updated to use domain name for the URL filename and save it in the domain-named folder
url_filename = os.path.join(output_dir, f"{domain_name}_urls.txt")
# Ensure the URL file exists
with open(url_filename, 'a') as file:
pass
urls = crawl_website(start_url, url_filename)
convert_and_save(urls, output_dir)
if __name__ == '__main__':
main()