-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_scraper.py
116 lines (100 loc) · 4.26 KB
/
web_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import json
import requests
from bs4 import BeautifulSoup
class WebScraper:
"""
Scrapes the content of a webpage and saves it to a JSON file
"""
def __init__(self, base_url: str):
self.base_url = base_url
self.data = {
'main_urls_titles': [],
'section_tree_url': [],
'section_title': [],
'article_links': [],
'article_link_title': [],
'article_body': []
}
def make_request(self, url: str, selector: str) -> list:
"""
Make a request to a webpage and return the tags that match the selector
"""
try:
response = requests.get(url, timeout=5)
soup = BeautifulSoup(response.content, 'lxml')
tags = soup.select(selector)
return tags
except requests.exceptions.RequestException as e:
print(e)
return []
def concatenate_url(self, url: str, href: str, adjust=False) -> str:
"""
Concatenate the base URL with the href attribute
"""
if adjust:
return url + href
else:
return url[:-1] + href
def get_urls_text(self, url: str, tags: list, adjust=False) -> tuple:
"""
Get the URLs and text content from a list of tags
"""
urls = []
text = []
try:
for tag in tags:
href_url = self.concatenate_url(url, tag.get('href'), adjust=adjust)
text_content = tag.get_text().replace('\n', '')
urls.append(href_url)
text.append(text_content)
except Exception as e:
print(e)
return urls, text
def scrape(self):
"""
Perform the scraping process and populate the data dictionary
"""
# Fetch the webpage
main_tags = self.make_request(self.base_url, 'a.blocks-item-link')
main_urls, main_urls_titles = self.get_urls_text(self.base_url, main_tags)
self.data['main_urls_titles'] = main_urls_titles
section_tree_url = []
section_title = []
for main_url in main_urls:
section_tags = self.make_request(main_url, 'section.section h2 a')
section_urls, section_titles = self.get_urls_text(self.base_url, section_tags, adjust=False)
section_tree_url.append(section_urls)
section_title.append(section_titles)
zipped_section_title_url = ((list(zip(titles, urls))
for titles, urls in zip(section_title, section_tree_url)))
section_links = (url for urls in section_tree_url for url in urls)
article_urls = []
article_titles = []
for sec_tr_url in section_links:
article_tags = self.make_request(sec_tr_url, 'a.article-list-link')
article_url, article_title = self.get_urls_text(self.base_url, article_tags, adjust=False)
article_urls.append(article_url)
article_titles.append(article_title)
article_links = [url for urls in article_urls for url in urls]
article_body_title = [url for urls in article_titles for url in urls]
article_body = []
for article_url in article_links:
article_tags = self.make_request(article_url, 'div.article-body')
article_text = [tag.get_text() for tag in article_tags][0]
article_title = article_body_title[article_links.index(article_url)]
article_body.append((article_title, article_text.replace('\n', ' ').replace('\xa0', '')))
self.data['section_tree_url'] = [url for urls in section_tree_url for url in urls]
self.data['section_title'] = [title for titles in section_title for title in titles]
self.data['article_links'] = article_links
self.data['article_link_title'] = article_body_title
self.data['article_body'] = [body for title, body in article_body]
def save_to_json(self, filename: str):
"""
Save the scraped data to a JSON file
"""
if not os.path.exists('./data'):
os.makedirs('./data')
filepath = os.path.join('./data', filename)
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(self.data, f, ensure_ascii=False, indent=4)