-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclass_15_crawler_page.py
53 lines (45 loc) · 1.64 KB
/
class_15_crawler_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import re
import time
import random
from lxml import html
from class_13_crawler_douban import MyCrawler
import urllib
# 初始化
douban_crawler = MyCrawler('douban_page.txt')
# 多标签爬取
# 1. 获取标签tag信息--下载页面并解析
tag_list_url = 'https://book.douban.com/tag/?view=type'
tag_content = douban_crawler.download(tag_list_url)
tag_tree = html.fromstring(tag_content)
tags = tag_tree.xpath('//td/a/text()')
# 逐个标签分页面进行爬取
for tag in tags[:5]:
print("current tag:", tag)
# 标签转url
tag_url = urllib.parse.quote(tag)
# 2. 分页进行抽取
# 获取总页数
url = 'https://book.douban.com/tag/' + tag_url
print(url)
content = douban_crawler.download(url)
tree = html.fromstring(content)
last_page_num = tree.xpath("//div[@class='paginator']/a[last()]/text()")[0]
print('Last Page Num :{}'.format(last_page_num))
# 分页抽取
for num in range(int(last_page_num)):
url_page = url + '?start={}&type=T'.format(20 * num)
content_page = douban_crawler.download(url)
tree_page = html.fromstring(content)
print('当前页面url:', url_page)
book_infos = tree.xpath("//li[@class='subject-item']")
for book_info in book_infos:
book_name_elem = book_info.xpath('.//h2/a')[0]
book_name = re.sub('\s{2,}', '', book_name_elem.text_content().replace('\n', ''))
book_url = book_name_elem.attrib['href']
book_pub_info = book_info.xpath(".//div[@class='pub']")[0].text.strip()
book_intro = 'N/A'
book_intro_elem = book_info.xpath(".//div[@class='info']/p")
if book_intro_elem:
book_intro = book_intro_elem[0].text.strip()
print('书名:', book_name)
time.sleep(random.randint(1, 10))