-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbook_spider.py
executable file
·84 lines (68 loc) · 3.35 KB
/
book_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Created by Kim on 2017/9/25
import requests
import random
import re
from book import Book
from bs4 import BeautifulSoup as bs
from urllib.parse import quote # url编码
from openpyxl import Workbook
header = [{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13'},
{'User-Agent': 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 Safari/419.3'},
{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13'},
{'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60'},
{'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'}]
def douban_book_spider(book_tag, page=2):
'''豆瓣读书爬虫,根据标签进行爬取'''
book_list = []
# while True:
for current_page in range(page):
try:
url = 'https://book.douban.com/tag/'+ quote(book_tag) + '?start=' + str(current_page * 20) # 豆瓣书单每页显示20本书
response = requests.get(url, headers=random.choice(header))
except requests.exceptions.RequestException as e:
print(e)
# continue
soup = bs(response.text, 'html.parser')
books = soup.find_all('li', class_='subject-item')
if books == None:
return None
for book in books:
book_info = book.find('div', class_='info')
pic = book.find('img', width='90').get('src').strip() # 书本的封面
name = book_info.select('h2 a:nth-of-type(1)')[0].get('title').strip() # 书名
desc = book_info.find('div', class_='pub').text.split('/')
*author, press, pub_time, price = desc
author = [a.strip() for a in author]
rating = book_info.find('span', class_='rating_nums').text
rating_num = book_info.find('span', class_='pl').text.strip()
rating_num = re.sub(r'\D', '', rating_num)
book = Book(pic, name, author, press, pub_time, price, rating, rating_num)
book_list.append(book)
return book_list
def write_to_excel_file(book_tag, book_dict):
'''将查找到的书本写入excel文件'''
if book_dict == None or len(book_dict) == 0:
print('没有找到相对应标签的书籍')
return
wb = Workbook()
for tag in book_tag:
ws = wb.create_sheet(title=tag)
ws.append(['书名', '作者/译者', '出版社', '出版时间', '价格', '评分/评分人数', '封面'])
list = book_dict[tag]
if list == None or len(list) == 0:
continue
for book in list:
ws.append([book.name, book.author, book.press, book.time, book.price, book.rating + '/' + book.rating_num,
book.pic])
wb.save('book_list.xlsx')
def get_books(book_tags):
results = {}
for tag in book_tags:
book_list = douban_book_spider(tag)
results[tag] = book_list
return results
if __name__ == '__main__':
tag = ['小说', '音乐', '历史', 'IT']
write_to_excel_file(tag, get_books(tag))