一个提取起点小说名称的例子

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# coding: utf-8
import urllib.request
import re


# 获取网页源码
def get_html(url):
page = urllib.request.urlopen(url)
_html = page.read()
return str(_html, encoding='utf-8')


# 保存到文本文件
def save_text_file(file_name, contents):
with open(file_name, 'w', encoding='utf-8') as f:
f.write(contents)


# 正则表达式提取数据
def get_novel_names(source):
return re.findall('<h4><a href="//book.qidian.com/info/\\d+" target="_blank" data-eid="qd_C40" data-bid="\\d+">(.+?)</a></h4>', source)


novels = [] # 所有的书名列表
for page in range(1, 3): # 遍历2页
html = get_html("https://www.qidian.com/rank/fin?chn=21&page="+str(page))
novels += get_novel_names(html)
save_text_file("玄幻.txt", "\n".join(novels))

这是最简单的例子,但是连续爬https会遇到报错:http.client.RemoteDisconnected: Remote end closed connection without response。需要加上headers:

1
2
3
4
5
6
# 获取网页源码
def get_html(url):
headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
req = urllib.request.Request(url=url, headers=headers)
html = urllib.request.urlopen(req)
return html.read().decode('utf-8', 'ignore')