一个提取起点小说名称的例子
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
| import urllib.request import re
def get_html(url): page = urllib.request.urlopen(url) _html = page.read() return str(_html, encoding='utf-8')
def save_text_file(file_name, contents): with open(file_name, 'w', encoding='utf-8') as f: f.write(contents)
def get_novel_names(source): return re.findall('<h4><a href="//book.qidian.com/info/\\d+" target="_blank" data-eid="qd_C40" data-bid="\\d+">(.+?)</a></h4>', source)
novels = [] for page in range(1, 3): html = get_html("https://www.qidian.com/rank/fin?chn=21&page="+str(page)) novels += get_novel_names(html) save_text_file("玄幻.txt", "\n".join(novels))
|
这是最简单的例子,但是连续爬https
会遇到报错:http.client.RemoteDisconnected: Remote end closed connection without response
。需要加上headers:
1 2 3 4 5 6
| def get_html(url): headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} req = urllib.request.Request(url=url, headers=headers) html = urllib.request.urlopen(req) return html.read().decode('utf-8', 'ignore')
|