1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
| # 载入相应的模块 import time import requests import openpyxl
time1 = time.time()
lists = [] lists.append(['answer_kname','headline','voteup_count','content'])
################## for i in range(0,1,1): url = 'https://www.zhihu.com/api/v4/questions/64270965/answers' headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } params = { 'include': 'data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,relevant_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,is_labeled,is_recognized,paid_info,paid_info_content;data[*].mark_infos[*].url;data[*].author.follower_count,badge[*].topics', 'offset': str(i), 'limit': '3', 'sort_by': 'default', 'platform': 'desktop' } res = requests.get(url, headers=headers, params=params) res_json = res.json() items = res_json['data'] for item in items: answer_kname = item['author']['name'] headline = item['author']['headline'] content = item['content'] voteup_count = item['voteup_count']
lists.append([answer_kname,headline,voteup_count,content])
################## for i in range(3,3,1): url = 'https://www.zhihu.com/api/v4/questions/64270965/answers' headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } params = { 'include': 'data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,relevant_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,is_labeled,is_recognized,paid_info,paid_info_content;data[*].mark_infos[*].url;data[*].author.follower_count,badge[*].topics', 'offset': str(i), 'limit': '5', 'sort_by': 'default', 'platform': 'desktop' } res = requests.get(url, headers=headers, params=params) res_json = res.json() items = res_json['data'] for item in items: answer_kname = item['author']['name'] headline = item['author']['headline'] content = item['content'] voteup_count = item['voteup_count']
lists.append([answer_kname,headline,voteup_count,content])
################## for i in range(8,278,5): url = 'https://www.zhihu.com/api/v4/questions/64270965/answers' headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } params = { 'include': 'data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,relevant_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,is_labeled,is_recognized,paid_info,paid_info_content;data[*].mark_infos[*].url;data[*].author.follower_count,badge[*].topics', 'offset': str(i), 'limit': '5', 'sort_by': 'default', 'platform': 'desktop' } res = requests.get(url, headers=headers, params=params) res_json = res.json() items = res_json['data'] for item in items: answer_kname = item['author']['name'] headline = item['author']['headline'] content = item['content'] voteup_count = item['voteup_count']
lists.append([answer_kname,headline,voteup_count,content])
################## file = openpyxl.Workbook() sheet = file.active sheet.title = 'answers' for i in lists: sheet.append(i)
file.save('即将步入研究生,有什么忠告.xlsx')
################## file_html = open('知乎:即将步入研究生,有什么忠告.html','w',encoding= 'utf-8')
for i in lists: file_html.write(i[3]) file_html.close()
################## time2 = time.time() print('爬虫耗时:%.3f'%(float(time2-time1)),'秒')
|