暂时貌似可以通过输入问题的ID来获取全部回答了。
每一条回答都会以json格式存储在本地的MongoDB中。
import pymongo,requests import multiprocessing as mp #浏览器头 headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', } #输入问题的ID,获取链接 def generate_url(): question_id = input('输入问题的ID:') url = 'https://www.zhihu.com/api/v4/questions/' + question_id + '/answers?' print('\n您即将采集的问题链接为:{}\n'.format(url)) return url #获取回答数 def get_max_num(url): params = { 'offset': 0, 'limit': 5, 'sort_by': 'default', 'platform': 'desktop' } try: r = requests.get(url,headers=headers,params=params).json() max_num = r['paging']['totals'] print('该问题共有{}条回答。\n'.format(max_num)) return max_num except: print('该问题不存在!') #生成链接、回答总数和偏移量的列表 def generate_offsets(): global url,max_num url = generate_url() max_num = get_max_num(url) offset = [offset for offset in range(0,max_num,20)] return offset #从当前的偏移量(回答编号)开始,向后获取20个回答 def get_answer_json(offset): params = { 'include': 'data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,relevant_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,is_labeled,is_recognized,paid_info,paid_info_content;data[*].mark_infos[*].url;data[*].author.follower_count,badge[*].topics', 'offset': offset, 'limit': 20, 'sort_by': 'default', 'platform': 'desktop' } r = requests.get(url,headers=headers,params=params).json() return r['data'] #创建用以存储数据的集合 def create_data_collection(): global database,collection database = input('输入用于存储的数据库名:') collection = input('\n输入用于存储的数据集合名:') client = pymongo.MongoClient('mongodb://localhost:27017') database = client[database] collection = database[collection] print('\n采集的回答将被存储到本地MongoDB的{}表中。\n'.format(collection)) #调用get_answer_json()并将结果存储到数据库中 def save_to_DB(offset): data = get_answer_json(offset) collection.insert_many(data) if (offset+20) < max_num: print('已完成{}条回答的采集'.format(offset+20)) elif (offset+20) > max_num: print('已完成{}条回答的采集'.format(max_num)) print('任务完成!') if __name__ == '__main__': offset = generate_offsets() create_data_collection() #多线程执行save_to_DB() p = mp.Pool() p.map_async(save_to_DB,offset) p.close() p.join()
Comments | NOTHING