暂时貌似可以通过输入问题的ID来获取全部回答了。
每一条回答都会以json格式存储在本地的MongoDB中。
import pymongo,requests
import multiprocessing as mp
#浏览器头
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}
#输入问题的ID,获取链接
def generate_url():
question_id = input('输入问题的ID:')
url = 'https://www.zhihu.com/api/v4/questions/' + question_id + '/answers?'
print('\n您即将采集的问题链接为:{}\n'.format(url))
return url
#获取回答数
def get_max_num(url):
params = {
'offset': 0,
'limit': 5,
'sort_by': 'default',
'platform': 'desktop'
}
try:
r = requests.get(url,headers=headers,params=params).json()
max_num = r['paging']['totals']
print('该问题共有{}条回答。\n'.format(max_num))
return max_num
except:
print('该问题不存在!')
#生成链接、回答总数和偏移量的列表
def generate_offsets():
global url,max_num
url = generate_url()
max_num = get_max_num(url)
offset = [offset for offset in range(0,max_num,20)]
return offset
#从当前的偏移量(回答编号)开始,向后获取20个回答
def get_answer_json(offset):
params = {
'include': 'data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,relevant_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,is_labeled,is_recognized,paid_info,paid_info_content;data[*].mark_infos[*].url;data[*].author.follower_count,badge[*].topics',
'offset': offset,
'limit': 20,
'sort_by': 'default',
'platform': 'desktop'
}
r = requests.get(url,headers=headers,params=params).json()
return r['data']
#创建用以存储数据的集合
def create_data_collection():
global database,collection
database = input('输入用于存储的数据库名:')
collection = input('\n输入用于存储的数据集合名:')
client = pymongo.MongoClient('mongodb://localhost:27017')
database = client[database]
collection = database[collection]
print('\n采集的回答将被存储到本地MongoDB的{}表中。\n'.format(collection))
#调用get_answer_json()并将结果存储到数据库中
def save_to_DB(offset):
data = get_answer_json(offset)
collection.insert_many(data)
if (offset+20) < max_num:
print('已完成{}条回答的采集'.format(offset+20))
elif (offset+20) > max_num:
print('已完成{}条回答的采集'.format(max_num))
print('任务完成!')
if __name__ == '__main__':
offset = generate_offsets()
create_data_collection()
#多线程执行save_to_DB()
p = mp.Pool()
p.map_async(save_to_DB,offset)
p.close()
p.join()
Comments | NOTHING