先声明,是我舍友怂恿我干的!
第一步,看页面:
第二步,上代码:
1import requests
2from bs4 import BeautifulSoup
3import re,time
4import pandas as pd
5
6#要抓取的问题的URL
7url = 'https://www.zhihu.com/api/v4/questions/26297181/answers?'
8#浏览器请求头,可以在Chrome的NetWork中查找“Answer”分析得到
9headers = {
10 'authority': 'www.zhihu.com',
11 'method': 'GET',
12 'path': XXXXXXXXXXX,
13 'scheme': 'https',
14 'accept': '*/*',
15 'accept-encoding': 'gzip, deflate, br',
16 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
17 'cookie': XXXXXXXXXX,
18 'referer': 'https://www.zhihu.com/question/26297181',
19 'sec-fetch-mode': 'cors',
20 'sec-fetch-site': 'same-origin',
21 'user-agent':XXXXXXXXXXXXXX,
22 'x-ab-param': XXXXXXXX,
23 'x-requested-with': 'fetch'
24}
25
26#获取用户回答的参数,默认每个data有5个回答
27def get_json_data(offset):
28 params = {
29 'include': XXXXXXXXXX,
30 'limit': '5',
31 #offset偏移量,表示当前是从第几条回答开始
32 'offset': offset,
33 'platform': 'desktop',
34 'sort_by': 'default'
35 }
36 try:
37 r = requests.get(url,headers=headers,params=params)
38 if r.ok:
39 return r.json()['data']
40 else:
41 print('Not ok')
42 except Exception as err:
43 print(err)
44
45#把每个data中5个回答的内容抽出来放到新的列表中
46def append_answer(data,answer):
47 for item in data:
48 answer.append(item['content'])
49
50#执行前面两个函数,得到答案内容的列表
51def data_to_list():
52 answer = []
53 #输入回答数
54 max_num = int(input('input max num :'))
55 for i in range(0,max_num+1,5):
56 data = get_json_data(i)
57 append_answer(data,answer)
58 if i % 100 ==0:
59 print('已完成{}个回答'.format(i))
60 return answer
61
62#分析每条答案的内容,抽取图片链接
63def get_pics_url_list(answer):
64 pics = []
65 for i in range(len(answer)):
66 soup = BeautifulSoup(answer[i],'lxml')
67 li = soup.find_all('img')
68 for pic in li:
69 pics.append(pic.get('data-original'))
70 #转成DataFrame(去除重复项和空值比较方便)
71 picdf = pd.DataFrame(pics,columns=['url'])
72 #去除重复项
73 df = picdf.drop_duplicates('url')
74 #去除空值
75 df = df.dropna()
76 #重编索引
77 df.index=range(len(df))
78 return df
79
80#访问和保存图片文件
81def get_and_save_pics(df):
82 for i in range(0,len(df)):
83 #设置本地保存路径
84 path = 'pics/' + str(i) +'.jpg'
85 #取得图片链接
86 url = df['url'][i]
87 r = requests.get(url,headers=headers)
88 with open(path,'wb') as file:
89 file.write(r.content)
90 file.flush()
91 #每获取15张图片打印一次任务状态
92 if i % 15 ==0:
93 time.sleep(1)
94 print('已完成{}张'.format(i))
95
96if __name__ == '__main__':
97 answer = data_to_list()
98 df = get_pics_url_list(answer)
99 get_and_save_pics(df)
第三步,等待,看结果:
我存在了服务器上,用h5ai访问:
Comments | 1 条评论
博主 傲娇的小基基
嗯……我也不知道为啥import了re,懒得改了