用到的只有requests和BeautifulSoup
import requests
from bs4 import BeautifulSoup
从xml中获取所有链接
def get_sitemap_index(url, headers):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'xml')
return [x.text for x in soup.find_all('loc')]
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
}
以便从根目录的索引继续遍历
url = 'https://www.imtrq.com/sitemap.xml'
sitemap = get_sitemap_index(url, headers)
确实是索引列表,百度不允许提交这样的sitemap
sitemap[:5]
['https://www.imtrq.com/sitemap-misc.xml', 'https://www.imtrq.com/sitemap-tax-category-1.xml', 'https://www.imtrq.com/sitemap-pt-post-p1-2023-04.xml', 'https://www.imtrq.com/sitemap-pt-post-p1-2023-03.xml', 'https://www.imtrq.com/sitemap-pt-post-p1-2023-02.xml']
获取所有文章和页面的链接
li = []
for url in sitemap:
li = li + get_sitemap_index(url, headers)
li[10:15]
['https://www.imtrq.com/archives/3367', 'https://www.imtrq.com/archives/3357', 'https://www.imtrq.com/archives/3350', 'https://www.imtrq.com/archives/3343', 'https://www.imtrq.com/archives/3330']
api = 'http://data.zz.baidu.com/urls?site=https://www.imtrq.com&token=******'
r = requests.post(api,data='\n'.join(li),headers={'Content-Type':'text/plain'})
r.json()
{'remain': 98900, 'success': 275}