在这个地方有公示:http://rcaj.nanjing.gov.cn/zhzbweb/m.html
不过它好像是每季度公示一次,所以可以简单写点代码。
import requests columns = ['corpName', 'label', 'publicityBatch', 'publicityYear', 'subsideWay', 'talentLevel', 'userName'] def get_announcement_list(): url = 'http://rcaj.nanjing.gov.cn/zhzbweb/rcaj/tRcajRygs' data = { 'name': '人才安居名单公示', 'pageNo': 1, 'pageSize': 500 } r = requests.post(url, data=data) return r.json()['list'] def get_talent_list(id, pageNo): qurl = 'https://m.mynj.cn:11109/zhzbapp/a/mobile/rygs/list' qdata = { 'rygsId': id, 'pageNo': pageNo, 'userName': '' } r = requests.post(qurl,qdata) return r.json() announcement_list = get_announcement_list() for announcement in announcement_list: id = announcement['id'] total_page = get_talent_list(id, 0)['totalPage'] for page in range(1, total_page): talent_list = get_talent_list(id, page)['list'] for talent in talent_list: item = {c:talent[c] for c in columns} print(item)
十几分钟写出来的,不咋美观,效率也不高,用scrapy框架效率会高很多。
懒得改了。
不过存储方法肯定不能用print直接输出的,我是新建了个数据库来存。
import pymysql connection = pymysql.Connection(host='localhost',port=3306,user='root',password='password',db='mynj') cursor = connection.cursor() def insert_item(item): columns = ','.join(item.keys()) values = ','.join(['"{}"'.format(str(x).replace('\"','\'')) if x!=None else 'NULL' for x in item.values()]) sql = f'insert into talent({columns}) values({values})' cursor.execute(sql) connection.commit()
后面改用insert_one就好了。
好像不用scrapy还是太慢了……
import scrapy import requests class TalentSpider(scrapy.Spider): name = 'talent' allowed_domains = ['mynj.cn', 'nanjing.gov.cn'] columns = ['corpName', 'label', 'publicityBatch', 'publicityYear', 'subsideWay', 'talentLevel', 'userName'] def get_announcement_list(self): url = 'http://rcaj.nanjing.gov.cn/zhzbweb/rcaj/tRcajRygs' data = { 'name': '人才安居名单公示', 'pageNo': 1, 'pageSize': 500 } r = requests.post(url, data=data) return r.json()['list'] def get_talent_list(self, id, pageNo): qurl = 'https://m.mynj.cn:11109/zhzbapp/a/mobile/rygs/list' qdata = { 'rygsId': id, 'pageNo': pageNo, 'userName': '' } r = requests.post(qurl,qdata) return r.json() def start_requests(self): announcement_list = self.get_announcement_list() for announcement in announcement_list: id = announcement['id'] total_page = self.get_talent_list(id, 0)['totalPage'] for page in range(1, total_page): qurl = 'https://m.mynj.cn:11109/zhzbapp/a/mobile/rygs/list' qdata = { 'rygsId': str(id), 'pageNo': str(page), 'userName': '' } yield scrapy.FormRequest(qurl, formdata=qdata, callback=self.parse) def parse(self, response): talent_list = response.json()['list'] for talent in talent_list: item = {c:talent[c] for c in self.columns} yield(item)
这样吧
Comments | NOTHING