在这个地方有公示:http://rcaj.nanjing.gov.cn/zhzbweb/m.html

不过它好像是每季度公示一次,所以可以简单写点代码。

import requests
columns = ['corpName', 'label', 'publicityBatch', 'publicityYear', 'subsideWay', 'talentLevel', 'userName']

def get_announcement_list():
    url = 'http://rcaj.nanjing.gov.cn/zhzbweb/rcaj/tRcajRygs'
    data = {
        'name': '人才安居名单公示',
        'pageNo': 1,
        'pageSize': 500
        }
    r = requests.post(url, data=data)
    return r.json()['list']


def get_talent_list(id, pageNo):
    qurl = 'https://m.mynj.cn:11109/zhzbapp/a/mobile/rygs/list'
    qdata = {
        'rygsId': id,
        'pageNo': pageNo,
        'userName': ''
    }
    r = requests.post(qurl,qdata)
    return r.json()

announcement_list = get_announcement_list()

for announcement in announcement_list:
    id = announcement['id']
    total_page = get_talent_list(id, 0)['totalPage']
    for page in range(1, total_page):
        talent_list = get_talent_list(id, page)['list']
        for talent in talent_list:
            item = {c:talent[c] for c in columns}
            print(item)

十几分钟写出来的,不咋美观,效率也不高,用scrapy框架效率会高很多。

懒得改了。

不过存储方法肯定不能用print直接输出的,我是新建了个数据库来存。

import pymysql
connection = pymysql.Connection(host='localhost',port=3306,user='root',password='password',db='mynj')
cursor = connection.cursor()

def insert_item(item):
    columns = ','.join(item.keys())
    values = ','.join(['"{}"'.format(str(x).replace('\"','\'')) if x!=None else 'NULL' for x in item.values()])
    sql = f'insert into talent({columns}) values({values})'
    cursor.execute(sql)
    connection.commit()

后面改用insert_one就好了。


好像不用scrapy还是太慢了……

import scrapy
import requests

class TalentSpider(scrapy.Spider):
    name = 'talent'
    allowed_domains = ['mynj.cn', 'nanjing.gov.cn']
    columns = ['corpName', 'label', 'publicityBatch', 'publicityYear', 'subsideWay', 'talentLevel', 'userName']

    def get_announcement_list(self):
        url = 'http://rcaj.nanjing.gov.cn/zhzbweb/rcaj/tRcajRygs'
        data = {
            'name': '人才安居名单公示',
            'pageNo': 1,
            'pageSize': 500
            }
        r = requests.post(url, data=data)
        return r.json()['list']

    def get_talent_list(self, id, pageNo):
        qurl = 'https://m.mynj.cn:11109/zhzbapp/a/mobile/rygs/list'
        qdata = {
            'rygsId': id,
            'pageNo': pageNo,
            'userName': ''
        }
        r = requests.post(qurl,qdata)
        return r.json()

    def start_requests(self):
        announcement_list = self.get_announcement_list()
        for announcement in announcement_list:
            id = announcement['id']
            total_page = self.get_talent_list(id, 0)['totalPage']
            for page in range(1, total_page):
                qurl = 'https://m.mynj.cn:11109/zhzbapp/a/mobile/rygs/list'
                qdata = {
                    'rygsId': str(id),
                    'pageNo': str(page),
                    'userName': ''
                }
                yield scrapy.FormRequest(qurl, formdata=qdata, callback=self.parse)



    def parse(self, response):
        talent_list = response.json()['list']
        for talent in talent_list:
            item = {c:talent[c] for c in self.columns}
            yield(item)

这样吧