代码就不放了,Github上有,也有2023年5月7日爬的数据。
Github链接:https://github.com/sqwqwqw1/Chinese-Auditors-Crawler
更新
中国注册会计师行业信息管理系统竟然大改版了,索性直接删除了以前的爬虫。
反而感觉清爽了不少。数据保存在本地的mongodb\cicpa\cpa,要修改保存方式的话,修改pipeline即可。
数据来源
信息来自中国注册会计师行业信息管理系统和财政会计行业管理系统
逻辑
1、从财政会计行业管理系统查询注册会计师列表
2、记录注册会计师的注师证书编号和地区,在中国注册会计师行业信息管理系统发起检索
{
"result": 1,
"msg": "success",
"info": {
"headInfo": {
"id": "0000010f-849c-6e98-8abc-aecb9e70286e",
"recver": null,
"perCode": "100000510872",
"loginPass": null,
"perstateCode": null,
"ascId": null,
"offId": "0000010f-8496-8857-403f-a440bedd405c",
"offGuid": null,
"proId": null,
"proIdName": null,
"quatypeCode": "01",
"certNum": "04070097",
"certTime": "",
"testWord": "无",
"regPlace": "110000",
"regTime": "1996-01-10",
"regGov": null,
"regWord": "会协字(1996)10号",
"perName": "马永香",
"perPy": null,
"gender": "2",
"brithDate": "1963-03-24",
"nationCode": "01",
"nativePlace": null,
"polityCode": "03",
"idCard": "110108196303246040",
"retireCode": "01",
"addr": "北京市宣武区广安门内大街319号广信嘉园319号C座9C室",
"phone": null,
"phoneDecode": null,
"mobile": null,
"mobileDecode": null,
"post": null,
"email": "[email protected]",
"profeName": "高级会计师",
"progradeCode": "02",
"stuexpCode": "05",
"degreeCode": null,
"major": "工程管理",
"college": "华东石油学院",
"englishDegree": null,
"hukouPlace": null,
"workPlace": null,
"filesPlace": "北京市人才交流中心",
"unittypeCode": null,
"enterTime": "2000-12-22",
"beforeUnit": "中洲会计师事务所",
"photoId": null,
"offDuty": "无",
"cardType": null,
"isInchg": null,
"areaCode": "01",
"isSafety": null,
"safetyUnit": null,
"offName": "北京中之光会计师事务所有限责任公司",
"partner": null,
"parMoney": null,
"partnerOffId": null,
"genderName": null,
"quatypeCodeName": null,
"perstateCodeName": null,
"remark": null,
"reservr1": null,
"reservr2": null,
"reservr3": null,
"strAge": null,
"currentPage": null,
"pageSize": null,
"areaName": null,
"certNumOrtestWord": null,
"isOffNormal": null,
"quatypeDesc": null,
"agreeGov": null,
"agreeTime": "1996-01-10",
"isPartner": "0",
"offAllCode": null,
"startDate": null,
"endDate": null,
"idcardNull": null,
"nameNull": null,
"birthNull": null,
"quatypeNull": null,
"stuexpNull": null,
"certnumNull": null,
"regwordNull": null,
"certOrTest": null,
"isStock": "0",
"startRegTime": null,
"endRegTime": null,
"displayItems": null,
"offCode": null,
"flag": "-1",
"npaGuid": "",
"newPhotoName": "",
"npaPhotoName": "",
"perPhotoName": "",
"srcPhotoPath": "",
"targetFileDir": "",
"isNquery": 1,
"perGuids": null,
"remarks": null,
"parGuid": null,
"regWords": null,
"reasons": null,
"times": null,
"isReNew": null,
"isFzyReNew": null,
"isZfAllow": null,
"impRes": null,
"impRow": 0,
"timeLimitCode": "1",
"perstateValue": null,
"quatypeValue": null,
"genderValue": null,
"nationValue": null,
"polityValue": null,
"retireValue": null,
"progradeValue": null,
"stuexpValue": null,
"degreeValue": null,
"unittypeValue": null,
"agreeTimeValue": null,
"proName": "海淀区",
"isExport": 0,
"ljcsAnnual": "领军人才年份",
"cpaetAnnual": "英语测试年份",
"khbg": "客户及报告情况",
"ywcxs": "40",
"rzzg": "最后一次通过任职资格检查时间",
"wcxs": "0",
"cjcf": "0",
"gyhd": "0",
"policyValue": "否",
"tableName": null,
"officeTableName": null,
"searchYear": null,
"searchMonth": null,
"parition": null,
"startAge": null,
"endAge": null,
"zynxType": null,
"zynx": null,
"regDate1": null,
"regDate2": null,
"offType": null,
"orgmode": null,
"startBirthDate": null,
"endBirthDate": null,
"proGuid": null,
"twoascGuid": null,
"idCardDecode": null
}
}
}
其实如果不想要这么详细的数据的话,在财政会计行业管理系统的请求里把pageSize设置成999999就可以一次把所有数据导出了,实际上我在start_requests里也是这么偷懒的,就是可能会牺牲一点性能吧。
Comments | 2 条评论
博主 2649340180
你好,我想知道里面每一个数据的name是什么,例如cardType这种,我自己在看行业信息网站上面的信息没有这么详细的,想问下作者这么多详细的数据怎么获得的,不胜感激,这里是我的联系方式13018013365
博主 Yaodo
@2649340180 我也不知道这些数据具体是什么,以前按F12是可以看到的,不过现在接口改了看不到。
好像也加了不少反爬措施,这个爬虫基本上是废废了。