完全Follow的这一篇文章模型系列-PSM(stata实操),里面用到的数据集刚好是我以前装stata的时候自带的nlswork.dta,所以我可以直接跟着做。
import pandas as pd import numpy as np import statsmodels.api as sm from linearmodels import PanelOLS # 读取数据 data = pd.read_stata('nlswork.dta') data = data.set_index(['idcode', 'year']) # 数据描述 data.info() # 生成平方项 data['age2'] = np.power(data['age'], 2) data['ttl_exp2'] = np.power(data['ttl_exp'], 2) data['tenure2'] = np.power(data['tenure'], 2) # 设定协变量 xlist = ['grade', 'age', 'age2', 'ttl_exp', 'ttl_exp2', 'tenure', 'tenure2', 'not_smsa', 'south', 'race'] # 人为分割实验组(不是很科学!) data['treated'] = 0 data.loc[data[data.index.get_level_values(0)>2000].index, 'treated'] = 1 # 逻辑回归 formula = 'treated ~ {}'.format('+'.join(xlist)) logit_model = sm.Logit.from_formula(formula, data=data) p_result = logit_model.fit() print(p_result.summary()) # 倾向评分 data['ps'] = p_result.predict(data[xlist]) # 最近邻匹配 def neighbor_match(treated, nontreated, caliper=0.05): """ 1. 此函数并不充分科学; 2. 此函数并不充分高效。 """ treated = treated.reset_index() nontreated = nontreated.reset_index() matched = pd.DataFrame() non_matched = pd.DataFrame() for i in range(3): mapping_diff = lambda x: np.abs(x-treated['ps'][i]) nontreated['diff_ps'] = nontreated['ps'].map(mapping_diff) if nontreated['diff_ps'].min() < caliper: matched = matched.append(nontreated.loc[nontreated['diff_ps'].idxmin(),:]) else: non_matched = non_matched.append(treated.iloc[i,:]) matched = matched.set_index(['idcode', 'year']) try: non_matched = non_matched.set_index(['idcode', 'year']) except: pass return matched, non_matched treated = data[data['treated']==1] nontreated = data[data['treated']==0] matched, non_matched = neighbor_match(treated, nontreated, caliper=0.05)
Comments | NOTHING