一样是记录一下这周的大数据管理研究课学了什么,以防为以后忘了什么叫向前逐步回归。同时,水一篇教程。
为了能一次性输出所有内容,并不加修改直接复制到word中,我在里面添了很多无关紧要的代码,也有很多画蛇添足的操作,希望未来回来查资料的我能发现。
import pandas as pd from patsy import dmatrices import statsmodels.api as sm from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, r2_score # 读取数据 names = 'Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings'.split(',') abalone = pd.read_csv('Homework_2_abalone.txt', names=names) # 数据分割 abalone_M = abalone[abalone['Sex']=='M'] abalone_F = abalone[abalone['Sex']=='F'] abalone_I = abalone[abalone['Sex']=='I'] # 拆分测试集 split_abalone = train_test_split(abalone, test_size=0.2, random_state=3) split_abalone_M = train_test_split(abalone_M, test_size=0.2, random_state=3) split_abalone_F = train_test_split(abalone_F, test_size=0.2, random_state=3) split_abalone_I = train_test_split(abalone_I, test_size=0.2, random_state=3) #--------------单变量回归--------------# def Unary_linear_regression(data, key): # 训练模型 train_y, train_X = dmatrices('Rings ~ {}'.format(key), data[0]) results = sm.OLS(train_y, train_X).fit() coef = results.params[1] # 测试模型 test_y, test_X = dmatrices('Rings ~ {}'.format(key), data[1]) y_hat = results.predict(test_X) MSE = mean_squared_error(test_y, y_hat) r2 = r2_score(test_y, y_hat) return (key, coef, MSE, r2) # 打印结果 print('单变量回归:', end='') for data in [(split_abalone, 'All'), (split_abalone_M, 'Male'), (split_abalone_F, 'Female'), (split_abalone_I, 'Infant')]: li = [] for key in abalone.columns[1:-1]: li.append(Unary_linear_regression(data[0], key)) df = pd.DataFrame(li, columns=['Key', 'Coefficient', 'MSE', 'R²']) print('\n'+ data[1] + ':') print(df.sort_values(by='MSE').reset_index(drop=True)) #--------------多元回归----------------# # 用向前逐步回归法来选择模型 def forward_selected(data, target): variables = set(data.columns) variables.remove('Sex') variables.remove(target) selected = [] current_score, best_new_score = float('inf'), float('inf') while variables: candidates_score = [] for candidate in variables: formula = '{} ~ {}'.format(target, '+'.join(selected+ [candidate])) y, X = dmatrices(formula, data) aic = sm.OLS(y, X).fit().aic candidates_score.append((aic, candidate)) candidates_score.sort(reverse=True) best_new_score, best_candidate = candidates_score.pop() if best_new_score < current_score: variables.remove(best_candidate) selected.append(best_candidate) current_score = best_new_score else: break formula = '{} ~ {}'.format(target, '+'.join(selected)) return formula # 训练选择的模型,并对测试集进行测试 # 返回训练模型的结果(系数) # 返回测试模型的 MSE 和 R² def multiple_regression(data, target): print('\n\n'+ data[1] + ':') data = data[0] formula = forward_selected(data[0], target) train_y, train_X = dmatrices(formula, data[0]) results = sm.OLS(train_y, train_X).fit() coef = results.params test_y, test_X = dmatrices(formula, data[1]) y_hat = results.predict(test_X) MSE = mean_squared_error(test_y, y_hat) r2 = r2_score(test_y, y_hat) print(results.summary()) return formula, coef[1:], MSE, r2 # 将测试的结果构建字典,方便输出成dataframe def predict_to_dict(data, target): formula, coef, MSE, r2 = multiple_regression(data, target) keys = formula.split('~')[1].strip().split('+') + ['MSE', 'R²'] values = coef.tolist() + [MSE] + [r2] result_dict = {keys[i]:values[i] for i in range(len(keys))} return result_dict # 打印结果 print('\n\n多元回归:', end='') li = [] for data in [(split_abalone, 'All'), (split_abalone_M, 'Male'), (split_abalone_F, 'Female'), (split_abalone_I, 'Infant')]: li.append(predict_to_dict(data, 'Rings')) df = pd.DataFrame(li).T df.columns = ['All', 'Male', 'Female', 'Infant'] print('\n\n\n结果汇总:') print(df)
Comments | NOTHING