样本 1 | 样本 2 | 合计 | |
---|---|---|---|
诊断试验 1 | 43 真阳 TP | 5 假阳 FP | 48 |
诊断试验 2 | 7 假阴 FN | 45 真阴 TN | 52 |
合计 | 50 | 50 | 100 |
TPR, FPR 的值都在 [0,1] 内
样本 ID | 原本类别 | 预测为正 | 样本 ID | 原本类型 | 预测为正 |
---|---|---|---|---|---|
1 | 阳 | 0.95 | 6 | 阴 | 0.53 |
2 | 阳 | 0.86 | 7 | 阴 | 0.52 |
3 | 阴 | 0.70 | 8 | 阴 | 0.43 |
4 | 阳 | 0.65 | 9 | 阳 | 0.42 |
5 | 阳 | 0.55 | 10 | 阴 | 0.35 |
# 分割训练集和测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=33, stratify=y) # 根据模型对测试集进行预测 y_predict = lr.predict_proba(x_test)[:,1] # 打印数据 print('\n'.join(map(lambda x: str(x), zip(y_predict[1000:1056].round(3), y_test[1000:1056]))))
# 分割训练集和测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=33, stratify=y) # 根据模型对测试集进行预测 y_predict = lr.predict_proba(x_test)[:,1] # 打印数据 print('\n'.join(map(lambda x: str(x), zip(y_predict[1000:1056].round(3), y_test[1000:1056]))))
(0.383, 1.0) (0.297, 0.0) (0.011, 0.0) (0.105, 0.0) (0.349, 0.0) (0.248, 0.0) (0.457, 0.0) (0.119, 0.0)
(0.383, 1.0) (0.297, 0.0) (0.011, 0.0) (0.105, 0.0) (0.349, 0.0) (0.248, 0.0) (0.457, 0.0) (0.119, 0.0)
(0.042, 0.0) (0.036, 0.0) (0.069, 0.0) (0.100, 0.0) (0.003, 0.0) (0.070, 0.0) (0.093, 0.0) (0.138, 0.0)
(0.042, 0.0) (0.036, 0.0) (0.069, 0.0) (0.100, 0.0) (0.003, 0.0) (0.070, 0.0) (0.093, 0.0) (0.138, 0.0)
(0.569, 0.0) (0.026, 0.0) (0.054, 0.0) (0.088, 0.0) (0.247, 0.0) (0.965, 0.0) (0.235, 0.0) (0.303, 0.0)
(0.569, 0.0) (0.026, 0.0) (0.054, 0.0) (0.088, 0.0) (0.247, 0.0) (0.965, 0.0) (0.235, 0.0) (0.303, 0.0)
(0.022, 0.0) (0.385, 0.0) (0.990, 1.0) (0.012, 0.0) (0.995, 1.0) (0.086, 0.0) (0.285, 1.0) (0.137, 0.0)
(0.022, 0.0) (0.385, 0.0) (0.990, 1.0) (0.012, 0.0) (0.995, 1.0) (0.086, 0.0) (0.285, 1.0) (0.137, 0.0)
(0.452, 0.0) (0.131, 0.0) (0.011, 0.0) (0.919, 0.0) (0.126, 0.0) (0.945, 1.0) (0.114, 0.0) (0.038, 1.0)
(0.452, 0.0) (0.131, 0.0) (0.011, 0.0) (0.919, 0.0) (0.126, 0.0) (0.945, 1.0) (0.114, 0.0) (0.038, 1.0)
(0.098, 0.0) (0.040, 0.0) (0.081, 0.0) (0.071, 1.0) (0.999, 1.0) (0.032, 0.0) (0.051, 0.0) (0.092, 0.0)
(0.098, 0.0) (0.040, 0.0) (0.081, 0.0) (0.071, 1.0) (0.999, 1.0) (0.032, 0.0) (0.051, 0.0) (0.092, 0.0)
(0.456, 0.0) (0.197, 0.0) (0.022, 0.0) (0.379, 0.0) (0.174, 0.0) (0.054, 0.0) (0.305, 0.0) (0.371, 0.0)
(0.456, 0.0) (0.197, 0.0) (0.022, 0.0) (0.379, 0.0) (0.174, 0.0) (0.054, 0.0) (0.305, 0.0) (0.371, 0.0)
y_predict
中是预测为违约的概率,y_test
是 Default
一列的值ID | 实际 | 预测为正 | 阈值 = 1.0 | 属于 |
---|---|---|---|---|
1 | 阳 | 0.95 | ||
2 | 阳 | 0.86 | ||
3 | 阴 | 0.7 | ||
4 | 阳 | 0.65 | ||
5 | 阳 | 0.55 | ||
6 | 阴 | 0.53 | ||
7 | 阴 | 0.52 | ||
8 | 阴 | 0.43 | ||
9 | 阳 | 0.42 | ||
10 | 阴 | 0.35 |
FPR = 被预测为阳性的阴性数 / 实际的阴性数 = 0.0
TPR = 被预测为阳性的阳性数 / 实际的阳性数 = 0.0
import matplotlib.pyplot as plt from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split from sklearn.externals import joblib import pandas as pd from sklearn import metrics data = pd.read_table('dataset13.txt',sep='\t') y = data['Default'].values x = data.drop(['Default'], axis=1).values # 划分训练集和测试集 x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2,random_state = 33,stratify=y) # 加载模型 lr = joblib.load("train_model.m") y_predict = lr.predict_proba(x_test)[:,1] #用metrics.roc_curve()求出 fpr, tpr, threshold fpr, tpr, threshold = metrics.roc_curve( y_test, y_predict) #用metrics.auc求出roc_auc的值 roc_auc = metrics.auc(fpr, tpr)
import matplotlib.pyplot as plt from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split from sklearn.externals import joblib import pandas as pd from sklearn import metrics data = pd.read_table('dataset13.txt',sep='\t') y = data['Default'].values x = data.drop(['Default'], axis=1).values # 划分训练集和测试集 x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2,random_state = 33,stratify=y) # 加载模型 lr = joblib.load("train_model.m") y_predict = lr.predict_proba(x_test)[:,1] #用metrics.roc_curve()求出 fpr, tpr, threshold fpr, tpr, threshold = metrics.roc_curve( y_test, y_predict) #用metrics.auc求出roc_auc的值 roc_auc = metrics.auc(fpr, tpr)
#将图片大小设为8:6 fig,ax = plt.subplots(figsize=(8,6)) #将plt.plot里的内容填写完整 plt.plot(fpr, tpr, label = f'AUC = {roc_auc:.2f}') #将图例显示在右下方 plt.legend(loc = 'lower right') #画出一条红色对角虚线 plt.plot([0, 1], [0, 1],'r--') #设置横纵坐标轴范围 plt.xlim([-0.01, 1.01]) plt.ylim([-0.01, 1.01]) #设置横纵名称以及图形名称 plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.title('Receiver Operating Characteristic Curve') plt.show()
#将图片大小设为8:6 fig,ax = plt.subplots(figsize=(8,6)) #将plt.plot里的内容填写完整 plt.plot(fpr, tpr, label = f'AUC = {roc_auc:.2f}') #将图例显示在右下方 plt.legend(loc = 'lower right') #画出一条红色对角虚线 plt.plot([0, 1], [0, 1],'r--') #设置横纵坐标轴范围 plt.xlim([-0.01, 1.01]) plt.ylim([-0.01, 1.01]) #设置横纵名称以及图形名称 plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.title('Receiver Operating Characteristic Curve') plt.show()
pre, rec, the2 = metrics.precision_recall_curve( y_test, y_predict ) plt.plot(pre, rec) plt.xlim([-0.01, 1.01]) plt.ylim([-0.01, 1.01]) plt.ylabel('Recall Rate') plt.xlabel('Precision Rate') plt.title('Precision-Recall Curve') plt.show()
pre, rec, the2 = metrics.precision_recall_curve( y_test, y_predict ) plt.plot(pre, rec) plt.xlim([-0.01, 1.01]) plt.ylim([-0.01, 1.01]) plt.ylabel('Recall Rate') plt.xlabel('Precision Rate') plt.title('Precision-Recall Curve') plt.show()
from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split import pandas as pd from sklearn.externals import joblib from sklearn import metrics import matplotlib.pyplot as plt data = pd.read_table('dataset13.txt',sep='\t') y = data['Default'].values x = data.drop(['Default'], axis=1).values # 划分训练集和测试集 x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2,random_state = 33,stratify=y) # 加载模型 rf_clf = joblib.load("train_model2.m") y_predict = rf_clf.predict_proba(x_test)[:,1] #用metrics.roc_curve()求出 fpr, tpr, threshold fpr, tpr, threshold = metrics.roc_curve( y_test, y_predict) #用metrics.auc求出roc_auc的值 roc_auc = metrics.auc(fpr, tpr)
from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split import pandas as pd from sklearn.externals import joblib from sklearn import metrics import matplotlib.pyplot as plt data = pd.read_table('dataset13.txt',sep='\t') y = data['Default'].values x = data.drop(['Default'], axis=1).values # 划分训练集和测试集 x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2,random_state = 33,stratify=y) # 加载模型 rf_clf = joblib.load("train_model2.m") y_predict = rf_clf.predict_proba(x_test)[:,1] #用metrics.roc_curve()求出 fpr, tpr, threshold fpr, tpr, threshold = metrics.roc_curve( y_test, y_predict) #用metrics.auc求出roc_auc的值 roc_auc = metrics.auc(fpr, tpr)
#将图片大小设为8:6 fig,ax = plt.subplots(figsize=(8,6)) #将plt.plot里的内容填写完整 plt.plot(fpr, tpr, label = f'AUC = {roc_auc:.2f}') #将图例显示在右下方 plt.legend(loc = 'lower right') #画出一条红色对角虚线 plt.plot([0, 1], [0, 1],'r--') #设置横纵坐标轴范围 plt.xlim([-0.01, 1.01]) plt.ylim([-0.01, 1.01]) #设置横纵名称以及图形名称 plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.title('Receiver Operating Characteristic Curve') plt.show()
#将图片大小设为8:6 fig,ax = plt.subplots(figsize=(8,6)) #将plt.plot里的内容填写完整 plt.plot(fpr, tpr, label = f'AUC = {roc_auc:.2f}') #将图例显示在右下方 plt.legend(loc = 'lower right') #画出一条红色对角虚线 plt.plot([0, 1], [0, 1],'r--') #设置横纵坐标轴范围 plt.xlim([-0.01, 1.01]) plt.ylim([-0.01, 1.01]) #设置横纵名称以及图形名称 plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.title('Receiver Operating Characteristic Curve') plt.show()
模型报错了!
import ... data = pd.read_table('dataset13.txt',sep='\t') y = data['Default'].values x = data.drop(['Default'], axis=1).values x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state = 33,stratify=y) rf = RandomForestClassifier() # 设置需要调试的参数 tuned_parameters = {'n_estimators': [180,190],'max_depth': [8,10]} # 调用网格搜索函数 rf_clf = GridSearchCV(rf, tuned_parameters, scoring='roc_auc', n_jobs=2, cv=5) rf_clf.fit(x_train, y_train) y_predict = rf_clf.predict_proba(x_test)[:, 1] test_auc = roc_auc_score(y_test, y_predict) print ('随机森林模型test AUC:') print (test_auc) joblib.dump(rf_clf, 'train_model2.m')
import ... data = pd.read_table('dataset13.txt',sep='\t') y = data['Default'].values x = data.drop(['Default'], axis=1).values x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state = 33,stratify=y) rf = RandomForestClassifier() # 设置需要调试的参数 tuned_parameters = {'n_estimators': [180,190],'max_depth': [8,10]} # 调用网格搜索函数 rf_clf = GridSearchCV(rf, tuned_parameters, scoring='roc_auc', n_jobs=2, cv=5) rf_clf.fit(x_train, y_train) y_predict = rf_clf.predict_proba(x_test)[:, 1] test_auc = roc_auc_score(y_test, y_predict) print ('随机森林模型test AUC:') print (test_auc) joblib.dump(rf_clf, 'train_model2.m')
将训练好的模型下载后,上传至 7.3