- 交叉验证模型。
- 混淆矩阵。
- ROC曲线。
- Cohen's κ score。
import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import warnings warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split # Creating the dataset N = 1000 # number of samples data = {'A': np.random.normal(100, 8, N), 'B': np.random.normal(60, 5, N), 'C': np.random.choice([1, 2, 3], size=N, p=[0.2, 0.3, 0.5])} df = pd.DataFrame(data=data) # Labeling def get_label(A, B, C): if A < 95: return 1 elif C == 1: return 1 elif B > 68 or B < 52: return 1 return 0 df['label'] = df.apply(lambda row: get_label(row['A'],row['B'],row['C']),axis=1) # Dividing to train and test set X = np.asarray(df[['A', 'B', 'C']]) y = np.asarray(df['label']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
from sklearn import linear_model from sklearn.model_selection import cross_val_score clf = linear_model.LogisticRegression() clf.fit(X_train, y_train) print(">> Score of the classifier on the train set is: ", round(clf.score(X_test, y_test),2))
>> Score of the classifier on the train set is: 0.74
交叉验证背后的想法很简单 - 我们选择一些数字k,通常k = 5或k = 10(5是sklearn中的默认值)。我们将数据分成k个大小相等的部分,并在其中的k - 1个部分上对机器学习模型进行训练,在剩余部分上对机器学习模型的性能进行检验。我们这样做k次,我们可以平均分数得到一个CV分数。
scores = cross_val_score(clf, X_train, y_train, cv=10) print('>> Mean CV score is: ', round(np.mean(scores),3)) pltt = sns.distplot(pd.Series(scores,name='CV scores distribution'), color='r')
>> Mean CV score is: 0.729
from sklearn.metrics import confusion_matrix C = confusion_matrix(clf.predict(X_test),y_test) df_cm = pd.DataFrame(C, range(2),range(2)) sns.set(font_scale=1.4) pltt = sns.heatmap(df_cm, annot=True,annot_kws={"size": 16}, cmap="YlGnBu", fmt='g')
让我们仔细看看混淆矩阵。如果我们允许FP为1,那么TP也将等于1; 通常,如果TP 和FP相等,我们的预测与随机猜测一样好。
ROC曲线定义为TP作为FP的函数的图。因此,从上面的讨论中,ROC曲线将位于线y = x 之上。
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve pltt = plot_ROC(y_train, clf.predict_proba(X_train)[:,1], y_test, clf.predict_proba(X_test)[:,1])
(1)ROC曲线下面积(AUC)是衡量分类器质量的重要指标。ROC AUC是机器学习中常用的一种工具。
(2)图中标记的点是TP和FP rates,正如我们在混淆矩阵中看到的那样。
(3)如果ROC曲线位于y = x 线以下,则意味着通过反转分类器的结果,我们可以得到一个信息分类器。下面是绘制ROC曲线的Python代码。
def plot_ROC(y_train_true, y_train_prob, y_test_true, y_test_prob): ''' a funciton to plot the ROC curve for train labels and test labels. Use the best threshold found in train set to classify items in test set. ''' fpr_train, tpr_train, thresholds_train = roc_curve(y_train_true, y_train_prob, pos_label =True) sum_sensitivity_specificity_train = tpr_train + (1-fpr_train) best_threshold_id_train = np.argmax(sum_sensitivity_specificity_train) best_threshold = thresholds_train[best_threshold_id_train] best_fpr_train = fpr_train[best_threshold_id_train] best_tpr_train = tpr_train[best_threshold_id_train] y_train = y_train_prob > best_threshold cm_train = confusion_matrix(y_train_true, y_train) acc_train = accuracy_score(y_train_true, y_train) auc_train = roc_auc_score(y_train_true, y_train) fig = plt.figure(figsize=(10,5)) ax = fig.add_subplot(121) curve1 = ax.plot(fpr_train, tpr_train) curve2 = ax.plot([0, 1], [0, 1], color='navy', linestyle='--') dot = ax.plot(best_fpr_train, best_tpr_train, marker='o', color='black') ax.text(best_fpr_train, best_tpr_train, s = '(%.3f,%.3f)' %(best_fpr_train, best_tpr_train)) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC curve (Train), AUC = %.4f'%auc_train) fpr_test, tpr_test, thresholds_test = roc_curve(y_test_true, y_test_prob, pos_label =True) y_test = y_test_prob > best_threshold cm_test = confusion_matrix(y_test_true, y_test) acc_test = accuracy_score(y_test_true, y_test) auc_test = roc_auc_score(y_test_true, y_test) tpr_score = float(cm_test[1][1])/(cm_test[1][1] + cm_test[1][0]) fpr_score = float(cm_test[0][1])/(cm_test[0][0]+ cm_test[0][1]) ax2 = fig.add_subplot(122) curve1 = ax2.plot(fpr_test, tpr_test) curve2 = ax2.plot([0, 1], [0, 1], color='navy', linestyle='--') dot = ax2.plot(fpr_score, tpr_score, marker='o', color='black') ax2.text(fpr_score, tpr_score, s = '(%.3f,%.3f)' %(fpr_score, tpr_score)) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC curve (Test), AUC = %.4f'%auc_test) plt.savefig('ROC', dpi = 500) plt.show() return best_threshold
Cohen's κ score
Cohen's κ score在同一数据上给出了两个分类器的一致性。它被定义为κ = 1-(1- po) /(1- pe),其中po是观察到的一致性概率,pe是一致性的随机概率。
from sklearn import svm clf2 = svm.SVC() clf2.fit(X_train, y_train) print(">> Score of the classifier on the train set is: ", round(clf.score(X_test, y_test),2))
>> Score of the classifier on the train set is: 0.74
y = clf.predict(X_test) y2 = clf2.predict(X_test) n = len(y) p_o = sum(y==y2)/n # observed agreement p_e = sum(y)*sum(y2)/(n**2)+sum(1-y)*sum(1-y2)/(n**2) # random agreement: both 1 or both 0 kappa = 1-(1-p_o)/(1-p_e) print(">> Cohen's Kappa score is: ", round(kappa,2))
>> Cohen's Kappa score is: 0.4
这表明两个分类器之间存在一定的一致性。k= 0表示没有一致性,而当两个分类器之间存在不一致时,κ <0也会发生。