学習曲線と検証曲線を使って、バイアスとバリアンスを可視化
以下をまずは見て。
train_size を指定することで、学習曲線を描画
import matplotlib.pyplot as plt from sklearn.model_selection import learning_curve pipe_lr = make_pipeline(StandardScaler(), LogisticRegression(penalty='l2', random_state=1)) ## cv => cross-validation train_sizes, train_scores, test_scores = learning_curve( estimator=pipe_lr, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1 ) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) ## 分散を誤差範囲として、fill_between でプロット。alpha は濃さを表す ## https://matplotlib.org/gallery/recipes/fill_between_alpha.html plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(train_sizes, train_mean+train_std, train_mean-train_std, alpha=0.15, color='blue') plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(train_sizes, test_mean+test_std, test_mean-test_std, alpha=0.15, color='green') plt.grid() plt.xlabel('Number of training sample') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.ylim([0.8, 1.05]) plt.tight_layout() plt.show()
param_range を指定することで、検証曲線を描画
from sklearn.model_selection import validation_curve param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] train_scores, test_scores = validation_curve( estimator=pipe_lr, X=X_train, y=y_train, param_name='logisticregression__C', param_range=param_range, cv=10 ) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean+train_std, train_mean-train_std, alpha=0.15, color='blue') plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean+test_std, test_mean-test_std, alpha=0.15, color='green') plt.grid() plt.xscale('log') plt.xlabel('Parameter C') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.ylim([0.8, 1.0]) plt.tight_layout() plt.show()