アンサンブル分類器の実装
一般的に、アンサンブル分類器の方が、個別の分類器より性能が高い
from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.preprocessing import ( StandardScaler, LabelEncoder, ) iris = datasets.load_iris() X, y = iris.data[50:, [1,2]], iris.target[50:] le = LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1, stratify=y)
アンサンブル対象分類器
- ロジスティク回帰分類器
- 決定木分類器
- k 近傍分類器
from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline import numpy as np clf1 = LogisticRegression( penalty='l2', C=0.001, random_state=1 ) clf2 = DecisionTreeClassifier( max_depth=1, criterion='entropy', random_state=1 ) ## p は距離計算時の指数。metric は計算方法 clf3 = KNeighborsClassifier( n_neighbors=1, p=2, metric='minkowski' ) pipe1 = Pipeline( [['sc', StandardScaler()], ['clf', clf1]] ) pipe3 = Pipeline( [['sc', StandardScaler()], ['clf', clf3]] ) clf_labels = ['Logistic regression', 'Decision tree', 'KNN'] print('10-fold cross validation:\n') for clf, label in zip([pipe1, clf2, pipe3], clf_labels): scores = cross_val_score( estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc' ) print("ROC AUC: %0.2f ( + / - %0.2f) [%s]" % (scores.mean(), scores.std(), label))
10-fold cross validation: ROC AUC: 0.87 ( + / - 0.17) [Logistic regression] ROC AUC: 0.89 ( + / - 0.16) [Decision tree] ROC AUC: 0.88 ( + / - 0.15) [KNN]
アンサンブル分類器との比較
from sklearn.ensemble import VotingClassifier ## Else if ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities, ## which is recommended for an ensemble of well-calibrated classifiers. vote_clf = VotingClassifier(estimators=[('pipe1', pipe1), ('clf2', clf2), ('pipe3', pipe3)], voting='soft') clf_labels = ['Logistic regression', 'Decision tree', 'KNN', 'VotingClassifier'] all_clf = [pipe1, clf2, pipe3, vote_clf] for clf, label in zip(all_clf, clf_labels): scores = cross_val_score( estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc' ) print("ROC AUC: %0.2f ( + / - %0.2f) [%s]" % (scores.mean(), scores.std(), label))
ROC AUC: 0.87 ( + / - 0.17) [Logistic regression] ROC AUC: 0.89 ( + / - 0.16) [Decision tree] ROC AUC: 0.88 ( + / - 0.15) [KNN] ROC AUC: 0.94 ( + / - 0.13) [VotingClassifier] In [ ]:
グリッドサーチにより、ハイパラメータをチューニング
vote_clf のパラメータの確認
print(vote_clf.get_params())
{'clf2': DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=1, splitter='best'), 'clf2__class_weight': None, 'clf2__criterion': 'entropy', 'clf2__max_depth': 1, 'clf2__max_features': None, 'clf2__max_leaf_nodes': None, 'clf2__min_impurity_decrease': 0.0, 'clf2__min_impurity_split': None, 'clf2__min_samples_leaf': 1, 'clf2__min_samples_split': 2, 'clf2__min_weight_fraction_leaf': 0.0, 'clf2__presort': False, 'clf2__random_state': 1, 'clf2__splitter': 'best', 'estimators': [('pipe1', Pipeline(memory=None, steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=1, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)]])), ('clf2', DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=1, splitter='best')), ('pipe3', Pipeline(memory=None, steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2, weights='uniform')]]))], 'flatten_transform': None, 'n_jobs': 1, 'pipe1': Pipeline(memory=None, steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=1, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)]]), 'pipe1__clf': LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=1, solver='liblinear', tol=0.0001, verbose=0, warm_start=False), 'pipe1__clf__C': 0.001, 'pipe1__clf__class_weight': None, 'pipe1__clf__dual': False, 'pipe1__clf__fit_intercept': True, 'pipe1__clf__intercept_scaling': 1, 'pipe1__clf__max_iter': 100, 'pipe1__clf__multi_class': 'ovr', 'pipe1__clf__n_jobs': 1, 'pipe1__clf__penalty': 'l2', 'pipe1__clf__random_state': 1, 'pipe1__clf__solver': 'liblinear', 'pipe1__clf__tol': 0.0001, 'pipe1__clf__verbose': 0, 'pipe1__clf__warm_start': False, 'pipe1__memory': None, 'pipe1__sc': StandardScaler(copy=True, with_mean=True, with_std=True), 'pipe1__sc__copy': True, 'pipe1__sc__with_mean': True, 'pipe1__sc__with_std': True, 'pipe1__steps': [['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=1, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)]], 'pipe3': Pipeline(memory=None, steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2, weights='uniform')]]), 'pipe3__clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2, weights='uniform'), 'pipe3__clf__algorithm': 'auto', 'pipe3__clf__leaf_size': 30, 'pipe3__clf__metric': 'minkowski', 'pipe3__clf__metric_params': None, 'pipe3__clf__n_jobs': 1, 'pipe3__clf__n_neighbors': 1, 'pipe3__clf__p': 2, 'pipe3__clf__weights': 'uniform', 'pipe3__memory': None, 'pipe3__sc': StandardScaler(copy=True, with_mean=True, with_std=True), 'pipe3__sc__copy': True, 'pipe3__sc__with_mean': True, 'pipe3__sc__with_std': True, 'pipe3__steps': [['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2, weights='uniform')]], 'voting': 'soft', 'weights': None}
グリッドサーチ実行
from sklearn.model_selection import GridSearchCV params = { 'clf2__max_depth': [1, 2], 'pipe1__clf__C': [0.001, 1.0, 100.0] } grid = GridSearchCV( estimator=vote_clf, param_grid=params, cv=10, scoring='roc_auc' ) grid.fit(X_train, y_train) for r, _ in enumerate(grid.cv_results_['mean_test_score']): print("%0.3f + / - %0.2f %r" % ( grid.cv_results_['mean_test_score'][r], grid.cv_results_['std_test_score'][r] / 2.0, grid.cv_results_['params'][r] )) print('') print('Best parameters: %s' %grid.best_params_) print('Accuracy: %.2f' % grid.best_score_)
0.933 + / - 0.07 {'clf2__max_depth': 1, 'pipe1__clf__C': 0.001} 0.973 + / - 0.04 {'clf2__max_depth': 1, 'pipe1__clf__C': 1.0} 0.973 + / - 0.04 {'clf2__max_depth': 1, 'pipe1__clf__C': 100.0} 0.947 + / - 0.07 {'clf2__max_depth': 2, 'pipe1__clf__C': 0.001} 0.973 + / - 0.04 {'clf2__max_depth': 2, 'pipe1__clf__C': 1.0} 0.973 + / - 0.04 {'clf2__max_depth': 2, 'pipe1__clf__C': 100.0} Best parameters: {'clf2__max_depth': 1, 'pipe1__clf__C': 1.0} Accuracy: 0.97