日に日に分からんことが増えていく…

φ(..)メモメモ

アンサンブル分類器の実装

機械学習 Python3

一般的に、アンサンブル分類器の方が、個別の分類器より性能が高い

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    StandardScaler,
    LabelEncoder,
)
iris = datasets.load_iris()
X, y = iris.data[50:, [1,2]], iris.target[50:]
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1, stratify=y)

アンサンブル対象分類器

ロジスティク回帰分類器
決定木分類器
k 近傍分類器

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
import numpy as np

clf1 = LogisticRegression(
    penalty='l2',
    C=0.001,
    random_state=1
)

clf2 = DecisionTreeClassifier(
    max_depth=1,
    criterion='entropy',
    random_state=1
)

## p は距離計算時の指数。metric は計算方法
clf3 = KNeighborsClassifier(
    n_neighbors=1,
    p=2,
    metric='minkowski'
)

pipe1 = Pipeline(
    [['sc', StandardScaler()], ['clf', clf1]]
)

pipe3 = Pipeline(
    [['sc', StandardScaler()], ['clf', clf3]]
)

clf_labels = ['Logistic regression', 'Decision tree', 'KNN']
print('10-fold cross validation:\n')

for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
    scores = cross_val_score(
        estimator=clf,
        X=X_train,
        y=y_train,
        cv=10,
        scoring='roc_auc'
    )

    print("ROC AUC: %0.2f ( + / - %0.2f) [%s]" % (scores.mean(), scores.std(),  label))

10-fold cross validation:

ROC AUC: 0.87 ( + / - 0.17) [Logistic regression]
ROC AUC: 0.89 ( + / - 0.16) [Decision tree]
ROC AUC: 0.88 ( + / - 0.15) [KNN]

アンサンブル分類器との比較

from sklearn.ensemble import VotingClassifier

## Else if ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities, 
## which is recommended for an ensemble of well-calibrated classifiers.
vote_clf = VotingClassifier(estimators=[('pipe1', pipe1), ('clf2', clf2), ('pipe3', pipe3)], voting='soft')
clf_labels =  ['Logistic regression', 'Decision tree', 'KNN', 'VotingClassifier']
all_clf = [pipe1, clf2, pipe3, vote_clf]

for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(
        estimator=clf,
        X=X_train,
        y=y_train,
        cv=10,
        scoring='roc_auc'
    )

    print("ROC AUC: %0.2f ( + / - %0.2f) [%s]" % (scores.mean(), scores.std(),  label))

ROC AUC: 0.87 ( + / - 0.17) [Logistic regression]
ROC AUC: 0.89 ( + / - 0.16) [Decision tree]
ROC AUC: 0.88 ( + / - 0.15) [KNN]
ROC AUC: 0.94 ( + / - 0.13) [VotingClassifier]
In [ ]:

グリッドサーチにより、ハイパラメータをチューニング

vote_clf のパラメータの確認

print(vote_clf.get_params())

{'clf2': DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=1,
             splitter='best'),
 'clf2__class_weight': None,
 'clf2__criterion': 'entropy',
 'clf2__max_depth': 1,
 'clf2__max_features': None,
 'clf2__max_leaf_nodes': None,
 'clf2__min_impurity_decrease': 0.0,
 'clf2__min_impurity_split': None,
 'clf2__min_samples_leaf': 1,
 'clf2__min_samples_split': 2,
 'clf2__min_weight_fraction_leaf': 0.0,
 'clf2__presort': False,
 'clf2__random_state': 1,
 'clf2__splitter': 'best',
 'estimators': [('pipe1', Pipeline(memory=None,
        steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
             intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
             penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
             verbose=0, warm_start=False)]])),
  ('clf2',
   DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
               max_features=None, max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, presort=False, random_state=1,
               splitter='best')),
  ('pipe3', Pipeline(memory=None,
        steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
              metric_params=None, n_jobs=1, n_neighbors=1, p=2,
              weights='uniform')]]))],
 'flatten_transform': None,
 'n_jobs': 1,
 'pipe1': Pipeline(memory=None,
      steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)]]),
 'pipe1__clf': LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 'pipe1__clf__C': 0.001,
 'pipe1__clf__class_weight': None,
 'pipe1__clf__dual': False,
 'pipe1__clf__fit_intercept': True,
 'pipe1__clf__intercept_scaling': 1,
 'pipe1__clf__max_iter': 100,
 'pipe1__clf__multi_class': 'ovr',
 'pipe1__clf__n_jobs': 1,
 'pipe1__clf__penalty': 'l2',
 'pipe1__clf__random_state': 1,
 'pipe1__clf__solver': 'liblinear',
 'pipe1__clf__tol': 0.0001,
 'pipe1__clf__verbose': 0,
 'pipe1__clf__warm_start': False,
 'pipe1__memory': None,
 'pipe1__sc': StandardScaler(copy=True, with_mean=True, with_std=True),
 'pipe1__sc__copy': True,
 'pipe1__sc__with_mean': True,
 'pipe1__sc__with_std': True,
 'pipe1__steps': [['sc',
   StandardScaler(copy=True, with_mean=True, with_std=True)],
  ['clf',
   LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
             intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
             penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
             verbose=0, warm_start=False)]],
 'pipe3': Pipeline(memory=None,
      steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=1, p=2,
            weights='uniform')]]),
 'pipe3__clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=1, p=2,
            weights='uniform'),
 'pipe3__clf__algorithm': 'auto',
 'pipe3__clf__leaf_size': 30,
 'pipe3__clf__metric': 'minkowski',
 'pipe3__clf__metric_params': None,
 'pipe3__clf__n_jobs': 1,
 'pipe3__clf__n_neighbors': 1,
 'pipe3__clf__p': 2,
 'pipe3__clf__weights': 'uniform',
 'pipe3__memory': None,
 'pipe3__sc': StandardScaler(copy=True, with_mean=True, with_std=True),
 'pipe3__sc__copy': True,
 'pipe3__sc__with_mean': True,
 'pipe3__sc__with_std': True,
 'pipe3__steps': [['sc',
   StandardScaler(copy=True, with_mean=True, with_std=True)],
  ['clf',
   KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
              metric_params=None, n_jobs=1, n_neighbors=1, p=2,
              weights='uniform')]],
 'voting': 'soft',
 'weights': None}

グリッドサーチ実行

from sklearn.model_selection import GridSearchCV
params = {
    'clf2__max_depth': [1, 2],
    'pipe1__clf__C': [0.001, 1.0, 100.0]
}

grid = GridSearchCV(
    estimator=vote_clf,
    param_grid=params,
    cv=10,
    scoring='roc_auc'
)

grid.fit(X_train, y_train)

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f + / - %0.2f %r" % (
        grid.cv_results_['mean_test_score'][r],
        grid.cv_results_['std_test_score'][r] / 2.0,
        grid.cv_results_['params'][r]
    ))
print('')
print('Best parameters: %s' %grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)

0.933 + / - 0.07 {'clf2__max_depth': 1, 'pipe1__clf__C': 0.001}
0.973 + / - 0.04 {'clf2__max_depth': 1, 'pipe1__clf__C': 1.0}
0.973 + / - 0.04 {'clf2__max_depth': 1, 'pipe1__clf__C': 100.0}
0.947 + / - 0.07 {'clf2__max_depth': 2, 'pipe1__clf__C': 0.001}
0.973 + / - 0.04 {'clf2__max_depth': 2, 'pipe1__clf__C': 1.0}
0.973 + / - 0.04 {'clf2__max_depth': 2, 'pipe1__clf__C': 100.0}

Best parameters: {'clf2__max_depth': 1, 'pipe1__clf__C': 1.0}
Accuracy: 0.97