バイナリ分類器の訓練と検証

ロジスティック回帰とXGB分類器のホールドアウト

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier


from sklearn.model_selection import train_test_split

import xgboost as xgb

from sklearn.metrics import (
    f1_score,
)

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

import warnings
warnings.simplefilter('ignore', DeprecationWarning)


x_train, x_test, y_train, y_test = train_test_split(df_std, df['target'], test_size=0.1, stratify=df['target'], random_state=10)

# モデル1

param_grid = [{'C': [0.000001 * (10 ** i) for i in range(10)], 'penalty': ['l2'], 'random_state': [15]}] 

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)

_model = GridSearchCV(LogisticRegression(), param_grid, cv=kf.split(x_train, y_train), scoring='f1', iid=True)
_model.fit(x_train, y_train)
print (_model.best_params_)
clf1_params = _model.best_params_

_x_train, x_val, _y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train, random_state=12)
clf = LogisticRegression(**_model.best_params_)
clf.fit(_x_train, _y_train)
pred=clf.predict(x_val)
print('f1_score', f1_score(y_val, pred))

# モデル2

param_grid = [{'n_estimators': [50 * (i + 1) for i in range(10)], 'random_state':[123]}] 

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=101)

_model = GridSearchCV(RandomForestClassifier(), param_grid, cv=kf.split(x_train, y_train), scoring='f1', iid=True)
_model.fit(x_train, y_train)
print (_model.best_params_)

clf2_params = _model.best_params_

_x_train, x_val, _y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train, random_state=13)
clf = RandomForestClassifier(**_model.best_params_)
clf.fit(_x_train, _y_train)
pred=clf.predict(x_val)
print('f1_score', f1_score(y_val, pred))

# モデル3

param_grid = [{'n_neighbors': [i + 4 for i in range(5)], 'p': [1, 2], 'metric':['minkowski'], }] 

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=121)

_model = GridSearchCV(KNeighborsClassifier(), param_grid, cv=kf.split(x_train, y_train), scoring='f1', iid=True)
_model.fit(x_train, y_train)
print (_model.best_params_)

clf3_params = _model.best_params_

_x_train, x_val, _y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train, random_state=14)
clf = KNeighborsClassifier(**_model.best_params_)
clf.fit(_x_train, _y_train)
pred=clf.predict(x_val)
print('f1_score', f1_score(y_val, pred))

# モデル4

param_grid = [{'max_depth': [i + 1 for i in range(3)], 'n_estimators': [25, 50,75, 100], 'random_state':[123], }] 

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=104)

_model = GridSearchCV(xgb.XGBClassifier(), param_grid, cv=kf.split(x_train, y_train), scoring='f1', iid=True)
_model.fit(x_train, y_train)
print (_model.best_params_)

clf4_params = _model.best_params_

_x_train, x_val, _y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train, random_state=16)
clf = xgb.XGBClassifier(**_model.best_params_)
clf.fit(_x_train, _y_train)
pred=clf.predict(x_val)
print('f1_score', f1_score(y_val, pred))

# モデル5

param_grid = [
    {
        'solver': ['lbfgs'],
        'alpha': [10**-5, 10**-4, 10**-3],
        'hidden_layer_sizes':[(i + 2, 2) for i in range(3)],
        'random_state': [123]
    }
] 

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=154)

_model = GridSearchCV(MLPClassifier(), param_grid, cv=kf.split(x_train, y_train), scoring='f1', iid=True)
_model.fit(x_train, y_train)
print (_model.best_params_)

clf5_params = _model.best_params_

_x_train, x_val, _y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train, random_state=15)
clf = MLPClassifier(**_model.best_params_)
clf.fit(_x_train, _y_train)
pred=clf.predict(x_val)
print('f1_score', f1_score(y_val, pred))

# モデル6

from sklearn.svm import SVC

param_grid = [{'C': [0.0001 * (10 ** i) for i in range(5)], 'kernel': ['linear', 'rbf'], 'gamma': [0.001 * (10 ** i) for i in range(5)], 'random_state': [109], 'probability': [True]}] 

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=123)

_model = GridSearchCV(SVC(), param_grid, cv=kf.split(x_train, y_train), scoring='f1', iid=True)
_model.fit(x_train, y_train)
print (_model.best_params_)
clf6_params = _model.best_params_

_x_train, x_val, _y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train, random_state=12)
clf = SVC(**_model.best_params_)
clf.fit(_x_train, _y_train)
pred=clf.predict(x_val)
print('f1_score', f1_score(y_val, pred))
# 複数のモデルの計算結果の傾向より、分類を予測するメタモデル

from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone

class StackingAveragedModels(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, meta_prams={}, n_folds=5):
        self.base_models = base_models
        self.meta_base_model = meta_model
        self.meta_model = meta_model(**meta_prams)
        self.n_folds = n_folds
   
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kf = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=123)

        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kf.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict_proba(X[holdout_index])[:, 1]
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict_proba(X)[:, 1] for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

    def best_params_(self, X, y, param_grid):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kf = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=123)

        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kf.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict_proba(X[holdout_index])[:, 1]
                out_of_fold_predictions[holdout_index, i] = y_pred

        _model = GridSearchCV(self.meta_base_model(), param_grid, cv=kf.split(out_of_fold_predictions, y), scoring='f1', iid=True)
        
        _model.fit(x_train, y_train)
        return _model.best_params_

# Stacking モデル

param_grid = [{'C': [0.1 + (0.1 * i) for i in range(10)], 'penalty': ['l1', 'l2'], 'random_state': [19]}] 
# param_grid = [{'C': [0.0001 * (10 ** i) for i in range(5)], 'kernel': ['linear', 'rbf'], 'gamma': [0.001 * (10 ** i) for i in range(5)], 'random_state': [114], 'probability': [True]}] 

clf1 = LogisticRegression(**clf1_params)
clf2 = RandomForestClassifier(**clf2_params)
clf3 = KNeighborsClassifier(**clf3_params)
clf4 = xgb.XGBClassifier(**clf4_params)
clf5 = MLPClassifier(**clf5_params)
clf6 = SVC(**clf6_params)

meta_clf = LogisticRegression

averaged_models = StackingAveragedModels(base_models = (clf1, clf2, clf3, clf4, clf5, clf6), meta_model=meta_clf)


meta_prams = averaged_models.best_params_(x_train.values, y_train.values, param_grid)
averaged_models = StackingAveragedModels(base_models = (clf1, clf2, clf3, clf4, clf5, clf6), meta_model=meta_clf, meta_prams=meta_prams)
averaged_models.fit(x_train.values, y_train.values)
print('f1_score', f1_score(y_train.values, averaged_models.predict(x_train.values), ))
print('f1_score', f1_score(y_test.values, averaged_models.predict(x_test.values), ))

ランダムサンプリングしたデータからf1値のばらつきを推定

# アンサンブル学習
ls = []

for i in range(10):
    x_train, x_test, y_train, y_test = train_test_split(df_std, df['target'], test_size=0.1, stratify=df['target'], random_state=i)
    averaged_models = StackingAveragedModels(base_models = (clf1, clf2, clf3, clf4, clf5, clf6), meta_model=meta_clf, meta_prams=meta_prams)
    averaged_models.fit(x_train.values, y_train.values)
    ls.append(f1_score(y_test.values, averaged_models.predict(x_test.values), ))

ls = np.array(ls)
print('f1_score: {:.4f} ± {:.4f}'.format(ls.mean(), ls.std(ddof=0)))

Kaggle Ensembling Guide | MLWave