バイナリ分類器の訓練と検証
ロジスティック回帰とXGB分類器のホールドアウト
from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.model_selection import train_test_split import xgboost as xgb from sklearn.metrics import ( f1_score, ) from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import GridSearchCV import warnings warnings.simplefilter('ignore', DeprecationWarning) x_train, x_test, y_train, y_test = train_test_split(df_std, df['target'], test_size=0.1, stratify=df['target'], random_state=10) # モデル1 param_grid = [{'C': [0.000001 * (10 ** i) for i in range(10)], 'penalty': ['l2'], 'random_state': [15]}] kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=123) _model = GridSearchCV(LogisticRegression(), param_grid, cv=kf.split(x_train, y_train), scoring='f1', iid=True) _model.fit(x_train, y_train) print (_model.best_params_) clf1_params = _model.best_params_ _x_train, x_val, _y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train, random_state=12) clf = LogisticRegression(**_model.best_params_) clf.fit(_x_train, _y_train) pred=clf.predict(x_val) print('f1_score', f1_score(y_val, pred)) # モデル2 param_grid = [{'n_estimators': [50 * (i + 1) for i in range(10)], 'random_state':[123]}] kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=101) _model = GridSearchCV(RandomForestClassifier(), param_grid, cv=kf.split(x_train, y_train), scoring='f1', iid=True) _model.fit(x_train, y_train) print (_model.best_params_) clf2_params = _model.best_params_ _x_train, x_val, _y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train, random_state=13) clf = RandomForestClassifier(**_model.best_params_) clf.fit(_x_train, _y_train) pred=clf.predict(x_val) print('f1_score', f1_score(y_val, pred)) # モデル3 param_grid = [{'n_neighbors': [i + 4 for i in range(5)], 'p': [1, 2], 'metric':['minkowski'], }] kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=121) _model = GridSearchCV(KNeighborsClassifier(), param_grid, cv=kf.split(x_train, y_train), scoring='f1', iid=True) _model.fit(x_train, y_train) print (_model.best_params_) clf3_params = _model.best_params_ _x_train, x_val, _y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train, random_state=14) clf = KNeighborsClassifier(**_model.best_params_) clf.fit(_x_train, _y_train) pred=clf.predict(x_val) print('f1_score', f1_score(y_val, pred)) # モデル4 param_grid = [{'max_depth': [i + 1 for i in range(3)], 'n_estimators': [25, 50,75, 100], 'random_state':[123], }] kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=104) _model = GridSearchCV(xgb.XGBClassifier(), param_grid, cv=kf.split(x_train, y_train), scoring='f1', iid=True) _model.fit(x_train, y_train) print (_model.best_params_) clf4_params = _model.best_params_ _x_train, x_val, _y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train, random_state=16) clf = xgb.XGBClassifier(**_model.best_params_) clf.fit(_x_train, _y_train) pred=clf.predict(x_val) print('f1_score', f1_score(y_val, pred)) # モデル5 param_grid = [ { 'solver': ['lbfgs'], 'alpha': [10**-5, 10**-4, 10**-3], 'hidden_layer_sizes':[(i + 2, 2) for i in range(3)], 'random_state': [123] } ] kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=154) _model = GridSearchCV(MLPClassifier(), param_grid, cv=kf.split(x_train, y_train), scoring='f1', iid=True) _model.fit(x_train, y_train) print (_model.best_params_) clf5_params = _model.best_params_ _x_train, x_val, _y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train, random_state=15) clf = MLPClassifier(**_model.best_params_) clf.fit(_x_train, _y_train) pred=clf.predict(x_val) print('f1_score', f1_score(y_val, pred)) # モデル6 from sklearn.svm import SVC param_grid = [{'C': [0.0001 * (10 ** i) for i in range(5)], 'kernel': ['linear', 'rbf'], 'gamma': [0.001 * (10 ** i) for i in range(5)], 'random_state': [109], 'probability': [True]}] kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=123) _model = GridSearchCV(SVC(), param_grid, cv=kf.split(x_train, y_train), scoring='f1', iid=True) _model.fit(x_train, y_train) print (_model.best_params_) clf6_params = _model.best_params_ _x_train, x_val, _y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, stratify=y_train, random_state=12) clf = SVC(**_model.best_params_) clf.fit(_x_train, _y_train) pred=clf.predict(x_val) print('f1_score', f1_score(y_val, pred))
# 複数のモデルの計算結果の傾向より、分類を予測するメタモデル from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone class StackingAveragedModels(BaseEstimator, ClassifierMixin, TransformerMixin): def __init__(self, base_models, meta_model, meta_prams={}, n_folds=5): self.base_models = base_models self.meta_base_model = meta_model self.meta_model = meta_model(**meta_prams) self.n_folds = n_folds def fit(self, X, y): self.base_models_ = [list() for x in self.base_models] self.meta_model_ = clone(self.meta_model) kf = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=123) out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models))) for i, model in enumerate(self.base_models): for train_index, holdout_index in kf.split(X, y): instance = clone(model) self.base_models_[i].append(instance) instance.fit(X[train_index], y[train_index]) y_pred = instance.predict_proba(X[holdout_index])[:, 1] out_of_fold_predictions[holdout_index, i] = y_pred self.meta_model_.fit(out_of_fold_predictions, y) return self def predict(self, X): meta_features = np.column_stack([ np.column_stack([model.predict_proba(X)[:, 1] for model in base_models]).mean(axis=1) for base_models in self.base_models_ ]) return self.meta_model_.predict(meta_features) def best_params_(self, X, y, param_grid): self.base_models_ = [list() for x in self.base_models] self.meta_model_ = clone(self.meta_model) kf = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=123) out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models))) for i, model in enumerate(self.base_models): for train_index, holdout_index in kf.split(X, y): instance = clone(model) self.base_models_[i].append(instance) instance.fit(X[train_index], y[train_index]) y_pred = instance.predict_proba(X[holdout_index])[:, 1] out_of_fold_predictions[holdout_index, i] = y_pred _model = GridSearchCV(self.meta_base_model(), param_grid, cv=kf.split(out_of_fold_predictions, y), scoring='f1', iid=True) _model.fit(x_train, y_train) return _model.best_params_ # Stacking モデル param_grid = [{'C': [0.1 + (0.1 * i) for i in range(10)], 'penalty': ['l1', 'l2'], 'random_state': [19]}] # param_grid = [{'C': [0.0001 * (10 ** i) for i in range(5)], 'kernel': ['linear', 'rbf'], 'gamma': [0.001 * (10 ** i) for i in range(5)], 'random_state': [114], 'probability': [True]}] clf1 = LogisticRegression(**clf1_params) clf2 = RandomForestClassifier(**clf2_params) clf3 = KNeighborsClassifier(**clf3_params) clf4 = xgb.XGBClassifier(**clf4_params) clf5 = MLPClassifier(**clf5_params) clf6 = SVC(**clf6_params) meta_clf = LogisticRegression averaged_models = StackingAveragedModels(base_models = (clf1, clf2, clf3, clf4, clf5, clf6), meta_model=meta_clf) meta_prams = averaged_models.best_params_(x_train.values, y_train.values, param_grid) averaged_models = StackingAveragedModels(base_models = (clf1, clf2, clf3, clf4, clf5, clf6), meta_model=meta_clf, meta_prams=meta_prams) averaged_models.fit(x_train.values, y_train.values) print('f1_score', f1_score(y_train.values, averaged_models.predict(x_train.values), )) print('f1_score', f1_score(y_test.values, averaged_models.predict(x_test.values), ))
ランダムサンプリングしたデータからf1値のばらつきを推定
# アンサンブル学習 ls = [] for i in range(10): x_train, x_test, y_train, y_test = train_test_split(df_std, df['target'], test_size=0.1, stratify=df['target'], random_state=i) averaged_models = StackingAveragedModels(base_models = (clf1, clf2, clf3, clf4, clf5, clf6), meta_model=meta_clf, meta_prams=meta_prams) averaged_models.fit(x_train.values, y_train.values) ls.append(f1_score(y_test.values, averaged_models.predict(x_test.values), )) ls = np.array(ls) print('f1_score: {:.4f} ± {:.4f}'.format(ls.mean(), ls.std(ddof=0)))