ランダムフォレストで特徴選択する方法
次元削減で特徴抽出する方法を本から抜粋
df_wine = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None) from sklearn.ensemble import RandomForestClassifier feat_labels = df_wine.columns[1:] forest = RandomForestClassifier(n_estimators=500, random_state=1) forest.fit(X_train, y_train) importances = forest.feature_importances_ ## argsort はsortした index をとってくる indices = np.argsort(importances)[::-1] plt.title("Feature Importances") plt.bar(range(X_train.shape[1]), importances[indices], align='center') plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90) plt.xlim([-1, X_train.shape[1]]) plt.tight_layout() plt.show() from sklearn.feature_selection import SelectFromModel sfm = SelectFromModel(forest, threshold=0.1, prefit=True) X_selected = sfm.transform(X_train) print('Number of samples that meet this criterion:', X_selected.shape[0]) for f in range(X_selected.shape[1]): print("%2d) %-*s %f" % (f+1, 30, feat_labels[indices[f]], importances[indices[f]]))
Number of samples that meet this criterion: 124 1) 13 0.185453 2) 7 0.174751 3) 10 0.143920 4) 12 0.136162 5) 1 0.118529