クラスタリング : DBSCAN の実装
クラスタリングアルゴリズムの中で、クラスタが球状という前提を持たずに、
クラスタラベルを割り当てる。
from sklearn.datasets import make_moons X, y = make_moons( n_samples=200, noise=0.05, random_state=0 ) plt.scatter(X[:, 0 ], X[:, 1]) plt.tight_layout() plt.show()
from sklearn.cluster import DBSCAN """ コア点、ボーダー点、ノイズ点 にラベル付を行う。 eps : 隣接点とみなす 2 点間の最大距離 min_samples : ボーダー点の最大個数 metric : 距離の計算方法 """ db = DBSCAN(eps=0.2, min_samples=5, metric='euclidean') y_db = db.fit_predict(X) plt.scatter(X[y_db == 0, 0], X[y_db == 0, 1], c='lightblue', marker='o', s=40, edgecolor='black', label='cluster 1') plt.scatter(X[y_db == 1, 0], X[y_db == 1, 1], c='red', marker='s', s=40, edgecolor='black', label='cluster 2') plt.legend() plt.tight_layout() #plt.savefig('images/11_16.png', dpi=300) plt.show()
ボトムアップ式のクラスタリングのグループ化
データ作成
shape: (5, 3) のランダム行列を作成
import pandas as pd import numpy as np np.random.seed(123) variables = ['X', 'Y', 'Z'] labels = ['ID_0', 'ID_1', 'ID_2', 'ID_3', 'ID_4'] X = np.random.random_sample([5, 3])*10 ## pandas のデータフレームに変更 df = pd.DataFrame(X, columns=variables, index=labels)
距離の計算
from scipy.spatial.distance import pdist, squareform ## pdist : 距離の計算 ## squareform : pdist にて算出した圧縮済みの距離行列から対称行列に変換 row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')), columns=labels, index=labels)
from scipy.cluster.hierarchy import linkage # method='complete' : 完全連結法、 クラスタの中で一番距離の遠い点が違い順で凝縮する row_clusters = linkage(pdist(df, metric='euclidean'), method='complete')
階層的クラスタリングの実行
# no. of items in clust : クラスタの個数 pd.DataFrame( row_clusters, columns=[ 'row label 1', 'row label 2', 'distance', 'no. of items in clust.' ], index=['cluster %d' % (i +1) for i in range(row_clusters.shape[0])] )
- sklearnでの実行
from sklearn.cluster import AgglomerativeClustering ac = AgglomerativeClustering( n_clusters=3, affinity='euclidean', linkage='complete' ) labels = ac.fit_predict(X) print('Cluster labels: %s' % labels)
Cluster labels: [1 0 0 2 1]
連結行列を樹形図にて表す
from scipy.cluster.hierarchy import dendrogram row_dendr = dendrogram( row_clusters, labels=labels, ) plt.ylabel('Euclidean distance') plt.tight_layout() plt.show()
ヒートマップによる可視化
fig = plt.figure(figsize=(8, 8), facecolor='white') ## x 軸の位置、y 軸の位置、幅、高さ axd = fig.add_axes([0.09, 0.1, 0.2, 0.6]) ## orientation : 回転 row_dendr = dendrogram(row_clusters, orientation='left') ## クラスタリングのラベルは 'leaves'から取得する。 ## ヒートマップは右側に配置する df_rowclust = df.iloc[row_dendr['leaves'][::-1]] axm = fig.add_axes([0.23, 0.1, 0.6, 0.6]) cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r') axd.set_xticks([]) axd.set_yticks([]) for i in axd.spines.values(): i.set_visible(False) fig.colorbar(cax) axm.set_xticklabels([''] + list(df_rowclust.columns)) axm.set_xticklabels([''] + list(df_rowclust.index)) plt.show()
MySQLの max にNULLが含まれても関係ないらしい
MySQL で maxを取得しようとしたさい、あれどんな挙動するんやろと迷った
気にすることなかった。
mysql> select (NULL < 1); +------------+ | (NULL < 1) | +------------+ | NULL | +------------+ 1 row in set (0.00 sec) mysql> select (NULL > 0); +------------+ | (NULL > 0) | +------------+ | NULL | +------------+ 1 row in set (0.00 sec) mysql> select * FROM ( SELECT NULL v UNION SELECT 0 v UNION SELECT 5 v UNION SELECT 10 v ) t; +------+ | v | +------+ | NULL | | 0 | | 5 | | 10 | +------+ 4 rows in set (0.00 sec) mysql> SELECT SUM(t.v) , MIN(t.v) , MAX(t.v) , COUNT(t.v) , AVG(t.v) FROM ( SELECT NULL v UNION SELECT 0 v UNION SELECT 5 v UNION SELECT 10 v ) t; +----------+----------+----------+------------+----------+ | SUM(t.v) | MIN(t.v) | MAX(t.v) | COUNT(t.v) | AVG(t.v) | +----------+----------+----------+------------+----------+ | 15 | 0 | 10 | 3 | 5.0000 | +----------+----------+----------+------------+----------+ 1 row in set (0.00 sec)
教師なしデータのクラスタ分析の検証
## クラスタリングのサンプルを作成 from sklearn.datasets import make_blobs X, y = make_blobs( n_samples=150, n_features=2, centers=3, cluster_std=0.5, shuffle=True, random_state=True ) ## クラスタリングを描画 plt.scatter(X[:, 0], X[:, 1], c='white', marker='o', edgecolors='black', s=50) plt.grid() plt.tight_layout() plt.show()
エルボー法
最適な、k は何かを可視化する。肘のできる辺りの k がよい選択である。
from sklearn.cluster import KMeans distortions = [] for i in range(1, 11): km = KMeans( n_clusters=i, init='k-means++', max_iter=300, random_state=0, ) km.fit(X) distortions.append(km.inertia_) plt.plot(range(1, 11), distortions, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Distortion') plt.tight_layout() plt.show()
シルエット図
クラスタの凝集度と乖離度からシルエット係数を算出し、グラフを描画する。
シルエット係数が 1 に近いほど、最適なクラスタリングに分類されていることになる。
km = KMeans( n_clusters=3, init='k-means++', n_init=10, max_iter=300, tol=1e-04, random_state=0 ) y_km = km.fit_predict(X) import numpy as np from matplotlib import cm from sklearn.metrics import silhouette_samples cluster_labels = np.unique(y_km) n_clusters = cluster_labels.shape[0] ## シルエット係数をリストで返却 silhouette_vals = silhouette_samples(X, y_km, metric='euclidean') y_ax_lower, y_ax_upper = 0, 0 yticks = [] for i, c in enumerate(cluster_labels): c_silhouette_vals = silhouette_vals[y_km == c] c_silhouette_vals.sort() y_ax_upper += len(c_silhouette_vals) ## cm : カラーマップ color = cm.jet(float(i) / n_clusters) ## 水平に棒グラフ plt.barh( range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color ) yticks.append((y_ax_lower + y_ax_upper) / 2.) y_ax_lower += len(c_silhouette_vals) silhouette_avg = np.mean(silhouette_vals) plt.axvline(silhouette_avg, color='red', linestyle="--") ## ytick : yの目盛 plt.yticks(yticks, cluster_labels + 1) plt.ylabel('Cluster') plt.xlabel('Silhouette coefficient') plt.tight_layout() plt.show()
アンサンブル分類器の実装
一般的に、アンサンブル分類器の方が、個別の分類器より性能が高い
from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.preprocessing import ( StandardScaler, LabelEncoder, ) iris = datasets.load_iris() X, y = iris.data[50:, [1,2]], iris.target[50:] le = LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1, stratify=y)
アンサンブル対象分類器
- ロジスティク回帰分類器
- 決定木分類器
- k 近傍分類器
from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline import numpy as np clf1 = LogisticRegression( penalty='l2', C=0.001, random_state=1 ) clf2 = DecisionTreeClassifier( max_depth=1, criterion='entropy', random_state=1 ) ## p は距離計算時の指数。metric は計算方法 clf3 = KNeighborsClassifier( n_neighbors=1, p=2, metric='minkowski' ) pipe1 = Pipeline( [['sc', StandardScaler()], ['clf', clf1]] ) pipe3 = Pipeline( [['sc', StandardScaler()], ['clf', clf3]] ) clf_labels = ['Logistic regression', 'Decision tree', 'KNN'] print('10-fold cross validation:\n') for clf, label in zip([pipe1, clf2, pipe3], clf_labels): scores = cross_val_score( estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc' ) print("ROC AUC: %0.2f ( + / - %0.2f) [%s]" % (scores.mean(), scores.std(), label))
10-fold cross validation: ROC AUC: 0.87 ( + / - 0.17) [Logistic regression] ROC AUC: 0.89 ( + / - 0.16) [Decision tree] ROC AUC: 0.88 ( + / - 0.15) [KNN]
アンサンブル分類器との比較
from sklearn.ensemble import VotingClassifier ## Else if ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities, ## which is recommended for an ensemble of well-calibrated classifiers. vote_clf = VotingClassifier(estimators=[('pipe1', pipe1), ('clf2', clf2), ('pipe3', pipe3)], voting='soft') clf_labels = ['Logistic regression', 'Decision tree', 'KNN', 'VotingClassifier'] all_clf = [pipe1, clf2, pipe3, vote_clf] for clf, label in zip(all_clf, clf_labels): scores = cross_val_score( estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc' ) print("ROC AUC: %0.2f ( + / - %0.2f) [%s]" % (scores.mean(), scores.std(), label))
ROC AUC: 0.87 ( + / - 0.17) [Logistic regression] ROC AUC: 0.89 ( + / - 0.16) [Decision tree] ROC AUC: 0.88 ( + / - 0.15) [KNN] ROC AUC: 0.94 ( + / - 0.13) [VotingClassifier] In [ ]:
グリッドサーチにより、ハイパラメータをチューニング
vote_clf のパラメータの確認
print(vote_clf.get_params())
{'clf2': DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=1, splitter='best'), 'clf2__class_weight': None, 'clf2__criterion': 'entropy', 'clf2__max_depth': 1, 'clf2__max_features': None, 'clf2__max_leaf_nodes': None, 'clf2__min_impurity_decrease': 0.0, 'clf2__min_impurity_split': None, 'clf2__min_samples_leaf': 1, 'clf2__min_samples_split': 2, 'clf2__min_weight_fraction_leaf': 0.0, 'clf2__presort': False, 'clf2__random_state': 1, 'clf2__splitter': 'best', 'estimators': [('pipe1', Pipeline(memory=None, steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=1, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)]])), ('clf2', DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=1, splitter='best')), ('pipe3', Pipeline(memory=None, steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2, weights='uniform')]]))], 'flatten_transform': None, 'n_jobs': 1, 'pipe1': Pipeline(memory=None, steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=1, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)]]), 'pipe1__clf': LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=1, solver='liblinear', tol=0.0001, verbose=0, warm_start=False), 'pipe1__clf__C': 0.001, 'pipe1__clf__class_weight': None, 'pipe1__clf__dual': False, 'pipe1__clf__fit_intercept': True, 'pipe1__clf__intercept_scaling': 1, 'pipe1__clf__max_iter': 100, 'pipe1__clf__multi_class': 'ovr', 'pipe1__clf__n_jobs': 1, 'pipe1__clf__penalty': 'l2', 'pipe1__clf__random_state': 1, 'pipe1__clf__solver': 'liblinear', 'pipe1__clf__tol': 0.0001, 'pipe1__clf__verbose': 0, 'pipe1__clf__warm_start': False, 'pipe1__memory': None, 'pipe1__sc': StandardScaler(copy=True, with_mean=True, with_std=True), 'pipe1__sc__copy': True, 'pipe1__sc__with_mean': True, 'pipe1__sc__with_std': True, 'pipe1__steps': [['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=1, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)]], 'pipe3': Pipeline(memory=None, steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2, weights='uniform')]]), 'pipe3__clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2, weights='uniform'), 'pipe3__clf__algorithm': 'auto', 'pipe3__clf__leaf_size': 30, 'pipe3__clf__metric': 'minkowski', 'pipe3__clf__metric_params': None, 'pipe3__clf__n_jobs': 1, 'pipe3__clf__n_neighbors': 1, 'pipe3__clf__p': 2, 'pipe3__clf__weights': 'uniform', 'pipe3__memory': None, 'pipe3__sc': StandardScaler(copy=True, with_mean=True, with_std=True), 'pipe3__sc__copy': True, 'pipe3__sc__with_mean': True, 'pipe3__sc__with_std': True, 'pipe3__steps': [['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2, weights='uniform')]], 'voting': 'soft', 'weights': None}
グリッドサーチ実行
from sklearn.model_selection import GridSearchCV params = { 'clf2__max_depth': [1, 2], 'pipe1__clf__C': [0.001, 1.0, 100.0] } grid = GridSearchCV( estimator=vote_clf, param_grid=params, cv=10, scoring='roc_auc' ) grid.fit(X_train, y_train) for r, _ in enumerate(grid.cv_results_['mean_test_score']): print("%0.3f + / - %0.2f %r" % ( grid.cv_results_['mean_test_score'][r], grid.cv_results_['std_test_score'][r] / 2.0, grid.cv_results_['params'][r] )) print('') print('Best parameters: %s' %grid.best_params_) print('Accuracy: %.2f' % grid.best_score_)
0.933 + / - 0.07 {'clf2__max_depth': 1, 'pipe1__clf__C': 0.001} 0.973 + / - 0.04 {'clf2__max_depth': 1, 'pipe1__clf__C': 1.0} 0.973 + / - 0.04 {'clf2__max_depth': 1, 'pipe1__clf__C': 100.0} 0.947 + / - 0.07 {'clf2__max_depth': 2, 'pipe1__clf__C': 0.001} 0.973 + / - 0.04 {'clf2__max_depth': 2, 'pipe1__clf__C': 1.0} 0.973 + / - 0.04 {'clf2__max_depth': 2, 'pipe1__clf__C': 100.0} Best parameters: {'clf2__max_depth': 1, 'pipe1__clf__C': 1.0} Accuracy: 0.97
sklearn にて、適合率と再現率
以下の投稿で load したX_train, y_train,... を利用。
適合率と再現率と F1 スコア
適合率(PRE)と再現率(REC)について、F1 スコアという性能指標が存在する。
PRE = TP / ( TP + FP )
REC = TP / ( TP + FN )
f1 = 2 * ( PRE * REC ) / ( PRE + REC )
from sklearn.metrics import ( precision_score, recall_score, f1_score, ) print('Precision : %.3f' % precision_score(y_true=y_test, y_pred=y_pred)) print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred)) print('F1 : %.3f' % f1_score(y_true=y_test, y_pred=y_pred))
Precision : 0.974 Recall: 0.905 F1 : 0.938
F1 スコアで、グリッドサーチの結果を計測
from sklearn.metrics import make_scorer ## pos_label : 陽性のラベル指定 scorer = make_scorer(f1_score, pos_label=0) gs = GridSearchCV( estimator=pipe_svc, param_grid=param_grid, scoring=scorer, cv=10, n_jobs=-1 ) gs.fit(X_train, y_train) print(gs.best_score_) print(gs.best_params_)
0.9880219137963148 {'svc__C': 100.0, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}