クラスタリング : DBSCAN の実装

クラスタリングアルゴリズムの中で、クラスタが球状という前提を持たずに、
クラスタラベルを割り当てる。

from sklearn.datasets import make_moons
X, y = make_moons(
    n_samples=200,
    noise=0.05,
    random_state=0
)
plt.scatter(X[:, 0 ], X[:, 1])
plt.tight_layout()
plt.show()
from sklearn.cluster import DBSCAN

"""
コア点、ボーダー点、ノイズ点 にラベル付を行う。

eps : 隣接点とみなす 2 点間の最大距離
min_samples : ボーダー点の最大個数
metric : 距離の計算方法
"""
db = DBSCAN(eps=0.2, min_samples=5, metric='euclidean')
y_db = db.fit_predict(X)
plt.scatter(X[y_db == 0, 0], X[y_db == 0, 1],
            c='lightblue', marker='o', s=40,
            edgecolor='black', 
            label='cluster 1')
plt.scatter(X[y_db == 1, 0], X[y_db == 1, 1],
            c='red', marker='s', s=40,
            edgecolor='black', 
            label='cluster 2')
plt.legend()
plt.tight_layout()
#plt.savefig('images/11_16.png', dpi=300)
plt.show()



    
    
  

ボトムアップ式のクラスタリングのグループ化

データ作成

shape: (5, 3) のランダム行列を作成

import pandas as pd
import numpy as np

np.random.seed(123)

variables = ['X', 'Y', 'Z']
labels = ['ID_0', 'ID_1', 'ID_2', 'ID_3', 'ID_4']

X = np.random.random_sample([5, 3])*10

## pandas のデータフレームに変更
df = pd.DataFrame(X, columns=variables, index=labels)

距離の計算

from scipy.spatial.distance import pdist, squareform

## pdist : 距離の計算
## squareform : pdist にて算出した圧縮済みの距離行列から対称行列に変換
row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')),
                        columns=labels,
                        index=labels)
from scipy.cluster.hierarchy import linkage
# method='complete' : 完全連結法、 クラスタの中で一番距離の遠い点が違い順で凝縮する
row_clusters = linkage(pdist(df, metric='euclidean'), method='complete')

階層的クラスタリングの実行

# no. of items in clust : クラスタの個数 
pd.DataFrame(
    row_clusters,
    columns=[
        'row label 1',
        'row label 2',
        'distance',
        'no. of items in clust.'
    ],
    index=['cluster %d' % (i +1) for i in range(row_clusters.shape[0])]
)
  • sklearnでの実行
from sklearn.cluster import AgglomerativeClustering
ac = AgglomerativeClustering(
    n_clusters=3,
    affinity='euclidean',
    linkage='complete'
)
labels = ac.fit_predict(X)
print('Cluster labels: %s' % labels)
Cluster labels: [1 0 0 2 1]

連結行列を樹形図にて表す

from scipy.cluster.hierarchy import dendrogram

row_dendr = dendrogram(
    row_clusters,
    labels=labels,
)
plt.ylabel('Euclidean distance')
plt.tight_layout()
plt.show()

ヒートマップによる可視化

fig = plt.figure(figsize=(8, 8), facecolor='white')
## x 軸の位置、y 軸の位置、幅、高さ
axd = fig.add_axes([0.09, 0.1, 0.2, 0.6])
## orientation : 回転
row_dendr = dendrogram(row_clusters, orientation='left')

## クラスタリングのラベルは 'leaves'から取得する。
## ヒートマップは右側に配置する
df_rowclust = df.iloc[row_dendr['leaves'][::-1]]
axm = fig.add_axes([0.23, 0.1, 0.6, 0.6])
cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r')

axd.set_xticks([])
axd.set_yticks([])
for i in axd.spines.values():
    i.set_visible(False)
    
fig.colorbar(cax)
axm.set_xticklabels([''] + list(df_rowclust.columns))
axm.set_xticklabels([''] + list(df_rowclust.index))
plt.show()

MySQLの max にNULLが含まれても関係ないらしい

MySQL で maxを取得しようとしたさい、あれどんな挙動するんやろと迷った
気にすることなかった。

mysql> select (NULL < 1);
+------------+
| (NULL < 1) |
+------------+
|       NULL |
+------------+
1 row in set (0.00 sec)

mysql> select (NULL > 0);
+------------+
| (NULL > 0) |
+------------+
|       NULL |
+------------+
1 row in set (0.00 sec)


mysql> select *   FROM (   SELECT NULL v   UNION   SELECT 0 v   UNION   SELECT 5 v   UNION   SELECT 10 v ) t;
+------+
| v    |
+------+
| NULL |
|    0 |
|    5 |
|   10 |
+------+
4 rows in set (0.00 sec)


mysql> SELECT   SUM(t.v)  , MIN(t.v)  , MAX(t.v)  , COUNT(t.v)  , AVG(t.v)  FROM (   SELECT NULL v   UNION   SELECT 0 v   UNION   SELECT 5 v   UNION   SELECT 10 v ) t;
+----------+----------+----------+------------+----------+
| SUM(t.v) | MIN(t.v) | MAX(t.v) | COUNT(t.v) | AVG(t.v) |
+----------+----------+----------+------------+----------+
|       15 |        0 |       10 |          3 |   5.0000 |
+----------+----------+----------+------------+----------+
1 row in set (0.00 sec)

fdays.blogspot.com

教師なしデータのクラスタ分析の検証

## クラスタリングのサンプルを作成
from sklearn.datasets import make_blobs
X, y = make_blobs(
    n_samples=150,
    n_features=2,
    centers=3,
    cluster_std=0.5,
    shuffle=True,
    random_state=True
)

## クラスタリングを描画
plt.scatter(X[:, 0], X[:, 1], c='white', marker='o', edgecolors='black', s=50)
plt.grid()
plt.tight_layout()
plt.show()

エルボー法

最適な、k は何かを可視化する。肘のできる辺りの k がよい選択である。

from sklearn.cluster import KMeans
distortions = []

for i in range(1, 11):
    km = KMeans(
        n_clusters=i,
        init='k-means++',
        max_iter=300,
        random_state=0,
    )
    km.fit(X)
    distortions.append(km.inertia_)
    
plt.plot(range(1, 11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.tight_layout()
plt.show()

シルエット図

クラスタの凝集度と乖離度からシルエット係数を算出し、グラフを描画する。
シルエット係数が 1 に近いほど、最適なクラスタリングに分類されていることになる。

km = KMeans(
    n_clusters=3,
    init='k-means++',
    n_init=10,
    max_iter=300,
    tol=1e-04,
    random_state=0
)
y_km = km.fit_predict(X)

import numpy as np
from matplotlib import cm
from sklearn.metrics import silhouette_samples
cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]

## シルエット係数をリストで返却
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []

for i, c in enumerate(cluster_labels):
    c_silhouette_vals = silhouette_vals[y_km == c]
    c_silhouette_vals.sort()
    y_ax_upper += len(c_silhouette_vals)
    
    ## cm : カラーマップ
    color = cm.jet(float(i) / n_clusters)
    
    ## 水平に棒グラフ
    plt.barh(
        range(y_ax_lower, y_ax_upper),
        c_silhouette_vals,
        height=1.0,
        edgecolor='none',
        color=color
    )
    yticks.append((y_ax_lower + y_ax_upper) / 2.)
    y_ax_lower += len(c_silhouette_vals)

silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color='red', linestyle="--")

## ytick : yの目盛
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.tight_layout()
plt.show()

機械学習の勉強コード+サイト

ここのコードを再利用することで、実装も簡単かも。

github.com

Python Data Science Handbook | Python Data Science Handbook

sebastianraschka.com

https://sebastianraschka.com/pdf/books/dlb/appendix_d_calculus.pdf

アンサンブル分類器の実装

一般的に、アンサンブル分類器の方が、個別の分類器より性能が高い

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    StandardScaler,
    LabelEncoder,
)
iris = datasets.load_iris()
X, y = iris.data[50:, [1,2]], iris.target[50:]
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1, stratify=y)

アンサンブル対象分類器

  • ロジスティク回帰分類器
  • 決定木分類器
  • k 近傍分類器
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
import numpy as np

clf1 = LogisticRegression(
    penalty='l2',
    C=0.001,
    random_state=1
)

clf2 = DecisionTreeClassifier(
    max_depth=1,
    criterion='entropy',
    random_state=1
)

## p は距離計算時の指数。metric は計算方法
clf3 = KNeighborsClassifier(
    n_neighbors=1,
    p=2,
    metric='minkowski'
)

pipe1 = Pipeline(
    [['sc', StandardScaler()], ['clf', clf1]]
)

pipe3 = Pipeline(
    [['sc', StandardScaler()], ['clf', clf3]]
)

clf_labels = ['Logistic regression', 'Decision tree', 'KNN']
print('10-fold cross validation:\n')

for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
    scores = cross_val_score(
        estimator=clf,
        X=X_train,
        y=y_train,
        cv=10,
        scoring='roc_auc'
    )

    print("ROC AUC: %0.2f ( + / - %0.2f) [%s]" % (scores.mean(), scores.std(),  label)) 
10-fold cross validation:

ROC AUC: 0.87 ( + / - 0.17) [Logistic regression]
ROC AUC: 0.89 ( + / - 0.16) [Decision tree]
ROC AUC: 0.88 ( + / - 0.15) [KNN]

アンサンブル分類器との比較

from sklearn.ensemble import VotingClassifier

## Else if ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities, 
## which is recommended for an ensemble of well-calibrated classifiers.
vote_clf = VotingClassifier(estimators=[('pipe1', pipe1), ('clf2', clf2), ('pipe3', pipe3)], voting='soft')
clf_labels =  ['Logistic regression', 'Decision tree', 'KNN', 'VotingClassifier']
all_clf = [pipe1, clf2, pipe3, vote_clf]

for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(
        estimator=clf,
        X=X_train,
        y=y_train,
        cv=10,
        scoring='roc_auc'
    )

    print("ROC AUC: %0.2f ( + / - %0.2f) [%s]" % (scores.mean(), scores.std(),  label)) 
ROC AUC: 0.87 ( + / - 0.17) [Logistic regression]
ROC AUC: 0.89 ( + / - 0.16) [Decision tree]
ROC AUC: 0.88 ( + / - 0.15) [KNN]
ROC AUC: 0.94 ( + / - 0.13) [VotingClassifier]
In [ ]:

グリッドサーチにより、ハイパラメータをチューニング

vote_clf のパラメータの確認

print(vote_clf.get_params())
{'clf2': DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=1,
             splitter='best'),
 'clf2__class_weight': None,
 'clf2__criterion': 'entropy',
 'clf2__max_depth': 1,
 'clf2__max_features': None,
 'clf2__max_leaf_nodes': None,
 'clf2__min_impurity_decrease': 0.0,
 'clf2__min_impurity_split': None,
 'clf2__min_samples_leaf': 1,
 'clf2__min_samples_split': 2,
 'clf2__min_weight_fraction_leaf': 0.0,
 'clf2__presort': False,
 'clf2__random_state': 1,
 'clf2__splitter': 'best',
 'estimators': [('pipe1', Pipeline(memory=None,
        steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
             intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
             penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
             verbose=0, warm_start=False)]])),
  ('clf2',
   DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
               max_features=None, max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, presort=False, random_state=1,
               splitter='best')),
  ('pipe3', Pipeline(memory=None,
        steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
              metric_params=None, n_jobs=1, n_neighbors=1, p=2,
              weights='uniform')]]))],
 'flatten_transform': None,
 'n_jobs': 1,
 'pipe1': Pipeline(memory=None,
      steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)]]),
 'pipe1__clf': LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 'pipe1__clf__C': 0.001,
 'pipe1__clf__class_weight': None,
 'pipe1__clf__dual': False,
 'pipe1__clf__fit_intercept': True,
 'pipe1__clf__intercept_scaling': 1,
 'pipe1__clf__max_iter': 100,
 'pipe1__clf__multi_class': 'ovr',
 'pipe1__clf__n_jobs': 1,
 'pipe1__clf__penalty': 'l2',
 'pipe1__clf__random_state': 1,
 'pipe1__clf__solver': 'liblinear',
 'pipe1__clf__tol': 0.0001,
 'pipe1__clf__verbose': 0,
 'pipe1__clf__warm_start': False,
 'pipe1__memory': None,
 'pipe1__sc': StandardScaler(copy=True, with_mean=True, with_std=True),
 'pipe1__sc__copy': True,
 'pipe1__sc__with_mean': True,
 'pipe1__sc__with_std': True,
 'pipe1__steps': [['sc',
   StandardScaler(copy=True, with_mean=True, with_std=True)],
  ['clf',
   LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
             intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
             penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
             verbose=0, warm_start=False)]],
 'pipe3': Pipeline(memory=None,
      steps=[['sc', StandardScaler(copy=True, with_mean=True, with_std=True)], ['clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=1, p=2,
            weights='uniform')]]),
 'pipe3__clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=1, p=2,
            weights='uniform'),
 'pipe3__clf__algorithm': 'auto',
 'pipe3__clf__leaf_size': 30,
 'pipe3__clf__metric': 'minkowski',
 'pipe3__clf__metric_params': None,
 'pipe3__clf__n_jobs': 1,
 'pipe3__clf__n_neighbors': 1,
 'pipe3__clf__p': 2,
 'pipe3__clf__weights': 'uniform',
 'pipe3__memory': None,
 'pipe3__sc': StandardScaler(copy=True, with_mean=True, with_std=True),
 'pipe3__sc__copy': True,
 'pipe3__sc__with_mean': True,
 'pipe3__sc__with_std': True,
 'pipe3__steps': [['sc',
   StandardScaler(copy=True, with_mean=True, with_std=True)],
  ['clf',
   KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
              metric_params=None, n_jobs=1, n_neighbors=1, p=2,
              weights='uniform')]],
 'voting': 'soft',
 'weights': None}

グリッドサーチ実行

from sklearn.model_selection import GridSearchCV
params = {
    'clf2__max_depth': [1, 2],
    'pipe1__clf__C': [0.001, 1.0, 100.0]
}

grid = GridSearchCV(
    estimator=vote_clf,
    param_grid=params,
    cv=10,
    scoring='roc_auc'
)

grid.fit(X_train, y_train)

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f + / - %0.2f %r" % (
        grid.cv_results_['mean_test_score'][r],
        grid.cv_results_['std_test_score'][r] / 2.0,
        grid.cv_results_['params'][r]
    ))
print('')
print('Best parameters: %s' %grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)
0.933 + / - 0.07 {'clf2__max_depth': 1, 'pipe1__clf__C': 0.001}
0.973 + / - 0.04 {'clf2__max_depth': 1, 'pipe1__clf__C': 1.0}
0.973 + / - 0.04 {'clf2__max_depth': 1, 'pipe1__clf__C': 100.0}
0.947 + / - 0.07 {'clf2__max_depth': 2, 'pipe1__clf__C': 0.001}
0.973 + / - 0.04 {'clf2__max_depth': 2, 'pipe1__clf__C': 1.0}
0.973 + / - 0.04 {'clf2__max_depth': 2, 'pipe1__clf__C': 100.0}

Best parameters: {'clf2__max_depth': 1, 'pipe1__clf__C': 1.0}
Accuracy: 0.97

sklearn にて、適合率と再現率

以下の投稿で load したX_train, y_train,... を利用。

kidnohr.hatenadiary.com

適合率と再現率と F1 スコア

適合率(PRE)と再現率(REC)について、F1 スコアという性能指標が存在する。

PRE = TP / ( TP + FP )

REC = TP / ( TP + FN )

f1 = 2 * ( PRE * REC ) / ( PRE + REC )

from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
)
print('Precision : %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
print('F1 : %.3f' % f1_score(y_true=y_test, y_pred=y_pred))
Precision : 0.974
Recall: 0.905
F1 : 0.938

F1 スコアで、グリッドサーチの結果を計測

from sklearn.metrics import make_scorer

## pos_label : 陽性のラベル指定
scorer = make_scorer(f1_score, pos_label=0)

gs = GridSearchCV(
    estimator=pipe_svc,
    param_grid=param_grid,
    scoring=scorer,
    cv=10,
    n_jobs=-1
)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)
0.9880219137963148
{'svc__C': 100.0, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}