House Priceの分析2
前処理
%matplotlib inline import numpy as np import pandas as pd import matplotlib.pyplot as plt import scipy.stats as stats import sklearn.linear_model as linear_model import seaborn as sns import xgboost as xgb # <-- アンサンブル学習に使う from sklearn.model_selection import KFold from IPython.display import HTML, display from sklearn.manifold import TSNE from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler pd.options.display.max_rows = 1000 pd.options.display.max_columns = 20 train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/train.csv') quantitative = [f for f in train.columns if train.dtypes[f] != 'object'] quantitative.remove('SalePrice') quantitative.remove('Id') qualitative = [f for f in train.columns if train.dtypes[f] == 'object'] ## 欠陥データの多い特徴を確認 missing = train.isnull().sum() missing = missing[missing > 0] missing.sort_values(inplace=True) missing.plot.bar() ## SalePriceの分布のフィッティング ## ジョンソン SU 分布が一番マッチするらしいが、ちょっと知らない分布だった import scipy.stats as st y = train['SalePrice'] plt.figure(1); plt.title('Johnson SU') sns.distplot(y, kde=False, fit=st.johnsonsu) plt.figure(2); plt.title('Normal') sns.distplot(y, kde=False, fit=st.norm) plt.figure(3); plt.title('Log Normal') sns.distplot(y, kde=False, fit=st.lognorm) ## SHAPIRO-WILKの正規性検定で、p値が0.01より小さい場合、 ## 正規分布に従っているという帰無仮説が棄却される test_normality = lambda x: stats.shapiro(x.fillna(0))[1] < 0.01 normal = pd.DataFrame(train[quantitative]) normal = normal.apply(test_normality) ## 正規分布に従わないカテゴリが存在するか print(not normal.any())
可視化
# index, variable, valueに展開 f = pd.melt(train, value_vars=quantitative) # "variable" のcol対して、col_wrapずつ描画する設定で、FacetGridを設定する。(share{x,y}で軸の値を共有するか決める) g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False) # "value" のcol対して、distplotを描画する g = g.map(sns.distplot, "value") ## 定性データをカテゴリとしてプロット。 for c in qualitative: train[c] = train[c].astype('category') if train[c].isnull().any(): train[c] = train[c].cat.add_categories(['MISSING']) train[c] = train[c].fillna('MISSING') def boxplot(x, y, **kwargs): sns.boxplot(x=x, y=y) x=plt.xticks(rotation=90) f = pd.melt(train, id_vars=['SalePrice'], value_vars=qualitative) g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5) g = g.map(boxplot, "value", "SalePrice") ## 一元配置分散分析法(One Way ANOVA) ## f_onewayで、qualitativeのカテゴリごとに異なる SalePrice 群に有意差が存在するかを検定 ## p値の逆数の対数より、disparity の値が大きい方が差異が大きい def anova(frame): anv = pd.DataFrame() anv['feature'] = qualitative pvals = [] for c in qualitative: samples = [] for cls in frame[c].unique(): s = frame[frame[c] == cls]['SalePrice'].values samples.append(s) pval = stats.f_oneway(*samples)[1] pvals.append(pval) anv['pval'] = pvals return anv.sort_values('pval') a = anova(train) a['disparity'] = np.log(1./a['pval'].values) sns.barplot(data=a, x='feature', y='disparity') x=plt.xticks(rotation=90)
定性 + 定量データの相関係数(スピアマン)
## カテゴリを1〜n の整数に変換。one-hot ではない ## SalePriceの平均が小さい順に番号を割り当てている def encode(frame, feature): ordering = pd.DataFrame() ordering['val'] = frame[feature].unique() ordering.index = ordering.val ordering['spmean'] = frame[[feature, 'SalePrice']].groupby(feature).mean()['SalePrice'] ordering = ordering.sort_values('spmean') ordering['ordering'] = range(1, ordering.shape[0]+1) ordering = ordering['ordering'].to_dict() for cat, o in ordering.items(): frame.loc[frame[feature] == cat, feature+'_E'] = o qual_encoded = [] for q in qualitative: encode(train, q) qual_encoded.append(q+'_E') print(qual_encoded) ## 特徴ごとに SalePrice とスピアマンの順位相関係数を昇順でソート ## pltを縦長の figure に設定 def spearman(frame, features): spr = pd.DataFrame() spr['feature'] = features spr['spearman'] = [frame[f].corr(frame['SalePrice'], 'spearman') for f in features] spr = spr.sort_values('spearman') plt.figure(figsize=(6, 0.25*len(features))) sns.barplot(data=spr, y='feature', x='spearman', orient='h') features = quantitative + qual_encoded spearman(train, features) ## 平均取って相関図 def pairplot(x, y, **kwargs): ax = plt.gca() ts = pd.DataFrame({'time': x, 'val': y}) ts = ts.groupby('time').mean() ts.plot(ax=ax) plt.xticks(rotation=90) f = pd.melt(train, id_vars=['SalePrice'], value_vars=quantitative+qual_encoded) g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5) g = g.map(pairplot, "value", "SalePrice")
200000ドルの上界外界の差を確認
features = quantitative standard = train[train['SalePrice'] < 200000] pricey = train[train['SalePrice'] >= 200000] diff = pd.DataFrame() diff['feature'] = features diff['difference'] = [(pricey[f].fillna(0.).mean() - standard[f].fillna(0.).mean())/(standard[f].fillna(0.).mean()) for f in features] sns.barplot(data=diff, x='feature', y='difference') x=plt.xticks(rotation=90)
TSNE を使ってクラスタを可視化
features = quantitative + qual_encoded ## TSNE で可視化 model = TSNE(n_components=2, random_state=0, perplexity=50) X = train[features].fillna(0.).values tsne = model.fit_transform(X) ## 標準化 --> 主成分分析 std = StandardScaler() s = std.fit_transform(X) pca = PCA(n_components=30) pca.fit(s) pc = pca.transform(s) kmeans = KMeans(n_clusters=5) kmeans.fit(pc) ## クラスタをTSNEで表現 fr = pd.DataFrame({'tsne1': tsne[:,0], 'tsne2': tsne[:, 1], 'cluster': kmeans.labels_}) sns.lmplot(data=fr, x='tsne1', y='tsne2', hue='cluster', fit_reg=False) ## PCAの説明分散(特徴量30の場合) print(np.sum(pca.explained_variance_ratio_))
Johnsonsu分布への変換
box cox と違う変換方法
y = train['SalePrice'].values def johnson(y): gamma, eta, epsilon, lbda = stats.johnsonsu.fit(y) yt = gamma + eta*np.arcsinh((y-epsilon)/lbda) return yt, gamma, eta, epsilon, lbda def johnson_inverse(y, gamma, eta, epsilon, lbda): return lbda*np.sinh((y-gamma)/eta) + epsilon yt, g, et, ep, l = johnson(y) yt2 = johnson_inverse(yt, g, et, ep, l) plt.figure(1) sns.distplot(yt) plt.figure(2) sns.distplot(yt2)