House Priceの分析3
回帰分析
## 平均二乗誤差をルート def error(actual, predicted): actual = np.log(actual) predicted = np.log(predicted) return np.sqrt(np.sum(np.square(actual-predicted))/len(actual)) ## 対数変換 def log_transform(feature): train[feature] = np.log1p(train[feature].values) ## 二乗項の追加 def quadratic(feature): train[feature+'2'] = train[feature]**2 log_transform('GrLivArea') log_transform('1stFlrSF') log_transform('2ndFlrSF') log_transform('TotalBsmtSF') log_transform('LotArea') log_transform('LotFrontage') log_transform('KitchenAbvGr') log_transform('GarageArea') quadratic('OverallQual') quadratic('YearBuilt') quadratic('YearRemodAdd') quadratic('TotalBsmtSF') quadratic('2ndFlrSF') quadratic('Neighborhood_E') quadratic('RoofMatl_E') quadratic('GrLivArea') qdr = ['OverallQual2', 'YearBuilt2', 'YearRemodAdd2', 'TotalBsmtSF2', '2ndFlrSF2', 'Neighborhood_E2', 'RoofMatl_E2', 'GrLivArea2'] train['HasBasement'] = train['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0) train['HasGarage'] = train['GarageArea'].apply(lambda x: 1 if x > 0 else 0) train['Has2ndFloor'] = train['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0) train['HasMasVnr'] = train['MasVnrArea'].apply(lambda x: 1 if x > 0 else 0) train['HasWoodDeck'] = train['WoodDeckSF'].apply(lambda x: 1 if x > 0 else 0) train['HasPorch'] = train['OpenPorchSF'].apply(lambda x: 1 if x > 0 else 0) train['HasPool'] = train['PoolArea'].apply(lambda x: 1 if x > 0 else 0) train['IsNew'] = train['YearBuilt'].apply(lambda x: 1 if x > 2000 else 0) boolean = ['HasBasement', 'HasGarage', 'Has2ndFloor', 'HasMasVnr', 'HasWoodDeck', 'HasPorch', 'HasPool', 'IsNew'] features = quantitative + qual_encoded + boolean + qdr lasso = linear_model.LassoLarsCV(max_iter=10000) X = train[features].fillna(0.).values Y = train['SalePrice'].values lasso.fit(X, np.log(Y)) Ypred = np.exp(lasso.predict(X)) error(Y, Ypred)
pyplot の 円グラフをいい感じに描く
f,a = plt.subplots(nrows=5, ncols=2, figsize=(14, 20)) a = a.ravel() for idx,ax in enumerate(a): v_list = km_center[idx] df_timeband_meanrate = pd.DataFrame( { 'timeband': name_list, 'rate': v_list }, ) print(idx, np.bincount(y_km)[idx]) display(df_timeband_meanrate) ax.pie(df_timeband_meanrate["rate"], labels=['' for i in range(8)],autopct='%.2f %%') ax.set_title('cluster: {}, number: {}'.format(idx, np.bincount(y_km)[idx])) ax.legend(loc="upper left", bbox_to_anchor=(0.85,1.025), labels=df_timeband_meanrate["timeband"]) plt.tight_layout( ) plt.savefig('time_cluster_center.png', dpi=100) plt.show()
House Priceの分析2
前処理
%matplotlib inline import numpy as np import pandas as pd import matplotlib.pyplot as plt import scipy.stats as stats import sklearn.linear_model as linear_model import seaborn as sns import xgboost as xgb # <-- アンサンブル学習に使う from sklearn.model_selection import KFold from IPython.display import HTML, display from sklearn.manifold import TSNE from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler pd.options.display.max_rows = 1000 pd.options.display.max_columns = 20 train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/train.csv') quantitative = [f for f in train.columns if train.dtypes[f] != 'object'] quantitative.remove('SalePrice') quantitative.remove('Id') qualitative = [f for f in train.columns if train.dtypes[f] == 'object'] ## 欠陥データの多い特徴を確認 missing = train.isnull().sum() missing = missing[missing > 0] missing.sort_values(inplace=True) missing.plot.bar() ## SalePriceの分布のフィッティング ## ジョンソン SU 分布が一番マッチするらしいが、ちょっと知らない分布だった import scipy.stats as st y = train['SalePrice'] plt.figure(1); plt.title('Johnson SU') sns.distplot(y, kde=False, fit=st.johnsonsu) plt.figure(2); plt.title('Normal') sns.distplot(y, kde=False, fit=st.norm) plt.figure(3); plt.title('Log Normal') sns.distplot(y, kde=False, fit=st.lognorm) ## SHAPIRO-WILKの正規性検定で、p値が0.01より小さい場合、 ## 正規分布に従っているという帰無仮説が棄却される test_normality = lambda x: stats.shapiro(x.fillna(0))[1] < 0.01 normal = pd.DataFrame(train[quantitative]) normal = normal.apply(test_normality) ## 正規分布に従わないカテゴリが存在するか print(not normal.any())
可視化
# index, variable, valueに展開 f = pd.melt(train, value_vars=quantitative) # "variable" のcol対して、col_wrapずつ描画する設定で、FacetGridを設定する。(share{x,y}で軸の値を共有するか決める) g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False) # "value" のcol対して、distplotを描画する g = g.map(sns.distplot, "value") ## 定性データをカテゴリとしてプロット。 for c in qualitative: train[c] = train[c].astype('category') if train[c].isnull().any(): train[c] = train[c].cat.add_categories(['MISSING']) train[c] = train[c].fillna('MISSING') def boxplot(x, y, **kwargs): sns.boxplot(x=x, y=y) x=plt.xticks(rotation=90) f = pd.melt(train, id_vars=['SalePrice'], value_vars=qualitative) g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5) g = g.map(boxplot, "value", "SalePrice") ## 一元配置分散分析法(One Way ANOVA) ## f_onewayで、qualitativeのカテゴリごとに異なる SalePrice 群に有意差が存在するかを検定 ## p値の逆数の対数より、disparity の値が大きい方が差異が大きい def anova(frame): anv = pd.DataFrame() anv['feature'] = qualitative pvals = [] for c in qualitative: samples = [] for cls in frame[c].unique(): s = frame[frame[c] == cls]['SalePrice'].values samples.append(s) pval = stats.f_oneway(*samples)[1] pvals.append(pval) anv['pval'] = pvals return anv.sort_values('pval') a = anova(train) a['disparity'] = np.log(1./a['pval'].values) sns.barplot(data=a, x='feature', y='disparity') x=plt.xticks(rotation=90)
定性 + 定量データの相関係数(スピアマン)
## カテゴリを1〜n の整数に変換。one-hot ではない ## SalePriceの平均が小さい順に番号を割り当てている def encode(frame, feature): ordering = pd.DataFrame() ordering['val'] = frame[feature].unique() ordering.index = ordering.val ordering['spmean'] = frame[[feature, 'SalePrice']].groupby(feature).mean()['SalePrice'] ordering = ordering.sort_values('spmean') ordering['ordering'] = range(1, ordering.shape[0]+1) ordering = ordering['ordering'].to_dict() for cat, o in ordering.items(): frame.loc[frame[feature] == cat, feature+'_E'] = o qual_encoded = [] for q in qualitative: encode(train, q) qual_encoded.append(q+'_E') print(qual_encoded) ## 特徴ごとに SalePrice とスピアマンの順位相関係数を昇順でソート ## pltを縦長の figure に設定 def spearman(frame, features): spr = pd.DataFrame() spr['feature'] = features spr['spearman'] = [frame[f].corr(frame['SalePrice'], 'spearman') for f in features] spr = spr.sort_values('spearman') plt.figure(figsize=(6, 0.25*len(features))) sns.barplot(data=spr, y='feature', x='spearman', orient='h') features = quantitative + qual_encoded spearman(train, features) ## 平均取って相関図 def pairplot(x, y, **kwargs): ax = plt.gca() ts = pd.DataFrame({'time': x, 'val': y}) ts = ts.groupby('time').mean() ts.plot(ax=ax) plt.xticks(rotation=90) f = pd.melt(train, id_vars=['SalePrice'], value_vars=quantitative+qual_encoded) g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5) g = g.map(pairplot, "value", "SalePrice")
200000ドルの上界外界の差を確認
features = quantitative standard = train[train['SalePrice'] < 200000] pricey = train[train['SalePrice'] >= 200000] diff = pd.DataFrame() diff['feature'] = features diff['difference'] = [(pricey[f].fillna(0.).mean() - standard[f].fillna(0.).mean())/(standard[f].fillna(0.).mean()) for f in features] sns.barplot(data=diff, x='feature', y='difference') x=plt.xticks(rotation=90)
TSNE を使ってクラスタを可視化
features = quantitative + qual_encoded ## TSNE で可視化 model = TSNE(n_components=2, random_state=0, perplexity=50) X = train[features].fillna(0.).values tsne = model.fit_transform(X) ## 標準化 --> 主成分分析 std = StandardScaler() s = std.fit_transform(X) pca = PCA(n_components=30) pca.fit(s) pc = pca.transform(s) kmeans = KMeans(n_clusters=5) kmeans.fit(pc) ## クラスタをTSNEで表現 fr = pd.DataFrame({'tsne1': tsne[:,0], 'tsne2': tsne[:, 1], 'cluster': kmeans.labels_}) sns.lmplot(data=fr, x='tsne1', y='tsne2', hue='cluster', fit_reg=False) ## PCAの説明分散(特徴量30の場合) print(np.sum(pca.explained_variance_ratio_))
Johnsonsu分布への変換
box cox と違う変換方法
y = train['SalePrice'].values def johnson(y): gamma, eta, epsilon, lbda = stats.johnsonsu.fit(y) yt = gamma + eta*np.arcsinh((y-epsilon)/lbda) return yt, gamma, eta, epsilon, lbda def johnson_inverse(y, gamma, eta, epsilon, lbda): return lbda*np.sinh((y-gamma)/eta) + epsilon yt, g, et, ep, l = johnson(y) yt2 = johnson_inverse(yt, g, et, ep, l) plt.figure(1) sns.distplot(yt) plt.figure(2) sns.distplot(yt2)
House Priceの分析1
タスク
Goal
It is your job to predict the sales price for each house. For each Id in the test set, you must predict the value of the SalePrice variable.
Metric
Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)
分析
kaggleのデータを使って、データの相関とか調べる
#invite people for the Kaggle party import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np from scipy.stats import norm from sklearn.preprocessing import StandardScaler from scipy import stats import warnings warnings.filterwarnings('ignore') %matplotlib inline df_train = pd.read_csv('kaggle/kaggle1/train.csv') ## df_trainの情報 df_train['SalePrice'].describe() ## seaornでヒストグラムと、分布プロット sns.distplot(df_train['SalePrice']) ## 歪度( Skewness )と尖度 ( Kurtosis ) を出す print("Skewness: %f" % df_train['SalePrice'].skew()) print("Kurtosis: %f" % df_train['SalePrice'].kurt()) ## 主観的に相関がありそうな項目を調べる var = 'GrLivArea' data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1) data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000)) ## box plot でデータのばらつきも見つつ、相関も見る #box plot overallqual/saleprice var = 'OverallQual' data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1) f, ax = plt.subplots(figsize=(8, 6)) fig = sns.boxplot(x=var, y="SalePrice", data=data) fig.axis(ymin=0, ymax=800000); ## 全体の相関図を見て、ヒートマップを作成 #correlation matrix corrmat = df_train.corr() f, ax = plt.subplots(figsize=(12, 9)) sns.heatmap(corrmat, vmax=.8, square=True, annot=True, fmt='.0%'); ## 相関係数の高いTOPのカテゴリを抽出 k = 10 #number of variables for heatmap cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index cm = np.corrcoef(df_train[cols].values.T) sns.set(font_scale=1.25) hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values) plt.show() ## 上位相関係数のカテゴリを抽出 #scatterplot sns.set() cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt'] sns.pairplot(df_train[cols], size = 2.5) plt.show();
missing data
#missing data total = df_train.isnull().sum().sort_values(ascending=False) percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False) missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) missing_data.head(20) ## missing dataのうちいらないデータを削除 df_train = df_train.drop((missing_data[missing_data['Total'] > 1]).index,1) df_train = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index) df_train.isnull().sum().max() #just checking that there's no missing data missing...
標準化
saleprice_scaled = StandardScaler().fit_transform(df_train['SalePrice'][:,np.newaxis]); low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10] high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:] print('outer range (low) of the distribution:') print(low_range) print('\nouter range (high) of the distribution:') print(high_range) # 外れ値の削除 #deleting points df_train.sort_values(by = 'GrLivArea', ascending = False)[:2] df_train = df_train.drop(df_train[df_train['Id'] == 1299].index) df_train = df_train.drop(df_train[df_train['Id'] == 524].index) # ヒストグラムで 正規分布に fit させ、QQplotで正規分布に従うか確認 #histogram and normal probability plot sns.distplot(df_train['SalePrice'], fit=norm); fig = plt.figure() res = stats.probplot(df_train['SalePrice'], plot=plt) # 対数を取ることで、正規分布になる df_train['SalePrice'] = np.log(df_train['SalePrice']) sns.distplot(df_train['SalePrice'], fit=norm); fig = plt.figure() res = stats.probplot(df_train['SalePrice'], plot=plt) # 0 を無視して対数変換することで、正規分布になる #create column for new variable (one is enough because it's a binary categorical feature) #if area>0 it gets 1, for area==0 it gets 0 df_train['HasBsmt'] = pd.Series(len(df_train['TotalBsmtSF']), index=df_train.index) df_train['HasBsmt'] = 0 df_train.loc[df_train['TotalBsmtSF']>0,'HasBsmt'] = 1 #transform data df_train.loc[df_train['HasBsmt']==1,'TotalBsmtSF'] = np.log(df_train['TotalBsmtSF']) #histogram and normal probability plot sns.distplot(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], fit=norm); fig = plt.figure() res = stats.probplot(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], plot=plt) ## 散布図を確認 #scatter plot plt.scatter(df_train['GrLivArea'], df_train['SalePrice']);
カテゴリをダミー変数に変換
#convert categorical variable into dummy
df_train = pd.get_dummies(df_train)
思い出したように CentOS7 に MySQL5.7 をインストールした
yum remove mysql-server mysql-libs mysql-devel mysql* rm -rf /var/lib/mysql/ yum -y install https://dev.mysql.com/get/mysql80-community-release-el7-1.noarch.rpm yum-config-manager --disable mysql80-community yum-config-manager --enable mysql57-community yum -y install mysql mysql-devel mysql-server yum -y install mysql mysql-utilities yum -y install postfix systemctl enable mysqld.service systemctl start mysqld.service systemctl status mysqld.service # rootの初期パスワード cat /var/log/mysqld.log | grep 'password is generated' mysql_secure_installation # cnfの編集 vim /etc/my.cnf # rootの初期パスワード mysql -uroot -p
- /etc/my.cnf
[mysqld] # # Remove leading # and set to the amount of RAM for the most important data # cache in MySQL. Start at 70% of total RAM for dedicated server, else 10%. # innodb_buffer_pool_size = 128M # # Remove leading # to turn on a very important data integrity option: logging # changes to the binary log between backups. # log_bin # # Remove leading # to set options mainly useful for reporting servers. # The server defaults are faster for transactions and fast SELECTs. # Adjust sizes as needed, experiment to find the optimal values. # join_buffer_size = 128M # sort_buffer_size = 2M # read_rnd_buffer_size = 2M datadir=/var/lib/mysql socket=/var/lib/mysql/mysql.sock # Disabling symbolic-links is recommended to prevent assorted security risks symbolic-links=0 log-error=/var/log/mysqld.log pid-file=/var/run/mysqld/mysqld.pid # ↓追加 validate-password=OFF log-slow-admin-statements = 1 log-queries-not-using-indexes = 1 slow_query_log = 1 slow_query_log_file = /var/log/mysql/slow.log long_query_time = 0.5 expire_logs_days = 14
rootにパスワードなしで入る
mysql> use mysql; Reading table information for completion of table and column names You can turn off this feature to get a quicker startup with -A Database changed mysql> update user set authentication_string=password('') where user='root'; Query OK, 1 row affected (0.00 sec) Rows matched: 1 Changed: 1 Warnings: 0
mermaid.js がデータフローに便利と聞いて
atom で atom-mermaid
を追加
apm install atom-mermaid
以下の実装でいろいろ書けそう。
graph TB; id1[四角] id2((丸)) id3(角丸四角) id4{ひし形} id5>リボン]