House Priceの分析1
タスク
Goal
It is your job to predict the sales price for each house. For each Id in the test set, you must predict the value of the SalePrice variable.
Metric
Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)
分析
kaggleのデータを使って、データの相関とか調べる
#invite people for the Kaggle party import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np from scipy.stats import norm from sklearn.preprocessing import StandardScaler from scipy import stats import warnings warnings.filterwarnings('ignore') %matplotlib inline df_train = pd.read_csv('kaggle/kaggle1/train.csv') ## df_trainの情報 df_train['SalePrice'].describe() ## seaornでヒストグラムと、分布プロット sns.distplot(df_train['SalePrice']) ## 歪度( Skewness )と尖度 ( Kurtosis ) を出す print("Skewness: %f" % df_train['SalePrice'].skew()) print("Kurtosis: %f" % df_train['SalePrice'].kurt()) ## 主観的に相関がありそうな項目を調べる var = 'GrLivArea' data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1) data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000)) ## box plot でデータのばらつきも見つつ、相関も見る #box plot overallqual/saleprice var = 'OverallQual' data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1) f, ax = plt.subplots(figsize=(8, 6)) fig = sns.boxplot(x=var, y="SalePrice", data=data) fig.axis(ymin=0, ymax=800000); ## 全体の相関図を見て、ヒートマップを作成 #correlation matrix corrmat = df_train.corr() f, ax = plt.subplots(figsize=(12, 9)) sns.heatmap(corrmat, vmax=.8, square=True, annot=True, fmt='.0%'); ## 相関係数の高いTOPのカテゴリを抽出 k = 10 #number of variables for heatmap cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index cm = np.corrcoef(df_train[cols].values.T) sns.set(font_scale=1.25) hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values) plt.show() ## 上位相関係数のカテゴリを抽出 #scatterplot sns.set() cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt'] sns.pairplot(df_train[cols], size = 2.5) plt.show();
missing data
#missing data total = df_train.isnull().sum().sort_values(ascending=False) percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False) missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) missing_data.head(20) ## missing dataのうちいらないデータを削除 df_train = df_train.drop((missing_data[missing_data['Total'] > 1]).index,1) df_train = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index) df_train.isnull().sum().max() #just checking that there's no missing data missing...
標準化
saleprice_scaled = StandardScaler().fit_transform(df_train['SalePrice'][:,np.newaxis]); low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10] high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:] print('outer range (low) of the distribution:') print(low_range) print('\nouter range (high) of the distribution:') print(high_range) # 外れ値の削除 #deleting points df_train.sort_values(by = 'GrLivArea', ascending = False)[:2] df_train = df_train.drop(df_train[df_train['Id'] == 1299].index) df_train = df_train.drop(df_train[df_train['Id'] == 524].index) # ヒストグラムで 正規分布に fit させ、QQplotで正規分布に従うか確認 #histogram and normal probability plot sns.distplot(df_train['SalePrice'], fit=norm); fig = plt.figure() res = stats.probplot(df_train['SalePrice'], plot=plt) # 対数を取ることで、正規分布になる df_train['SalePrice'] = np.log(df_train['SalePrice']) sns.distplot(df_train['SalePrice'], fit=norm); fig = plt.figure() res = stats.probplot(df_train['SalePrice'], plot=plt) # 0 を無視して対数変換することで、正規分布になる #create column for new variable (one is enough because it's a binary categorical feature) #if area>0 it gets 1, for area==0 it gets 0 df_train['HasBsmt'] = pd.Series(len(df_train['TotalBsmtSF']), index=df_train.index) df_train['HasBsmt'] = 0 df_train.loc[df_train['TotalBsmtSF']>0,'HasBsmt'] = 1 #transform data df_train.loc[df_train['HasBsmt']==1,'TotalBsmtSF'] = np.log(df_train['TotalBsmtSF']) #histogram and normal probability plot sns.distplot(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], fit=norm); fig = plt.figure() res = stats.probplot(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], plot=plt) ## 散布図を確認 #scatter plot plt.scatter(df_train['GrLivArea'], df_train['SalePrice']);
カテゴリをダミー変数に変換
#convert categorical variable into dummy
df_train = pd.get_dummies(df_train)