MySQLのレプリケーションが遅延した場合
レプリケーション遅延
まずは、 show slave status
で Seconds_Behind_Master と Slave*Runningを確認する。
どれくらい遅延しているか確認できる。
slave1 [localhost] {msandbox} ((none)) > show slave status\G *************************** 1. row *************************** Slave_IO_State: Waiting for master to send event Master_Host: 127.0.0.1 Master_User: rsandbox Master_Port: 19972 Connect_Retry: 60 Master_Log_File: mysql-bin.000003 Read_Master_Log_Pos: 31122248 Relay_Log_File: mysql_sandbox19973-relay-bin.000008 Relay_Log_Pos: 30512168 Relay_Master_Log_File: mysql-bin.000003 Slave_IO_Running: Yes Slave_SQL_Running: Yes Replicate_Do_DB: Replicate_Ignore_DB: Replicate_Do_Table: Replicate_Ignore_Table: Replicate_Wild_Do_Table: Replicate_Wild_Ignore_Table: Last_Errno: 0 Last_Error: Skip_Counter: 0 Exec_Master_Log_Pos: 30512022 Relay_Log_Space: 31122709 Until_Condition: None Until_Log_File: Until_Log_Pos: 0 Master_SSL_Allowed: No Master_SSL_CA_File: Master_SSL_CA_Path: Master_SSL_Cert: Master_SSL_Cipher: Master_SSL_Key: Seconds_Behind_Master: 1 Master_SSL_Verify_Server_Cert: No Last_IO_Errno: 0 Last_IO_Error: Last_SQL_Errno: 0 Last_SQL_Error: Replicate_Ignore_Server_Ids: Master_Server_Id: 1 1 row in set (0.00 sec)
Slave_SQL_THREADが遅い場合は、以下のコマンドで、現在進行系の queryを確認
> show processlist\G;
House Priceの分析5
前処理
import pandas as pd import numpy as np import seaborn as sns import matplotlib import matplotlib.pyplot as plt from scipy.stats import skew from scipy.stats.stats import pearsonr %config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook %matplotlib inline train = pd.read_csv("kaggle/kaggle1/train.csv") test = pd.read_csv("kaggle/kaggle1/train.csv") ## id 以外 all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'], test.loc[:,'MSSubClass':'SaleCondition'])) ## histgramで確認 matplotlib.rcParams['figure.figsize'] = (12.0, 6.0) prices = pd.DataFrame({"price":train["SalePrice"], "log(price + 1)":np.log1p(train["SalePrice"])}) prices.hist() train["SalePrice"] = np.log1p(train["SalePrice"]) #log transform skewed numeric features: numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index # 歪度の高い特徴を pick up skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness skewed_feats = skewed_feats[skewed_feats > 0.75] skewed_feats = skewed_feats.index all_data[skewed_feats] = np.log1p(all_data[skewed_feats]) all_data = pd.get_dummies(all_data) #filling NA's with the mean of the column: all_data = all_data.fillna(all_data.mean()) #creating matrices for sklearn: X_train = all_data[:train.shape[0]] X_test = all_data[train.shape[0]:] y = train.SalePrice from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV from sklearn.model_selection import cross_val_score def rmse_cv(model): rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 5)) return(rmse) model_ridge = Ridge() リッジ回帰のベストパラメータ確認 alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75] cv_ridge = [rmse_cv(Ridge(alpha = alpha)).mean() for alpha in alphas] cv_ridge = pd.Series(cv_ridge, index = alphas) cv_ridge.plot(title = "Validation - Just Do It") plt.xlabel("alpha") plt.ylabel("rmse") # 最小値 cv_ridge.min() # LASSO 回帰器 model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(X_train, y) rmse_cv(model_lasso).mean() coef = pd.Series(model_lasso.coef_, index = X_train.columns) print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables") imp_coef = pd.concat([coef.sort_values().head(10), coef.sort_values().tail(10)]) matplotlib.rcParams['figure.figsize'] = (8.0, 10.0) imp_coef.plot(kind = "barh") plt.title("Coefficients in the Lasso Model") #let's look at the residuals as well: matplotlib.rcParams['figure.figsize'] = (6.0, 6.0) preds = pd.DataFrame({"preds":model_lasso.predict(X_train), "true":y}) preds["residuals"] = preds["true"] - preds["preds"] preds.plot(x = "preds", y = "residuals",kind = "scatter")
xgboost modelを使用
import xgboost as xgb dtrain = xgb.DMatrix(X_train, label = y) dtest = xgb.DMatrix(X_test) params = {"max_depth":2, "eta":0.1} model = xgb.cv(params, dtrain, num_boost_round=500, early_stopping_rounds=100) model.loc[30:,["test-rmse-mean", "train-rmse-mean"]].plot() model_xgb = xgb.XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1) #the params were tuned using xgb.cv model_xgb.fit(X_train, y) xgb_preds = np.expm1(model_xgb.predict(X_test)) lasso_preds = np.expm1(model_lasso.predict(X_test)) ## 予想と比較 predictions = pd.DataFrame({"xgb":xgb_preds, "lasso":lasso_preds}) predictions.plot(x = "xgb", y = "lasso", kind = "scatter")
MySQL から pandas.Dataframe へ読み込む
pandasから、mysqlに読み込む方法
import pandas as pd import MySQLdb def pd_dbread(table, columns_list): """ 接続サンプル """ # 接続する con = MySQLdb.connect( user='aaa', passwd='aaa', host='127.0.0.1', db='aaa', charset='utf8' ) # カーソルを取得する cur= con.cursor() # クエリを実行する sql = """ select {columns} from {table} """.format( columns=",".join(["`{}`".format(l) for l in columns_list]), table="`{}`".format(table) ) df_read = pd.read_sql(sql, con,) con.close() return df_read aaa_columns = [ 'id', 'name', ] table_name = "aaa" df_aaa = pd_dbread(table_name, aaa_columns)
ramdom でshuffleすると、return Noneになる問題
こうすればいいらしい。
>>> import random >>> x = ['foo', 'bar', 'black', 'sheep'] # O(N) operations・・・shuffle と同じロジック >>> random.sample(x, len(x)) ['bar', 'sheep', 'black', 'foo'] # O(NlogN) operation >>> sorted(x, key=lambda k: random.random()) ['sheep', 'foo', 'black', 'bar'] >>> x ['foo', 'bar', 'black', 'sheep']
House Priceの分析4
XGBRegressorっていう、回帰モデルがあるので確認。 そもそも xgboost が結構界隈では有名らしい。
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import Imputer data = pd.read_csv('kaggle/kaggle1/train.csv') data.isnull().any(axis=0) # Imputerは欠損値を mean(平均), median(中央値), mode(最頻値)のどれかに置き換える data.dropna(axis=0, subset=['SalePrice'], inplace=True) y = data.SalePrice X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object']) train_X, test_X, train_y, test_y = train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25) my_imputer = Imputer() train_X = my_imputer.fit_transform(train_X) test_X = my_imputer.transform(test_X) from xgboost import XGBRegressor my_model = XGBRegressor() # Add silent=True to avoid printing out updates with each cycle my_model.fit(train_X, train_y, verbose=False) # make predictions predictions = my_model.predict(test_X) from sklearn.metrics import mean_absolute_error print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y))) my_model = XGBRegressor(n_estimators=1000) my_model.fit(train_X, train_y, early_stopping_rounds=5, eval_set=[(test_X, test_y)], verbose=False) my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05) my_model.fit(train_X, train_y, early_stopping_rounds=5, eval_set=[(test_X, test_y)], verbose=False)
House Priceの分析3
回帰分析
## 平均二乗誤差をルート def error(actual, predicted): actual = np.log(actual) predicted = np.log(predicted) return np.sqrt(np.sum(np.square(actual-predicted))/len(actual)) ## 対数変換 def log_transform(feature): train[feature] = np.log1p(train[feature].values) ## 二乗項の追加 def quadratic(feature): train[feature+'2'] = train[feature]**2 log_transform('GrLivArea') log_transform('1stFlrSF') log_transform('2ndFlrSF') log_transform('TotalBsmtSF') log_transform('LotArea') log_transform('LotFrontage') log_transform('KitchenAbvGr') log_transform('GarageArea') quadratic('OverallQual') quadratic('YearBuilt') quadratic('YearRemodAdd') quadratic('TotalBsmtSF') quadratic('2ndFlrSF') quadratic('Neighborhood_E') quadratic('RoofMatl_E') quadratic('GrLivArea') qdr = ['OverallQual2', 'YearBuilt2', 'YearRemodAdd2', 'TotalBsmtSF2', '2ndFlrSF2', 'Neighborhood_E2', 'RoofMatl_E2', 'GrLivArea2'] train['HasBasement'] = train['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0) train['HasGarage'] = train['GarageArea'].apply(lambda x: 1 if x > 0 else 0) train['Has2ndFloor'] = train['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0) train['HasMasVnr'] = train['MasVnrArea'].apply(lambda x: 1 if x > 0 else 0) train['HasWoodDeck'] = train['WoodDeckSF'].apply(lambda x: 1 if x > 0 else 0) train['HasPorch'] = train['OpenPorchSF'].apply(lambda x: 1 if x > 0 else 0) train['HasPool'] = train['PoolArea'].apply(lambda x: 1 if x > 0 else 0) train['IsNew'] = train['YearBuilt'].apply(lambda x: 1 if x > 2000 else 0) boolean = ['HasBasement', 'HasGarage', 'Has2ndFloor', 'HasMasVnr', 'HasWoodDeck', 'HasPorch', 'HasPool', 'IsNew'] features = quantitative + qual_encoded + boolean + qdr lasso = linear_model.LassoLarsCV(max_iter=10000) X = train[features].fillna(0.).values Y = train['SalePrice'].values lasso.fit(X, np.log(Y)) Ypred = np.exp(lasso.predict(X)) error(Y, Ypred)
pyplot の 円グラフをいい感じに描く
f,a = plt.subplots(nrows=5, ncols=2, figsize=(14, 20)) a = a.ravel() for idx,ax in enumerate(a): v_list = km_center[idx] df_timeband_meanrate = pd.DataFrame( { 'timeband': name_list, 'rate': v_list }, ) print(idx, np.bincount(y_km)[idx]) display(df_timeband_meanrate) ax.pie(df_timeband_meanrate["rate"], labels=['' for i in range(8)],autopct='%.2f %%') ax.set_title('cluster: {}, number: {}'.format(idx, np.bincount(y_km)[idx])) ax.legend(loc="upper left", bbox_to_anchor=(0.85,1.025), labels=df_timeband_meanrate["timeband"]) plt.tight_layout( ) plt.savefig('time_cluster_center.png', dpi=100) plt.show()