mean encoding の方法、Kfold
import pandas as pd import numpy as np index_cols = ['shop_id', 'item_id', 'cnt'] global_mean = 0.2 df = pd.read_csv(filename) # groupby した gb = df.groupby(index_cols,as_index=False).agg({'cnt':{'target':'sum'}}) #fix column names gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] # mean encoding # 1. 計算後、gb に 代入 item_id_target_mean = gb.groupby('item_id').target.mean() gb['item_target_enc'] = gb['item_id'].map(item_id_target_mean) # 2. 計算結果を、そのまま gb に 代入 gb['item_target_enc'] = gb.groupby('item_id')['target'].transform('mean') # gb['item_target_enc'].fillna(global_mean, inplace=True) from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=5, shuffle=False) for tr_ind, val_ind in skf.split(gb.values, gb['target'].values): X_tr, X_val = gb.iloc[tr_ind], gb.iloc[val_ind] X_val['item_target_enc'] = X_tr.groupby('item_id')['target'].transform('mean') X_val['item_target_enc'].fillna(global_mean, inplace=True) def target_encode(frame, features): kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) y_frame = frame[['target']] X_frame =frame.copy() new_X_frame =frame.copy()[['_index']] global_mean = y_frame['target'].mean() for tr_idx, val_idx in kf.split(X_frame, y_frame): X_tr, y_tr, X_val, y_val = X_frame.iloc[tr_idx, :], y_frame.iloc[tr_idx, :], X_frame.iloc[val_idx, :], y_frame.iloc[val_idx, :] for feature in features: te = feature + '_mean_target' alpha = X_tr[feature].nunique() X_tr_feature_means = X_tr.groupby(feature)['target'].mean() nrows = len(X_tr) smoothing_means = (X_tr_feature_means * nrows + global_mean * alpha) / (nrows + alpha) means = X_val[feature].map(X_tr_feature_means) X_val[te] = means new_X_frame.loc[val_idx, te] = X_val[te] new_X_frame = new_X_frame.fillna(global_mean) return frame.merge(new_X_frame, on='_index', how='left')
MySQL のトランザクション 消化ステータス確認
KILLED のプロセスが transaction 掴んで焦った話。KILLED を消すためにmysqldを強制終了すると、dead lock が発生するのでやめたほうがいい。
以下のメッセージが出て追加deleteができなかった。。
transaction mysql Lock wait timeout exceeded; try restarting transaction
mysql> SHOW PROCESSLIST; 14654233 --> プロセス確認 mysql> SHOW ENGINE INNODB STATUS\G ---TRANSACTION 3747011, ACTIVE 1231 sec fetching rows, thread declared inside InnoDB 1887 mysql tables in use 1, locked 1 78241 lock struct(s), heap size 7594192, 6806394 row lock(s), undo log entries 6727648 MySQL thread id 14654233, OS thread handle 140280497481472, query id 478855377 localhost root updating delete from `goods` where modified_dt = '2018-10-01'
14654233 --> MySQL thread id(sql process id) select count(1) from `goods` where modified_dt = '2018-10-01'--> 10000000(delete 対象行) 6806394 --> lock rows(ロック行) 6727648 --> undo log entries(更新未完了行)
In [35]: 6806394 / 10000000 Out[35]: 0.6806394 (ロック完了割合) In [36]: 6727648 / 6806394 Out[36]: 0.9884305845356587 (delete 未完了割合)
binary の logloss と auc
Binary Class の測定
logloss
l_pred = [0.5, 0.5, 0.5, 0.5] l_label = [0, 0, 0, 0] def logloss(l_pred, l_label): n = len(l_pred) score = 0 for t in range(n): i = l_pred[t] k = l_label[t] score += k * np.log(i) + (1 - k) * np.log(i) return - score / n logloss(l_pred, l_label)
auc
from io import StringIO a = StringIO('''target prediction 1 0.38 0 0.51 1 0.90 1 0.84 1 0.48 0 0.03 0 0.45''') def auc(df): df_ = df_ = df.sort_values('prediction', ascending=False).reset_index(drop=True) all_ = df_.target.value_counts()[0] * df_.target.value_counts()[1] target_list = df_.target.values n = len(target_list) count = 0 for i in range(n): for k in range(i + 1, n): if target_list[i] < target_list[k]: count += 1 return(1 - count / all_) auc(df)
syslogとか cronlogとか確認
OOM が出たときとかに syslog を確認
ログファイル名 内容 /var/log/messages 一般的なシステム関連のメッセージ /var/log/secure セキュリティに関するメッセージ /var/log/cron 定期的に実行される処理結果に関するメッセージ /var/log/maillog メールに関するメッセージ /var/log/spooler 印刷に関するメッセージ /var/log/boot.log OS起動時に関するメッセージ
sudo grep -F 'Out of memory: Kill process' /var/log/messages* | less
本番サーバと検証サーバを間違えないように
.bashprofile に以下の設定をすると、色付きになる
export PS1="\[\033[31m\]\u@\h\[\033[00m\]:\[\033[01m\]\w\[\033[00m\]\\$ "