mean encoding の方法、Kfold

import pandas as pd
import numpy as np

index_cols = ['shop_id', 'item_id', 'cnt']
global_mean = 0.2

df = pd.read_csv(filename)

# groupby した
gb = df.groupby(index_cols,as_index=False).agg({'cnt':{'target':'sum'}})

#fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]


# mean encoding
# 1.  計算後、gb に 代入
item_id_target_mean = gb.groupby('item_id').target.mean()
gb['item_target_enc'] = gb['item_id'].map(item_id_target_mean)

# 2.  計算結果を、そのまま gb に 代入
gb['item_target_enc'] = gb.groupby('item_id')['target'].transform('mean')

# 
gb['item_target_enc'].fillna(global_mean, inplace=True) 


from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=False)

for tr_ind, val_ind in skf.split(gb.values, gb['target'].values):
    X_tr, X_val = gb.iloc[tr_ind], gb.iloc[val_ind]
    X_val['item_target_enc'] = X_tr.groupby('item_id')['target'].transform('mean')
    X_val['item_target_enc'].fillna(global_mean, inplace=True) 


def target_encode(frame, features):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    y_frame = frame[['target']]
    X_frame =frame.copy()
    new_X_frame =frame.copy()[['_index']]
    global_mean = y_frame['target'].mean()
    
    for tr_idx, val_idx in kf.split(X_frame, y_frame):
        X_tr, y_tr, X_val, y_val = X_frame.iloc[tr_idx, :], y_frame.iloc[tr_idx, :], X_frame.iloc[val_idx, :], y_frame.iloc[val_idx, :]
        for feature in features:
            te = feature + '_mean_target'
            alpha = X_tr[feature].nunique()
            X_tr_feature_means = X_tr.groupby(feature)['target'].mean()
            nrows = len(X_tr)
            smoothing_means = (X_tr_feature_means * nrows + global_mean * alpha) / (nrows + alpha)
            means = X_val[feature].map(X_tr_feature_means)
            X_val[te] = means
            new_X_frame.loc[val_idx, te] = X_val[te]
    new_X_frame = new_X_frame.fillna(global_mean)
    return frame.merge(new_X_frame, on='_index', how='left')

MySQL のトランザクション 消化ステータス確認

KILLED のプロセスが transaction 掴んで焦った話。KILLED を消すためにmysqldを強制終了すると、dead lock が発生するのでやめたほうがいい。
以下のメッセージが出て追加deleteができなかった。。
transaction mysql Lock wait timeout exceeded; try restarting transaction

mysql> SHOW PROCESSLIST;
14654233 --> プロセス確認

mysql> SHOW ENGINE INNODB STATUS\G

---TRANSACTION 3747011, ACTIVE 1231 sec fetching rows, thread declared inside InnoDB 1887
mysql tables in use 1, locked 1
78241 lock struct(s), heap size 7594192, 6806394 row lock(s), undo log entries 6727648
MySQL thread id 14654233, OS thread handle 140280497481472, query id 478855377 localhost root updating
delete from `goods` where modified_dt = '2018-10-01'
14654233 --> MySQL thread id(sql process id)
select count(1) from `goods` where modified_dt = '2018-10-01'--> 10000000(delete 対象行)
6806394 --> lock rows(ロック行)
6727648 --> undo log entries(更新未完了行)
In [35]: 6806394 / 10000000
Out[35]: 0.6806394 (ロック完了割合)

In [36]: 6727648 / 6806394
Out[36]: 0.9884305845356587 (delete 未完了割合)

binary の logloss と auc

Binary Class の測定

logloss

l_pred = [0.5, 0.5, 0.5, 0.5]
l_label = [0, 0, 0, 0]

def logloss(l_pred, l_label):
    n = len(l_pred)
    score = 0
    for t in range(n):
        i = l_pred[t]
        k = l_label[t]
        score +=  k * np.log(i) + (1 - k) * np.log(i)
    return - score / n

logloss(l_pred, l_label)

auc

from io import StringIO

a = StringIO('''target    prediction
1 0.38
0 0.51
1 0.90
1 0.84
1 0.48
0 0.03
0 0.45''')

def auc(df):
    df_ = df_ = df.sort_values('prediction', ascending=False).reset_index(drop=True)
    all_ = df_.target.value_counts()[0] * df_.target.value_counts()[1]
    
    target_list = df_.target.values
    n = len(target_list)
    count = 0
    for i in range(n):
        for k in range(i + 1, n):
            if target_list[i] < target_list[k]:
                count += 1
    return(1 - count / all_)
    
auc(df)

syslogとか cronlogとか確認

OOM が出たときとかに syslog を確認

ログファイル名    内容
 /var/log/messages     一般的なシステム関連のメッセージ
 /var/log/secure   セキュリティに関するメッセージ
 /var/log/cron     定期的に実行される処理結果に関するメッセージ
 /var/log/maillog  メールに関するメッセージ
 /var/log/spooler  印刷に関するメッセージ
 /var/log/boot.log     OS起動時に関するメッセージ
sudo grep -F 'Out of memory: Kill process' /var/log/messages* | less

www.infraeye.com