mean encoding の方法、Kfold

import pandas as pd
import numpy as np

index_cols = ['shop_id', 'item_id', 'cnt']
global_mean = 0.2

df = pd.read_csv(filename)

# groupby した
gb = df.groupby(index_cols,as_index=False).agg({'cnt':{'target':'sum'}})

#fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]


# mean encoding
# 1.  計算後、gb に 代入
item_id_target_mean = gb.groupby('item_id').target.mean()
gb['item_target_enc'] = gb['item_id'].map(item_id_target_mean)

# 2.  計算結果を、そのまま gb に 代入
gb['item_target_enc'] = gb.groupby('item_id')['target'].transform('mean')

# 
gb['item_target_enc'].fillna(global_mean, inplace=True) 


from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=False)

for tr_ind, val_ind in skf.split(gb.values, gb['target'].values):
    X_tr, X_val = gb.iloc[tr_ind], gb.iloc[val_ind]
    X_val['item_target_enc'] = X_tr.groupby('item_id')['target'].transform('mean')
    X_val['item_target_enc'].fillna(global_mean, inplace=True) 


def target_encode(frame, features):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    y_frame = frame[['target']]
    X_frame =frame.copy()
    new_X_frame =frame.copy()[['_index']]
    global_mean = y_frame['target'].mean()
    
    for tr_idx, val_idx in kf.split(X_frame, y_frame):
        X_tr, y_tr, X_val, y_val = X_frame.iloc[tr_idx, :], y_frame.iloc[tr_idx, :], X_frame.iloc[val_idx, :], y_frame.iloc[val_idx, :]
        for feature in features:
            te = feature + '_mean_target'
            alpha = X_tr[feature].nunique()
            X_tr_feature_means = X_tr.groupby(feature)['target'].mean()
            nrows = len(X_tr)
            smoothing_means = (X_tr_feature_means * nrows + global_mean * alpha) / (nrows + alpha)
            means = X_val[feature].map(X_tr_feature_means)
            X_val[te] = means
            new_X_frame.loc[val_idx, te] = X_val[te]
    new_X_frame = new_X_frame.fillna(global_mean)
    return frame.merge(new_X_frame, on='_index', how='left')