import pandas as pd
import numpy as np
index_cols = ['shop_id', 'item_id', 'cnt']
global_mean = 0.2
df = pd.read_csv(filename)
gb = df.groupby(index_cols,as_index=False).agg({'cnt':{'target':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
item_id_target_mean = gb.groupby('item_id').target.mean()
gb['item_target_enc'] = gb['item_id'].map(item_id_target_mean)
gb['item_target_enc'] = gb.groupby('item_id')['target'].transform('mean')
gb['item_target_enc'].fillna(global_mean, inplace=True)
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=False)
for tr_ind, val_ind in skf.split(gb.values, gb['target'].values):
X_tr, X_val = gb.iloc[tr_ind], gb.iloc[val_ind]
X_val['item_target_enc'] = X_tr.groupby('item_id')['target'].transform('mean')
X_val['item_target_enc'].fillna(global_mean, inplace=True)
def target_encode(frame, features):
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
y_frame = frame[['target']]
X_frame =frame.copy()
new_X_frame =frame.copy()[['_index']]
global_mean = y_frame['target'].mean()
for tr_idx, val_idx in kf.split(X_frame, y_frame):
X_tr, y_tr, X_val, y_val = X_frame.iloc[tr_idx, :], y_frame.iloc[tr_idx, :], X_frame.iloc[val_idx, :], y_frame.iloc[val_idx, :]
for feature in features:
te = feature + '_mean_target'
alpha = X_tr[feature].nunique()
X_tr_feature_means = X_tr.groupby(feature)['target'].mean()
nrows = len(X_tr)
smoothing_means = (X_tr_feature_means * nrows + global_mean * alpha) / (nrows + alpha)
means = X_val[feature].map(X_tr_feature_means)
X_val[te] = means
new_X_frame.loc[val_idx, te] = X_val[te]
new_X_frame = new_X_frame.fillna(global_mean)
return frame.merge(new_X_frame, on='_index', how='left')