標本平均と不偏標本分散とか信頼区間をpythonでする - 日に日に分からんことが増えていく…

train.loc[train.paytype == 1, :].pa.sum() # pa人数

# cash (paytype=1) で払った人
train_iscash = train.paytype == 1

# cash 出払った人の割合の平均値の信頼区間 99% 
from statsmodels.stats.proportion import proportion_confint
proportion_confint(sum(train_iscash), len(train_iscash), alpha=0.01) # 信頼区間
(0.314, 0.315)

# distance = 距離
# 標本平均
train['distance'].mean()

# 不偏標本分散
s = train['distance'].std(ddof=1) / np.sqrt(len(train['distance']) - 1)

# distance の信頼区間 95% <-- Nが大きいため、正規分布に近似している
from statsmodels.stats.weightstats import _tconfint_generic
_tconfint_generic(train['distance'].mean(), s, len(train['distance']), 0.05, 'two-sided')
(3.14, 3.15)