feature enginearing のカテゴリ系

# 量で分割 --> categorical (pd.Interval)
df['price_range'] = pd.qcut(allfeat['price'], 5)

# 値で分割 --> categorical (pd.Interval)
df['age_range'] = pd.qcut(allfeat['age'], 5)

>>> iv = pd.Interval(left=0, right=5)
>>> iv
Interval(0, 5, closed='right')

>>> 2.5 in iv
True


# null処理 ミディアン
allfeat['fare']. = allfeat['fare'].fillna(train['fare'].median())

# null処理 ランダム
allfeat['age']=allfeat['age'].fillna(value=np.random.randint(avg-std,avg+std))


# one-hot encoding の継ぎ足し
allfeat=pd.concat([allfeat,pd.get_dummies(allfeat['age_range'])],axis=1) 
allfeat.drop(columns=['age_range'], inplace=True)

アクセスログの統計処理

httpd.apache.org

LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %D"
from datetime import (datetime, date, timedelta)
import re

import pandas as pd
import numpy as np

from IPython.display import display, HTML
from pandas.tools.plotting import table
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st


df_access = pd.read_csv('access_log.gz', compression='gzip', sep=' ', delimiter='"', header=None)

df_access.columns = [
    'ip_uid_username_datetime',
    'method_req',
    'status_byte',
    'referer',
    '_',
    'useagent',
    'time',
]

df_access[['ip', 'uid', 'username', 'datetime']] = df_access.ip_uid_username_datetime.str.strip().str.split(
    ' ', 3, expand=True
).rename(columns={0:'ip', 1:'uid', 2:'username', 3:'datetime'})

df_access[['method', 'req', 'httpver']] = df_access.method_req.str.strip().str.split(
    ' ', 2, expand=True
).rename(columns={0:'method', 1:'req',2: 'httpver'})

df_access[['status', 'byte']] = df_access.status_byte.str.strip().str.split(
    ' ', 1, expand=True
).rename(columns={0:'status', 1:'byte',})

_time = df_access.datetime.map(lambda x: x.strip('[]').split('+')[0].strip())

df_access['reqtime'] = pd.to_datetime(_time, format='%d/%b/%Y:%H:%M:%S')

df = df_access[['reqtime', 'req', 'status', 'time']]

408 エラーの頻度回数

df_408 = df.loc[(df.status=='408'), :]
df_408_g = df_408.groupby('reqtime').size().reset_index(name='c')

from matplotlib import dates as mdates

_from, _to = '2019-01-07 14:50:00', '2019-01-07 15:10:00'
_ = df_408_g.loc[(df_408_g.reqtime >= _from) & (df_408_g.reqtime <= _to), :]

fig = sns.mpl.pyplot.figure(figsize=(15, 6))
ax = fig.add_subplot(111)
ax.plot(_.reqtime, _.c, label='408 count',)
ax.set(xlim=(_from, _to))

ax.xaxis.set_major_formatter(mdates.DateFormatter('%m/%d\n%H:%M'))
ax.xaxis.set_major_locator(mdates.MinuteLocator(interval=1))  
ax.legend()
fig.autofmt_xdate()
fig.show()

api 毎の res time 計測

df['api'] = df.req.str.split('?', 1, expand=True)[0]
df['seconds'] = df.time.map(lambda x: x / 10**6)

df_api_g = df.groupby('api').agg({
    'seconds': ['mean', 'std', 'count', 'max', 'min']
})['seconds']

top_list = df_api_g.sort_values('mean', ascending=False).index
f,a = plt.subplots(nrows=n, ncols=1, figsize=(15, 18))
a = a.ravel()

for idx,ax in enumerate(a):
    l = top_list[idx]
    _df = df[['api', 'seconds']].loc[df.api==l, :]
    ax.hist(
        _df.seconds, log=True, range=(0,5),
        bins=100
    )
    ax.set_title(l)
    ax.set_xlim(0, 4)
plt.tight_layout( )
plt.savefig('api_res_histgram.png', dpi=100)
plt.show()

レスポンスタイム はポアソン分布になるのかな。

LSTM を書いてみた

出典元)Python 機械学習プログラミング https://www.amazon.co.jp/dp/4295003379/

tar 解凍

import tarfile
with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar:
    tar.extractall()

ai.stanford.edu

データ作成

import pandas as pd
import os

base_path = 'aclImdb'
labels = {'pos':1, 'neg':0}
from tqdm import tqdm_notebook

df = pd.DataFrame()
for s in ('test', 'train'):
    for l in labels.keys():
        path = os.path.join(base_path, s, l)
        for file in tqdm_notebook(os.listdir(path)):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
# データシャッフル
from sklearn.utils import shuffle
df = shuffle(df).reset_index(drop=True)

# char2int
from string import punctuation
from collections import Counter

counts = Counter()
for i, review in tqdm_notebook(enumerate(df['review'])):
    text = ''.join([c if c not in punctuation else ' '+c+' ' for c in review]).lower()
    df.loc[i, 'review'] = text
    counts.update(text.split())

word_counts = sorted(counts, key=counts.get, reverse=True)
print(word_counts[:5])
word_to_int = {word: ii for ii, word in enumerate(word_counts, 1)}

mapped_reviews = []
for review in tqdm_notebook(df['review']):
    mapped_reviews.append([word_to_int[word] for word in review.split()])

# 200 文字で 0padding
sequence_length = 200
sequences = np.zeros((len(mapped_reviews), sequence_length), dtype=int)

for i, row in tqdm_notebook(enumerate(mapped_reviews)):
    review_arr = np.array(row)
    sequences[i, -len(row):] = review_arr[-sequence_length:]

LSTM

X_train = sequences[:25000, :]
y_train = df.loc[:25000, 'sentiment'].values
X_test = sequences[25000:, :]
y_test = df.loc[25000:, 'sentiment'].values

np.random.seed(123)

# バッチジェネレータ
def create_batch_generator(x, y=None, batch_size=64):
    n_batch = len(x) // batch_size
    x = x[:n_batch*batch_size]
    if y is not None:
        y = y[:n_batch*batch_size]
    for ii in range(0, len(x), batch_size):
        if y is not None:
            yield x[ii:ii+batch_size], y[ii:ii+batch_size]
        else:
            yield x[ii:ii+batch_size]
import tensorflow as tf

class SentimentRNN(object):
    def __init__(self, n_words, seq_len=200,
                lstm_size=256, num_layers=1, batch_size=64,
                learning_rate=0.0001, embed_size=200):
        self.n_words = n_words
        self.seq_len = seq_len
        self.lstm_size = lstm_size  # 隠れユニットの個数
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.embed_size = embed_size
        
        self.g = tf.Graph()
        with self.g.as_default():
            tf.set_random_seed(123)
            self.build()
            self.saver = tf.train.Saver()
            self.init_op = tf.global_variables_initializer()
            
    def build(self):
        # プレースホルダ
        tf_x = tf.placeholder(tf.int32,
                             shape=(self.batch_size, self.seq_len),
                             name='tf_x')
        tf_y = tf.placeholder(tf.float32,
                             shape=(self.batch_size),
                             name='tf_y')
        tf_keepprob = tf.placeholder(tf.float32,
                             name='tf_keepprob')
        
        embedding = tf.Variable(tf.random_uniform((self.n_words, self.embed_size), minval=-1, maxval=1))
        
        embed_x = tf.nn.embedding_lookup(embedding, tf_x, name='embeded_x')
        
        cells = tf.contrib.rnn.MultiRNNCell(
            [tf.contrib.rnn.DropoutWrapper(
                tf.contrib.rnn.BasicLSTMCell(self.lstm_size),
                output_keep_prob=tf_keepprob)
             for i in range(self.num_layers)
            ]
        )
        
        # 初期状態を定義
        
        self.initial_state = cells.zero_state(self.batch_size, tf.float32)
        print('   << initial state >>  ', self.initial_state)
        
        lstm_outputs, self.final_state = \
            tf.nn.dynamic_rnn(cells, embed_x, initial_state=self.initial_state)
            
        print('\n  << lstm_output  >>  ', lstm_outputs)
        print('\n  << final_state  >>  ', self.final_state)
        
        logits = tf.layers.dense(inputs=lstm_outputs[:, -1],
                                units=1, activation=None, name='logits')
        logits = tf.squeeze(logits, name='logits_squeeze')
        
        print('\n  <<  logits          >>', logits)
        
        y_proba = tf.nn.sigmoid(logits, name='probabilities')
        predictions = {
            'probabilities': y_proba,
            'labels': tf.cast(tf.round(y_proba), tf.int32, name='labels')
        }
        
        print('\n  <<  predictions      >>', predictions)
        
        cost = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(labels=tf_y,
                                                   logits=logits),
            name='cost'
        )
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        train_op = optimizer.minimize(cost, name='train_op')
    
    def train(self, X_train, y_train, num_epochs):
        with tf.Session(graph=self.g) as sess:
            sess.run(self.init_op)
            iteration = 1
            for epoch in range(num_epochs):
                state = sess.run(self.initial_state)
                
                for batch_x, batch_y in create_batch_generator(X_train, y_train, self.batch_size):
                    feed = {'tf_x:0': batch_x,
                           'tf_y:0': batch_y,
                           'tf_keepprob:0': 0.5,
                           self.initial_state: state}
                    loss, _, state = sess.run(
                        ['cost:0', 'train_op', self.final_state],
                        feed_dict=feed
                    )
                    
                    if iteration % 20 == 0:
                        print("Epoch: %d / %d Iteration: %d | Train loss: %.5f"
                             % (epoch + 1, num_epochs, iteration, loss))
                    iteration += 1
                
                if (epoch + 1) % 10 == 0:
                    self.saver.save(sess, "model/sentiment-%d.ckpt" % epoch)
    
    def predict(self, X_data,  return_proba=False):
        preds = []
        with tf.Session(graph=self.g) as sess:
            self.saver.restore(sess, tf.train.latest_checkpoint('./model/'))
            test_state = sess.run(self.initial_state)

            for ii, batch_x in enumerate(create_batch_generator(X_data, None, batch_size=self.batch_size), 1):
                feed = {'tf_x:0': batch_x,
                           'tf_keepprob:0': 1.0,
                           self.initial_state: test_state}
                if return_proba:
                    pred, test_state = sess.run(['probabilities:0', self.final_state],
                                               feed_dict=feed)
                else:
                    pred, test_state = sess.run(['labels:0', self.final_state],
                                               feed_dict=feed)
                preds.append(pred)

            return np.concatenate(preds)

train+test

n_words = max(list(word_to_int.values())) + 1
rnn = SentimentRNN(
    n_words=n_words,
    seq_len=sequence_length,
    embed_size=256,
    lstm_size=128,
    num_layers=1,
    batch_size=100,
    learning_rate=0.001
)
   << initial state >>   (LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0' shape=(100, 128) dtype=float32>),)

  << lstm_output  >>   Tensor("rnn/transpose_1:0", shape=(100, 200, 128), dtype=float32)

  << final_state  >>   (LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_3:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_4:0' shape=(100, 128) dtype=float32>),)

  <<  logits          >> Tensor("logits_squeeze:0", shape=(100,), dtype=float32)

  <<  predictions      >> {'probabilities': <tf.Tensor 'probabilities:0' shape=(100,) dtype=float32>, 'labels': <tf.Tensor 'labels:0' shape=(100,) dtype=int32>}
rnn.train(X_train, y_train, num_epochs=40)
Epoch: 1 / 40 Iteration: 20 | Train loss: 0.68427
Epoch: 1 / 40 Iteration: 40 | Train loss: 0.64362
Epoch: 1 / 40 Iteration: 60 | Train loss: 0.67976
Epoch: 1 / 40 Iteration: 80 | Train loss: 0.60497
...
preds = rnn.predict(X_test)
y_true = y_test[:len(preds)]
print(np.sum(preds == y_true) / len(y_true))
INFO:tensorflow:Restoring parameters from ./model/sentiment-39.ckpt
0.854

データ分析の流れ

準備

  1. Prepare Problem a) Load libraries b) Load dataset
  2. Summarize Data a) Descriptive statistics b) Data visualizations
  3. Prepare Data a) Data Cleaning b) Feature Selection c) Data Transforms (Normalize,...)

  4. direcotory構成

echo '.DS_Store
.ipynb_checkpoints/' > .gitignore

[ -d app ] || mkdir app/
[ -d app/utils ] || mkdir app/utils
[ -d app/utils/preprocessing ] || mkdir app/utils/preprocessing

[ -d config ] || mkdir config

[ -d data ] || mkdir data/
[ -d data/rawdata ] || mkdir data/rawdata && echo '*
!.gitignore' > data/rawdata/.gitignore
[ -d data/preprocessed_data ] || mkdir data/preprocessed_data && echo '*
!.gitignore' > data/preprocessed_data/.gitignore
[ -d data/model_params ] || mkdir data/model_params && echo '*
!.gitignore' > data/model_params/.gitignore
[ -d data/output ] || mkdir data/output && echo '*
!.gitignore' > data/output/.gitignore

[ -d log ] || mkdir log && echo '*
!.gitignore' > log/.gitignore

[ -d model ] || mkdir model

[ -d tmp ] || mkdir tmp && echo '*
!.gitignore' > tmp/.gitignore
  • dvcを使用する場合
echo '.DS_Store
.ipynb_checkpoints/' > .gitignore
echo 'README' >>README.md
[ -d app ] || mkdir app/
[ -d app/utils ] || mkdir app/utils
[ -d config ] || mkdir config
[ -d data ] || mkdir data/
[ -d data/input ] || mkdir data/input
[ -d data/preprocessed_data ] || mkdir data/preprocessed_data
[ -d data/output ] || mkdir data/output
[ -d log ] || mkdir log
[ -d model ] || mkdir model
[ -d tmp ] || mkdir tmp

分析

  1. Evaluate Algorithms a) Split-out validation dataset b) Test options and evaluation metric c) Spot Check Algorithms d) Compare Algorithms
  2. Improve Accuracy a) Algorithm Tuning b) Ensembles
  3. Finalize Model a) Predictions on validation dataset b) Create standalone model on entire training dataset c) Save model for later use

https://www.kaggle.com/dennise/coursera-competition-getting-started-eda

https://towardsdatascience.com/exploratory-data-analysis-eda-a-practical-guide-and-template-for-structured-data-abfbf3ee3bd9towardsdatascience.com

例(train: ECサイトの transaction)

準備

1. repare Problem a) Load libraries b) Load dataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline 

from scipy import stats


test=pd.read_csv('test.csv.gz',compression='gzip')
train=pd.read_csv('train.csv.gz',compression='gzip')

# データ初見確認
train.info()
train.describe() # count が index と等しければ、non-null と考えられる
train.head()

2. Summarize Data a) Descriptive statistics b) Data visualizations

train.price.hist()
train.price.value_counts()
train.price.nunique()

# histgram
train.price.apply(lambda x: np.log10(x+2)).hist(figsize=(12,16))

# index 順の plot
train.block_num.plot(figsize=(20,4))

# データ作り込み
train["category"]=train.name.apply(lambda x:x.split()[0])

date_format = '%d.%m.%Y'
train['day'] = pd.to_datetime(train['date'], format=date_format).dt.day
train['month'] = pd.to_datetime(train['date'], format=date_format).dt.month
train['year'] = pd.to_datetime(train['date'], format=date_format).dt.year
train['weekday'] = pd.to_datetime(train['date'], format=date_format).dt.dayofweek

train["revenue"]=train.item_cnt_day * train.price

train.groupby("item_category_id").sum()["revenue"].hist(figsize=(20,4),bins=100)

# 不必要な行の削除
train.drop("date",axis=1,inplace=True)
train.drop("name",axis=1,inplace=True)
train.drop("item_cnt_day",axis=1,inplace=True)

# 売上数の推移
train.groupby("block_num").sum()['revenue'].plot()
train.groupby("block_num").mean()['revenue'].plot()

# 特定商品名に対する価格の散布図
prices_hoge=train[train.category=="hogehoge"]["item_price"]

plt.figure(figsize=(20, 8), dpi=80)
plt.scatter(prices_hoge.index, prices_hoge,s=0.1)

# groupby した結果を pivot table で見る
# upnstack <--> stack (階層化データの変換)
train.groupby(["block_num","item_category_id"]).sum()["revenue"].unstack()
train.pivot_table(index=['shop_id','item_id'], columns='block_num', values='item_cnt_day',aggfunc='sum').fillna(0.0)

train.groupby(["block_num","item_category_id"]).sum()["revenue"].unstack().plot(figsize=(20,20))
train.groupby(["block_num","shop_id"]).sum()["revenue"].unstack().plot(figsize=(20,20))

# データの相関
sns.pairplot(train)

# データの入力ミスの修正
train.loc[train["shop_id"]==0,"shop_id"]=1

# テストデータの内容確認
test_list = test.shop_id.unique()
out_of_test = [i for i in train.shop_id.unique() if i not in test_list]

3. Prepare Data a) Data Cleaning b) Feature Selection c) Data Transforms (Normalize,...)

# a) Data Cleaning
columns_needed = ['item_id', 'shop_id', ...]
df = train[columns_needed]

# b) Feature Selection c) Data Transforms (Normalize,...)

df["price_category"]=np.nan
df["price_category"][(df["price"]>=0)&(df["price"]<=10000)]=0
df["price_category"][(df["price"]>10000)]=1

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(df.category)
df["meta_category"] = le.transform(df.category)

scaler = preprocessing.StandardScaler()
le.fit(df.star)
df['star'] = scaler.transform(df.star)

X_train=df.drop("item_cnt_month", axis=1)
# Reason for dropping item_price explained below
y_train=df["item_cnt_month"]

X_train.fillna(0, inplace=True)

分析

1. Evaluate Algorithms a) Split-out validation dataset b) Test options and evaluation metric c) Spot Check Algorithms d) Compare Algorithms

import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# 線形回帰
linmodel=LinearRegression()
linmodel.fit(X_train, y_train)
lin_pred=linmodel.predict(X_train)

print('R-squared is %f' % r2_score(lin_pred, y_train))

# boost model
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
pred_lgb = model.predict(X_train)

print('R-squared is %f' % r2_score(pred_lgb, y_train))

http://kidnohr.hatenadiary.com/entry/2018/09/21/012446

2. Improve Accuracy a) Algorithm Tuning b) Ensembles

meta_feature = np.c_[lin_pred, pred_lgb]
meta_lr = LinearRegression()
meta_lr.fit(meta_feature, final_train[33])

meta_pred = meta_lr.predict(meta_feature)

print('R-squared is %f' % r2_score(meta_pred, y_train))

3. Finalize Model a) Predictions on validation dataset b) Create standalone model on entire training dataset c) Save model for later use




おまけ

groupby のチートシート

chachay.hatenablog.com