feature enginearing のカテゴリ系
# 量で分割 --> categorical (pd.Interval) df['price_range'] = pd.qcut(allfeat['price'], 5) # 値で分割 --> categorical (pd.Interval) df['age_range'] = pd.qcut(allfeat['age'], 5) >>> iv = pd.Interval(left=0, right=5) >>> iv Interval(0, 5, closed='right') >>> 2.5 in iv True # null処理 ミディアン allfeat['fare']. = allfeat['fare'].fillna(train['fare'].median()) # null処理 ランダム allfeat['age']=allfeat['age'].fillna(value=np.random.randint(avg-std,avg+std)) # one-hot encoding の継ぎ足し allfeat=pd.concat([allfeat,pd.get_dummies(allfeat['age_range'])],axis=1) allfeat.drop(columns=['age_range'], inplace=True)
アクセスログの統計処理
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %D"
from datetime import (datetime, date, timedelta) import re import pandas as pd import numpy as np from IPython.display import display, HTML from pandas.tools.plotting import table import matplotlib.pyplot as plt import seaborn as sns import scipy.stats as st df_access = pd.read_csv('access_log.gz', compression='gzip', sep=' ', delimiter='"', header=None) df_access.columns = [ 'ip_uid_username_datetime', 'method_req', 'status_byte', 'referer', '_', 'useagent', 'time', ] df_access[['ip', 'uid', 'username', 'datetime']] = df_access.ip_uid_username_datetime.str.strip().str.split( ' ', 3, expand=True ).rename(columns={0:'ip', 1:'uid', 2:'username', 3:'datetime'}) df_access[['method', 'req', 'httpver']] = df_access.method_req.str.strip().str.split( ' ', 2, expand=True ).rename(columns={0:'method', 1:'req',2: 'httpver'}) df_access[['status', 'byte']] = df_access.status_byte.str.strip().str.split( ' ', 1, expand=True ).rename(columns={0:'status', 1:'byte',}) _time = df_access.datetime.map(lambda x: x.strip('[]').split('+')[0].strip()) df_access['reqtime'] = pd.to_datetime(_time, format='%d/%b/%Y:%H:%M:%S') df = df_access[['reqtime', 'req', 'status', 'time']]
408 エラーの頻度回数
df_408 = df.loc[(df.status=='408'), :] df_408_g = df_408.groupby('reqtime').size().reset_index(name='c') from matplotlib import dates as mdates _from, _to = '2019-01-07 14:50:00', '2019-01-07 15:10:00' _ = df_408_g.loc[(df_408_g.reqtime >= _from) & (df_408_g.reqtime <= _to), :] fig = sns.mpl.pyplot.figure(figsize=(15, 6)) ax = fig.add_subplot(111) ax.plot(_.reqtime, _.c, label='408 count',) ax.set(xlim=(_from, _to)) ax.xaxis.set_major_formatter(mdates.DateFormatter('%m/%d\n%H:%M')) ax.xaxis.set_major_locator(mdates.MinuteLocator(interval=1)) ax.legend() fig.autofmt_xdate() fig.show()
api 毎の res time 計測
df['api'] = df.req.str.split('?', 1, expand=True)[0] df['seconds'] = df.time.map(lambda x: x / 10**6) df_api_g = df.groupby('api').agg({ 'seconds': ['mean', 'std', 'count', 'max', 'min'] })['seconds'] top_list = df_api_g.sort_values('mean', ascending=False).index f,a = plt.subplots(nrows=n, ncols=1, figsize=(15, 18)) a = a.ravel() for idx,ax in enumerate(a): l = top_list[idx] _df = df[['api', 'seconds']].loc[df.api==l, :] ax.hist( _df.seconds, log=True, range=(0,5), bins=100 ) ax.set_title(l) ax.set_xlim(0, 4) plt.tight_layout( ) plt.savefig('api_res_histgram.png', dpi=100) plt.show()
レスポンスタイム はポアソン分布になるのかな。
LSTM を書いてみた
出典元)Python 機械学習プログラミング https://www.amazon.co.jp/dp/4295003379/
tar 解凍
import tarfile with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar: tar.extractall()
データ作成
import pandas as pd import os base_path = 'aclImdb' labels = {'pos':1, 'neg':0}
from tqdm import tqdm_notebook df = pd.DataFrame() for s in ('test', 'train'): for l in labels.keys(): path = os.path.join(base_path, s, l) for file in tqdm_notebook(os.listdir(path)): with open(os.path.join(path, file), 'r', encoding='utf-8') as infile: txt = infile.read() df = df.append([[txt, labels[l]]], ignore_index=True)
# データシャッフル from sklearn.utils import shuffle df = shuffle(df).reset_index(drop=True) # char2int from string import punctuation from collections import Counter counts = Counter() for i, review in tqdm_notebook(enumerate(df['review'])): text = ''.join([c if c not in punctuation else ' '+c+' ' for c in review]).lower() df.loc[i, 'review'] = text counts.update(text.split()) word_counts = sorted(counts, key=counts.get, reverse=True) print(word_counts[:5]) word_to_int = {word: ii for ii, word in enumerate(word_counts, 1)} mapped_reviews = [] for review in tqdm_notebook(df['review']): mapped_reviews.append([word_to_int[word] for word in review.split()]) # 200 文字で 0padding sequence_length = 200 sequences = np.zeros((len(mapped_reviews), sequence_length), dtype=int) for i, row in tqdm_notebook(enumerate(mapped_reviews)): review_arr = np.array(row) sequences[i, -len(row):] = review_arr[-sequence_length:]
LSTM
X_train = sequences[:25000, :] y_train = df.loc[:25000, 'sentiment'].values X_test = sequences[25000:, :] y_test = df.loc[25000:, 'sentiment'].values np.random.seed(123) # バッチジェネレータ def create_batch_generator(x, y=None, batch_size=64): n_batch = len(x) // batch_size x = x[:n_batch*batch_size] if y is not None: y = y[:n_batch*batch_size] for ii in range(0, len(x), batch_size): if y is not None: yield x[ii:ii+batch_size], y[ii:ii+batch_size] else: yield x[ii:ii+batch_size]
import tensorflow as tf class SentimentRNN(object): def __init__(self, n_words, seq_len=200, lstm_size=256, num_layers=1, batch_size=64, learning_rate=0.0001, embed_size=200): self.n_words = n_words self.seq_len = seq_len self.lstm_size = lstm_size # 隠れユニットの個数 self.num_layers = num_layers self.batch_size = batch_size self.learning_rate = learning_rate self.embed_size = embed_size self.g = tf.Graph() with self.g.as_default(): tf.set_random_seed(123) self.build() self.saver = tf.train.Saver() self.init_op = tf.global_variables_initializer() def build(self): # プレースホルダ tf_x = tf.placeholder(tf.int32, shape=(self.batch_size, self.seq_len), name='tf_x') tf_y = tf.placeholder(tf.float32, shape=(self.batch_size), name='tf_y') tf_keepprob = tf.placeholder(tf.float32, name='tf_keepprob') embedding = tf.Variable(tf.random_uniform((self.n_words, self.embed_size), minval=-1, maxval=1)) embed_x = tf.nn.embedding_lookup(embedding, tf_x, name='embeded_x') cells = tf.contrib.rnn.MultiRNNCell( [tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.BasicLSTMCell(self.lstm_size), output_keep_prob=tf_keepprob) for i in range(self.num_layers) ] ) # 初期状態を定義 self.initial_state = cells.zero_state(self.batch_size, tf.float32) print(' << initial state >> ', self.initial_state) lstm_outputs, self.final_state = \ tf.nn.dynamic_rnn(cells, embed_x, initial_state=self.initial_state) print('\n << lstm_output >> ', lstm_outputs) print('\n << final_state >> ', self.final_state) logits = tf.layers.dense(inputs=lstm_outputs[:, -1], units=1, activation=None, name='logits') logits = tf.squeeze(logits, name='logits_squeeze') print('\n << logits >>', logits) y_proba = tf.nn.sigmoid(logits, name='probabilities') predictions = { 'probabilities': y_proba, 'labels': tf.cast(tf.round(y_proba), tf.int32, name='labels') } print('\n << predictions >>', predictions) cost = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=tf_y, logits=logits), name='cost' ) optimizer = tf.train.AdamOptimizer(self.learning_rate) train_op = optimizer.minimize(cost, name='train_op') def train(self, X_train, y_train, num_epochs): with tf.Session(graph=self.g) as sess: sess.run(self.init_op) iteration = 1 for epoch in range(num_epochs): state = sess.run(self.initial_state) for batch_x, batch_y in create_batch_generator(X_train, y_train, self.batch_size): feed = {'tf_x:0': batch_x, 'tf_y:0': batch_y, 'tf_keepprob:0': 0.5, self.initial_state: state} loss, _, state = sess.run( ['cost:0', 'train_op', self.final_state], feed_dict=feed ) if iteration % 20 == 0: print("Epoch: %d / %d Iteration: %d | Train loss: %.5f" % (epoch + 1, num_epochs, iteration, loss)) iteration += 1 if (epoch + 1) % 10 == 0: self.saver.save(sess, "model/sentiment-%d.ckpt" % epoch) def predict(self, X_data, return_proba=False): preds = [] with tf.Session(graph=self.g) as sess: self.saver.restore(sess, tf.train.latest_checkpoint('./model/')) test_state = sess.run(self.initial_state) for ii, batch_x in enumerate(create_batch_generator(X_data, None, batch_size=self.batch_size), 1): feed = {'tf_x:0': batch_x, 'tf_keepprob:0': 1.0, self.initial_state: test_state} if return_proba: pred, test_state = sess.run(['probabilities:0', self.final_state], feed_dict=feed) else: pred, test_state = sess.run(['labels:0', self.final_state], feed_dict=feed) preds.append(pred) return np.concatenate(preds)
train+test
n_words = max(list(word_to_int.values())) + 1 rnn = SentimentRNN( n_words=n_words, seq_len=sequence_length, embed_size=256, lstm_size=128, num_layers=1, batch_size=100, learning_rate=0.001 )
<< initial state >> (LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0' shape=(100, 128) dtype=float32>),) << lstm_output >> Tensor("rnn/transpose_1:0", shape=(100, 200, 128), dtype=float32) << final_state >> (LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_3:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_4:0' shape=(100, 128) dtype=float32>),) << logits >> Tensor("logits_squeeze:0", shape=(100,), dtype=float32) << predictions >> {'probabilities': <tf.Tensor 'probabilities:0' shape=(100,) dtype=float32>, 'labels': <tf.Tensor 'labels:0' shape=(100,) dtype=int32>}
rnn.train(X_train, y_train, num_epochs=40)
Epoch: 1 / 40 Iteration: 20 | Train loss: 0.68427 Epoch: 1 / 40 Iteration: 40 | Train loss: 0.64362 Epoch: 1 / 40 Iteration: 60 | Train loss: 0.67976 Epoch: 1 / 40 Iteration: 80 | Train loss: 0.60497 ...
preds = rnn.predict(X_test) y_true = y_test[:len(preds)] print(np.sum(preds == y_true) / len(y_true))
INFO:tensorflow:Restoring parameters from ./model/sentiment-39.ckpt 0.854
str.translate() が便利
trans_dict = ('0123456789', 'abcdefghij') a = {ord(i):ord(h) for i, h in zip(*trans_dict)} '0120-333-666'.translate(a)
'abca-ddd-ggg'
データ分析の流れ
準備
- Prepare Problem a) Load libraries b) Load dataset
- Summarize Data a) Descriptive statistics b) Data visualizations
Prepare Data a) Data Cleaning b) Feature Selection c) Data Transforms (Normalize,...)
direcotory構成
echo '.DS_Store .ipynb_checkpoints/' > .gitignore [ -d app ] || mkdir app/ [ -d app/utils ] || mkdir app/utils [ -d app/utils/preprocessing ] || mkdir app/utils/preprocessing [ -d config ] || mkdir config [ -d data ] || mkdir data/ [ -d data/rawdata ] || mkdir data/rawdata && echo '* !.gitignore' > data/rawdata/.gitignore [ -d data/preprocessed_data ] || mkdir data/preprocessed_data && echo '* !.gitignore' > data/preprocessed_data/.gitignore [ -d data/model_params ] || mkdir data/model_params && echo '* !.gitignore' > data/model_params/.gitignore [ -d data/output ] || mkdir data/output && echo '* !.gitignore' > data/output/.gitignore [ -d log ] || mkdir log && echo '* !.gitignore' > log/.gitignore [ -d model ] || mkdir model [ -d tmp ] || mkdir tmp && echo '* !.gitignore' > tmp/.gitignore
- dvcを使用する場合
echo '.DS_Store .ipynb_checkpoints/' > .gitignore echo 'README' >>README.md [ -d app ] || mkdir app/ [ -d app/utils ] || mkdir app/utils [ -d config ] || mkdir config [ -d data ] || mkdir data/ [ -d data/input ] || mkdir data/input [ -d data/preprocessed_data ] || mkdir data/preprocessed_data [ -d data/output ] || mkdir data/output [ -d log ] || mkdir log [ -d model ] || mkdir model [ -d tmp ] || mkdir tmp
分析
- Evaluate Algorithms a) Split-out validation dataset b) Test options and evaluation metric c) Spot Check Algorithms d) Compare Algorithms
- Improve Accuracy a) Algorithm Tuning b) Ensembles
- Finalize Model a) Predictions on validation dataset b) Create standalone model on entire training dataset c) Save model for later use
https://www.kaggle.com/dennise/coursera-competition-getting-started-eda
https://towardsdatascience.com/exploratory-data-analysis-eda-a-practical-guide-and-template-for-structured-data-abfbf3ee3bd9towardsdatascience.com
例(train: ECサイトの transaction)
準備
1. repare Problem a) Load libraries b) Load dataset
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns color = sns.color_palette() %matplotlib inline from scipy import stats test=pd.read_csv('test.csv.gz',compression='gzip') train=pd.read_csv('train.csv.gz',compression='gzip') # データ初見確認 train.info() train.describe() # count が index と等しければ、non-null と考えられる train.head()
2. Summarize Data a) Descriptive statistics b) Data visualizations
train.price.hist() train.price.value_counts() train.price.nunique() # histgram train.price.apply(lambda x: np.log10(x+2)).hist(figsize=(12,16)) # index 順の plot train.block_num.plot(figsize=(20,4)) # データ作り込み train["category"]=train.name.apply(lambda x:x.split()[0]) date_format = '%d.%m.%Y' train['day'] = pd.to_datetime(train['date'], format=date_format).dt.day train['month'] = pd.to_datetime(train['date'], format=date_format).dt.month train['year'] = pd.to_datetime(train['date'], format=date_format).dt.year train['weekday'] = pd.to_datetime(train['date'], format=date_format).dt.dayofweek train["revenue"]=train.item_cnt_day * train.price train.groupby("item_category_id").sum()["revenue"].hist(figsize=(20,4),bins=100) # 不必要な行の削除 train.drop("date",axis=1,inplace=True) train.drop("name",axis=1,inplace=True) train.drop("item_cnt_day",axis=1,inplace=True) # 売上数の推移 train.groupby("block_num").sum()['revenue'].plot() train.groupby("block_num").mean()['revenue'].plot() # 特定商品名に対する価格の散布図 prices_hoge=train[train.category=="hogehoge"]["item_price"] plt.figure(figsize=(20, 8), dpi=80) plt.scatter(prices_hoge.index, prices_hoge,s=0.1) # groupby した結果を pivot table で見る # upnstack <--> stack (階層化データの変換) train.groupby(["block_num","item_category_id"]).sum()["revenue"].unstack() train.pivot_table(index=['shop_id','item_id'], columns='block_num', values='item_cnt_day',aggfunc='sum').fillna(0.0) train.groupby(["block_num","item_category_id"]).sum()["revenue"].unstack().plot(figsize=(20,20)) train.groupby(["block_num","shop_id"]).sum()["revenue"].unstack().plot(figsize=(20,20)) # データの相関 sns.pairplot(train) # データの入力ミスの修正 train.loc[train["shop_id"]==0,"shop_id"]=1 # テストデータの内容確認 test_list = test.shop_id.unique() out_of_test = [i for i in train.shop_id.unique() if i not in test_list]
3. Prepare Data a) Data Cleaning b) Feature Selection c) Data Transforms (Normalize,...)
# a) Data Cleaning columns_needed = ['item_id', 'shop_id', ...] df = train[columns_needed] # b) Feature Selection c) Data Transforms (Normalize,...) df["price_category"]=np.nan df["price_category"][(df["price"]>=0)&(df["price"]<=10000)]=0 df["price_category"][(df["price"]>10000)]=1 from sklearn import preprocessing le = preprocessing.LabelEncoder() le.fit(df.category) df["meta_category"] = le.transform(df.category) scaler = preprocessing.StandardScaler() le.fit(df.star) df['star'] = scaler.transform(df.star) X_train=df.drop("item_cnt_month", axis=1) # Reason for dropping item_price explained below y_train=df["item_cnt_month"] X_train.fillna(0, inplace=True)
分析
1. Evaluate Algorithms a) Split-out validation dataset b) Test options and evaluation metric c) Spot Check Algorithms d) Compare Algorithms
import lightgbm as lgb from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score # 線形回帰 linmodel=LinearRegression() linmodel.fit(X_train, y_train) lin_pred=linmodel.predict(X_train) print('R-squared is %f' % r2_score(lin_pred, y_train)) # boost model lgb_params = { 'feature_fraction': 0.75, 'metric': 'rmse', 'nthread':1, 'min_data_in_leaf': 2**7, 'bagging_fraction': 0.75, 'learning_rate': 0.03, 'objective': 'mse', 'bagging_seed': 2**7, 'num_leaves': 2**7, 'bagging_freq':1, 'verbose':0 } model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100) pred_lgb = model.predict(X_train) print('R-squared is %f' % r2_score(pred_lgb, y_train))
http://kidnohr.hatenadiary.com/entry/2018/09/21/012446
2. Improve Accuracy a) Algorithm Tuning b) Ensembles
meta_feature = np.c_[lin_pred, pred_lgb] meta_lr = LinearRegression() meta_lr.fit(meta_feature, final_train[33]) meta_pred = meta_lr.predict(meta_feature) print('R-squared is %f' % r2_score(meta_pred, y_train))
3. Finalize Model a) Predictions on validation dataset b) Create standalone model on entire training dataset c) Save model for later use
おまけ
groupby のチートシート