アクセスログの統計処理
LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %D"
from datetime import (datetime, date, timedelta) import re import pandas as pd import numpy as np from IPython.display import display, HTML from pandas.tools.plotting import table import matplotlib.pyplot as plt import seaborn as sns import scipy.stats as st df_access = pd.read_csv('access_log.gz', compression='gzip', sep=' ', delimiter='"', header=None) df_access.columns = [ 'ip_uid_username_datetime', 'method_req', 'status_byte', 'referer', '_', 'useagent', 'time', ] df_access[['ip', 'uid', 'username', 'datetime']] = df_access.ip_uid_username_datetime.str.strip().str.split( ' ', 3, expand=True ).rename(columns={0:'ip', 1:'uid', 2:'username', 3:'datetime'}) df_access[['method', 'req', 'httpver']] = df_access.method_req.str.strip().str.split( ' ', 2, expand=True ).rename(columns={0:'method', 1:'req',2: 'httpver'}) df_access[['status', 'byte']] = df_access.status_byte.str.strip().str.split( ' ', 1, expand=True ).rename(columns={0:'status', 1:'byte',}) _time = df_access.datetime.map(lambda x: x.strip('[]').split('+')[0].strip()) df_access['reqtime'] = pd.to_datetime(_time, format='%d/%b/%Y:%H:%M:%S') df = df_access[['reqtime', 'req', 'status', 'time']]
408 エラーの頻度回数
df_408 = df.loc[(df.status=='408'), :] df_408_g = df_408.groupby('reqtime').size().reset_index(name='c') from matplotlib import dates as mdates _from, _to = '2019-01-07 14:50:00', '2019-01-07 15:10:00' _ = df_408_g.loc[(df_408_g.reqtime >= _from) & (df_408_g.reqtime <= _to), :] fig = sns.mpl.pyplot.figure(figsize=(15, 6)) ax = fig.add_subplot(111) ax.plot(_.reqtime, _.c, label='408 count',) ax.set(xlim=(_from, _to)) ax.xaxis.set_major_formatter(mdates.DateFormatter('%m/%d\n%H:%M')) ax.xaxis.set_major_locator(mdates.MinuteLocator(interval=1)) ax.legend() fig.autofmt_xdate() fig.show()
api 毎の res time 計測
df['api'] = df.req.str.split('?', 1, expand=True)[0] df['seconds'] = df.time.map(lambda x: x / 10**6) df_api_g = df.groupby('api').agg({ 'seconds': ['mean', 'std', 'count', 'max', 'min'] })['seconds'] top_list = df_api_g.sort_values('mean', ascending=False).index f,a = plt.subplots(nrows=n, ncols=1, figsize=(15, 18)) a = a.ravel() for idx,ax in enumerate(a): l = top_list[idx] _df = df[['api', 'seconds']].loc[df.api==l, :] ax.hist( _df.seconds, log=True, range=(0,5), bins=100 ) ax.set_title(l) ax.set_xlim(0, 4) plt.tight_layout( ) plt.savefig('api_res_histgram.png', dpi=100) plt.show()
レスポンスタイム はポアソン分布になるのかな。