アクセスログの統計処理

httpd.apache.org

LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\" %D"
from datetime import (datetime, date, timedelta)
import re

import pandas as pd
import numpy as np

from IPython.display import display, HTML
from pandas.tools.plotting import table
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st


df_access = pd.read_csv('access_log.gz', compression='gzip', sep=' ', delimiter='"', header=None)

df_access.columns = [
    'ip_uid_username_datetime',
    'method_req',
    'status_byte',
    'referer',
    '_',
    'useagent',
    'time',
]

df_access[['ip', 'uid', 'username', 'datetime']] = df_access.ip_uid_username_datetime.str.strip().str.split(
    ' ', 3, expand=True
).rename(columns={0:'ip', 1:'uid', 2:'username', 3:'datetime'})

df_access[['method', 'req', 'httpver']] = df_access.method_req.str.strip().str.split(
    ' ', 2, expand=True
).rename(columns={0:'method', 1:'req',2: 'httpver'})

df_access[['status', 'byte']] = df_access.status_byte.str.strip().str.split(
    ' ', 1, expand=True
).rename(columns={0:'status', 1:'byte',})

_time = df_access.datetime.map(lambda x: x.strip('[]').split('+')[0].strip())

df_access['reqtime'] = pd.to_datetime(_time, format='%d/%b/%Y:%H:%M:%S')

df = df_access[['reqtime', 'req', 'status', 'time']]

408 エラーの頻度回数

df_408 = df.loc[(df.status=='408'), :]
df_408_g = df_408.groupby('reqtime').size().reset_index(name='c')

from matplotlib import dates as mdates

_from, _to = '2019-01-07 14:50:00', '2019-01-07 15:10:00'
_ = df_408_g.loc[(df_408_g.reqtime >= _from) & (df_408_g.reqtime <= _to), :]

fig = sns.mpl.pyplot.figure(figsize=(15, 6))
ax = fig.add_subplot(111)
ax.plot(_.reqtime, _.c, label='408 count',)
ax.set(xlim=(_from, _to))

ax.xaxis.set_major_formatter(mdates.DateFormatter('%m/%d\n%H:%M'))
ax.xaxis.set_major_locator(mdates.MinuteLocator(interval=1))  
ax.legend()
fig.autofmt_xdate()
fig.show()

api 毎の res time 計測

df['api'] = df.req.str.split('?', 1, expand=True)[0]
df['seconds'] = df.time.map(lambda x: x / 10**6)

df_api_g = df.groupby('api').agg({
    'seconds': ['mean', 'std', 'count', 'max', 'min']
})['seconds']

top_list = df_api_g.sort_values('mean', ascending=False).index
f,a = plt.subplots(nrows=n, ncols=1, figsize=(15, 18))
a = a.ravel()

for idx,ax in enumerate(a):
    l = top_list[idx]
    _df = df[['api', 'seconds']].loc[df.api==l, :]
    ax.hist(
        _df.seconds, log=True, range=(0,5),
        bins=100
    )
    ax.set_title(l)
    ax.set_xlim(0, 4)
plt.tight_layout( )
plt.savefig('api_res_histgram.png', dpi=100)
plt.show()

レスポンスタイム はポアソン分布になるのかな。