#! /usr/bin/env bash
# Function that takes an Apache access log and outputs a tab-separated file
# (access_log.tmp) containing 12 fields:
# $1 f_ip IP address
# $2 f_client_id Client ID
# $3 f_user_id User ID
# $4 f_date Apache timestamp
# $5 f_unixdate Unix timestamp
# $6 f_method HTTP method (GET, POST etc.)
# $7 f_resource Request resource
# $8 f_protocol HTTP protocol (HTTP/1.1)
# $9 f_status Apache status code (200, 301 etc.)
# $10 f_bytes Number of bytes transferred
# $11 f_referer Referrer URL
# $12 f_agent User agent
#
# You can use the function to further process the output. For instance, you
# could get specific fields between specific time periods.
#
# Usage:
# log_muncher "/path/to/access-log"
log_muncher() {
# We're splitting the input file ($1) by two mutli-character delimiters:
# a space and double quote and a double quote and space. This gives us
# five fields. The first three fields contain multiple fields and need
# to be processed further.
awk -F' "|" ' '{
# Assign the three fields to variables:
p1 = $1
p2 = $2
p3 = $3
f_referrer = $4
f_agent = $5
# $p1 is a space-separated string with four fields:
# * IP address
# * Client ID
# * User ID
# * Timestamp
#
# The timestamp is enclosed in square brackets, so we can split $p1 by
# square brackets to get the first three fields ($p1a) and $f_date:
split(p1, p1_arr, /[][]/)
p1a = p1_arr[1]
f_date = p1_arr[2]
# Next, we split $p1a and assign the other three variables:
split(p1a, p1a_arr, / /)
f_ip = p1a_arr[1]
f_client_id = p1a_arr[2]
f_user_id = p1a_arr[3]
# Convert $f_date to a Unix timestamp. To do so we first get the date,
# time and timezone in variables and we then mould the variables until
# we got everything in the format mktime() expects.
ts_date = substr(f_date,1,11)
ts_time = substr(f_date,13,8)
ts_zone = substr(f_date,22,5)
split(ts_date, tsd, /\//)
ts_year = tsd[3]
ts_mon = sprintf("%02d",(index("JanFebMarAprMayJunJulAugSepOctNovDec",tsd[2])+2)/3)
ts_day = tsd[1]
split(ts_time, tst, /:/)
ts_hour = tst[1]
ts_min = tst[2]
ts_sec = tst[3]
ts_concat = sprintf("%04d %02d %02d %02d %02d %02d %s", ts_year, ts_mon, ts_day, ts_hour, ts_min, ts_sec, ts_zone)
f_unixtime = mktime(ts_concat)
# $p2 is a space-separated string with three fields:
# * HTTP method
# * Requested resource
# * HTTP protocol
split(p2, p2_arr, / /)
f_method = p2_arr[1]
f_resource = p2_arr[2]
f_protocol = p2_arr[3]
# $p3 is a space-separated string with two fields:
# * Status code
# * Bytes
split(p3, p3_arr, / /)
f_status = p3_arr[1]
f_bytes = p3_arr[2]
# Print all the fields:
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", f_ip, f_client_id, f_user_id, f_date, f_unixtime, f_method, f_resource, f_protocol, f_status, f_bytes, f_referrer, f_agent
}' "$1" > access_log.tmp
}
# Parse the file:
log_muncher "$1"