Snippets

Catalyst2 Apache access log muncher

Created by Robert last modified
#! /usr/bin/env bash

# Function that takes an Apache access log and outputs a tab-separated file 
# (access_log.tmp) containing 12 fields:
#   $1  f_ip          IP address
#   $2  f_client_id   Client ID
#   $3  f_user_id     User ID
#   $4  f_date        Apache timestamp
#   $5  f_unixdate    Unix timestamp
#   $6  f_method      HTTP method (GET, POST etc.)
#   $7  f_resource    Request resource
#   $8  f_protocol    HTTP protocol (HTTP/1.1)
#   $9  f_status      Apache status code (200, 301 etc.)
#   $10 f_bytes       Number of bytes transferred
#   $11 f_referer     Referrer URL
#   $12 f_agent       User agent
#
# You can use the function to further process the output. For instance, you 
# could get specific fields between specific time periods.
#
# Usage:
# log_muncher "/path/to/access-log"

log_muncher() {
  # We're splitting the input file ($1) by two mutli-character delimiters: 
  # a space and double quote and a double quote and space. This gives us 
  # five fields. The first three fields contain multiple fields and need 
  # to be processed further.
  awk -F' "|" ' '{
    # Assign the three fields to variables:
    p1 = $1
    p2 = $2
    p3 = $3
    f_referrer = $4
    f_agent = $5

    # $p1 is a space-separated string with four fields:
    # * IP address
    # * Client ID
    # * User ID
    # * Timestamp
    #
    # The timestamp is enclosed in square brackets, so we can split $p1 by 
    # square brackets to get the first three fields ($p1a) and $f_date:
    split(p1, p1_arr, /[][]/)
    p1a         = p1_arr[1]
    f_date      = p1_arr[2]

    # Next, we split $p1a and assign the other three variables:
    split(p1a, p1a_arr, / /)
    f_ip        = p1a_arr[1]
    f_client_id = p1a_arr[2]
    f_user_id   = p1a_arr[3]

    # Convert $f_date to a Unix timestamp. To do so we first get the date, 
    # time and timezone in variables and we then mould the variables until  
    # we got everything in the format mktime() expects.
    ts_date = substr(f_date,1,11)
    ts_time = substr(f_date,13,8)
    ts_zone = substr(f_date,22,5)

    split(ts_date, tsd, /\//)
    ts_year = tsd[3]
    ts_mon  = sprintf("%02d",(index("JanFebMarAprMayJunJulAugSepOctNovDec",tsd[2])+2)/3)
    ts_day  = tsd[1]

    split(ts_time, tst, /:/)
    ts_hour = tst[1]
    ts_min  = tst[2]
    ts_sec  = tst[3]

    ts_concat = sprintf("%04d %02d %02d %02d %02d %02d %s", ts_year, ts_mon, ts_day, ts_hour, ts_min, ts_sec, ts_zone)
    f_unixtime = mktime(ts_concat)

    # $p2 is a space-separated string with three fields:
    # * HTTP method
    # * Requested resource
    # * HTTP protocol
    split(p2, p2_arr, / /)
    f_method   = p2_arr[1]
    f_resource = p2_arr[2]
    f_protocol = p2_arr[3]

    # $p3 is a space-separated string with two fields:
    # * Status code
    # * Bytes
    split(p3, p3_arr, / /)
    f_status   = p3_arr[1]
    f_bytes    = p3_arr[2]

    # Print all the fields:
    printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", f_ip, f_client_id, f_user_id, f_date, f_unixtime, f_method, f_resource, f_protocol, f_status, f_bytes, f_referrer, f_agent

  }' "$1" > access_log.tmp
}

# Parse the file:
log_muncher "$1"

Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.