stash-log-parser / logparser / src / Stash / Log / GitOpsAnalyser.hs

{-# LANGUAGE OverloadedStrings, BangPatterns #-}

module Stash.Log.GitOpsAnalyser
( GitOperationStats(..)
, analyseGitOperations
, RequestDurationStat(..)
, cloneRequestDuration
, isRefAdvertisement
, protocolCount
) where

import qualified Data.ByteString.Char8 as S
import qualified Data.HashMap.Strict as M
import Data.String.Utils (split)
import Data.List (foldl', groupBy)
import Data.Maybe (isJust, mapMaybe, fromMaybe)
import Data.Function (on)
import Text.Printf (printf)
import Stash.Log.Parser
import Stash.Log.Common (logDateEqHour, isSsh, isHttp)

data GitOperationStats = GitOperationStats {
     getOpStatDate              :: !String
    ,cacheMisses                :: ![Int] -- clone, fetch, shallow clone, push, ref advertisement
    ,cacheHits                  :: ![Int]
}

data RequestDurationStat = RequestDurationStat {
    getDurationDate             :: !LogDate
   ,getClientIp                 :: !String
   ,cacheMissDurations          :: ![Int]
   ,cacheHitDurations           :: ![Int] -- clone, fetch, shallow clone, push, ref advertisement
   ,requestUsername             :: !S.ByteString
}

-- | Parse and aggregate the log file input into a list of hourly GitOperationStats
analyseGitOperations :: Input -> [GitOperationStats]
analyseGitOperations rawLines =
    let formatLogDate date = printf "%04d-%02d-%02d %02d" (getYear date) (getMonth date)
                            (getDay date) (getHour date)
    in analyseGitOperations' logDateEqHour formatLogDate rawLines

-- | Return the duration of clone (clone and shallow clone) operations
cloneRequestDuration :: Input -> [RequestDurationStat]
cloneRequestDuration rawLines = collectRequestDurations rawLines authenticatedGitOp


-- =================================================================================

authenticatedGitOp :: LogLine -> Bool
authenticatedGitOp line = isJust (getUsername line)

collectRequestDurations :: Input -> (LogLine -> Bool) -> [RequestDurationStat]
collectRequestDurations rawLines p = map m $ filter f $ parseLogLines rawLines
        where clientIp line = head $ split "," (S.unpack $ getRemoteAdress line)
              ops           = [isClone, isFetch, isShallowClone, isPush, isRefAdvertisement]
              f line        = isOutgoingLogLine line && p line && isGitOperation line
              m line        =  let  duration    = getRequestDuration line
                                    zero        = replicate 5 0
                                    inc op      = if op line then (+duration) else id
                                    missOps     = map (inc . uncachedOperation) ops
                                    hitOps      = map (inc . cachedOperation) ops
                                    username'   = fromMaybe "-" $ getUsername line
                                    !misses     = zipWith id missOps zero
                                    !hits       = zipWith id hitOps zero
                               in RequestDurationStat (getDate line) (clientIp line) misses hits username'

emptyStats :: GitOperationStats
emptyStats = GitOperationStats "" zero zero
            where zero = replicate 5 0

analyseGitOperations' :: (LogDate -> LogDate -> Bool) -> (LogDate -> String) -> Input -> [GitOperationStats]
analyseGitOperations' comp formatLogDate rawLines =
    let groups = groupBy (comp `on` getDate) $ parseLogLines rawLines
    in map (summarizeGitOperations formatLogDate) groups


summarizeGitOperations :: (LogDate -> String) -> [LogLine] -> GitOperationStats
summarizeGitOperations formatLogDate = foldl' aggregate emptyStats . filter isOutgoingLogLine
                        where aggregate (GitOperationStats date misses hits) logLine =
                                let ops         = [isClone, isFetch, isShallowClone, isPush, isRefAdvertisement]
                                    inc op      = if op logLine then (+1) else (+0)
                                    missOps     = map (inc . uncachedOperation) ops
                                    hitOps      = map (inc . cachedOperation) ops
                                    date'       = if null date then formatLogDate $ getDate logLine else date
                                    misses'     = zipWith id missOps misses
                                    hits'       = zipWith id hitOps hits
                                in GitOperationStats date' misses' hits'

protocolCount :: Input -> [(S.ByteString,Integer)]
protocolCount line = M.toList $ foldl' count' M.empty (filter isOutgoingLogLine (filter isGitOperation $ mapMaybe parseLogLine line))
        where
            count' acc logLine = let !proto = getProtocol logLine
                                 in M.insertWith (+) proto 1 acc


-- =================================================================================
--                                Predicates
-- =================================================================================

isGitOperation :: LogLine -> Bool
isGitOperation line = any (\g -> g line) ops
            where ops = [isClone, isFetch, isShallowClone, isPush, isRefAdvertisement]

-- As of 1.1.2 of the clone cache plugin, refs are explicitly listed in the
-- labels field, most of the data we have does _not_ have that information though
isRefAdvertisement :: LogLine -> Bool
isRefAdvertisement logLine = authenticatedGitOp logLine && isOutgoingLogLine logLine && refAdvertisement logLine
            where
                action      = getAction logLine
                path        = getPath action
                method      = getMethod action
                refAdvertisement line
                            | isSsh line        = isRefs line || not (any (inLabel line) ["clone", "fetch", "shallow clone"])
                            | isHttp line       = ".git/info/refs" `S.isSuffixOf` path && "GET" == method
                            | otherwise         = False

isCacheHit :: LogLine -> Bool
isCacheHit logLine = inLabel logLine "cache:hit"

isCacheMiss :: LogLine -> Bool
isCacheMiss = not . flip inLabel "cache:hit" -- treat as cache miss if the cache:* label is missing

isFetch :: LogLine -> Bool
isFetch logLine = inLabel logLine "fetch" && not (inLabel logLine "clone" || inLabel logLine "shallow clone")

isClone :: LogLine -> Bool
isClone logLine = inLabel logLine "clone"

isShallowClone :: LogLine -> Bool
isShallowClone logLine = inLabel logLine "shallow clone"

isPush :: LogLine -> Bool
isPush logLine = inLabel logLine "push"

isRefs :: LogLine -> Bool
isRefs logLine = inLabel logLine "refs"

inLabel :: LogLine -> String -> Bool
inLabel logLine name =  let labels = getLabels logLine
                        in name `elem` labels

cachedOperation :: (LogLine -> Bool) -> LogLine -> Bool
cachedOperation op logLine = op logLine && isCacheHit logLine

uncachedOperation :: (LogLine -> Bool) -> LogLine -> Bool
uncachedOperation op logLine = op logLine && isCacheMiss logLine
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.