Commits

Stefan Saasen committed 31d8ad9

Foundation for reading log files for a date period (to enable progressive parsing)

  • Participants
  • Parent commits 1728d02

Comments (0)

Files changed (6)

File logparser/logparser.cabal

                             containers,
                             text,
                             unordered-containers,
+                            time,
                             aeson,
                             bzlib,
                             QuickCheck >= 2.4.0.1
 executable logparser
     main-is:                Main.hs
     hs-source-dirs:         src
-    other-modules:          Stash.Log.Analyser, Stash.Log.Parser, Stash.Log.GitOpsAnalyser, Stash.Log.Common
+    other-modules:          Stash.Log.Analyser, Stash.Log.Parser, Stash.Log.GitOpsAnalyser, Stash.Log.Common, Stash.Log.Output, Stash.Log.Input
     build-depends:          base < 5 && >= 3,
                             bytestring >= 0.9,
                             MissingH,
                             containers,
                             cmdargs >= 0.10,
                             unordered-containers >= 0.2,
+                            time,
                             aeson
     ghc-options:
                             -Wall
+                            -- -Werror
                             -fno-warn-unused-do-bind
                             -rtsopts
                             -O2

File logparser/src/Main.hs

 import Stash.Log.Analyser hiding (ProtocolStats)
 import Stash.Log.GitOpsAnalyser
 import Stash.Log.Output
+import Stash.Log.Input
+import Control.Monad (liftM)
 import System.Console.CmdArgs
 import Prelude hiding (takeWhile)
 
 appShortDesc :: String
 appShortDesc = "Logparser for the Atlassian Stash access logs"
 
-data LogParser = MaxConn {files :: [FilePath]}
+data LogParser = MaxConn        {files :: [FilePath]}
                 | CountRequests {files :: [FilePath]}
-                | GitOperations {files :: [FilePath]}
-                | GitDurations {files :: [FilePath]}
+                | GitOperations {files :: [FilePath], progressive :: Bool}
+                | GitDurations  {files :: [FilePath], progressive :: Bool}
                 | ProtocolStats {files :: [FilePath]}
-                | Count {files :: [FilePath]}
-                | DebugParser {files :: [FilePath]}
+                | Count         {files :: [FilePath]}
+                | DebugParser   {files :: [FilePath], progressive :: Bool}
              deriving (Data,Typeable,Show,Eq)
 
+progressiveFlags :: Bool
+progressiveFlags = False &= help "Progressively parse the logfiles" &= typ "BOOL"
+
+
 maxConn :: LogParser
 maxConn         = MaxConn {files = def &= args}
                 &= name "maxConn"       &= help "Show the maximum number of concurrent requests per hour"
                 &= name "countRequests" &= help "Count the number of requests"
 
 gitOperations :: LogParser
-gitOperations   = GitOperations {files = def &= args}
+gitOperations   = GitOperations {files = def &= args, progressive = progressiveFlags}
                 &= name "gitOperations" &= help "Aggregate git operations per hour. Show counts for fetch, clone, push, pull and ref advertisement"
 
 gitDurations :: LogParser
-gitDurations    = GitDurations {files = def &= args}
+gitDurations    = GitDurations {files = def &= args, progressive = progressiveFlags}
                 &= name "gitDurations"  &= help "Show the duration of git operations over time"
 
 protocolStats :: LogParser
                 &= name "count"         &= help "Count the number of lines in the given logfile(s)"
 
 debugParser :: LogParser
-debugParser     = DebugParser {files = def &= args}
+debugParser     = DebugParser {files = def &= args, progressive = progressiveFlags}
                 &= name "debugParser"   &= help "Parse and print the first five lines of the log file"
 
 
 mode = cmdArgsMode $ modes [maxConn, countRequests, gitOperations, gitDurations, protocolStats, count, debugParser]
         &= help appShortDesc
         &= program appName &= summary (appName ++ " " ++ appVersion)
+        &= verbosity
 
 
 run :: LogParser -> IO ()
-run (MaxConn files)         = printPlotDataConcurrentConn plotDataConcurrentConnHour files
-run (CountRequests files)   = parseAndPrint countRequestLines files
-run (GitOperations files)   = printPlotDataGitOps analyseGitOperations files
-run (GitDurations files)    = printGitRequestDurations gitRequestDuration files
-run (ProtocolStats files)   = printProtocolData protocolStatsByHour files
-run (Count files)           = printCountLines countLines files
-run (DebugParser files)     = parseAndPrint showLines files
+run (MaxConn files)                     = stream plotDataConcurrentConnHour printPlotDataConcurrentConn newRunConfig "printPlotDataConcurrentConn" files
+run (CountRequests files)               = stream countRequestLines parseAndPrint newRunConfig "countRequestLines" files
+run (GitOperations files progressive)   = stream analyseGitOperations printPlotDataGitOps (RunConfig progressive) "printPlotDataGitOps" files
+run (GitDurations files progressive)    = stream gitRequestDuration printGitRequestDurations (RunConfig progressive) "gitRequestDuration" files
+run (ProtocolStats files)               = stream protocolStatsByHour printProtocolData newRunConfig "printProtocolData" files
+run (Count files)                       = printCountLines countLines files
+run (DebugParser files progressive)     = stream showLines parseAndPrint newRunConfig "showLines" files
+
+stream :: (Input -> a) -> (a -> IO ()) -> RunConfig -> String -> [FilePath] -> IO ()
+stream analyze output runConfig name files = output =<< (liftM analyze $ readLogFiles runConfig name files)
 
 main :: IO ()
 main = do

File logparser/src/Stash/Log/File.hs

-{-# LANGUAGE OverloadedStrings #-}
-
-module Stash.Log.File
-( sortLogFiles
-, toLines
-, readFiles
-, FileInfo(..)
-, extractFileInfo
-, isFileNewer
-) where
-
-import qualified Data.ByteString.Lazy.Char8 as L
-import qualified Codec.Compression.BZip as BZip
-import Data.Monoid (mappend)
-import Data.List (isSuffixOf, sortBy)
-import Data.String.Utils (split)
-import System.Path.NameManip
-import Control.Monad (liftM)
-import Debug.Trace
-
-data FileInfo = FileInfo {
-     year       :: String
-    ,month      :: String
-    ,day        :: String
-    ,counter    :: Int
-} deriving (Show, Eq, Ord)
-
-type Date = String
-
--- | Check whether the log file is more recent than the given date. This is
--- solely based on the date that is part of the filename.
-isFileNewer :: FilePath -> Date -> Bool
-isFileNewer file date = (Just $ base (unpack date)) <= extractFileInfo file
-        where base (year':month':day':_) = FileInfo year' month' day' 0
-              unpack                  = split "-"
-
--- | Sort the logfiles by date and log file sequence number
--- The logfile naming scheme is: "atlassian-stash-access-2012-11-29.0.log(.bz2)"
-sortLogFiles :: [FilePath] -> [FilePath]
-sortLogFiles = sortBy logFilePred
-    where sortPred (date1, num1) (date2, num2) = compare date1 date2 `mappend` compare num1 num2
-          logFilePred logFileName1 logFileName2 = sortPred (extractSortPairs logFileName1) (extractSortPairs logFileName2)
-          extractSortPairs path = maybe ("9999", 0) asPair $ extractFileInfo path
-          asPair (FileInfo year' month' day' counter') = (year' ++ "-" ++ month' ++ "-" ++ day', counter')
-          asPair _                                     = ("", 0)
-
--- | Try to extract the FileInfo out of the given file. This function assumes
--- that the given file follows the naming scheme for the access log archive
--- files.
-extractFileInfo :: FilePath -> Maybe FileInfo
-extractFileInfo path = let elems = drop 3 $ split "-" $ extractFile path
-                       in case elems of
-                               (year':month':(rest:_)) -> case split "." rest of
-                                                             (day':num:_) -> Just $ FileInfo year' month' day' (read num :: Int)
-                                                             _           -> Nothing
-                               _                     -> Nothing
-
--- | Read the list of files and return a list of lines. The input files will be
--- filtered using the function (FilePath -> Bool)
-toLines :: (FilePath -> Bool) -> [FilePath] -> IO [L.ByteString]
-toLines p files = liftM L.lines $ readFiles p files
-
--- | Read the list of files and turn them into a lazy ByteString. The input files will be
--- filtered using the function (FilePath -> Bool)
-readFiles :: (FilePath -> Bool) -> [FilePath] -> IO L.ByteString
-readFiles f files = trace ("filteredFiles: " ++ show filteredFiles)  fmap L.concat . mapM readCompressedOrUncompressed $ filteredFiles
-            where filteredFiles = filter f $ sortLogFiles files
-
--- =================================================================================
-
-extractFile :: FilePath -> String
-extractFile = last . slice_path
-
-readCompressedOrUncompressed :: FilePath -> IO L.ByteString
-readCompressedOrUncompressed path = if ".bz2" `isSuffixOf` path
-                                    then liftM BZip.decompress $ L.readFile path
-                                    else L.readFile path

File logparser/src/Stash/Log/Input.hs

+{-# LANGUAGE OverloadedStrings #-}
+
+module Stash.Log.Input
+( sortLogFiles
+, readFiles
+, FileInfo(..)
+, extractFileInfo
+, isFileNewer
+, filterLastDay
+, dropUntilDate
+, readLogFiles
+, RunConfig(..)
+, newRunConfig
+) where
+
+import qualified Data.ByteString.Lazy.Char8 as L
+import qualified Codec.Compression.BZip as BZip
+import qualified Data.Map as M
+import Data.Monoid (mappend)
+import Data.Maybe (isJust, fromMaybe, fromJust)
+import Data.List (isSuffixOf, sortBy, groupBy)
+import Data.String.Utils (split)
+import System.Path.NameManip
+import Control.Monad (liftM, liftM2)
+import Data.Aeson (decode)
+import Data.Time.Clock
+import Data.Time.Calendar
+import Debug.Trace
+
+data FileInfo = FileInfo {
+     year       :: String
+    ,month      :: String
+    ,day        :: String
+    ,counter    :: Int
+} deriving (Show, Ord, Eq)
+
+newtype FileDateInfo = FileDateInfo FileInfo
+
+-- | Ignore the counter for equality checks
+instance Eq FileDateInfo where
+  (FileDateInfo (FileInfo year1 month1 day1 _)) == (FileDateInfo (FileInfo year2 month2 day2 _))
+                            = year1 == year2 && month1 == month2 && day1 == day2
+
+type Date = String
+
+-- | Check whether the log file is more recent than the given date. This is
+-- solely based on the date that is part of the filename.
+isFileNewer :: FilePath -> Date -> Bool
+isFileNewer file date = (Just $ base (unpack date)) <= extractFileInfo file
+        where base (year':month':day':_) = FileInfo year' month' day' 0
+              unpack                  = split "-"
+
+-- | Sort the logfiles by date and log file sequence number
+-- The logfile naming scheme is: "atlassian-stash-access-2012-11-29.0.log(.bz2)"
+sortLogFiles :: [FilePath] -> [FilePath]
+sortLogFiles = sortBy logFilePred
+    where sortPred (date1, num1) (date2, num2) = compare date1 date2 `mappend` compare num1 num2
+          logFilePred logFileName1 logFileName2 = sortPred (extractSortPairs logFileName1) (extractSortPairs logFileName2)
+          extractSortPairs path = maybe ("9999", 0) asPair $ extractFileInfo path
+          asPair (FileInfo year' month' day' counter') = (year' ++ "-" ++ month' ++ "-" ++ day', counter')
+          asPair _                                     = ("", 0)
+
+-- | Try to extract the FileInfo out of the given file. This function assumes
+-- that the given file follows the naming scheme for the access log archive
+-- files.
+extractFileInfo :: FilePath -> Maybe FileInfo
+extractFileInfo path = let elems = drop 3 $ split "-" $ extractFile path
+                       in case elems of
+                               (year':month':(rest:_)) -> case split "." rest of
+                                                             (day':num:_) -> Just $ FileInfo year' month' day' (read num :: Int)
+                                                             _           -> Nothing
+                               _                     -> Nothing
+
+extractFileDateInfo :: FilePath -> Maybe FileDateInfo
+extractFileDateInfo path = fmap (FileDateInfo) $ extractFileInfo path
+
+-- | Read the list of files and return a list of lines. The input files will be
+-- filtered using the function (FilePath -> Bool)
+--toLines :: (FilePath -> Bool) -> [FilePath] -> IO [L.ByteString]
+--toLines p files = liftM L.lines $ readFiles p files
+
+toLines :: [FilePath] -> IO [L.ByteString]
+toLines files = liftM L.lines $ readFiles files
+
+-- | Read the list of files and turn them into a lazy ByteString. The input files will be
+-- filtered using the function (FilePath -> Bool)
+readFiles :: [FilePath] -> IO L.ByteString
+readFiles files = trace ("filteredFiles: " ++ show filteredFiles)  fmap L.concat . mapM readCompressedOrUncompressed $ filteredFiles
+            where filteredFiles = sortLogFiles files
+
+filterLastDay :: [FilePath] -> [FilePath]
+filterLastDay []    = []
+filterLastDay files = concat . init . gr $ filterLastFile files
+    where filterLastFile    = filter (isJust . extractFileInfo)
+          gr                = groupBy (\a b -> fromMaybe False $ liftM2 (==) (extractFileDateInfo a) (extractFileDateInfo b))
+
+dropUntilDate :: Date -> [FilePath] -> [FilePath]
+dropUntilDate date files = dropWhile (\f -> not $ isFileNewer f date) files
+
+-- =================================================================================
+
+extractFile :: FilePath -> String
+extractFile = last . slice_path
+
+readCompressedOrUncompressed :: FilePath -> IO L.ByteString
+readCompressedOrUncompressed path = if ".bz2" `isSuffixOf` path
+                                    then liftM BZip.decompress $ L.readFile path
+                                    else L.readFile path
+data RunConfig = RunConfig {
+    cfgProgressive :: Bool
+    } deriving (Show)
+
+newRunConfig :: RunConfig
+newRunConfig = RunConfig False
+
+readConfig :: String -> IO (Maybe String)
+readConfig key = do
+        json <- L.readFile "logparser.state"
+        return $ (decode json :: Maybe (M.Map String String)) >>= M.lookup key
+
+today :: IO (Integer,Int,Int) -- :: (year,month,day)
+today = getCurrentTime >>= return . toGregorian . utctDay
+
+readLogFiles :: RunConfig -> String -> [FilePath] -> IO [L.ByteString]
+readLogFiles cfg key path = do
+        date <- readConfig key
+        now <- today
+        let progressive = cfgProgressive cfg
+        trace ("date: " ++ show date ++ " key: " ++ key ++ " now: " ++ show now) (if progressive && (isJust date) then
+                toLines $ (dropUntilDate $ fromJust date) $ filterLastDay path
+            else
+                toLines path
+                )

File logparser/src/Stash/Log/Output.hs

 import qualified Data.ByteString.Char8 as S
 import qualified Data.Map as M
 import Control.Monad (liftM)
+import Control.Monad.Reader
+import Data.Maybe (isJust, fromJust)
 import Stash.Log.Parser
 import Stash.Log.Analyser
 import Stash.Log.GitOpsAnalyser
-import Stash.Log.File
+import Stash.Log.Input
 import Text.Printf (printf)
-import Data.Aeson (decode)
+import Debug.Trace
 
-
-printProtocolData :: (Input -> [ProtocolStats]) -> [FilePath] -> IO ()
-printProtocolData f path = do
-        plotData <- liftM f $ readLogFiles "printProtocolData" path
+printProtocolData :: [ProtocolStats] -> IO ()
+printProtocolData plotData = do
         printf "# Date | SSH | HTTP(s)\n"
         mapM_ (\(ProtocolStats date ssh http) -> printf "%s|%d|%d\n" date ssh http) plotData
 
-printPlotDataGitOps :: (Input -> [GitOperationStats]) -> [FilePath] -> IO ()
-printPlotDataGitOps f path = do
-        plotData <- liftM f $ readLogFiles "printPlotDataGitOps" path
+printPlotDataGitOps :: [GitOperationStats] -> IO ()
+printPlotDataGitOps plotData = do
         printf "# Date | clone | fetch | shallow clone | push | ref advertisement | clone (hit) | fetch (hit) | shallow clone (hit) | push (hit) | ref advertisement (hit) | clone (miss) | fetch (miss) | shallow clone (miss) | push (miss) | ref advertisement (miss)\n"
         mapM_ (\(GitOperationStats date [a,b,c,d,e] [aHit,bHit,cHit,dHit,eHit])
                 -> printf "%s|%d|%d|%d|%d|%d|%d|%d|%d|%d|%d|%d|%d|%d|%d|%d\n" date (a+aHit) (b+bHit) (c+cHit) (d+dHit) (e+eHit) aHit bHit cHit dHit eHit a b c d e) plotData
 
-printPlotDataConcurrentConn :: (Input -> [DateValuePair]) -> [FilePath] -> IO ()
-printPlotDataConcurrentConn f path = do
-        plotData <- liftM f $ readLogFiles "printPlotDataConcurrentConn" path
+printPlotDataConcurrentConn :: [DateValuePair] -> IO ()
+printPlotDataConcurrentConn plotData = do
         printf "# Date | Max concurrent connection\n"
         mapM_ (\pd -> printf "%s|%d\n" (formatLogDate $ getLogDate pd) (getValue pd)) plotData
 
-printGitRequestDurations :: (Input -> [RequestDurationStat]) -> [FilePath] -> IO ()
-printGitRequestDurations g path = do
-        plotData <- liftM g $ readLogFiles "printCloneRequestDurations" path
+printGitRequestDurations :: [RequestDurationStat] -> IO ()
+printGitRequestDurations plotData = do
         printf "# Date | Clone duration (cache hit) | Clone duration (cache miss) | Fetch (hit) | Fetch (miss) | Shallow Clone (hit) | Shallow Clone (miss) | Push (hit) | Push (miss) | Ref adv (hit) | Ref adv (miss) | Client IP | Username \n"
         mapM_ (\(RequestDurationStat date clientIp [cm,fm,sm,pm,rm] [c,f,s,p,r] username)
                 -> printf "%s|%d|%d|%d|%d|%d|%d|%d|%d|%d|%d|%s|%s\n" (show date) c cm f fm s sm p pm r rm clientIp (S.unpack username)) plotData
 
-parseAndPrint :: (Show a) => (Input -> a) -> [FilePath] -> IO ()
-parseAndPrint f path = print . f . L.lines =<< readFiles (const True) path
+parseAndPrint :: (Show a) => a -> IO ()
+parseAndPrint d = print d
 
 printCountLines :: (Show a) => (L.ByteString -> a) -> [FilePath] -> IO ()
-printCountLines f path = print . f =<< readFiles (const True) path
+printCountLines f path = print . f =<< readFiles path
 
 -- =================================================================================
 formatLogDate :: LogDate -> String
 formatLogDate date = printf "%04d-%02d-%02d %02d:%02d" (getYear date) (getMonth date)
                             (getDay date) (getHour date) (getMinute date)
 
-readConfig :: String -> IO (Maybe String)
-readConfig key = do
-        json <- L.readFile "logparser.state"
-        return $ (decode json :: Maybe (M.Map String String)) >>= M.lookup key
-
-readLogFiles :: String -> [FilePath] -> IO [L.ByteString]
-readLogFiles key path = do
-        date <- readConfig key
-        toLines (createPredicate date) path
-        where createPredicate = maybe (const True) $ flip isFileNewer

File logparser/tests/Properties.hs

 import Stash.Log.Analyser
 import Stash.Log.Parser
 import Stash.Log.GitOpsAnalyser
-import Stash.Log.File (sortLogFiles)
+import Stash.Log.Input
 import Data.Maybe
 import Test.QuickCheck hiding ((.&.))
 import Test.Framework (Test, defaultMain, testGroup)
                      "atlassian-stash-access.log",
                      "atlassian-stash-access-2012-11-29.2.log.bz2"]
 
+
+test_filterFilesDropLast = H.assertEqual
+    "Should drop the last day"
+    ["atlassian-stash-access-2012-11-28.0.log.bz2",
+     "atlassian-stash-access-2012-11-29.0.log.bz2",
+     "atlassian-stash-access-2012-11-29.1.log.bz2"]
+     (filterLastDay input)
+    where input = ["atlassian-stash-access-2012-11-28.0.log.bz2",
+                     "atlassian-stash-access-2012-11-29.0.log.bz2",
+                     "atlassian-stash-access-2012-11-29.1.log.bz2",
+                     "atlassian-stash-access-2012-11-30.0.log.bz2",
+                     "atlassian-stash-access-2012-11-30.1.log.bz2",
+                     "atlassian-stash-access.log"]
+
+test_filterFilesSkipUntilDate = H.assertEqual
+    "Should drop files up to the given date"
+    ["atlassian-stash-access-2012-11-30.0.log.bz2",
+     "atlassian-stash-access-2012-11-30.1.log.bz2",
+     "atlassian-stash-access-2012-11-31.0.log.bz2",
+     "atlassian-stash-access-2012-11-31.1.log.bz2",
+     "atlassian-stash-access.log"]
+     (dropUntilDate "2012-11-30" input)
+    where input = ["atlassian-stash-access-2012-11-28.0.log.bz2",
+                     "atlassian-stash-access-2012-11-29.0.log.bz2",
+                     "atlassian-stash-access-2012-11-29.1.log.bz2",
+                     "atlassian-stash-access-2012-11-30.0.log.bz2",
+                     "atlassian-stash-access-2012-11-30.1.log.bz2",
+                     "atlassian-stash-access-2012-11-31.0.log.bz2",
+                     "atlassian-stash-access-2012-11-31.1.log.bz2",
+                     "atlassian-stash-access.log"]
+
 ------------------------------------------------------------------------
 -- Test harness
 
         ,testCase "analyser/isRefAdvertisement http action" test_identifyRefAdvertisement_HttpAction
         ,testCase "analyser/isRefAdvertisement http label" test_identifyRefAdvertisement_HttpLabel
       ],
-      testGroup "Common"
+      testGroup "Files"
       [
-        testCase "common/sortLogFiles" test_sortFilesAsc
+        testCase "files/sortLogFiles" test_sortFilesAsc
+       ,testCase "files/drop last day" test_filterFilesDropLast
+       ,testCase "files/skip until date" test_filterFilesSkipUntilDate
       ],
       testGroup "parser"
       [ testCase "parser/parse empty String" test_logLineParserEmpty