Commits

Bryan O'Sullivan committed 0648acc Merge

Merge pull request #5 from jaspervdj/master

Further work on benchmarks

Comments (0)

Files changed (43)

tests/benchmarks/python/case_map.py

-#!/usr/bin/env python
-
-import utils, sys
-
-for f in sys.argv[1:]:
-    t = utils.benchmark(lambda: utils.with_utf8_file(f, lambda c: c.upper()))
-    sys.stderr.write('{0}: {1}\n'.format(f, t))

tests/benchmarks/python/cut.py

+#!/usr/bin/env python
+
+import utils, sys, codecs
+
+def cut(filename, l, r):
+    content = open(filename, encoding='utf-8')
+    for line in content:
+        print(line[l:r])
+
+for f in sys.argv[1:]:
+    t = utils.benchmark(lambda: cut(f, 20, 40))
+    sys.stderr.write('{0}: {1}\n'.format(f, t))

tests/benchmarks/python/file_read.py

-#!/usr/bin/env python
-
-import utils, sys
-
-for f in sys.argv[1:]:
-    t = utils.benchmark(lambda: utils.with_utf8_file(f, lambda c: len(c)))
-    sys.stderr.write('{0}: {1}\n'.format(f, t))

tests/benchmarks/python/sort.py

 #!/usr/bin/env python
 
-import utils, sys
+import utils, sys, codecs
 
-def sort(string):
-    lines = string.splitlines()
+def sort(filename):
+    content = open(filename, encoding='utf-8').read()
+    lines = content.splitlines()
     lines.sort()
-    return '\n'.join(lines)
+    print('\n'.join(lines))
 
 for f in sys.argv[1:]:
-    t = utils.benchmark(lambda: sys.stdout.write(
-                    				utils.with_utf8_file(f,sort).encode('utf-8'))
-                    				)
+    t = utils.benchmark(lambda: sort(f))
     sys.stderr.write('{0}: {1}\n'.format(f, t))
-

tests/benchmarks/python/strip_brackets.py

-#!/usr/bin/env python
-
-import utils, sys
-
-def strip_brackets(string):
-    d = 0
-    out = ''
-    for c in string:
-        if c == '{' or c == '[': d += 1
-
-        if d > 0:
-            out += ' '
-        else:
-            out += c
-
-        if c == '}' or c == ']': d -= 1
-
-    return out
-
-for f in sys.argv[1:]:
-    t = utils.benchmark(lambda: utils.with_utf8_file(f, strip_brackets))
-    sys.stderr.write('{0}: {1}\n'.format(f, t))

tests/benchmarks/python/strip_tags.py

+#!/usr/bin/env python
+
+import utils, sys
+
+def strip_tags(filename):
+    string = open(filename, encoding='utf-8').read()
+
+    d = 0
+    out = []
+
+    for c in string:
+        if c == '<': d += 1
+
+        if d > 0:
+            out += ' '
+        else:
+            out += c
+
+        if c == '>': d -= 1
+
+    print(''.join(out))
+
+for f in sys.argv[1:]:
+    t = utils.benchmark(lambda: strip_tags(f))
+    sys.stderr.write('{0}: {1}\n'.format(f, t))

tests/benchmarks/python/utils.py

         sys.stderr.write('Run {0}: {1}\n'.format(i, result))
         total += result
     return total / runs
-
-def with_utf8_file(filename, f):
-    contents = open(filename).read().decode('utf-8')
-    return f(contents)

tests/benchmarks/python/word_count.py

-#!/usr/bin/env python
-
-import utils, sys
-
-def word_count(string):
-    freqs = {}
-    for w in string.split():
-        w = w.lower()
-        if freqs.get(w):
-            freqs[w] += 1
-        else:
-            freqs[w] = 1
-    return freqs
-
-for f in sys.argv[1:]:
-    t = utils.benchmark(lambda: utils.with_utf8_file(f, word_count))
-    sys.stderr.write('{0}: {1}\n'.format(f, t))

tests/benchmarks/ruby/case_map.rb

-#!/usr/bin/env ruby
-
-require './utils.rb'
-
-ARGV.each do |f|
-  t = benchmark { with_utf8_file(f) { |c| c.upcase } }
-  STDERR.puts "#{f}: #{t}"
-end

tests/benchmarks/ruby/cut.rb

+#!/usr/bin/env ruby
+
+require './utils.rb'
+
+def cut(filename, l, r)
+  File.open(filename, 'r:utf-8') do |file|
+    file.each_line do |line|
+      puts line[l, r - l]
+    end
+  end
+end
+
+ARGV.each do |f|
+  t = benchmark { cut(f, 20, 40) }
+  STDERR.puts "#{f}: #{t}"
+end

tests/benchmarks/ruby/file_read.rb

-#!/usr/bin/env ruby
-
-require './utils.rb'
-
-ARGV.each do |f|
-  t = benchmark { with_utf8_file(f) { |c| c.size } }
-  STDERR.puts "#{f}: #{t}"
-end

tests/benchmarks/ruby/fold.rb

+#!/usr/bin/env ruby
+
+require './utils.rb'
+
+def fold(filename, max_width)
+  File.open(filename, 'r:utf-8') do |file|
+    # Words in this paragraph
+    paragraph = []
+
+    file.each_line do |line|
+      # If we encounter an empty line, we reformat and dump the current
+      # paragraph
+      if line.strip.empty?
+        puts fold_paragraph(paragraph, max_width)
+        puts
+        paragraph = []
+      # Otherwise, we append the words found in the line to the paragraph
+      else
+        paragraph.concat line.split
+      end
+    end
+
+    # Last paragraph
+    puts fold_paragraph(paragraph, max_width) unless paragraph.empty?
+  end
+end
+
+# Fold a single paragraph to the desired width
+def fold_paragraph(paragraph, max_width)
+  # Gradually build our output
+  str, *rest = paragraph
+  width = str.length
+
+  rest.each do |word|
+    if width + word.length + 1 <= max_width
+      str << ' ' << word
+      width += word.length + 1
+    else
+      str << "\n" << word
+      width = word.length
+    end
+  end
+
+  str
+end
+
+ARGV.each do |f|
+  t = benchmark { fold(f, 80) }
+  STDERR.puts "#{f}: #{t}"
+end

tests/benchmarks/ruby/sort.rb

 
 require './utils.rb'
 
-def sort(str)
-  str.lines.sort.join
+def sort(filename)
+  File.open(filename, 'r:utf-8') do |file|
+    content = file.read
+    puts content.lines.sort.join
+  end
 end
 
 ARGV.each do |f|
-  t = benchmark do
-    with_utf8_file(f) { |c| puts sort(c) }
-  end
+  t = benchmark { sort(f) }
   STDERR.puts "#{f}: #{t}"
 end

tests/benchmarks/ruby/strip_brackets.rb

-#!/usr/bin/env ruby
-
-require './utils.rb'
-
-def strip_brackets(str)
-  d = 0
-  out = ''
-
-  str.each_char do |c|
-    d += 1 if c == '{' || c == '['
-    out << if d > 0 then ' ' else c end
-    d -= 1 if c == '}' || c == ']'
-  end
-
-  out
-end
-
-ARGV.each do |f|
-  t = benchmark { with_utf8_file(f) { |c| strip_brackets(c) } }
-  STDERR.puts "#{f}: #{t}"
-end

tests/benchmarks/ruby/strip_tags.rb

+#!/usr/bin/env ruby
+
+require './utils.rb'
+
+def strip_tags(filename)
+  File.open(filename, 'r:utf-8') do |file|
+    str = file.read
+
+    d = 0
+
+    str.each_char do |c|
+      d += 1 if c == '<'
+      putc(if d > 0 then ' ' else c end)
+      d -= 1 if c == '>'
+    end
+  end
+end
+
+ARGV.each do |f|
+  t = benchmark { strip_tags(f) }
+  STDERR.puts "#{f}: #{t}"
+end

tests/benchmarks/ruby/utils.rb

 
   total / runs 
 end
-
-def with_utf8_file(filename)
-  File.open(filename, 'r:utf-8') do |file|
-    yield file.read
-  end
-end

tests/benchmarks/ruby/word_count.rb

-#!/usr/bin/env ruby
-
-require './utils.rb'
-
-def word_count(str)
-  freqs = Hash.new 0
-  str.split.each do |w|
-    freqs[w.downcase] += 1
-  end
-  freqs
-end
-
-ARGV.each do |f|
-  t = benchmark { with_utf8_file(f) { |c| word_count(c) } }
-  STDERR.puts "#{f}: #{t}"
-end

tests/benchmarks/src/Data/Text/Benchmarks.hs

     ( main
     ) where
 
-import Criterion.Main (Benchmark, defaultMain)
+import Criterion.Main (Benchmark, defaultMain, bgroup)
 import System.FilePath ((</>))
 import System.IO (IOMode (WriteMode), openFile, hSetEncoding, utf8)
 
 import qualified Data.Text.Benchmarks.Builder as Builder
-import qualified Data.Text.Benchmarks.CaseMap as CaseMap
-import qualified Data.Text.Benchmarks.Cut as Cut
 import qualified Data.Text.Benchmarks.DecodeUtf8 as DecodeUtf8
 import qualified Data.Text.Benchmarks.EncodeUtf8 as EncodeUtf8
 import qualified Data.Text.Benchmarks.Equality as Equality
-import qualified Data.Text.Benchmarks.FileIndices as FileIndices
 import qualified Data.Text.Benchmarks.FileRead as FileRead
 import qualified Data.Text.Benchmarks.FoldLines as FoldLines
-import qualified Data.Text.Benchmarks.HtmlCombinator as HtmlCombinator
-import qualified Data.Text.Benchmarks.Ordering as Ordering
 import qualified Data.Text.Benchmarks.Pure as Pure
 import qualified Data.Text.Benchmarks.ReadNumbers as ReadNumbers
 import qualified Data.Text.Benchmarks.Replace as Replace
-import qualified Data.Text.Benchmarks.Sort as Sort
-import qualified Data.Text.Benchmarks.StripBrackets as StripBrackets
-import qualified Data.Text.Benchmarks.WordCount as WordCount
+import qualified Data.Text.Benchmarks.Search as Search
+import qualified Data.Text.Benchmarks.WordFrequencies as WordFrequencies
+
+import qualified Data.Text.Benchmarks.Programs.BigTable as Programs.BigTable
+import qualified Data.Text.Benchmarks.Programs.Cut as Programs.Cut
+import qualified Data.Text.Benchmarks.Programs.Fold as Programs.Fold
+import qualified Data.Text.Benchmarks.Programs.Sort as Programs.Sort
+import qualified Data.Text.Benchmarks.Programs.StripTags as Programs.StripTags
+import qualified Data.Text.Benchmarks.Programs.Throughput as Programs.Throughput
 
 main :: IO ()
 main = benchmarks >>= defaultMain
 benchmarks = do
     sink <- openFile "/dev/null" WriteMode
     hSetEncoding sink utf8
-    sequence
+
+    -- Traditional benchmarks
+    bs <- sequence
         [ Builder.benchmark
-        , CaseMap.benchmark (tf "russian.txt") sink
-        , Cut.benchmark (tf "russian.txt") sink 30 60
         , DecodeUtf8.benchmark (tf "russian.txt")
-        , EncodeUtf8.benchmark sink "επανάληψη 竺法蘭共譯"
+        , EncodeUtf8.benchmark "επανάληψη 竺法蘭共譯"
         , Equality.benchmark (tf "japanese.txt")
-        , FileIndices.benchmark (tf "russian.txt") "принимая"
         , FileRead.benchmark (tf "russian.txt")
         , FoldLines.benchmark (tf "russian.txt")
-        , HtmlCombinator.benchmark sink
-        , Ordering.benchmark (tf "russian.txt")
         , Pure.benchmark (tf "japanese.txt")
         , ReadNumbers.benchmark (tf "numbers.txt")
-        , Replace.benchmark (tf "russian.txt") sink "принимая" "своем"
-        , Sort.benchmark (tf "russian.txt") sink
-        , StripBrackets.benchmark (tf "russian.txt") sink
-        , WordCount.benchmark (tf "russian.txt")
+        , Replace.benchmark (tf "russian.txt") "принимая" "своем"
+        , Search.benchmark (tf "russian.txt") "принимая"
+        , WordFrequencies.benchmark (tf "russian.txt")
         ]
+
+    -- Program-like benchmarks
+    ps <- bgroup "Programs" `fmap` sequence
+        [ Programs.BigTable.benchmark sink
+        , Programs.Cut.benchmark (tf "russian.txt") sink 20 40
+        , Programs.Fold.benchmark (tf "russian.txt") sink
+        , Programs.Sort.benchmark (tf "russian.txt") sink
+        , Programs.StripTags.benchmark (tf "yiwiki.xml") sink
+        , Programs.Throughput.benchmark (tf "russian.txt") sink
+        ]
+
+    return $ bs ++ [ps]
   where
     -- Location of a test file
     tf = ("../text/test" </>)

tests/benchmarks/src/Data/Text/Benchmarks/Builder.hs

 -- | Testing the internal builder monoid
 --
+-- Tested in this benchmark:
+--
+-- * Concatenating many small strings using a builder
+--
 {-# LANGUAGE OverloadedStrings #-}
 module Data.Text.Benchmarks.Builder
     ( benchmark

tests/benchmarks/src/Data/Text/Benchmarks/CaseMap.hs

--- | This benchmark converts a number of UTF-8 encoded files to uppercase
---
-module Data.Text.Benchmarks.CaseMap
-    ( benchmark
-    ) where
-
-import Criterion (Benchmark, bench)
-import System.IO (Handle)
-import qualified Data.ByteString as B
-import qualified Data.Text as T
-import qualified Data.Text.Encoding as T
-
-benchmark :: FilePath -> Handle -> IO Benchmark
-benchmark fp sink = return $ bench "CaseMap" $
-    B.readFile fp >>= B.hPutStr sink . T.encodeUtf8 . T.toUpper . T.decodeUtf8

tests/benchmarks/src/Data/Text/Benchmarks/Cut.hs

--- | Cut into a file, selecting certain columns (e.g. lines 10 to 40)
---
-module Data.Text.Benchmarks.Cut
-    ( benchmark
-    ) where
-
-import Criterion (Benchmark, bgroup, bench)
-import System.IO (Handle, hPutStr)
-import qualified Data.ByteString as B
-import qualified Data.ByteString.Char8 as BC
-import qualified Data.ByteString.Lazy as BL
-import qualified Data.ByteString.Lazy.Char8 as BLC
-import qualified Data.Text as T
-import qualified Data.Text.Encoding as T
-import qualified Data.Text.IO as T
-import qualified Data.Text.Lazy as TL
-import qualified Data.Text.Lazy.Encoding as TL
-import qualified Data.Text.Lazy.IO as TL
-
-benchmark :: FilePath -> Handle -> Int -> Int -> IO Benchmark
-benchmark p sink from to = return $ bgroup "Cut"
-    [ bench' "String" string
-    , bench' "ByteString" byteString
-    , bench' "LazyByteString" lazyByteString
-    , bench' "Text" text
-    , bench' "LazyText" lazyText
-    , bench' "TextByteString" textByteString
-    , bench' "LazyTextByteString" lazyTextByteString
-    ]
-  where
-    bench' n s = bench n (s p sink from to)
-
-string :: FilePath -> Handle -> Int -> Int -> IO ()
-string fp sink from to = do
-    s <- readFile fp
-    hPutStr sink $ cut s
-  where
-    cut = unlines . map (take (to - from) . drop from) . lines
-
-byteString :: FilePath -> Handle -> Int -> Int -> IO ()
-byteString fp sink from to = do
-    bs <- B.readFile fp
-    B.hPutStr sink $ cut bs
-  where
-    cut = BC.unlines . map (B.take (to - from) . B.drop from) . BC.lines
-
-lazyByteString :: FilePath -> Handle -> Int -> Int -> IO ()
-lazyByteString fp sink from to = do
-    bs <- BL.readFile fp
-    BL.hPutStr sink $ cut bs
-  where
-    cut = BLC.unlines . map (BL.take (to' - from') . BL.drop from') . BLC.lines
-    from' = fromIntegral from
-    to' = fromIntegral to
-
-text :: FilePath -> Handle -> Int -> Int -> IO ()
-text fp sink from to = do
-    t <- T.readFile fp
-    T.hPutStr sink $ cut t
-  where
-    cut = T.unlines . map (T.take (to - from) . T.drop from) . T.lines
-
-lazyText :: FilePath -> Handle -> Int -> Int -> IO ()
-lazyText fp sink from to = do
-    t <- TL.readFile fp
-    TL.hPutStr sink $ cut t
-  where
-    cut = TL.unlines . map (TL.take (to' - from') . TL.drop from') . TL.lines
-    from' = fromIntegral from
-    to' = fromIntegral to
-
-textByteString :: FilePath -> Handle -> Int -> Int -> IO ()
-textByteString fp sink from to = do
-    t <- T.decodeUtf8 `fmap` B.readFile fp
-    B.hPutStr sink $ T.encodeUtf8 $ cut t
-  where
-    cut = T.unlines . map (T.take (to - from) . T.drop from) . T.lines
-
-lazyTextByteString :: FilePath -> Handle -> Int -> Int -> IO ()
-lazyTextByteString fp sink from to = do
-    t <- TL.decodeUtf8 `fmap` BL.readFile fp
-    BL.hPutStr sink $ TL.encodeUtf8 $ cut t
-  where
-    cut = TL.unlines . map (TL.take (to' - from') . TL.drop from') . TL.lines
-    from' = fromIntegral from
-    to' = fromIntegral to

tests/benchmarks/src/Data/Text/Benchmarks/DecodeUtf8.hs

+-- | Test decoding of UTF-8
+--
+-- Tested in this benchmark:
+--
+-- * Decoding bytes using UTF-8
+--
+-- In some tests:
+--
+-- * Taking the length of the result
+--
+-- * Taking the init of the result
+--
+-- The latter are used for testing stream fusion.
+--
 module Data.Text.Benchmarks.DecodeUtf8
     ( benchmark
     ) where
 
-import Control.DeepSeq (rnf)
-import Criterion (Benchmark, bgroup, bench)
-import System.IO (IOMode (ReadMode), openFile, hGetContents, hSetEncoding, utf8)
+import Criterion (Benchmark, bgroup, bench, nf)
 import qualified Codec.Binary.UTF8.Generic as U8
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Lazy as BL
 import qualified Data.Text as T
 import qualified Data.Text.Encoding as T
-import qualified Data.Text.IO as T
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as TL
-import qualified Data.Text.Lazy.IO as TL
 
 benchmark :: FilePath -> IO Benchmark
-benchmark fp = return $ bgroup "DecodeUtf8"
-    [ bench "Strict" $ do
-        bs <- B.readFile fp
-        rnf (T.decodeUtf8 bs) `seq` return ()
-
-    , bench "StrictLength" $ do
-        bs <- B.readFile fp
-        rnf (T.length $ T.decodeUtf8 bs) `seq` return ()
-
-    , bench "StrictInitLength" $ do
-        bs <- B.readFile fp
-        rnf (T.length $ T.init $ T.decodeUtf8 bs) `seq` return ()
-
-    , bench "StrictIO" $ do
-        h <- openFile fp ReadMode
-        hSetEncoding h utf8
-        t <- T.hGetContents h
-        rnf t `seq` return ()
-
-    , bench "StrictLengthIO" $ do
-        h <- openFile fp ReadMode
-        hSetEncoding h utf8
-        t <- T.hGetContents h
-        rnf (T.length t) `seq` return ()
-
-    , bench "Lazy" $ do
-        bs <- BL.readFile fp
-        rnf (TL.decodeUtf8 bs) `seq` return ()
-
-    , bench "LazyLength" $ do
-        bs <- BL.readFile fp
-        rnf (TL.length $ TL.decodeUtf8 bs) `seq` return ()
-
-    , bench "LazyInitLength" $ do
-        bs <- BL.readFile fp
-        rnf (TL.length $ TL.init $ TL.decodeUtf8 bs) `seq` return ()
-
-    , bench "LazyIO" $ do
-        h <- openFile fp ReadMode
-        hSetEncoding h utf8
-        t <- TL.hGetContents h
-        rnf t `seq` return ()
-
-    , bench "LazyLengthIO" $ do
-        h <- openFile fp ReadMode
-        hSetEncoding h utf8
-        t <- TL.hGetContents h
-        rnf (TL.length t) `seq` return ()
-
-    , bench "String" $ do
-        h <- openFile fp ReadMode
-        hSetEncoding h utf8
-        t <- hGetContents h
-        rnf t `seq` return ()
-
-    , bench "StringLength" $ do
-        h <- openFile fp ReadMode
-        hSetEncoding h utf8
-        t <- hGetContents h
-        rnf (length t) `seq` return ()
-
-    , bench "LazyStringUtf8" $ do
-        s <- U8.toString `fmap` BL.readFile fp
-        rnf s `seq` return ()
-
-    , bench "LazyStringUtf8Length" $ do
-        s <- U8.toString `fmap` BL.readFile fp
-        rnf (length s) `seq` return ()
-
-    , bench "StrictStringUtf8" $ do
-        s <- U8.toString `fmap` B.readFile fp
-        rnf s `seq` return ()
-
-    , bench "StrictStringUtf8Length" $ do
-        s <- U8.toString `fmap` B.readFile fp
-        rnf (length s) `seq` return ()
-    ]
+benchmark fp = do
+    bs  <- B.readFile fp
+    lbs <- BL.readFile fp
+    return $ bgroup "DecodeUtf8"
+        [ bench "Strict" $ nf T.decodeUtf8 bs
+        , bench "StrictLength" $ nf (T.length . T.decodeUtf8) bs
+        , bench "StrictInitLength" $ nf (T.length . T.init . T.decodeUtf8) bs
+        , bench "Lazy" $ nf TL.decodeUtf8 lbs
+        , bench "LazyLength" $ nf (TL.length . TL.decodeUtf8) lbs
+        , bench "LazyInitLength" $ nf (TL.length . TL.init . TL.decodeUtf8) lbs
+        , bench "StrictStringUtf8" $ nf U8.toString bs
+        , bench "StrictStringUtf8Length" $ nf (length . U8.toString) bs
+        , bench "LazyStringUtf8" $ nf U8.toString lbs
+        , bench "LazyStringUtf8Length" $ nf (length . U8.toString) lbs
+        ]

tests/benchmarks/src/Data/Text/Benchmarks/EncodeUtf8.hs

+-- | UTF-8 encode a text
+--
+-- Tested in this benchmark:
+--
+-- * Replicating a string a number of times
+--
+-- * UTF-8 encoding it
+--
 module Data.Text.Benchmarks.EncodeUtf8
     ( benchmark
     ) where
 
-import Criterion (Benchmark, bgroup, bench)
-import System.IO (Handle, hPutStr)
+import Criterion (Benchmark, bgroup, bench, whnf)
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Lazy as BL
 import qualified Data.Text as T
 import qualified Data.Text.Encoding as T
-import qualified Data.Text.IO as T
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as TL
-import qualified Data.Text.Lazy.IO as TL
 
-benchmark :: Handle -> String -> IO Benchmark
-benchmark sink string = return $ bgroup "EncodeUtf8"
-    [ bench "String" $ hPutStr sink $ concat $ replicate k string
-    , bench "Text" $ T.hPutStr sink $ T.replicate k text
-    , bench "TextLazy" $ TL.hPutStr sink $
-        TL.replicate (fromIntegral k) lazyText
-    , bench "TextByteString" $ B.hPutStr sink $
-        T.encodeUtf8 $ T.replicate k text
-    , bench "TextByteStringLazy" $ BL.hPutStr sink $
-        TL.encodeUtf8 $ TL.replicate (fromIntegral k) lazyText
-    ]
+benchmark :: String -> IO Benchmark
+benchmark string = do
+    return $ bgroup "EncodeUtf8"
+        [ bench "Text"     $ whnf (B.length . T.encodeUtf8)   text
+        , bench "LazyText" $ whnf (BL.length . TL.encodeUtf8) lazyText
+        ]
   where
     -- The string in different formats
-    text = T.pack string
-    lazyText = TL.pack string
+    text = T.replicate k $ T.pack string
+    lazyText = TL.replicate (fromIntegral k) $ TL.pack string
 
     -- Amount
     k = 100000

tests/benchmarks/src/Data/Text/Benchmarks/Equality.hs

 -- | Compare a string with a copy of itself that is identical except
 -- for the last character.
 --
+-- Tested in this benchmark:
+--
+-- * Comparison of strings (Eq instance)
+--
 module Data.Text.Benchmarks.Equality
-    (
-      benchmark
+    ( benchmark
     ) where
 
 import Criterion (Benchmark, bgroup, bench, whnf)

tests/benchmarks/src/Data/Text/Benchmarks/FileIndices.hs

--- | Search for a pattern in a file, find the number of occurences
---
-module Data.Text.Benchmarks.FileIndices
-    ( benchmark
-    ) where
-
-import Control.Exception (evaluate)
-import Criterion (Benchmark, bench, bgroup)
-import qualified Data.ByteString as B
-import qualified Data.ByteString.Lazy as BL
-import qualified Data.ByteString.Lazy.Search as BL
-import qualified Data.Text.Lazy as TL
-import qualified Data.Text.Lazy.Encoding as TL
-import qualified Data.Text.Lazy.IO as TL
-
-benchmark :: FilePath -> TL.Text -> IO Benchmark
-benchmark fp t = return $ bgroup "FileIndices"
-    [ bench "LazyText"           $ TL.readFile fp >>= evaluate . text t
-    , bench "LazyByteString"     $ BL.readFile fp >>= evaluate . byteString b
-    ]
-  where
-    b = B.concat $ BL.toChunks $ TL.encodeUtf8 t
-
-text :: TL.Text -> TL.Text -> Int
-text needle = fromIntegral . TL.count needle
-
-byteString :: B.ByteString -> BL.ByteString -> Int
-byteString needle = length . BL.indices needle

tests/benchmarks/src/Data/Text/Benchmarks/FileRead.hs

 -- | Benchmarks simple file reading
 --
+-- Tested in this benchmark:
+--
+-- * Reading a file from the disk
+--
 module Data.Text.Benchmarks.FileRead
     ( benchmark
     ) where

tests/benchmarks/src/Data/Text/Benchmarks/FoldLines.hs

--- | Fold over the lines in a file
+-- | Read a file line-by-line using handles, and perform a fold over the lines.
+-- The fold is used here to calculate the number of lines in the file.
+--
+-- Tested in this benchmark:
+--
+-- * Buffered, line-based IO
 --
 {-# LANGUAGE BangPatterns #-}
 module Data.Text.Benchmarks.FoldLines
 import qualified Data.Text.IO as T
 
 benchmark :: FilePath -> IO Benchmark
-benchmark fp = return $ bgroup "FoldLines"
-    [ bench "Text" $ withHandle $ foldLinesT (\n _ -> n + 1) (0 :: Int)
+benchmark fp = return $ bgroup "ReadLines"
+    [ bench "Text"       $ withHandle $ foldLinesT (\n _ -> n + 1) (0 :: Int)
     , bench "ByteString" $ withHandle $ foldLinesB (\n _ -> n + 1) (0 :: Int)
     ]
   where
         hClose h
         return x
 
--- Text line fold
+-- | Text line fold
 --
 foldLinesT :: (a -> T.Text -> a) -> a -> Handle -> IO a
 foldLinesT f z0 h = go z0

tests/benchmarks/src/Data/Text/Benchmarks/HtmlCombinator.hs

--- | Create a large HTML table and dump it to a handle
---
-{-# LANGUAGE OverloadedStrings #-}
-module Data.Text.Benchmarks.HtmlCombinator
-    ( benchmark
-    ) where
-
-import Criterion (Benchmark, bench)
-import Data.Monoid (mappend, mconcat)
-import Data.Text.Lazy.Builder (Builder, fromText, toLazyText)
-import Data.Text.Lazy.IO (hPutStr)
-import System.IO (Handle)
-import qualified Data.Text as T
-
-benchmark :: Handle -> IO Benchmark
-benchmark sink = return $ bench "HtmlCombinator" $ do
-    hPutStr sink "Content-Type: text/html\n\n<table>"
-    hPutStr sink . toLazyText . makeTable =<< rows
-    hPutStr sink "</table>"
-  where
-    -- We provide the number of rows in IO so the builder value isn't shared
-    -- between the benchmark samples.
-    rows :: IO Int
-    rows = return 20000
-    {-# NOINLINE rows #-}
-
-makeTable :: Int -> Builder
-makeTable n = mconcat $ replicate n $ mconcat $ map makeCol [1 .. 50]
-
-makeCol :: Int -> Builder
-makeCol 1 = fromText "<tr><td>1</td>"
-makeCol 50 = fromText "<td>50</td></tr>"
-makeCol i = fromText "<td>" `mappend` (fromInt i `mappend` fromText "</td>")
-
-fromInt :: Int -> Builder
-fromInt = fromText . T.pack . show

tests/benchmarks/src/Data/Text/Benchmarks/Ordering.hs

--- | For every 1000th line of a file, check how many lines in the file are
--- lexicographically smaller
---
-module Data.Text.Benchmarks.Ordering
-    ( benchmark
-    ) where
-
-import Control.Exception (evaluate)
-import Criterion (Benchmark, bgroup, bench)
-import qualified Data.ByteString.Char8 as B
-import qualified Data.ByteString.Lazy.Char8 as BL
-import qualified Data.Text as T
-import qualified Data.Text.Encoding as T
-import qualified Data.Text.Lazy as TL
-import qualified Data.Text.Lazy.Encoding as TL
-
-benchmark :: FilePath -> IO Benchmark
-benchmark fp = return $ bgroup "Ordering"
-    [ bench "ByteString" $ B.readFile fp >>=
-        evaluate . numSmaller . B.lines
-    , bench "LazyByteString" $ BL.readFile fp >>=
-        evaluate . numSmaller . BL.lines
-
-    , bench "Text" $ B.readFile fp >>=
-        evaluate . numSmaller . T.lines . T.decodeUtf8
-    , bench "TextFusion" $ B.readFile fp >>=
-        evaluate . numSmallerFusion . T.lines . T.decodeUtf8
-    , bench "LazyText" $ BL.readFile fp >>=
-        evaluate . numSmaller . TL.lines . TL.decodeUtf8
-
-    , bench "String" $ readFile fp >>=
-        evaluate . numSmaller . lines
-    ]
-
--- | Take every Nth item from a list
---
-every :: Int -> [a] -> [a]
-every k = go k
-  where
-    go n (x : xs)
-        | n < k     = go (n+1) xs
-        | otherwise = x : go 1 xs
-    go _ _          = []
-
--- | Benchmark logic
---
-numSmaller :: (Ord a) => [a] -> Int
-numSmaller ls = sum . map f $ every 1000 ls
-  where
-    f x = length . filter ((== GT) . compare x) $ ls
-
--- | Test a comparison that could be fused: compare (toLower a) (toLower b)
--- Currently, this funcion performs very badly!
---
-numSmallerFusion :: [T.Text] -> Int
-numSmallerFusion ls = sum . map f $ every 1000 ls
-  where
-    f x = length . filter ((== GT) . compare (T.toLower x) . T.toLower) $ ls

tests/benchmarks/src/Data/Text/Benchmarks/Programs/BigTable.hs

+-- | Create a large HTML table and dump it to a handle
+--
+-- Tested in this benchmark:
+--
+-- * Creating a large HTML document using a builder
+--
+-- * Writing to a handle
+--
+{-# LANGUAGE OverloadedStrings #-}
+module Data.Text.Benchmarks.Programs.BigTable
+    ( benchmark
+    ) where
+
+import Criterion (Benchmark, bench)
+import Data.Monoid (mappend, mconcat)
+import Data.Text.Lazy.Builder (Builder, fromText, toLazyText)
+import Data.Text.Lazy.IO (hPutStr)
+import System.IO (Handle)
+import qualified Data.Text as T
+
+benchmark :: Handle -> IO Benchmark
+benchmark sink = return $ bench "BigTable" $ do
+    hPutStr sink "Content-Type: text/html\n\n<table>"
+    hPutStr sink . toLazyText . makeTable =<< rows
+    hPutStr sink "</table>"
+  where
+    -- We provide the number of rows in IO so the builder value isn't shared
+    -- between the benchmark samples.
+    rows :: IO Int
+    rows = return 20000
+    {-# NOINLINE rows #-}
+
+makeTable :: Int -> Builder
+makeTable n = mconcat $ replicate n $ mconcat $ map makeCol [1 .. 50]
+
+makeCol :: Int -> Builder
+makeCol 1 = fromText "<tr><td>1</td>"
+makeCol 50 = fromText "<td>50</td></tr>"
+makeCol i = fromText "<td>" `mappend` (fromInt i `mappend` fromText "</td>")
+
+fromInt :: Int -> Builder
+fromInt = fromText . T.pack . show

tests/benchmarks/src/Data/Text/Benchmarks/Programs/Cut.hs

+-- | Cut into a file, selecting certain columns (e.g. columns 10 to 40)
+--
+-- Tested in this benchmark:
+--
+-- * Reading the file
+--
+-- * Splitting into lines
+--
+-- * Taking a number of characters from the lines
+--
+-- * Joining the lines
+--
+-- * Writing back to a handle
+--
+module Data.Text.Benchmarks.Programs.Cut
+    ( benchmark
+    ) where
+
+import Criterion (Benchmark, bgroup, bench)
+import System.IO (Handle, hPutStr)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Char8 as BC
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.ByteString.Lazy.Char8 as BLC
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.Encoding as TL
+import qualified Data.Text.Lazy.IO as TL
+
+benchmark :: FilePath -> Handle -> Int -> Int -> IO Benchmark
+benchmark p sink from to = return $ bgroup "Cut"
+    [ bench' "String" string
+    , bench' "ByteString" byteString
+    , bench' "LazyByteString" lazyByteString
+    , bench' "Text" text
+    , bench' "LazyText" lazyText
+    , bench' "TextByteString" textByteString
+    , bench' "LazyTextByteString" lazyTextByteString
+    ]
+  where
+    bench' n s = bench n (s p sink from to)
+
+string :: FilePath -> Handle -> Int -> Int -> IO ()
+string fp sink from to = do
+    s <- readFile fp
+    hPutStr sink $ cut s
+  where
+    cut = unlines . map (take (to - from) . drop from) . lines
+
+byteString :: FilePath -> Handle -> Int -> Int -> IO ()
+byteString fp sink from to = do
+    bs <- B.readFile fp
+    B.hPutStr sink $ cut bs
+  where
+    cut = BC.unlines . map (B.take (to - from) . B.drop from) . BC.lines
+
+lazyByteString :: FilePath -> Handle -> Int -> Int -> IO ()
+lazyByteString fp sink from to = do
+    bs <- BL.readFile fp
+    BL.hPutStr sink $ cut bs
+  where
+    cut = BLC.unlines . map (BL.take (to' - from') . BL.drop from') . BLC.lines
+    from' = fromIntegral from
+    to' = fromIntegral to
+
+text :: FilePath -> Handle -> Int -> Int -> IO ()
+text fp sink from to = do
+    t <- T.readFile fp
+    T.hPutStr sink $ cut t
+  where
+    cut = T.unlines . map (T.take (to - from) . T.drop from) . T.lines
+
+lazyText :: FilePath -> Handle -> Int -> Int -> IO ()
+lazyText fp sink from to = do
+    t <- TL.readFile fp
+    TL.hPutStr sink $ cut t
+  where
+    cut = TL.unlines . map (TL.take (to' - from') . TL.drop from') . TL.lines
+    from' = fromIntegral from
+    to' = fromIntegral to
+
+textByteString :: FilePath -> Handle -> Int -> Int -> IO ()
+textByteString fp sink from to = do
+    t <- T.decodeUtf8 `fmap` B.readFile fp
+    B.hPutStr sink $ T.encodeUtf8 $ cut t
+  where
+    cut = T.unlines . map (T.take (to - from) . T.drop from) . T.lines
+
+lazyTextByteString :: FilePath -> Handle -> Int -> Int -> IO ()
+lazyTextByteString fp sink from to = do
+    t <- TL.decodeUtf8 `fmap` BL.readFile fp
+    BL.hPutStr sink $ TL.encodeUtf8 $ cut t
+  where
+    cut = TL.unlines . map (TL.take (to' - from') . TL.drop from') . TL.lines
+    from' = fromIntegral from
+    to' = fromIntegral to

tests/benchmarks/src/Data/Text/Benchmarks/Programs/Fold.hs

+-- | Benchmark which formats paragraph, like the @sort@ unix utility.
+--
+-- Tested in this benchmark:
+--
+-- * Reading the file
+--
+-- * Splitting into paragraphs
+--
+-- * Reformatting the paragraphs to a certain line width
+--
+-- * Concatenating the results using the text builder
+--
+-- * Writing back to a handle
+--
+{-# LANGUAGE OverloadedStrings #-}
+module Data.Text.Benchmarks.Programs.Fold
+    ( benchmark
+    ) where
+
+import Data.List (foldl')
+import Data.List (intersperse)
+import Data.Monoid (mempty, mappend, mconcat)
+import System.IO (Handle)
+import Criterion (Benchmark, bench)
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy.Builder as TLB
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.IO as TL
+
+benchmark :: FilePath -> Handle -> IO Benchmark
+benchmark i o = return $
+    bench "Fold" $ T.readFile i >>= TL.hPutStr o . fold 80
+
+-- | We represent a paragraph by a word list
+--
+type Paragraph = [T.Text]
+
+-- | Fold a text
+--
+fold :: Int -> T.Text -> TL.Text
+fold maxWidth = TLB.toLazyText . mconcat .
+    intersperse "\n\n" . map (foldParagraph maxWidth) . paragraphs
+
+-- | Fold a paragraph
+--
+foldParagraph :: Int -> Paragraph -> TLB.Builder
+foldParagraph _    []       = mempty
+foldParagraph max' (w : ws) = fst $ foldl' go (TLB.fromText w, T.length w) ws
+  where
+    go (builder, width) word
+        | width + len + 1 <= max' =
+            (builder `mappend` " " `mappend` word', width + len + 1)
+        | otherwise =
+            (builder `mappend` "\n" `mappend` word', len)
+      where
+        word' = TLB.fromText word
+        len = T.length word
+
+-- | Divide a text into paragraphs
+--
+paragraphs :: T.Text -> [Paragraph]
+paragraphs = splitParagraphs . map T.words . T.lines
+  where
+    splitParagraphs ls = case break null ls of
+        ([], []) -> []
+        (p,  []) -> [concat p]
+        (p,  lr) -> concat p : splitParagraphs (dropWhile null lr)

tests/benchmarks/src/Data/Text/Benchmarks/Programs/Sort.hs

+-- | This benchmark sorts the lines of a file, like the @sort@ unix utility.
+--
+-- Tested in this benchmark:
+--
+-- * Reading the file
+--
+-- * Splitting into lines
+--
+-- * Sorting the lines
+--
+-- * Joining the lines
+--
+-- * Writing back to a handle
+--
+{-# LANGUAGE OverloadedStrings #-}
+module Data.Text.Benchmarks.Programs.Sort
+    ( benchmark
+    ) where
+
+import Criterion (Benchmark, bgroup, bench)
+import Data.Monoid (mconcat)
+import System.IO (Handle, hPutStr)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.ByteString.Lazy.Char8 as BLC
+import qualified Data.ByteString.Char8 as BC
+import qualified Data.List as L
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.IO as TL
+import qualified Data.Text.Lazy.Builder as TLB
+
+benchmark :: FilePath -> Handle -> IO Benchmark
+benchmark i o = return $ bgroup "Sort"
+    [ bench "String" $ readFile i >>= hPutStr o . string
+    , bench "ByteString" $ B.readFile i >>= B.hPutStr o . byteString
+    , bench "LazyByteString" $ BL.readFile i >>= BL.hPutStr o . lazyByteString
+    , bench "Text" $ T.readFile i >>= T.hPutStr o . text
+    , bench "LazyText" $ TL.readFile i >>= TL.hPutStr o . lazyText
+    , bench "TextBuilder" $ T.readFile i >>= TL.hPutStr o . textBuilder
+    ]
+
+string :: String -> String
+string = unlines . L.sort . lines
+
+byteString :: B.ByteString -> B.ByteString
+byteString = BC.unlines . L.sort . BC.lines
+
+lazyByteString :: BL.ByteString -> BL.ByteString
+lazyByteString = BLC.unlines . L.sort . BLC.lines
+
+text :: T.Text -> T.Text
+text = T.unlines . L.sort . T.lines
+
+lazyText :: TL.Text -> TL.Text
+lazyText = TL.unlines . L.sort . TL.lines
+
+-- | Text variant using a builder monoid for the final concatenation
+--
+textBuilder :: T.Text -> TL.Text
+textBuilder = TLB.toLazyText . mconcat . L.intersperse (TLB.singleton '\n') .
+    map TLB.fromText . L.sort . T.lines

tests/benchmarks/src/Data/Text/Benchmarks/Programs/StripTags.hs

+-- | Program to replace HTML tags by whitespace
+--
+-- This program was originally contributed by Petr Prokhorenkov.
+--
+-- Tested in this benchmark:
+--
+-- * Reading the file
+--
+-- * Replacing text between HTML tags (<>) with whitespace
+--
+-- * Writing back to a handle
+--
+{-# OPTIONS_GHC -fspec-constr-count=5 #-}
+module Data.Text.Benchmarks.Programs.StripTags
+    ( benchmark
+    ) where
+     
+import Criterion (Benchmark, bgroup, bench)
+import Data.List (mapAccumL)
+import System.IO (Handle, hPutStr)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Char8 as BC
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+
+benchmark :: FilePath -> Handle -> IO Benchmark
+benchmark i o = return $ bgroup "StripTags"
+    [ bench "String" $ readFile i >>= hPutStr o . string
+    , bench "ByteString" $ B.readFile i >>= B.hPutStr o . byteString
+    , bench "Text" $
+        B.readFile i >>= B.hPutStr o . T.encodeUtf8 . text . T.decodeUtf8
+    ]
+
+string :: String -> String
+string = snd . mapAccumL step 0
+
+text :: T.Text -> T.Text
+text = snd . T.mapAccumL step 0
+
+byteString :: B.ByteString -> B.ByteString
+byteString = snd . BC.mapAccumL step 0
+
+step :: Int -> Char -> (Int, Char)
+step d c
+    | d > 0 || d' > 0 = (d', ' ')
+    | otherwise       = (d', c)
+  where
+    d' = d + depth c
+    depth '>' = 1
+    depth '<' = -1
+    depth _   = 0

tests/benchmarks/src/Data/Text/Benchmarks/Programs/Throughput.hs

+-- | This benchmark simply reads and writes a file using the various string
+-- libraries. The point of it is that we can make better estimations on how
+-- much time the other benchmarks spend doing IO.
+--
+-- Note that we expect ByteStrings to be a whole lot faster, since they do not
+-- do any actual encoding/decoding here, while String and Text do have UTF-8
+-- encoding/decoding.
+--
+-- Tested in this benchmark:
+--
+-- * Reading the file
+--
+-- * Replacing text between HTML tags (<>) with whitespace
+--
+-- * Writing back to a handle
+--
+module Data.Text.Benchmarks.Programs.Throughput
+    ( benchmark
+    ) where
+
+import Criterion (Benchmark, bgroup, bench)
+import System.IO (Handle, hPutStr)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy.IO as TL
+
+benchmark :: FilePath -> Handle -> IO Benchmark
+benchmark fp sink = return $ bgroup "Throughput"
+    [ bench "String" $ readFile fp >>= hPutStr sink
+    , bench "ByteString" $ B.readFile fp >>= B.hPutStr sink
+    , bench "LazyByteString" $ BL.readFile fp >>= BL.hPutStr sink
+    , bench "Text" $ T.readFile fp >>= T.hPutStr sink
+    , bench "LazyText" $ TL.readFile fp >>= TL.hPutStr sink
+    ]

tests/benchmarks/src/Data/Text/Benchmarks/Pure.hs

 -- | Benchmarks various pure functions from the Text library
 --
+-- Tested in this benchmark:
+--
+-- * Most pure functions defined the string types
+--
 {-# LANGUAGE BangPatterns, GADTs, MagicHash #-}
 {-# OPTIONS_GHC -fno-warn-orphans #-}
 module Data.Text.Benchmarks.Pure

tests/benchmarks/src/Data/Text/Benchmarks/ReadNumbers.hs

 -- | Read numbers from a file with a just a number on each line, find the
--- minimum of those numbers.
+-- minimum of those numbers. The file contains different kinds of numbers:
+--
+-- * Decimals
+--
+-- * Hexadecimals
+--
+-- * Floating point numbers
+--
+-- * Floating point numbers in scientific notation
+--
+-- The different benchmarks will only take into account the values they can
+-- parse.
+--
+-- Tested in this benchmark:
+--
+-- * Lexing/parsing of different numerical types
 --
 module Data.Text.Benchmarks.ReadNumbers
     ( benchmark
     ) where
 
-import Control.Exception (evaluate)
-import Criterion (Benchmark, bgroup, bench)
+import Criterion (Benchmark, bgroup, bench, whnf)
 import Data.List (foldl')
 import Numeric (readDec, readFloat, readHex)
 import qualified Data.ByteString.Char8 as B
 import qualified Data.ByteString.Lex.Double as B
 import qualified Data.ByteString.Lex.Lazy.Double as BL
 import qualified Data.Text as T
-import qualified Data.Text.Encoding as T
+import qualified Data.Text.IO as T
 import qualified Data.Text.Lazy as TL
-import qualified Data.Text.Lazy.Encoding as TL
+import qualified Data.Text.Lazy.IO as TL
 import qualified Data.Text.Lazy.Read as TL
 import qualified Data.Text.Read as T
 
 benchmark :: FilePath -> IO Benchmark
-benchmark fp = return $ bgroup "ReadNumbers"
-    [ bench "DecimalString" $ readFile fp >>= evaluate .
-        int . string readDec . lines
-    , bench "HexadecimalString" $ readFile fp >>= evaluate .
-        int . string readHex . lines
-    , bench "DoubleString" $ readFile fp >>= evaluate .
-        double . string readFloat . lines
+benchmark fp = do
+    -- Read all files into lines: string, text, lazy text, bytestring, lazy
+    -- bytestring
+    s <- lines `fmap` readFile fp
+    t <- T.lines `fmap` T.readFile fp
+    tl <- TL.lines `fmap` TL.readFile fp
+    b <- B.lines `fmap` B.readFile fp
+    bl <- BL.lines `fmap` BL.readFile fp
+    return $ bgroup "ReadNumbers"
+        [ bench "DecimalString"     $ whnf (int . string readDec) s
+        , bench "HexadecimalString" $ whnf (int . string readHex) s
+        , bench "DoubleString"      $ whnf (double . string readFloat) s
 
-    , bench "DecimalText" $ B.readFile fp >>= evaluate .
-        int . text (T.signed T.decimal) . T.lines . T.decodeUtf8
-    , bench "HexadecimalText" $ B.readFile fp >>= evaluate .
-        int . text (T.signed T.hexadecimal) . T.lines . T.decodeUtf8
-    , bench "DoubleText" $ B.readFile fp >>= evaluate .
-        double . text T.double . T.lines . T.decodeUtf8
-    , bench "RationalText" $ B.readFile fp >>= evaluate .
-        double . text T.rational . T.lines . T.decodeUtf8
+        , bench "DecimalText"     $ whnf (int . text (T.signed T.decimal)) t
+        , bench "HexadecimalText" $ whnf (int . text (T.signed T.hexadecimal)) t
+        , bench "DoubleText"      $ whnf (double . text T.double) t
+        , bench "RationalText"    $ whnf (double . text T.rational) t
 
-    , bench "DecimalLazyText" $ BL.readFile fp >>= evaluate .
-        int . text (TL.signed TL.decimal) . TL.lines . TL.decodeUtf8
-    , bench "HexadecimalLazyText" $ BL.readFile fp >>= evaluate .
-        int . text (TL.signed TL.hexadecimal) . TL.lines . TL.decodeUtf8
-    , bench "DoubleLazyText" $ BL.readFile fp >>= evaluate .
-        double . text TL.double . TL.lines . TL.decodeUtf8
-    , bench "RationalLazyText" $ BL.readFile fp >>= evaluate .
-        double . text TL.rational . TL.lines . TL.decodeUtf8
+        , bench "DecimalLazyText" $
+            whnf (int . text (TL.signed TL.decimal)) tl
+        , bench "HexadecimalLazyText" $
+            whnf (int . text (TL.signed TL.hexadecimal)) tl
+        , bench "DoubleLazyText" $
+            whnf (double . text TL.double) tl
+        , bench "RationalLazyText" $
+            whnf (double . text TL.rational) tl
 
-    , bench "DecimalByteString" $ B.readFile fp >>= evaluate .
-        int . byteString B.readInt . B.lines
-    , bench "DoubleByteString" $ B.readFile fp >>= evaluate .
-        double . byteString B.readDouble . B.lines
+        , bench "DecimalByteString" $ whnf (int . byteString B.readInt) b
+        , bench "DoubleByteString"  $ whnf (double . byteString B.readDouble) b
 
-    , bench "DecimalLazyByteString" $ BL.readFile fp >>= evaluate .
-        int . byteString BL.readInt . BL.lines
-    , bench "DoubleLazyByteString" $ BL.readFile fp >>= evaluate .
-        double . byteString BL.readDouble . BL.lines
-    ]
+        , bench "DecimalLazyByteString" $
+            whnf (int . byteString BL.readInt) bl
+        , bench "DoubleLazyByteString" $
+            whnf (double . byteString BL.readDouble) bl
+        ]
   where
     -- Used for fixing types
     int :: Int -> Int

tests/benchmarks/src/Data/Text/Benchmarks/Replace.hs

--- | Replace a string by another string in a file
+-- | Replace a string by another string
+--
+-- Tested in this benchmark:
+--
+-- * Search and replace of a pattern in a text
 --
 module Data.Text.Benchmarks.Replace
     ( benchmark
     ) where
 
-import Criterion (Benchmark, bgroup, bench)
-import System.IO (Handle)
+import Criterion (Benchmark, bgroup, bench, nf)
 import qualified Data.ByteString.Char8 as B
 import qualified Data.ByteString.Lazy as BL
 import qualified Data.ByteString.Lazy.Search as BL
 import qualified Data.Text.Lazy.Encoding as TL
 import qualified Data.Text.Lazy.IO as TL
 
-benchmark :: FilePath -> Handle -> String -> String -> IO Benchmark
-benchmark fp sink pat sub = return $ bgroup "Replace"
-    -- We have benchmarks for lazy text and lazy bytestrings. We also benchmark
-    -- without the acual replacement, so we can get an idea of what time is
-    -- spent on IO and computations.
-    [ bench "LazyText" $ TL.readFile fp >>=
-        TL.hPutStr sink . TL.replace tpat tsub
-    , bench "LazyTextNull" $ TL.readFile fp >>= TL.hPutStr sink
-
-    , bench "LazyByteString" $ BL.readFile fp >>=
-        BL.hPutStr sink . BL.replace bpat bsub
-    , bench "LazyByteStringNull" $ BL.readFile fp >>= BL.hPutStr sink
-    ]
+benchmark :: FilePath -> String -> String -> IO Benchmark
+benchmark fp pat sub = do
+    tl <- TL.readFile fp
+    bl <- BL.readFile fp
+    return $ bgroup "Replace"
+        [ bench "LazyText"       $ nf (TL.length . TL.replace tpat tsub) tl
+        , bench "LazyByteString" $ nf (BL.length . BL.replace bpat bsub) bl
+        ]
   where
     tpat = TL.pack pat
     tsub = TL.pack sub

tests/benchmarks/src/Data/Text/Benchmarks/Search.hs

+-- | Search for a pattern in a file, find the number of occurences
+--
+-- Tested in this benchmark:
+--
+-- * Searching all occurences of a pattern using library routines
+--
+module Data.Text.Benchmarks.Search
+    ( benchmark
+    ) where
+
+import Criterion (Benchmark, bench, bgroup, whnf)
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Lazy as BL
+import qualified Data.ByteString.Lazy.Search as BL
+import qualified Data.ByteString.Search as B
+import qualified Data.Text as T
+import qualified Data.Text.Encoding as T
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.IO as TL
+
+benchmark :: FilePath -> T.Text -> IO Benchmark
+benchmark fp needleT = do
+    b  <- B.readFile fp
+    bl <- BL.readFile fp
+    t  <- T.readFile fp
+    tl <- TL.readFile fp
+    return $ bgroup "FileIndices"
+        [ bench "ByteString"     $ whnf (byteString needleB)     b
+        , bench "LazyByteString" $ whnf (lazyByteString needleB) bl
+        , bench "Text"           $ whnf (text needleT)           t
+        , bench "LazyText"       $ whnf (lazyText needleTL)      tl
+        ]
+  where
+    needleB = T.encodeUtf8 needleT
+    needleTL = TL.fromChunks [needleT]
+
+byteString :: B.ByteString -> B.ByteString -> Int
+byteString needle = length . B.indices needle
+
+lazyByteString :: B.ByteString -> BL.ByteString -> Int
+lazyByteString needle = length . BL.indices needle
+
+text :: T.Text -> T.Text -> Int
+text = T.count
+
+lazyText :: TL.Text -> TL.Text -> Int
+lazyText needle = fromIntegral . TL.count needle

tests/benchmarks/src/Data/Text/Benchmarks/Sort.hs

--- | Implements the unix @sort@ program
---
-{-# LANGUAGE OverloadedStrings #-}
-module Data.Text.Benchmarks.Sort
-    ( benchmark
-    ) where
-
-import Criterion (Benchmark, bench)
-import Data.Monoid (mconcat)
-import System.IO (Handle)
-import qualified Data.ByteString as B
-import qualified Data.ByteString.Lazy as BL
-import qualified Data.List as L
-import qualified Data.Text as T
-import qualified Data.Text.Encoding as T
-import qualified Data.Text.Lazy as TL
-import qualified Data.Text.Lazy.Builder as TLB
-import qualified Data.Text.Lazy.Encoding as TL
-
-benchmark :: FilePath -> Handle -> IO Benchmark
-benchmark fp sink = return $ bench "Sort" $ do
-    t <- T.decodeUtf8 `fmap` B.readFile fp
-    BL.hPutStr sink $ TL.encodeUtf8 $ sort t
-
-sort :: T.Text -> TL.Text
-sort = TLB.toLazyText . mconcat . L.intersperse (TLB.fromText "\n") .
-    map TLB.fromText . L.sort . T.lines

tests/benchmarks/src/Data/Text/Benchmarks/StripBrackets.hs

--- | Program to replace everything between brackets by spaces
---
--- This program was originally contributed by Petr Prokhorenkov.
---
-module Data.Text.Benchmarks.StripBrackets
-    ( benchmark
-    ) where
-     
-import Criterion (Benchmark, bench)
-import System.IO (Handle)
-import qualified Data.ByteString as B
-import qualified Data.Text as T
-import qualified Data.Text.Encoding as T
-
-benchmark :: FilePath -> Handle -> IO Benchmark
-benchmark fp sink = return $ bench "StripBrackets" $ do
-    t <- T.decodeUtf8 `fmap` B.readFile fp
-    B.hPutStr sink $ T.encodeUtf8 $ stripBrackets t
-
-stripBrackets :: T.Text -> T.Text
-stripBrackets = snd . T.mapAccumL f (0 :: Int)
-  where
-    f depth c =
-        let depth' = depth + d' c
-            c' | depth > 0 || depth' > 0 = ' '
-               | otherwise = c
-        in (depth', c')
-
-    d' '{' = 1
-    d' '[' = 1
-    d' '}' = -1
-    d' ']' = -1
-    d' _   = 0

tests/benchmarks/src/Data/Text/Benchmarks/WordCount.hs

--- | A word frequence count program
---
-module Data.Text.Benchmarks.WordCount
-    ( benchmark
-    ) where
-
-import Control.Exception (evaluate)
-import Criterion (Benchmark, bench)
-import Data.List (foldl')
-import Data.Map (Map)
-import qualified Data.Map as M
-import qualified Data.Text as T
-import qualified Data.Text.IO as T
-
-benchmark :: FilePath -> IO Benchmark
-benchmark fp = return $ bench "WordCount" $ do
-    t <- T.readFile fp
-    evaluate $ M.size $ wordCount t
-
-wordCount :: T.Text -> Map T.Text Int
-wordCount =
-    foldl' (\m k -> M.insertWith (+) k 1 m) M.empty . map T.toLower . T.words

tests/benchmarks/src/Data/Text/Benchmarks/WordFrequencies.hs

+-- | A word frequency count using the different string types
+--
+-- Tested in this benchmark:
+--
+-- * Splitting into words
+--
+-- * Converting to lowercase
+--
+-- * Comparing: Eq/Ord instances
+--
+module Data.Text.Benchmarks.WordFrequencies
+    ( benchmark
+    ) where
+
+import Criterion (Benchmark, bench, bgroup, whnf)
+import Data.Char (toLower)
+import Data.List (foldl')
+import Data.Map (Map)
+import qualified Data.ByteString.Char8 as B
+import qualified Data.Map as M
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+
+benchmark :: FilePath -> IO Benchmark
+benchmark fp = do
+    s <- readFile fp
+    b <- B.readFile fp
+    t <- T.readFile fp
+    return $ bgroup "WordFrequencies"
+        [ bench "String"     $ whnf (frequencies . words . map toLower)     s
+        , bench "ByteString" $ whnf (frequencies . B.words . B.map toLower) b
+        , bench "Text"       $ whnf (frequencies . T.words . T.toLower)     t
+        ]
+
+frequencies :: Ord a => [a] -> Map a Int
+frequencies = foldl' (\m k -> M.insertWith (+) k 1 m) M.empty
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.