Bryan O'Sullivan avatar Bryan O'Sullivan committed a2e92ce

Portable native UTF-8 decoder gives 3.7x faster decoding

This code is derived from Björn Höhrmann's UTF-8 decoder. Compared
to the original Haskell decoder from cac7dbcbc392, it's between
2.17 and 3.68 times faster. It's even between 1.18 and 3.58 times
faster than the improved Haskell decoder from 71ead801296a.

The x86-specific decoding path gives a substantial win for entirely
and partly ASCII text, e.g. HTML and XML, at the cost of being about
17% slower than the portable C decoder for entirely non-ASCII text.

Comments (0)

Files changed (2)

Data/Text/Encoding.hs

-{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE BangPatterns, ForeignFunctionInterface, MagicHash,
+    UnliftedFFITypes #-}
 -- |
 -- Module      : Data.Text.Encoding
 -- Copyright   : (c) 2008, 2009 Tom Harper,
     ) where
 
 import Control.Exception (evaluate, try)
+import Control.Monad.ST (unsafeIOToST, unsafeSTToIO)
 import Data.Bits ((.&.))
 import Data.ByteString as B
 import Data.ByteString.Internal as B
-import Data.ByteString.Unsafe as B
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
 import Data.Text.Internal (Text(..), textP)
 import Data.Text.UnsafeChar (ord, unsafeWrite)
 import Data.Text.UnsafeShift (shiftL, shiftR)
 import Data.Word (Word8)
+import Foreign.C.Types (CSize)
 import Foreign.ForeignPtr (withForeignPtr)
-import Foreign.Ptr (plusPtr)
-import Foreign.Storable (poke)
+import Foreign.Marshal.Utils (with)
+import Foreign.Ptr (Ptr, plusPtr)
+import Foreign.Storable (peek, poke)
+import GHC.Base (MutableByteArray#)
 import System.IO.Unsafe (unsafePerformIO)
 import qualified Data.Text.Array as A
 import qualified Data.Text.Encoding.Fusion as E
 import qualified Data.Text.Encoding.Utf16 as U16
-import qualified Data.Text.Encoding.Utf8 as U8
 import qualified Data.Text.Fusion as F
 
 -- $strict
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 decodeUtf8With :: OnDecodeError -> ByteString -> Text
-decodeUtf8With onErr bs = textP (fst a) 0 (snd a)
+decodeUtf8With onErr (PS fp off len) = textP (fst a) 0 (snd a)
  where
-  a   = A.run2 (A.new len >>= outer 0 0)
-  len = B.length bs
-  idx = B.unsafeIndex bs
+  a = A.run2 (A.new len >>= unsafeIOToST . go)
   desc = "Data.Text.Encoding.decodeUtf8: Invalid UTF-8 stream"
-  outer n0 m0 arr = go n0 m0
-   where
-    go !n !m =
-      if m < len
-      then let !x1 = idx m
-               !m1 = m + 1
-               barf = case onErr desc (Just x1) of
-                        Nothing -> go n m1
-                        Just c -> do
-                          w <- unsafeWrite arr n c
-                          go (n+w) m1
-           in if U8.validate1 x1 then do
-                A.unsafeWrite arr n (fromIntegral x1)
-                go (n+1) m1
-              else if m1 < len then
-                let !x2 = idx m1; !m2 = m + 2 in
-                if U8.validate2 x1 x2 then do
-                  w <- unsafeWrite arr n (U8.chr2 x1 x2)
-                  go (n+w) m2
-                else if m2 < len then
-                  let !x3 = idx m2; !m3 = m + 3 in
-                  if U8.validate3 x1 x2 x3 then do
-                    w <- unsafeWrite arr n (U8.chr3 x1 x2 x3)
-                    go (n+w) m3
-                  else if m3 < len then
-                    let !x4 = idx m3 in
-                    if U8.validate4 x1 x2 x3 x4 then do
-                      w <- unsafeWrite arr n (U8.chr4 x1 x2 x3 x4)
-                      go (n+w) (m+4)
-                    else barf
-                  else barf
-                else barf
-              else barf
-      else return (arr,n)
-{-# INLINE[0] decodeUtf8With #-}
+  go dest = withForeignPtr fp $ \ptr ->
+    with (0::CSize) $ \destOffPtr -> do
+      let end = ptr `plusPtr` (off + len)
+          loop curPtr = do
+            curPtr' <- c_decode_utf8 (A.maBA dest) destOffPtr curPtr end
+            if curPtr' == end
+              then do
+                n <- peek destOffPtr
+                return (dest,fromIntegral n)
+              else do
+                x <- peek curPtr'
+                case onErr desc (Just x) of
+                  Nothing -> loop $ curPtr' `plusPtr` 1
+                  Just c -> do
+                    destOff <- peek destOffPtr
+                    w <- unsafeSTToIO $
+                         unsafeWrite dest (fromIntegral destOff) c
+                    poke destOffPtr (destOff + fromIntegral w)
+                    loop $ curPtr' `plusPtr` 1
+      loop (ptr `plusPtr` off)
+{- INLINE[0] decodeUtf8With #-}
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text that is known
 -- to be valid.
 encodeUtf32BE :: Text -> ByteString
 encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
 {-# INLINE encodeUtf32BE #-}
+
+foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8
+    :: MutableByteArray# s -> Ptr CSize
+    -> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
+/*
+ * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
+ *
+ * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
+ *
+ * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+ */
+
 #include <string.h>
+#include <stdint.h>
+#include <stdio.h>
 
 void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff,
 		     size_t n)
 {
   return memcmp(a + (aoff<<1), b + (boff<<1), n<<1);
 }
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+
+static const uint8_t utf8d[] = {
+  /*
+   * The first part of the table maps bytes to character classes that
+   * to reduce the size of the transition table and create bitmasks.
+   */
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+  /*
+   * The second part is a transition table that maps a combination of
+   * a state of the automaton and a character class to a state.
+   */
+   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+  12,36,12,12,12,12,12,12,12,12,12,12, 
+};
+
+static inline uint32_t
+decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
+  uint32_t type = utf8d[byte];
+
+  *codep = (*state != UTF8_ACCEPT) ?
+    (byte & 0x3fu) | (*codep << 6) :
+    (0xff >> type) & (byte);
+
+  return *state = utf8d[256 + *state + type];
+}
+
+/*
+ * A best-effort decoder. Runs until it hits either end of input or
+ * the start of an invalid byte sequence.
+ *
+ * At exit, updates *destoff with the next offset to write to, and
+ * returns the next source offset to read from.
+ */
+uint8_t const *
+_hs_text_decode_utf8(uint16_t *dest, size_t *destoff,
+		     const uint8_t const *src, const uint8_t const *srcend)
+{
+  uint16_t *d = dest + *destoff;
+  const uint8_t const *s = src;
+  uint32_t state = UTF8_ACCEPT;
+
+  while (s < srcend) {
+    uint32_t codepoint;
+
+#if defined(__i386__) || defined(__x86_64__)
+    /*
+     * This code will only work on a little-endian system that
+     * supports unaligned loads.
+     *
+     * It gives a substantial speed win on data that is purely or
+     * partly ASCII (e.g. HTML), at only a slight cost on purely
+     * non-ASCII text.
+     */
+
+    if (state == UTF8_ACCEPT) {
+      while (s < srcend - 4) {
+	codepoint = *((uint32_t *) s);
+	if ((codepoint & 0x80808080) != 0)
+	  break;
+	s += 4;
+
+	/*
+	 * Tried 32-bit stores here, but the extra bit-twiddling
+	 * slowed the code down.
+	 */
+
+	*d++ = (uint16_t) (codepoint & 0xff);
+	*d++ = (uint16_t) ((codepoint >> 8) & 0xff);
+	*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
+	*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
+      }
+    }
+#endif
+
+    if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
+      if (state != UTF8_REJECT)
+	continue;
+      break;
+    }
+
+    if (codepoint <= 0xffff)
+      *d++ = (uint16_t) codepoint;
+    else {
+      *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
+      *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
+    }
+  }
+
+  /* Error recovery - if we're not in a valid finishing state, back up. */
+  if (state != UTF8_ACCEPT)
+    s -= 1;
+
+  *destoff = d - dest;
+
+  return s;
+}
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.