Commits

Bryan O'Sullivan  committed b687ad7

A valiant attempt at improving UTF-8 encoding performance.

This didn't actually work - it slowed down aeson encoding by almost 2x!

  • Participants
  • Parent commits 9d0dc6d

Comments (0)

Files changed (3)

File Data/Text/Encoding.hs

 #else
 import Control.Monad.ST (unsafeIOToST, unsafeSTToIO)
 #endif
-import Data.Bits ((.&.))
 import Data.ByteString as B
 import Data.ByteString.Internal as B
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
 import Data.Text.Internal (Text(..), textP)
-import Data.Text.UnsafeChar (ord, unsafeWrite)
-import Data.Text.UnsafeShift (shiftL, shiftR)
+import Data.Text.UnsafeChar (unsafeWrite)
 import Data.Word (Word8)
 import Foreign.C.Types (CSize)
-import Foreign.ForeignPtr (withForeignPtr)
+import Foreign.ForeignPtr (newForeignPtr, withForeignPtr)
 import Foreign.Marshal.Utils (with)
 import Foreign.Ptr (Ptr, plusPtr)
 import Foreign.Storable (peek, poke)
-import GHC.Base (MutableByteArray#)
+import GHC.Base (ByteArray#, MutableByteArray#)
 import System.IO.Unsafe (unsafePerformIO)
 import qualified Data.Text.Array as A
 import qualified Data.Text.Encoding.Fusion as E
-import qualified Data.Text.Encoding.Utf16 as U16
+
 import qualified Data.Text.Fusion as F
 
 -- $strict
 
 -- | Encode text using UTF-8 encoding.
 encodeUtf8 :: Text -> ByteString
-encodeUtf8 (Text arr off len) = unsafePerformIO $ do
-  let size0 = max len 4
-  mallocByteString size0 >>= start size0 off 0
- where
-  start size n0 m0 fp = withForeignPtr fp $ loop n0 m0
-   where
-    loop n1 m1 ptr = go n1 m1
-     where
-      go !n !m
-        | n == off+len = return (PS fp 0 m)
-        | otherwise = do
-            let poke8 k v = poke (ptr `plusPtr` k) (fromIntegral v :: Word8)
-                ensure k act
-                  | size-m >= k = act
-                  | otherwise = {-# SCC "resizeUtf8/ensure" #-} do
-                      let newSize = size `shiftL` 1
-                      fp' <- mallocByteString newSize
-                      withForeignPtr fp' $ \ptr' ->
-                        memcpy ptr' ptr (fromIntegral m)
-                      start newSize n m fp'
-                {-# INLINE ensure #-}
-            case A.unsafeIndex arr n of
-             w| w <= 0x7F  -> poke8 m w >> go (n+1) (m+1)
-              | w <= 0x7FF -> ensure 2 $ do
-                  poke8 m     $ (w `shiftR` 6) + 0xC0
-                  poke8 (m+1) $ (w .&. 0x3f) + 0x80
-                  go (n+1) (m+2)
-              | 0xD800 <= w && w <= 0xDBFF -> ensure 4 $ do
-                  let c = ord $ U16.chr2 w (A.unsafeIndex arr (n+1))
-                  poke8 m     $ (c `shiftR` 18) + 0xF0
-                  poke8 (m+1) $ ((c `shiftR` 12) .&. 0x3F) + 0x80
-                  poke8 (m+2) $ ((c `shiftR` 6) .&. 0x3F) + 0x80
-                  poke8 (m+3) $ (c .&. 0x3F) + 0x80
-                  go (n+2) (m+4)
-              | otherwise -> ensure 3 $ do
-                  poke8 m     $ (w `shiftR` 12) + 0xE0
-                  poke8 (m+1) $ ((w `shiftR` 6) .&. 0x3F) + 0x80
-                  poke8 (m+2) $ (w .&. 0x3F) + 0x80
-                  go (n+1) (m+3)
+encodeUtf8 (Text arr off len)
+    | len == 0  = empty
+    | otherwise = unsafePerformIO $ do
+  with (fromIntegral len ::CSize) $ \lenPtr -> do
+    dptr <- c_encode_utf8 lenPtr (A.aBA arr) (fromIntegral off)
+    fp <- newForeignPtr c_free_finalizer dptr
+    dlen <- peek lenPtr
+    return (PS fp 0 (fromIntegral dlen))
 
 -- | Decode text from little endian UTF-16 encoding.
 decodeUtf16LEWith :: OnDecodeError -> ByteString -> Text
 foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8
     :: MutableByteArray# s -> Ptr CSize
     -> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
+
+foreign import ccall unsafe "_hs_text_encode_utf8" c_encode_utf8
+    :: Ptr CSize -> ByteArray# -> CSize -> IO (Ptr Word8)

File cbits/cbits.c

  * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
  */
 
-#include <string.h>
 #include <stdint.h>
 #include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff,
 		     size_t n)
 
   return s;
 }
+
+uint8_t *_hs_text_encode_utf8(size_t *plen,
+			      const uint16_t const *src, size_t srcoff)
+{
+  size_t srclen = *plen;
+  size_t destlen = srclen > 4 ? srclen : 4;
+  uint8_t *dest = malloc(destlen);
+  uint8_t *d = dest;
+  const uint8_t const *dend = dest + destlen;
+  const uint16_t const *s = src + srcoff;
+  const uint16_t const *srcend = s + srclen;
+  
+  do {
+    const uint16_t w = *s++;
+    if (w < 0x80) {
+      if (dend - d < 1) {
+	destlen *= 2;
+	uint8_t *newdest = realloc(dest, destlen);
+	d = newdest + (d - dest);
+	dest = newdest;
+	dend = newdest + destlen;
+      }
+      *d++ = w;
+      while (s < srcend && d < dend && *s < 0x80) {
+	*d++ = *s++;
+      }
+    } else if (w < 0x800) {
+      if (dend - d < 2) {
+	destlen *= 2;
+	uint8_t *newdest = realloc(dest, destlen);
+	d = newdest + (d - dest);
+	dest = newdest;
+	dend = newdest + destlen;
+      }
+      *d++ = 0xc0 | (w >> 6);
+      *d++ = 0x80 | (w & 0x3f);
+    } else if (w >= 0xd800 && w < 0xdc00) {
+      if (dend - d < 4) {
+	destlen *= 2;
+	uint8_t *newdest = realloc(dest, destlen);
+	d = newdest + (d - dest);
+	dest = newdest;
+	dend = newdest + destlen;
+      }
+      const uint32_t c = (((((uint32_t) w) - 0xd800) << 10) |
+			  (((uint32_t) *s++) - 0xdc00)) + 0x10000;
+      *d++ = 0xf0 | (c >> 18);
+      *d++ = 0x80 | ((c >> 12) & 0x3f);
+      *d++ = 0x80 | ((c >> 6) & 0x3f);
+      *d++ = 0x80 | (c & 0x3f);
+    } else {
+      if (dend - d < 3) {
+	destlen *= 2;
+	uint8_t *newdest = realloc(dest, destlen);
+	d = newdest + (d - dest);
+	dest = newdest;
+	dend = newdest + destlen;
+      }
+      *d++ = 0xe0 | (w >> 12);
+      *d++ = 0x80 | ((w >> 6) & 0x3f);
+      *d++ = 0x80 | (w & 0x3f);
+    }
+  } while (s < srcend);
+
+  *plen = d - dest;
+
+  return dest;
+}
 
 library
   c-sources: cbits/cbits.c
+  cc-options: -Wall
 
   exposed-modules:
     Data.Text