Commits

Anonymous committed c36259c

Add new `Data.Text.Encoding.decodeLatin1` ISO-8859-1 decoding function

This has about an order of magnitude lower runtime and/or call-overhead as
compared to the more generic `text-icu` approach, e.g. according to criterion
with GHC 7.4.1 on Linux/x86_64:

* 12 times faster for empty input strings,
* 6 times faster for 16-byte strings, and
* 3 times faster for 1024-byte strings.

`decodeLatin1` is also faster compared to using `decodeUtf8` for plain ASCII:

* 2 times faster for 16-byte input strings,
* ~40% faster for 1024-byte strings.

Comments (0)

Files changed (3)

Data/Text/Encoding.hs

     -- * Decoding ByteStrings to Text
     -- $strict
       decodeASCII
+    , decodeLatin1
     , decodeUtf8
     , decodeUtf16LE
     , decodeUtf16BE
 import Data.ByteString as B
 import Data.ByteString.Internal as B
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
-import Data.Text.Internal (Text(..))
+import Data.Text.Internal (Text(..), textP)
 import Data.Text.Private (runText)
 import Data.Text.UnsafeChar (ord, unsafeWrite)
 import Data.Text.UnsafeShift (shiftL, shiftR)
 -- | /Deprecated/.  Decode a 'ByteString' containing 7-bit ASCII
 -- encoded text.
 --
--- This function is deprecated.  Use 'decodeUtf8' instead.
+-- This function is deprecated.  Use 'decodeLatin1' instead.
 decodeASCII :: ByteString -> Text
 decodeASCII = decodeUtf8
 {-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
 
+-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
+--
+-- 'decodeLatin1' is semantically equivalent to
+--  @Data.Text.pack . Data.ByteString.Char8.unpack@
+decodeLatin1 :: ByteString -> Text
+decodeLatin1 (PS fp off len) = textP a 0 len
+ where
+  a = A.run (A.new len >>= unsafeIOToST . go)
+  go dest = withForeignPtr fp $ \ptr -> do
+    c_decode_latin1 (A.maBA dest) (ptr `plusPtr` off) (ptr `plusPtr` (off+len))
+    return dest
+
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 decodeUtf8With :: OnDecodeError -> ByteString -> Text
 decodeUtf8With onErr (PS fp off len) = runText $ \done -> do
 foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8
     :: MutableByteArray# s -> Ptr CSize
     -> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
+
+foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1
+    :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO ()
 }
 
 /*
+ * The ISO 8859-1 (aka latin-1) code points correspond exactly to the first 256 unicode
+ * code-points, therefore we can trivially convert from a latin-1 encoded bytestring to
+ * an UTF16 array
+ */
+void
+_hs_text_decode_latin1(uint16_t *dest, const uint8_t const *src,
+                       const uint8_t const *srcend)
+{
+  const uint8_t *p = src;
+  while (p != srcend)
+    *dest++ = *p++;
+}
+
+/*
  * A best-effort decoder. Runs until it hits either end of input or
  * the start of an invalid byte sequence.
  *

tests/Tests/Properties.hs

 tl_chunk_unchunk    = (TL.fromChunks . TL.toChunks) `eq` id
 tl_from_to_strict   = (TL.fromStrict . TL.toStrict) `eq` id
 
+-- Note: this silently truncates code-points > 255 to 8-bit due to 'B.pack'
+encodeL1 :: T.Text -> B.ByteString
+encodeL1 = B.pack . map (fromIntegral . fromEnum) . T.unpack
+
 t_ascii t    = E.decodeASCII (E.encodeUtf8 a) == a
     where a  = T.map (\c -> chr (ord c `mod` 128)) t
 tl_ascii t   = EL.decodeASCII (EL.encodeUtf8 a) == a
     where a  = TL.map (\c -> chr (ord c `mod` 128)) t
+t_latin1 t   = E.decodeLatin1 (encodeL1 a) == a
+    where a  = T.map (\c -> chr (ord c `mod` 256)) t
 t_utf8       = forAll genUnicode $ (E.decodeUtf8 . E.encodeUtf8) `eq` id
 t_utf8'      = forAll genUnicode $ (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right)
 tl_utf8      = forAll genUnicode $ (EL.decodeUtf8 . EL.encodeUtf8) `eq` id
     testGroup "transcoding" [
       testProperty "t_ascii" t_ascii,
       testProperty "tl_ascii" tl_ascii,
+      testProperty "t_latin1" t_latin1,
       testProperty "t_utf8" t_utf8,
       testProperty "t_utf8'" t_utf8',
       testProperty "tl_utf8" tl_utf8,