Commits

Bryan O'Sullivan committed 61d94dd Merge

Merge pull request #18 from hvr/pull-req-16

Add new `Data.Text.Encoding.decodeLatin1` ISO-8859-1 decoding function

Comments (0)

Files changed (4)

Data/Text/Encoding.hs

     -- * Decoding ByteStrings to Text
     -- $strict
       decodeASCII
+    , decodeLatin1
     , decodeUtf8
     , decodeUtf16LE
     , decodeUtf16BE
 import Data.ByteString as B
 import Data.ByteString.Internal as B
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
-import Data.Text.Internal (Text(..))
+import Data.Text.Internal (Text(..), textP)
 import Data.Text.Private (runText)
 import Data.Text.UnsafeChar (ord, unsafeWrite)
 import Data.Text.UnsafeShift (shiftL, shiftR)
 -- | /Deprecated/.  Decode a 'ByteString' containing 7-bit ASCII
 -- encoded text.
 --
--- This function is deprecated.  Use 'decodeUtf8' instead.
+-- This function is deprecated.  Use 'decodeLatin1' instead.
 decodeASCII :: ByteString -> Text
 decodeASCII = decodeUtf8
 {-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
 
+-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
+--
+-- 'decodeLatin1' is semantically equivalent to
+--  @Data.Text.pack . Data.ByteString.Char8.unpack@
+decodeLatin1 :: ByteString -> Text
+decodeLatin1 (PS fp off len) = textP a 0 len
+ where
+  a = A.run (A.new len >>= unsafeIOToST . go)
+  go dest = withForeignPtr fp $ \ptr -> do
+    c_decode_latin1 (A.maBA dest) (ptr `plusPtr` off) (ptr `plusPtr` (off+len))
+    return dest
+
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 decodeUtf8With :: OnDecodeError -> ByteString -> Text
 decodeUtf8With onErr (PS fp off len) = runText $ \done -> do
 foreign import ccall unsafe "_hs_text_decode_utf8" c_decode_utf8
     :: MutableByteArray# s -> Ptr CSize
     -> Ptr Word8 -> Ptr Word8 -> IO (Ptr Word8)
+
+foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1
+    :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO ()

Data/Text/Lazy/Encoding.hs

     -- * Decoding ByteStrings to Text
     -- $strict
       decodeASCII
+    , decodeLatin1
     , decodeUtf8
     , decodeUtf16LE
     , decodeUtf16BE
 -- | /Deprecated/.  Decode a 'ByteString' containing 7-bit ASCII
 -- encoded text.
 --
--- This function is deprecated.  Use 'decodeUtf8' instead.
+-- This function is deprecated.  Use 'decodeLatin1' instead.
 decodeASCII :: B.ByteString -> Text
 decodeASCII = decodeUtf8
 {-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
 
+-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
+decodeLatin1 :: B.ByteString -> Text
+decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks
+
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
 decodeUtf8With onErr bs0 = fast bs0
 }
 
 /*
+ * The ISO 8859-1 (aka latin-1) code points correspond exactly to the first 256 unicode
+ * code-points, therefore we can trivially convert from a latin-1 encoded bytestring to
+ * an UTF16 array
+ */
+void
+_hs_text_decode_latin1(uint16_t *dest, const uint8_t const *src,
+                       const uint8_t const *srcend)
+{
+  const uint8_t *p = src;
+
+#if defined(__i386__) || defined(__x86_64__)
+  /* This optimization works on a little-endian systems by using
+     (aligned) 32-bit loads instead of 8-bit loads
+   */
+
+  /* consume unaligned prefix */
+  while (p != srcend && (uintptr_t)p & 0x3)
+    *dest++ = *p++;
+
+  /* iterate over 32-bit aligned loads */
+  while (p < srcend - 3) {
+    const uint32_t w = *((const uint32_t *)p);
+
+    *dest++ =  w        & 0xff;
+    *dest++ = (w >> 8)  & 0xff;
+    *dest++ = (w >> 16) & 0xff;
+    *dest++ = (w >> 24) & 0xff;
+
+    p += 4;
+  }
+#endif
+
+  /* handle unaligned suffix */
+  while (p != srcend)
+    *dest++ = *p++;
+}
+
+/*
  * A best-effort decoder. Runs until it hits either end of input or
  * the start of an invalid byte sequence.
  *

tests/Tests/Properties.hs

 import Test.Framework.Providers.QuickCheck2 (testProperty)
 import qualified Data.Bits as Bits (shiftL, shiftR)
 import qualified Data.ByteString as B
+import qualified Data.ByteString.Lazy as BL
 import qualified Data.List as L
 import qualified Data.Text as T
 import qualified Data.Text.Encoding as E
 tl_chunk_unchunk    = (TL.fromChunks . TL.toChunks) `eq` id
 tl_from_to_strict   = (TL.fromStrict . TL.toStrict) `eq` id
 
+-- Note: this silently truncates code-points > 255 to 8-bit due to 'B.pack'
+encodeL1 :: T.Text -> B.ByteString
+encodeL1 = B.pack . map (fromIntegral . fromEnum) . T.unpack
+encodeLazyL1 :: TL.Text -> BL.ByteString
+encodeLazyL1 = BL.fromChunks . map encodeL1 . TL.toChunks
+
 t_ascii t    = E.decodeASCII (E.encodeUtf8 a) == a
     where a  = T.map (\c -> chr (ord c `mod` 128)) t
 tl_ascii t   = EL.decodeASCII (EL.encodeUtf8 a) == a
     where a  = TL.map (\c -> chr (ord c `mod` 128)) t
+t_latin1 t   = E.decodeLatin1 (encodeL1 a) == a
+    where a  = T.map (\c -> chr (ord c `mod` 256)) t
+tl_latin1 t  = EL.decodeLatin1 (encodeLazyL1 a) == a
+    where a  = TL.map (\c -> chr (ord c `mod` 256)) t
 t_utf8       = forAll genUnicode $ (E.decodeUtf8 . E.encodeUtf8) `eq` id
 t_utf8'      = forAll genUnicode $ (E.decodeUtf8' . E.encodeUtf8) `eq` (id . Right)
 tl_utf8      = forAll genUnicode $ (EL.decodeUtf8 . EL.encodeUtf8) `eq` id
     testGroup "transcoding" [
       testProperty "t_ascii" t_ascii,
       testProperty "tl_ascii" tl_ascii,
+      testProperty "t_latin1" t_latin1,
+      testProperty "tl_latin1" tl_latin1,
       testProperty "t_utf8" t_utf8,
       testProperty "t_utf8'" t_utf8',
       testProperty "tl_utf8" tl_utf8,