text / Data / Text / Lazy / Encoding.hs

The default branch has multiple heads

deian 23bec39 



Bryan O'Sullivan 935c76d 

Bryan O'Sullivan 9071533 
Bryan O'Sullivan 935c76d 

Bryan O'Sullivan 7dfeb13 
Bryan O'Sullivan 935c76d 






Bryan O'Sullivan 6d549dc 

Bryan O'Sullivan 935c76d 



Bryan O'Sullivan 6d549dc 
John Millikin b0a0b16 
Herbert Valerio … cad006c 
John Millikin b0a0b16 
John Millikin 9a8885a 



Bryan O'Sullivan dedeed7 



John Millikin 9a8885a 
Bryan O'Sullivan 4a127e8 
John Millikin 9a8885a 



Bryan O'Sullivan 935c76d 

Bryan O'Sullivan b73783a 
John Millikin b0a0b16 



Bryan O'Sullivan 935c76d 

Bryan O'Sullivan dedeed7 
Bryan O'Sullivan 54ee4c2 
Bryan O'Sullivan dedeed7 
Bryan O'Sullivan 54ee4c2 
Bryan O'Sullivan dedeed7 
Bryan O'Sullivan 54ee4c2 
John Millikin b0a0b16 
Bryan O'Sullivan d03a77d 
Bryan O'Sullivan 54ee4c2 

John Millikin b0a0b16 
Bryan O'Sullivan d03a77d 
Bryan O'Sullivan 5153e29 

Bryan O'Sullivan 6d549dc 












Bryan O'Sullivan e996bc7 


Herbert Valerio … cad006c 
John Millikin b0a0b16 
Bryan O'Sullivan e996bc7 

John Millikin b0a0b16 
Herbert Valerio … cad006c 



Bryan O'Sullivan 5153e29 
John Millikin b0a0b16 
Bryan O'Sullivan 83171b7 
Bryan O'Sullivan 54ee4c2 
































Bryan O'Sullivan 83171b7 
Bryan O'Sullivan 54ee4c2 
Bryan O'Sullivan dedeed7 

Bryan O'Sullivan 6d549dc 

Bryan O'Sullivan dedeed7 

Bryan O'Sullivan 6d549dc 
Bryan O'Sullivan 83171b7 


Bryan O'Sullivan 54ee4c2 




Bryan O'Sullivan dedeed7 
















John Millikin b0a0b16 
Bryan O'Sullivan d03a77d 

John Millikin b0a0b16 
John Millikin 9a8885a 





Bryan O'Sullivan 6d549dc 



John Millikin 9a8885a 









Bryan O'Sullivan 6d549dc 



John Millikin 9a8885a 



John Millikin b0a0b16 









John Millikin 9a8885a 





Bryan O'Sullivan 6d549dc 



John Millikin 9a8885a 









Bryan O'Sullivan 6d549dc 



John Millikin 9a8885a 



John Millikin b0a0b16 







{-# LANGUAGE BangPatterns,CPP #-}
#if __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-- |
-- Module      : Data.Text.Lazy.Encoding
-- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
--
-- License     : BSD-style
-- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
--               duncan@haskell.org
-- Stability   : experimental
-- Portability : portable
--
-- Functions for converting lazy 'Text' values to and from lazy
-- 'ByteString', using several standard encodings.
--
-- To gain access to a much larger variety of encodings, use the
-- @text-icu@ package: <http://hackage.haskell.org/package/text-icu>

module Data.Text.Lazy.Encoding
    (
    -- * Decoding ByteStrings to Text
    -- $strict
      decodeASCII
    , decodeLatin1
    , decodeUtf8
    , decodeUtf16LE
    , decodeUtf16BE
    , decodeUtf32LE
    , decodeUtf32BE

    -- ** Catchable failure
    , decodeUtf8'

    -- ** Controllable error handling
    , decodeUtf8With
    , decodeUtf16LEWith
    , decodeUtf16BEWith
    , decodeUtf32LEWith
    , decodeUtf32BEWith

    -- * Encoding Text to ByteStrings
    , encodeUtf8
    , encodeUtf16LE
    , encodeUtf16BE
    , encodeUtf32LE
    , encodeUtf32BE
    ) where

import Control.Exception (evaluate, try)
import Data.Bits ((.&.))
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
import Data.Text.Lazy.Internal (Text(..), chunk, empty, foldrChunks)
import System.IO.Unsafe (unsafePerformIO)
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Lazy.Internal as B
import qualified Data.ByteString.Unsafe as S
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.Text.Lazy.Encoding.Fusion as E
import qualified Data.Text.Lazy.Fusion as F

-- $strict
--
-- All of the single-parameter functions for decoding bytestrings
-- encoded in one of the Unicode Transformation Formats (UTF) operate
-- in a /strict/ mode: each will throw an exception if given invalid
-- input.
--
-- Each function has a variant, whose name is suffixed with -'With',
-- that gives greater control over the handling of decoding errors.
-- For instance, 'decodeUtf8' will throw an exception, but
-- 'decodeUtf8With' allows the programmer to determine what to do on a
-- decoding error.

-- | /Deprecated/.  Decode a 'ByteString' containing 7-bit ASCII
-- encoded text.
--
-- This function is deprecated.  Use 'decodeLatin1' instead.
decodeASCII :: B.ByteString -> Text
decodeASCII = decodeUtf8
{-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}

-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
decodeLatin1 :: B.ByteString -> Text
decodeLatin1 = foldr (chunk . TE.decodeLatin1) empty . B.toChunks

-- | Decode a 'ByteString' containing UTF-8 encoded text.
decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
decodeUtf8With onErr bs0 = fast bs0
  where
    decode = TE.decodeUtf8With onErr
    fast (B.Chunk p ps) | isComplete p = chunk (decode p) (fast ps)
                        | otherwise    = chunk (decode h) (slow t ps)
      where (h,t) = S.splitAt pivot p
            pivot | at 1      = len-1
                  | at 2      = len-2
                  | otherwise = len-3
            len  = S.length p
            at n = len >= n && S.unsafeIndex p (len-n) .&. 0xc0 == 0xc0
    fast B.Empty = empty
    slow i bs = {-# SCC "decodeUtf8With'/slow" #-}
                case B.uncons bs of
                  Just (w,bs') | isComplete i' -> chunk (decode i') (fast bs')
                               | otherwise     -> slow i' bs'
                    where i' = S.snoc i w
                  Nothing -> case S.uncons i of
                               Just (j,i') ->
                                 case onErr desc (Just j) of
                                   Nothing -> slow i' bs
                                   Just c  -> Chunk (T.singleton c) (slow i' bs)
                               Nothing ->
                                 case onErr desc Nothing of
                                   Nothing -> empty
                                   Just c  -> Chunk (T.singleton c) empty
    isComplete bs = {-# SCC "decodeUtf8With'/isComplete" #-}
                    ix 1 .&. 0x80 == 0 ||
                    (len >= 2 && ix 2 .&. 0xe0 == 0xc0) ||
                    (len >= 3 && ix 3 .&. 0xf0 == 0xe0) ||
                    (len >= 4 && ix 4 .&. 0xf8 == 0xf0)
      where len = S.length bs
            ix n = S.unsafeIndex bs (len-n)
    desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
{-# INLINE[0] decodeUtf8With #-}

-- | Decode a 'ByteString' containing UTF-8 encoded text that is known
-- to be valid.
--
-- If the input contains any invalid UTF-8 data, an exception will be
-- thrown that cannot be caught in pure code.  For more control over
-- the handling of invalid data, use 'decodeUtf8'' or
-- 'decodeUtf8With'.
decodeUtf8 :: B.ByteString -> Text
decodeUtf8 = decodeUtf8With strictDecode
{-# INLINE[0] decodeUtf8 #-}

-- This rule seems to cause performance loss.
{- RULES "LAZY STREAM stream/decodeUtf8' fusion" [1]
   forall bs. F.stream (decodeUtf8' bs) = E.streamUtf8 strictDecode bs #-}

-- | Decode a 'ByteString' containing UTF-8 encoded text..
--
-- If the input contains any invalid UTF-8 data, the relevant
-- exception will be returned, otherwise the decoded text.
--
-- /Note/: this function is /not/ lazy, as it must decode its entire
-- input before it can return a result.  If you need lazy (streaming)
-- decoding, use 'decodeUtf8With' in lenient mode.
decodeUtf8' :: B.ByteString -> Either UnicodeException Text
decodeUtf8' bs = unsafePerformIO $ do
                   let t = decodeUtf8 bs
                   try (evaluate (rnf t `seq` t))
  where
    rnf Empty        = ()
    rnf (Chunk _ ts) = rnf ts
{-# INLINE decodeUtf8' #-}

encodeUtf8 :: Text -> B.ByteString
encodeUtf8 (Chunk c cs) = B.Chunk (TE.encodeUtf8 c) (encodeUtf8 cs)
encodeUtf8 Empty        = B.Empty

-- | Decode text from little endian UTF-16 encoding.
decodeUtf16LEWith :: OnDecodeError -> B.ByteString -> Text
decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
{-# INLINE decodeUtf16LEWith #-}

-- | Decode text from little endian UTF-16 encoding.
--
-- If the input contains any invalid little endian UTF-16 data, an
-- exception will be thrown.  For more control over the handling of
-- invalid data, use 'decodeUtf16LEWith'.
decodeUtf16LE :: B.ByteString -> Text
decodeUtf16LE = decodeUtf16LEWith strictDecode
{-# INLINE decodeUtf16LE #-}

-- | Decode text from big endian UTF-16 encoding.
decodeUtf16BEWith :: OnDecodeError -> B.ByteString -> Text
decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
{-# INLINE decodeUtf16BEWith #-}

-- | Decode text from big endian UTF-16 encoding.
--
-- If the input contains any invalid big endian UTF-16 data, an
-- exception will be thrown.  For more control over the handling of
-- invalid data, use 'decodeUtf16BEWith'.
decodeUtf16BE :: B.ByteString -> Text
decodeUtf16BE = decodeUtf16BEWith strictDecode
{-# INLINE decodeUtf16BE #-}

-- | Encode text using little endian UTF-16 encoding.
encodeUtf16LE :: Text -> B.ByteString
encodeUtf16LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16LE) [] txt)
{-# INLINE encodeUtf16LE #-}

-- | Encode text using big endian UTF-16 encoding.
encodeUtf16BE :: Text -> B.ByteString
encodeUtf16BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16BE) [] txt)
{-# INLINE encodeUtf16BE #-}

-- | Decode text from little endian UTF-32 encoding.
decodeUtf32LEWith :: OnDecodeError -> B.ByteString -> Text
decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
{-# INLINE decodeUtf32LEWith #-}

-- | Decode text from little endian UTF-32 encoding.
--
-- If the input contains any invalid little endian UTF-32 data, an
-- exception will be thrown.  For more control over the handling of
-- invalid data, use 'decodeUtf32LEWith'.
decodeUtf32LE :: B.ByteString -> Text
decodeUtf32LE = decodeUtf32LEWith strictDecode
{-# INLINE decodeUtf32LE #-}

-- | Decode text from big endian UTF-32 encoding.
decodeUtf32BEWith :: OnDecodeError -> B.ByteString -> Text
decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
{-# INLINE decodeUtf32BEWith #-}

-- | Decode text from big endian UTF-32 encoding.
--
-- If the input contains any invalid big endian UTF-32 data, an
-- exception will be thrown.  For more control over the handling of
-- invalid data, use 'decodeUtf32BEWith'.
decodeUtf32BE :: B.ByteString -> Text
decodeUtf32BE = decodeUtf32BEWith strictDecode
{-# INLINE decodeUtf32BE #-}

-- | Encode text using little endian UTF-32 encoding.
encodeUtf32LE :: Text -> B.ByteString
encodeUtf32LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32LE) [] txt)
{-# INLINE encodeUtf32LE #-}

-- | Encode text using big endian UTF-32 encoding.
encodeUtf32BE :: Text -> B.ByteString
encodeUtf32BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32BE) [] txt)
{-# INLINE encodeUtf32BE #-}
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.