Commits

John Lenz  committed 062f3c0

Use text-icu to decode messages

Pass unknown content-charsets to ICU: this was needed to decode a GB2312
document.

Also, update to latest yesod-platform (email-validate actually)

  • Participants
  • Parent commits a9ba9b9

Comments (0)

Files changed (3)

File notmuch-web.cabal

                  , monad-control
                  , old-locale
                  , process
-                 , process-conduit               >= 0.5        && < 0.6
+                 , process-conduit               >= 0.5        && < 1.1
                  , pwstore-fast
                  , random
                  , shakespeare-css
                  , tagsoup
                  , template-haskell
                  , text
+                 , text-icu
                  , time
                  , transformers
                  , unordered-containers

File src/Handler/Compose.hs

                        Right [] -> Right $ Nothing
                        Right addrs -> Just <$> mapM checkAddr addrs
     where
-        checkAddr a@(Address _ e) | E.isValid (T.unpack e) = Right a
+        checkAddr a@(Address _ e) | E.isValid (T.encodeUtf8 e) = Right a
         checkAddr (Address _ e)   | otherwise = Left $ SomeMessage $ MsgInvalidEmail e
 
 showAddresses :: [Address] -> T.Text

File src/Handler/View.hs

 import Data.List (find)
 import Text.Blaze.Html5 (preEscapedToHtml)
 import Blaze.ByteString.Builder (fromByteString)
+import qualified Data.ByteString as B
+import qualified Data.Conduit as C
 import qualified Data.Conduit.List as C
 import qualified Data.Conduit.Text as C
+import qualified Data.Text as T
 import qualified Data.Text.Encoding as T
 import qualified Data.Text.Lazy as TL
+import qualified Data.Text.ICU.Convert as ICU
 import qualified Data.Map as M
 import qualified Data.Tree as TR
 import qualified Data.CaseInsensitive as CI
 
+decodePart :: Maybe T.Text -> C.Source (C.ResourceT IO) B.ByteString -> Handler TL.Text
+decodePart charset src = case charset of
+                            Just "ISO-8859-1" -> decodeConduit C.iso8859_1
+                            Just "UTF-8"      -> decodeConduit C.utf8
+                            Just x            -> decodeICU x
+                            Nothing           -> decodeConduit C.utf8
+  where decodeConduit c = TL.fromChunks <$> lift (src $= C.decode c $$ C.consume)
+        decodeICU x = do $(logInfo) ("Decoding using ICU: " `T.append` x)
+                         lift $ do raw <- src $$ C.consume
+                                   c <- liftIO $ ICU.open (T.unpack x) (Just True)
+                                   return $ TL.fromChunks [ICU.toUnicode c $ B.concat raw]
+
 messagePart :: MessageID -> MessagePart -> Widget
 messagePart mid p@(MessagePart {partContentType = "text/html"}) = do
     let ((_ :: IO MessagePart), src) = notmuchMessagePart mid $ partID p
-    let codec = case partContentCharset p of
-                  Just "ISO-8859-1" -> C.iso8859_1
-                  Just "UTF-8" -> C.utf8
-                  _ -> C.ascii
-    html <- lift (lift (src $= C.decode codec $$ C.consume))
+    html <- lift $ decodePart (partContentCharset p) src
     [whamlet|
 <div .message-part .message-html>
-    #{preEscapedToHtml $ filterHtml $ TL.fromChunks html}
+    #{preEscapedToHtml $ filterHtml html}
 |]
 messagePart mid m@(MessagePart {partContent = Left ""}) = [whamlet|
 <div .message-part .message-attachment>