haskellblog / src / Blog / Formats.hs

{-# LANGUAGE DeriveDataTypeable #-}
module Blog.Formats ( Format(..)
                    , getFormatter


import Blog.Utils (regexReplace, regexReplaceCustom, regexReplaceCustomFull, regexReplaceS)
import Control.Arrow ((>>>))
import Data.Data
import Data.Maybe (fromJust)
import Data.Typeable
import Ella.GenUtils (utf8)
import qualified Data.ByteString.Lazy.Char8 as LB
import qualified Data.ByteString.Lazy.UTF8 as UTF8
import qualified Data.Map as Map
import qualified Text.Pandoc as Pandoc
import qualified Text.Pandoc.Definition as PD
import qualified Text.XHtml as X

data Format = Rawhtml
            | Plaintext
            | RST
            deriving (Eq, Ord, Show, Read, Enum, Data, Typeable)

formatRawhtml :: String -> String
formatRawhtml = id

url_regex =
    "https?://"                                    ++ -- http:// or https://
    "(?:(?:[a-zA-Z0-9-]+\\.)+[a-zA-Z]{2,6}|"       ++ -- domain...
    "localhost|"                                   ++ --localhost...
    "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})"   ++ -- ...or ip
    "(?::\\d+)?"                                   ++ -- optional port
    "(?:/[^\\s<>\"]+|/?)"                            -- path

escapeHtml = regexReplace "&" (LB.pack "&amp;") >>>
             regexReplace "<" (LB.pack "&lt;") >>>
             regexReplace ">" (LB.pack "&gt;")

escapeQuotes = regexReplace "\"" (LB.pack "&quot;")

normaliseCRLF = regexReplace "\r\n" (LB.pack "\n")
normaliseCRLF_S = regexReplaceS "\r\n" "\n"

-- Need to auto convert HTTP URLs in HTML strings into anchors
-- Problem: if we put 'linkify' step before 'escapeHtml', the HTML anchors
-- inserted will be double escaped.  If we put it after, then we can't write
-- a regex that will allow '>' to mark the end of the URL, because it has
-- already been escaped to '&gt;'.  Solution: use a URL regex that splits
-- into 'blocks', with different constructors for URLs and normal text,
-- with different escaping strategies.

data PlainTextBlocks = PTText LB.ByteString
                     | PTUrl LB.ByteString

parseLinks = regexReplaceCustomFull url_regex PTText PTUrl

escapeBlocks (PTUrl s) = (LB.pack "<a href=\"") `LB.append` (escapeQuotes $ escapeHtml s) `LB.append` (LB.pack "\">") `LB.append` (escapeHtml s) `LB.append` (LB.pack "</a>")
escapeBlocks (PTText s) = escapeHtml s

preserveLeadingWhitespace = regexReplaceCustom "^(\\s+)" (regexReplace " " (LB.pack "&nbsp;"))

nl2br = regexReplace "\n" (LB.pack "<br />\n")

formatPlaintext :: String -> String
formatPlaintext   = utf8 >>>
                    parseLinks >>>
                    map escapeBlocks >>>
                    LB.concat >>>
                    normaliseCRLF >>>
                    nl2br >>>
                    preserveLeadingWhitespace >>>

removeRawHtml :: PD.Pandoc -> PD.Pandoc
removeRawHtml (PD.Pandoc m blocks) = PD.Pandoc m (filter (not . isRawHtml) blocks)
      isRawHtml (PD.RawHtml s) = True
      isRawHtml _ = False

formatRST :: String -> String
formatRST = normaliseCRLF_S >>>
            Pandoc.readRST Pandoc.defaultParserState >>>
            removeRawHtml >>>
            Pandoc.writeHtmlString Pandoc.defaultWriterOptions { Pandoc.writerStandalone = False }

formatters :: Map.Map Format (String -> String)
formatters = Map.fromList
             [ (Rawhtml, formatRawhtml)
             , (Plaintext, formatPlaintext)
             , (RST, formatRST)

getFormatter f = fromJust $ Map.lookup f formatters
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.