Commits

Luke Plant committed eba05d7

Fixed autolinking so that < and > can be used to delimit URLs

Comments (0)

Files changed (2)

src/Blog/Formats.hs

 
 where
 
-import Blog.Utils (regexReplace, regexReplaceCustom, regexReplaceS)
+import Blog.Utils (regexReplace, regexReplaceCustom, regexReplaceCustomFull, regexReplaceS)
 import Control.Arrow ((>>>))
 import Data.Data
 import Data.Maybe (fromJust)
     "localhost|"                                   ++ --localhost...
     "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})"   ++ -- ...or ip
     "(?::\\d+)?"                                   ++ -- optional port
-    "(?:/\\S+|/?)"
+    "(?:/[^\\s<>\"]+|/?)"                            -- path
 
 escapeHtml = regexReplace "&" (LB.pack "&amp;") >>>
              regexReplace "<" (LB.pack "&lt;") >>>
 normaliseCRLF = regexReplace "\r\n" (LB.pack "\n")
 normaliseCRLF_S = regexReplaceS "\r\n" "\n"
 
--- | Convert HTTP URLS in HTML strings into anchors
-linkify = regexReplaceCustom url_regex (\s -> (LB.pack "<a href=\"") `LB.append` (escapeQuotes s) `LB.append` (LB.pack "\">") `LB.append` s `LB.append` (LB.pack "</a>"))
+-- Need to auto convert HTTP URLs in HTML strings into anchors
+-- Problem: if we put 'linkify' step before 'escapeHtml', the HTML anchors
+-- inserted will be double escaped.  If we put it after, then we can't write
+-- a regex that will allow '>' to mark the end of the URL, because it has
+-- already been escaped to '&gt;'.  Solution: use a URL regex that splits
+-- into 'blocks', with different constructors for URLs and normal text,
+-- with different escaping strategies.
+
+data PlainTextBlocks = PTText LB.ByteString
+                     | PTUrl LB.ByteString
+
+parseLinks = regexReplaceCustomFull url_regex PTText PTUrl
+
+escapeBlocks (PTUrl s) = (LB.pack "<a href=\"") `LB.append` (escapeQuotes $ escapeHtml s) `LB.append` (LB.pack "\">") `LB.append` (escapeHtml s) `LB.append` (LB.pack "</a>")
+escapeBlocks (PTText s) = escapeHtml s
 
 preserveLeadingWhitespace = regexReplaceCustom "^(\\s+)" (regexReplace " " (LB.pack "&nbsp;"))
 
 
 formatPlaintext :: String -> String
 formatPlaintext   = utf8 >>>
-                    escapeHtml >>>
+                    parseLinks >>>
+                    map escapeBlocks >>>
+                    LB.concat >>>
                     normaliseCRLF >>>
-                    linkify >>>
                     nl2br >>>
                     preserveLeadingWhitespace >>>
                     UTF8.toString

src/Blog/Utils.hs

   -> (LB.ByteString -> LB.ByteString)  -- ^ transformation function applied to all matches
   -> LB.ByteString                     -- ^ text to operate on
   -> LB.ByteString
-regexReplaceCustom !regex replacef !text = go text []
+regexReplaceCustom !regex replacef !text = LB.concat $ regexReplaceCustomFull regex id replacef text
+
+regexReplaceCustomFull ::
+  (RegexMaker Regex CompOption ExecOption source) =>
+  source                               -- ^ regular expression
+  -> (LB.ByteString -> a)              -- ^ transformation function applied to all non-matches
+  -> (LB.ByteString -> a)              -- ^ transformation function applied to all matches
+  -> LB.ByteString                     -- ^ text to operate on
+  -> [a]
+regexReplaceCustomFull !regex keepf replacef !text = go text []
  where go str res =
            if LB.null str
-           then LB.concat . reverse $ res
+           then reverse res
            else case (str =~~ regex) :: Maybe (LB.ByteString, LB.ByteString, LB.ByteString) of
-                  Nothing -> LB.concat . reverse $ (str:res)
-                  Just (bef, match , aft) -> go aft (replacef(match):bef:res)
+                  Nothing -> reverse $ (keepf str):res
+                  Just (bef, match , aft) -> go aft (replacef(match):(keepf bef):res)
 
 
 -- | Replace using a regular expression. String version