haskellblog / src / Blog / Formats.hs

Diff from to

src/Blog/Formats.hs

 
 where
 
-import Blog.Utils (regexReplace, regexReplaceCustom, regexReplaceS)
+import Blog.Utils (regexReplace, regexReplaceCustom, regexReplaceCustomFull, regexReplaceS)
 import Control.Arrow ((>>>))
 import Data.Data
 import Data.Maybe (fromJust)
     "localhost|"                                   ++ --localhost...
     "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})"   ++ -- ...or ip
     "(?::\\d+)?"                                   ++ -- optional port
-    "(?:/\\S+|/?)"
+    "(?:/[^\\s<>\"]+|/?)"                            -- path
 
 escapeHtml = regexReplace "&" (LB.pack "&amp;") >>>
              regexReplace "<" (LB.pack "&lt;") >>>
 normaliseCRLF = regexReplace "\r\n" (LB.pack "\n")
 normaliseCRLF_S = regexReplaceS "\r\n" "\n"
 
--- | Convert HTTP URLS in HTML strings into anchors
-linkify = regexReplaceCustom url_regex (\s -> (LB.pack "<a href=\"") `LB.append` (escapeQuotes s) `LB.append` (LB.pack "\">") `LB.append` s `LB.append` (LB.pack "</a>"))
+-- Need to auto convert HTTP URLs in HTML strings into anchors
+-- Problem: if we put 'linkify' step before 'escapeHtml', the HTML anchors
+-- inserted will be double escaped.  If we put it after, then we can't write
+-- a regex that will allow '>' to mark the end of the URL, because it has
+-- already been escaped to '&gt;'.  Solution: use a URL regex that splits
+-- into 'blocks', with different constructors for URLs and normal text,
+-- with different escaping strategies.
+
+data PlainTextBlocks = PTText LB.ByteString
+                     | PTUrl LB.ByteString
+
+parseLinks = regexReplaceCustomFull url_regex PTText PTUrl
+
+escapeBlocks (PTUrl s) = (LB.pack "<a href=\"") `LB.append` (escapeQuotes $ escapeHtml s) `LB.append` (LB.pack "\">") `LB.append` (escapeHtml s) `LB.append` (LB.pack "</a>")
+escapeBlocks (PTText s) = escapeHtml s
 
 preserveLeadingWhitespace = regexReplaceCustom "^(\\s+)" (regexReplace " " (LB.pack "&nbsp;"))
 
 
 formatPlaintext :: String -> String
 formatPlaintext   = utf8 >>>
-                    escapeHtml >>>
+                    parseLinks >>>
+                    map escapeBlocks >>>
+                    LB.concat >>>
                     normaliseCRLF >>>
-                    linkify >>>
                     nl2br >>>
                     preserveLeadingWhitespace >>>
                     UTF8.toString
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.