Commits

Anonymous committed dcd779f

html entities

Comments (0)

Files changed (3)

acceptor-main.lua

 thirddata.acceptor.globals    = { }
 thirddata.acceptor.parsers    = { }
 
+environment.loadluafile("char-ent") -- xml entity list
+
 local aux        = thirddata.acceptor.aux
 local document   = thirddata.acceptor.document
 local finalizers = thirddata.acceptor.finalizers

formatters/acceptor_f-mediawiki.lua

 local unpack      = unpack or table.unpack
 local md5hex      = md5.hex
 
+local utf8char    = unicode.utf8.char
 local utf8len     = unicode.utf8.len
 local utf8sub     = unicode.utf8.sub
 local utf8upper   = unicode.utf8.upper
   --return [[{\iffalse]] .. data [[\fi}]] -- unsafe
   return ""
 end
+------------------------------------------------------------------------
+--  html entities
+------------------------------------------------------------------------
+--- Char entities get redirected to the entities table in char-ent.lua
+--- (thanks for its existence, Hans!)
+--- 
+--- Decimal characters are mapped straight onto utfchar(), hex entities
+--- have to be normalized first.
+
+formatter.hex_entity = function (hexnum)
+  return utf8char(tonumber("0x"..hexnum))
+end
 
 --======================================================================
 --  Misc

parsers/acceptor_p-mediawiki.lua

 local utf8char = aux.utf8char
 local dbg      = aux.debug_rule
 
+local entities = characters.entities
+
 local C, Carg, Cb, Cc, Cg, Cf, Cmt, Cp, Cs, Ct 
   = lpeg.C, lpeg.Carg, lpeg.Cb, lpeg.Cc, lpeg.Cg, lpeg.Cf, lpeg.Cmt, lpeg.Cp, lpeg.Cs, lpeg.Ct
 
 
 
 local stringlower = string.lower
+local utf8_char   = unicode.utf8.char
 
 local balanced_anything = P{
   "balanced",
 local colon                 = P":"
 local dot                   = P"."
 local double_quote          = P[["]]
+local open_guillemet        = P"«"
+local close_guillemet       = P"»"
 local equals                = P"="
 local gartenzaun            = P"#"
 local html_unsafe_symbol    = P"<" + P">" + P"&"
                             
 local html_entity_char      = ucase_letter + lcase_letter + decimal_digit
 local html_entity_chars     = html_entity_char^1
-local html_entity           = (P"&"   * html_entity_chars * P";")
-                            + (P"&#"  * decimal_number    * P";")
-                            + (P"&#x" * hex_number        * P";")
+local html_entity           = (P"&#x" / "") * (hex_number        / formatter.hex_entity) * (semicolon / "")
+                            + (P"&#"  / "") * (decimal_number    / utf8_char)            * (semicolon / "")
+                            + (P"&"   / "") * (html_entity_chars / entities)             * (semicolon / "")
                             
 local character             = whitespace_char + non_whitespace_char + html_entity
 local characters            = character^1 -- not in the spec but referred to everywhere
   --- <http://www.mediawiki.org/wiki/Markup_spec/BNF/Inline_text#Formatting>
   text_with_formatting = V"formatting"
                        + V"behaviour_switch"
-                       + V"open_guillemet"
-                       + V"close_guillemet"
+                       + open_guillemet  + close_guillemet
                        + V"nbsp_before" -- those were unreferenced in the BNF
                        + V"nbsp_after"
                        + html_entity
   bold_toggle          = Cs(P[[''']]   / [=[\togglewiki[bold]]=]      ),
   italic_toggle        = Cs(P[['']]    / [=[\togglewiki[italic]]=]    ),
   --- Those were undefined
-  open_guillemet       = P"«",
-  close_guillemet      = P"»",
 
   ------------------------------------------------------------------------
   --  References
 
 do
   local pre_escape_chars = {
-    ["&"] = [[{\letterampersand}]],
     ["$"] = [[{\letterdollar}]],
     ["^"] = [[{\letterhat}]],
     --["_"] = [[{\letterunderscore}]],
     ["%"] = [[{\letterpercent}]],
   }
   local post_escape_chars = {
+    ["&"] = [[{\letterampersand}]],
     ["#"] = [[{\letterhash}]],
     ["|"] = [[{\letterbar}]],
   }