Philipp Gesang avatar Philipp Gesang committed 9cd858f

parser caching (-> *great* speedup); generalized the yer-hack

Comments (0)

Files changed (10)

doc/context/third/transliterator/transliterator.tex

 part of the document be processed according to dutch rules, leaving the overall
 \type{\language[#1]} configuration unchanged for the rest of the content.
 
+Another argument, \type{deficient_font} can be used in
+combination with the modes \type{all}, \type{ru_old} and
+\type{iso9_ocs}. It lets you circumvent the deficiency that some
+fonts show concerning the characters that ISO~9 assigns to
+cyrillic “ь” and “ъ”. Set it to {\em true} to enable it.
+
 The actual transliteration is done using the macro
 \type{\transliterate[#1]} \type{{#2}}.
 The second argument takes the raw string in the original language that we want

tex/context/interface/third/t-transliterator.xml

           <cd:constant type="sk"/>
           <cd:constant type="hr"/>
         </cd:parameter>
+        <cd:parameter name="deficient_font">
+          <cd:constant type="yes"/>
+          <cd:constant type="no" default="yes"/>
+        </cd:parameter>
         <cd:parameter name="hinting">
           <cd:constant type="yes" default="yes"/>
           <cd:constant type="no"/>

tex/context/third/transliterator/t-transliterator.mkiv

   hyphenate=cz,
   mode=ru_old,
   sr_exceptions=\v!yes,
+  deficient_font=\v!no,
 ]
 
 %D Possible values for \type{mode} are by the time of this writing:
 %D \type{all}, \type{iso9_ocs}, \type{ocs}, \type{ocs_gla}, \type{ru_cz},
 %D \type{ocs_cz}, \type{gr} and \type{gr_n}.
 %D As not all fonts, even the expensive ones, support some of the most frequent
-%D unicode signs used in ISO~9 there are fallbacks for the transliterations of
-%D the weak and hard sign: \type{iso9_ocs_hack}, which is essentially
-%D \type{iso9_ocs}, and \type{ru_old_jer_hack}, which is essentially
-%D \type{ru_old}.  These two transliterate {\em ь} and {\em ъ} (both upper and
-%D lower case) to the more common, but non-ISO characters {\em '} and {\em ''}
+%D unicode signs used in ISO~9, there are fallbacks for the transliterations of
+%D the weak and hard sign.
+%D They work with the modes \type{iso9_ocs}, \type{all} and
+%D \type{ru_old} only and can be triggered by setting the
+%D variable \type{deficient_font} to the value {\em yes}.
+%D This will transliterate {\em ь} and {\em ъ} (both upper and
+%D lower case) to the more common, but non-ISO characters {\em ’} and {\em ”}
 %D respectively.
 %D Possible values for \type{hyphenate} are all valid \CONTEXT\ language code, for an
 %D overview see \type{http://wiki.contextgarden.net/Language_Codes}.
     \setuptransliterate[#1]%
   \fi
     \language[\transliterateparameter{hyphenate}]%
-    \ctxlua{thirddata.translit.transliterate("\transliterateparameter{mode}","\luaescapestring{#2}")}%
+    \ctxlua{
+      thirddata.translit.deficient_font = "\transliterateparameter{deficient_font}"
+      thirddata.translit.transliterate("\transliterateparameter{mode}","\luaescapestring{#2}")
+    }%
   \egroup%
 }
 
   %\bgroup
     %\setuptransliterate[#1]%
     %\language[\transliterateparameter{hyphenate}]%
-    \ctxlua{thirddata.translit.transliterate("\transliterateparameter{mode}","#2")}%
+    \ctxlua{
+      thirddata.translit.deficient_font = "\transliterateparameter{deficient_font}"
+      thirddata.translit.transliterate("\transliterateparameter{mode}","#2")
+    }%
   %\egroup%
 }
 

tex/context/third/transliterator/trans_tables_bg.lua

 --                            Bulgarian                                      --
 --===========================================================================--
 
-local translit = thirddata.translit
+local translit  = thirddata.translit
+local pcache    = translit.parser_cache
+local lpegmatch = lpeg.match
 
 if not translit.done_bg then
     ---------------------------------------------------------------------------
     translit.done_bg = true
 end
 
-local P, Cs, lmatch = lpeg.P, lpeg.Cs, lpeg.match
-local addrules      = translit.addrules
-local utfchar       = translit.utfchar
+local P, Cs    = lpeg.P, lpeg.Cs
+local addrules = translit.addrules
+local utfchar  = translit.utfchar
 
-local memo = { }
-local function bulgarian (mode, text)
+local function bulgarian (mode)
     local bulgarian_parser
-    if memo[mode] then
-        return lmatch(memo[mode], text)
-    end
     if mode == "de" then
         local bg = translit.bg_upp + translit.bg_low
         local p_bg = addrules(bg)
         bulgarian_parser = Cs((p_bg / bg + utfchar)^0)
+    else
+        return nil
     end
-    memo[mode] = bulgarian_parser
-    return bulgarian_parser and lmatch(bulgarian_parser, text) or ""
+    return bulgarian_parser
 end
 
-translit.methods["bg_de"] = function (text) return bulgarian("de", text) end
+translit.methods["bg_de"] = function (text)
+    local p = pcache["bg_de"]
+    if not p then
+        p = bulgarian("de")
+        pcache["bg_de"] = p
+    end
+    return p and lpegmatch(p, text) or ""
+end
 
+-- vim:ft=lua:sw=4:ts=4

tex/context/third/transliterator/trans_tables_gr.lua

 --                              Greek                                        --
 --===========================================================================--
 
-local translit = thirddata.translit
+local translit  = thirddata.translit
+local pcache    = translit.parser_cache
+local lpegmatch = lpeg.match
 
 -- Note that the Greek transliteration mapping isn't bijective so transliterated
 -- texts won't be reversible.  (Shouldn't be impossible to make one up using
             other     = Cs(p       / gr        ),
         }
 
-        --g:print()
-        text = g:match(text)
-        return text
+        return g
     end
 end
 
-translit.methods ["gr"]   = function (text) return greek("gr"  , text) end
-translit.methods ["gr_n"] = function (text) return greek("gr_n", text) end
+translit.methods["gr"] = function (text)
+    p = pcache["gr"]
+    if not p then
+        p = greek("gr")
+        pcache["gr"] = p
+    end
+    return lpegmatch(p, text)
+end
+
+translit.methods["gr_n"] = function (text)
+    p = pcache["gr_n"]
+    if not p then
+        p = greek("gr_n")
+        pcache["gr_n"] = p
+    end
+    return lpegmatch(p, text)
+end
+
+-- vim:ft=lua:sw=4:ts=4

tex/context/third/transliterator/trans_tables_iso9.lua

 --           ISO 9.1995(E) standardized transliteration for cyrillic         --
 --===========================================================================--
 
-local translit = thirddata.translit
+local translit  = thirddata.translit
+local pcache    = translit.parser_cache
+local lpegmatch = lpeg.match
 
 if not translit.done_iso9 then
     -----------------------------------------
     }
 
     translit.ru_jer_hack = translit.make_add_dict{
-    ["ь"] = "'",
-    ["Ь"] = "'",
-    ["ъ"] = "''",
-    ["Ъ"] = "''",
+    ["ь"] = "’",
+    ["Ь"] = "’",
+    ["ъ"] = "”",
+    ["Ъ"] = "”",
     }
 
     translit.tables["russian magkij / tverdyj znak hack"] = translit.ru_jer_hack
 --                              End Of Tables                                --
 --===========================================================================--
 
-
-local function iso9 (mode, text)
+local function iso9 (mode)
     local P, R, S, V, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs
     local addrules = translit.addrules
     local utfchar = translit.utfchar
                  + translit.non_ru_upp
                  + translit.non_ru_low
         end
-    elseif mode == "ru_old_jer_hack" then
-        iso9 = iso9
-             + translit.ru_old_upp
-             + translit.ru_old_low
-             + translit.ru_jer_hack
+        if translit.deficient_font == "yes" then
+            iso9 = iso9
+                + translit.ru_old_upp
+                + translit.ru_old_low
+                + translit.ru_jer_hack
+        end
     end
 
     local p_iso9 = addrules (iso9, p_iso9)
     local iso9_parser = Cs((p_iso9 / iso9 + utfchar)^0)
 
-    return iso9_parser:match(text)
+    return iso9_parser
 end
 
-translit.methods ["ru"]              = function (text) return iso9 ("all"            , text) end
-translit.methods ["all"]             = function (text) return iso9 ("all"            , text) end
-translit.methods ["ru_old"]          = function (text) return iso9 ("ru_old"         , text) end
-translit.methods ["ru_old_jer_hack"] = function (text) return iso9 ("ru_old_jer_hack", text) end
+translit.methods["all"] = function (text)
+    local pname = "all" .. translit.deficient_font
+    local p = pcache[pname]
+    if not p then
+        p = iso9("all")
+        pcache[pname] = p
+    end
+    return lpegmatch(p, text)
+end
+
+translit.methods["ru"] = translit.methods["all"]
+
+translit.methods["ru_old"] = function (text)
+    local pname = "ru_old" .. translit.deficient_font
+    local p = pcache[pname]
+    if not p then
+        p = iso9("all")
+        pcache[pname] = p
+    end
+    return lpegmatch(p, text)
+end
+
+-- vim:ft=lua:sw=4:ts=4

tex/context/third/transliterator/trans_tables_scntfc.lua

 --                      Other transliterations                               --
 --===========================================================================--
 
-local translit = thirddata.translit
+local translit  = thirddata.translit
+local pcache    = translit.parser_cache
+local lpegmatch = lpeg.match
 
 -- The following are needed because ISO 9 does not cover old Slavonic
 -- characters that became obsolete before the advent of гражданский шрифт.
 --                              End Of Tables                                --
 --===========================================================================--
 
-local function scientific (mode, text)
+local function scientific (mode)
     local P, Cs = lpeg.P, lpeg.Cs
     local utfchar = translit.utfchar
     local addrules = translit.addrules
             + translit.ocs_add_low
             + translit.ocs_add_upp
 
-        if mode == "iso9_ocs_hack" then
+        if translit.deficient_font == "yes" then
             cyr = cyr + translit.ru_jer_hack
         end
 
         scientific_parser = Cs((p_cyr / cyr + utfchar)^0)
     end
 
-    return scientific_parser:match(text)
+    return scientific_parser
 end
 
 
-translit.methods ["iso9_ocs"]      = function (text) return scientific( "iso9_ocs"     , text ) end
-translit.methods ["iso9_ocs_hack"] = function (text) return scientific( "iso9_ocs_hack", text ) end
-translit.methods ["ocs"]           = function (text) return scientific( "ocs"          , text ) end
-translit.methods ["ocs_gla"]       = function (text) return scientific( "ocs_gla"      , text ) end
+translit.methods["iso9_ocs"] = function (text)
+    local pname = "iso9_ocs" .. translit.deficient_font
+    local p     = pcache[pname]
+    if not p then
+        p = scientific("iso9_ocs")
+        pcache[pname] = p
+    end
+    return lpegmatch(p, text)
+end
+
+translit.methods["ocs"] = function (text)
+    local p = pcache["ocs"]
+    if not p then
+        p = scientific("ocs")
+        pcache["ocs"] = p
+    end
+    return lpegmatch(p, text)
+end
+
+translit.methods["ocs_gla"] = function (text)
+    local p = pcache["ocs_gla"]
+    if not p then
+        p = scientific("ocs_gla")
+        pcache["ocs_gla"] = p
+    end
+    return lpegmatch(p, text)
+end
+
+-- vim:ft=lua:ts=4:sw=4

tex/context/third/transliterator/trans_tables_sr.lua

 --                               Serbian                                     --
 --===========================================================================--
 
-local translit = thirddata.translit
+local translit  = thirddata.translit
+local pcache    = translit.parser_cache
+local lpegmatch = lpeg.match
 
 
 -- Special thanks to Mojca Miklavec and Arthur Reutenauer for their
 
 
 local t = translit
-local function sr (mode, text)
+local function sr (mode)
     local P, R, Cs = lpeg.P, lpeg.R, lpeg.Cs
     local utfchar  = translit.utfchar
     local modestr  = "p_" .. mode:match("to..$")
     trl_sr         = t[mode.."_upper"] + t[mode.."_lower"]
 
     -- transliteration from latin script requires macro handling … 
-    local _p_macro = P[[\]] * R("az", "AZ")^1
+    local _p_macro = P[[\]] * R("az", "AZ")^1 -- assuming standard catcodes
     local _p_sr    = translit.addrules (trl_sr, _p_sr) / trl_sr
     if translit.hinting then
         _p_sr = t.serbian_exceptions[modestr .. "_hint"] + _p_sr
         p_sr = Cs((_p_macro + _p_sr + utfchar)^0)
     end
 
-    return p_sr:match(text)
+    return p_sr
 end
 
-translit.methods ["sr_tolt"] = function (text) return sr( "sr_tolt", text ) end
-translit.methods ["sr_tocy"] = function (text) return sr( "sr_tocy", text ) end
+translit.methods["sr_tolt"] = function (text)
+    local pname = "sr_tolt" .. tostring(translit.hinting) .. tostring(translit.sr_except)
+    local p = pcache[pname]
+    if not p then
+        p = sr("sr_tolt")
+        pcache[pname] = p
+    end
+    return lpegmatch(p, text)
+end
+
+translit.methods["sr_tocy"] = function (text)
+    local pname = "sr_tocy" .. tostring(translit.hinting) .. tostring(translit.sr_except)
+    local p = pcache[pname]
+    if not p then
+        p = sr("sr_tocy")
+        pcache[pname] = p
+    end
+    return lpegmatch(p, text)
+end
+
+-- vim:ft=lua:sw=4:ts=4

tex/context/third/transliterator/trans_tables_trsc.lua

 
 local function transcript (mode, text)
     local P, R, S, V, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs
-    local addrules = translit.addrules
-    local utfchar = translit.utfchar
+    local addrules       = translit.addrules
+    local utfchar        = translit.utfchar
 
     local trsc_parser, p_rules, capt, p_de
 
-    function tab_subst (s, ...)
+    local function tab_subst (s, ...)
         local p_tmp, tmp = nil, translit.make_add_dict{}
         for _,tab in ipairs(arg) do
             tmp = tmp + tab

tex/context/third/transliterator/transliterator.lua

 --------------------------------------------------------------------------------
 --
 
-thirddata          = thirddata or { }
-thirddata.translit = thirddata.translit or { }
-local translit     = thirddata.translit
-translit.tables    = translit.tables  or { }
-translit.methods   = translit.methods or { }
+thirddata               = thirddata or { }
+thirddata.translit      = thirddata.translit or { }
+local translit          = thirddata.translit
+translit.tables         = translit.tables  or { }
+translit.methods        = translit.methods or { }
+translit.deficient_font = "no"
+translit.parser_cache   = { }
 
 --------------------------------------------------------------------------------
 -- Predefining vowel lists
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.