Commits

Anonymous committed 9b84172

documented the formatter

Comments (0)

Files changed (1)

formatters/acceptor_f-mediawiki.lua

 --------------------------------------------------------------------------------
 --
 
+--- TODO: put the templates in table
+
+--[[ldx--
+<source>Formatting functions for WikiText</source>
+<p>The formatting is done in a rudimentary way only, leaving enough room for
+the user to define eir own setups. Therefore, everything with a couple of minor
+exceptions maps to ordinary <logo label="context"/> commands whose result can
+be controlled by specifying the respective setups at the beginning of the final
+document. Also, the generated code should be as intuitive to the normal user as
+possible: for instance, URL definitions will appear in their own section before
+<t>\starttext</t>, as will <t>\useexternalfigure</t> declarations.</p>
+--ldx]]--
+
 thirddata.acceptor.formatters.wikimedia = { }
 
 local aux         = thirddata.acceptor.aux
 local auxgsub     = aux.gsub
 local auxtrim     = aux.trim
 
-------------------------------------------------------------------------
---  Setup Templates
-------------------------------------------------------------------------
+
+--[[ldx--
+<p><em>Setup Templates</em>. Some basic declarations:</p>
+<ul>
+  <li>Colors (X11-style color names for CSS) are enabled.</li>
+  <li>Markup switches are defined.</li>
+</ul>
+<p>The markup switches are necessary because of Wikitext’s specifying only
+<em>font switches</em>, instead of different levels of logical emphases.
+Furthermore, those switches are supposed not to work as enclosing the
+highlighted content, but rather as essential font switches, which makes it hard
+to map them onto something like <logo label="context"/>’s <t>{\italic ...}</t>.
+To accomodate this requirement, a command <t>\togglewiki</t> is used instead of
+the ordinary multi-level emphasis that a <logo label="context"/> user would
+define for eirself.</p>
+--ldx]]--
 
 local setup_templates = {
 }
 ]]
 
 docsetups.generic[#docsetups.generic+1] = setup_templates.basic_formatting
-------------------------------------------------------------------------
---  Headings
-------------------------------------------------------------------------
+
+--[[ldx--
+<p><em>Headings</em>. Mapping Wiki headings onto the
+<t>\chapter</t>/<t>\section</t> kind.</p>
+--ldx]]--
+
 local headings = { }
 headings[1] = "part"
 headings[2] = "chapter"
   docmain[#docmain+1] = fmt(heading_template, headings[level], text)
 end
 
+
+--[[ldx--
+<p><em>Content processing</em>. Three steps are necessary in order for normal
+wiki text to be passed on to <logo label="context"/>:</p>
+<ol>
+  <li>pre-escaping characters that have no meaning in Wikitext but do have so
+      in <logo label="context"/> (literalization, if you will);</li>
+  <li>parsing inline markup, which also contains some non-text elements like
+      figures, that can themselves contain inline markup such as captions,
+      possibly leading to recursion;</li>
+  <li>post-escaping letters that are significant in Wikitext and <logo
+      label="context"/> as well.</li>
+</ol>
+<p>Some of these might at a later point be accomplished by the inline parser
+itself.</p>
+--ldx]]--
+
 local pre_escape = function (str)
   return parsers.p_pre_escape:match(str)  or ""
 end
   text = tableconcat(text)
   return post_escape(text)
 end
---======================================================================
---  Tables
---======================================================================
+
+
+--[[ldx--
+<source>Tables</source>
+<p>Tables are blocks, and as such, naturally, processed by the block
+parser.</p>
+--ldx]]--
+
 do
   --- TODO: find a better way to handle paragraphs in tables
   local cell_template   = "  \\NC{}{\\framed[frame=off,align=flushleft,width=%.3f\\hsize]{%s}}\n"
     result = result .. rep(f, n)
     return result
   end
+
+  --[[ldx--
+  <p>The style is an example for how to nest an assignment list inside another
+  assignment list (i.e. the CSS parameters).</p>
+  <p>By far not all possible parameters are actually processed, as some don’t
+  have a meaning outside a web browser and others might interfere with the
+  postulate of configurability (see above). Others, like <t>text-align</t>,
+  influence the output only partially: for now, alignment is only respected
+  globally (for the whole table), and not on a per-cell basis.</p>
+  --ldx]]--
+
   --- examples: http://en.wikibooks.org/wiki/Editing_Wikitext/Tables#CSS_Styles
   --- basically a semicolon-delimited assignment list
+
   local semicolon = P";"
   local colon     = P":"
   local field     = C((1 - colon - semicolon)^1) / auxtrim
       local content
       if this then
         content = process_inline(this.content, "table")
-        --content = this.content
-        --content = pre_escape(this.content)
-        --content = lpegmatch(parsers.p_wiki_inline, content, 1, "table")
-        --content = tableconcat(content)
-        --content = post_escape(content)
       else
         content = "" -- TODO: handle rowspan from CSS opts
       end
     result = fmt(row_fmt, unpack(new_cells))
     return fmt(table_header_template, result)
   end
+
+  --[[ldx--
+  <p><t>formatter.table</t> is the actual table processor: it maps the
+  collected elements onto a simple <t>tabulate</t> environment.</p>
+  --ldx]]--
+
   formatter.table = function (tab)
     --table.print(tab)
     local rows, cells   = tablemaxn(tab), get_maxn_of_cells(tab) --tablemaxn(tab[1])
 
     local n_row, row = 1, tab[1]
     while row do
-      local new_row      = { }
+      local new_row = { }
       for n_cell=1, cells do
         local cell = row[n_cell]
         if cell then
           local new_cell   = pre_escape(cell.content)
+
+          --[[ldx--
+          <p>The inline parser has to be aware that it is called from inside a
+          table because of the image etc. elements that it might contain: these
+          shouldn’t be put into float environments if desired to be part of the
+          table ...</p>
+          --ldx]]--
+
           new_cell         = lpegmatch(p_wiki_inline, new_cell, 1, "table")
           local parameters = cell.parameters or "" -- TODO: use captured CSS
           --- TODO replace cct() with appropriate finalizer
 
 local utfcharacters = string.utfcharacters
 do
-  --local parse_prefix = function (pfx)
-    --local result = { }
-    --for char in utfcharacters(pfx) do
-      --result[#result+1] = char
-    --end
-    --return result
-  --end
   local item_template         = "\n%s\\item %s"
   local stopitemize_template  = "\n%s\\stopitemize"
   local startitemize_template = "\n%s\\startitemize[%s]"
             end
           end
         end -- if prefix
+
+        --[[ldx--
+        <p>The following exception handling concerns definition lists only. In
+        short: the first part of the definition up until the colon is allowed
+        to take a line of its own, with the actual defining part on the next
+        line. Therefore, whenever the latter part is missing we peek ahead to
+        the following line whether it is there (beginning with said colon).
+        If so, both parts together form an ordinary definition list entry and
+        we skip the next line; otherwise, the second part is treated as
+        empty.</p>
+        --ldx]]--
+
         if liststack[#liststack] == ";" then -- def list
           local definiendum, definiens = match(content, "^([^:]+)[ \t\v]*:?[ \t\v]*(.*)$")
           if definiens == "" then -- might be on next line
 end
 
 
---======================================================================
---  Inline Elements
---======================================================================
 
-------------------------------------------------------------------------
---  Links
-------------------------------------------------------------------------
+--[[ldx--
+<source>Inline Elements</source>
+<p>The following functions are to be called from inside the inline parser.</p>
+<p><em>Links</em>. Hyperlinks create a <t>\from</t> macro in situ and add a
+corresponding <t>\useURL</t> to the document header. Subsequent references to
+the same URL, as long as they use the same description, will refer to this
+declaration.</p>
+--ldx]]--
+
 local collected_links = { }
 local target_cnt      = 0
 
 local link_template = [=[\from[%s]]=]
 local link_setup    = [=[\useURL[%s][%s][][%s]]=]
 
+
+--[[ldx--
+<p><em>Internal links</em> are references to other pages on the same site. As
+such they work like some kind of shortcut, eliminating the need to specify the
+whole URL. They are, however, very specific to the site in question. I.e. an
+internal link on the <em>english</em> Wikipedia produces references to this
+language’s edition. (Compare interwiki links.) Therefore, the destination of an
+internal link depends on the site it on, and this information has to be
+supplied externally.</p>
+--ldx]]--
+
 formatter.internal_link = function (data)
   local str = data[1]
   --table.print(data)
   return result
 end
 
-------------------------------------------------------------------------
+
+
+--[[ldx--
+<p><em>External links</em>. The Wikitext notation for ordinary URLs /
+hyperlinks.</p>
+--ldx]]--
 
 formatter.external_link = function (data)
   --table.print(data)
   return result
 end
 
-------------------------------------------------------------------------
---  Images
-------------------------------------------------------------------------
+
+
+--[[ldx--
+<p><em>Images</em> pose a couple of problems: they are referenced by the file
+name, which may refer to locations on <em>different servers</em>. Besides
+parsing the respective wikimedia page (how do <em>they</em> know the correct
+location in the first place?), there is no way to locate the real file URL
+other than guessing. So that’s what we’re going to do: loop through a couple of
+possible servers until the first one returns a valid image.</p>
+<p>Not only the server prefix but the file subdirectory itself is non-trivial
+to obtain: it involves hashing the file name and creating the path from the
+first two letters of the hash... luckily this procedure is server-independent,
+so one hashed directory name will work on any of the servers.</p>
+--ldx]]--
 
 local collected_images  = { }
 local image_cnt         = 0
 local height_template   = "/%dpx-%s"
 local size_template     = "/%dx%dpx-%s"
 
---- That this is needed is the final proof that wikimedia software is broken.
---local svg_thumbnail_exception = function (path, fname)
-  --aux.handle_tmp_dir(globals.image_tmp_dir)
-  --return aux.get_wp_image(path, fname)
---end
-
 local get_image_file = function (paths, fname)
   aux.handle_tmp_dir(globals.image_tmp_dir)
   return aux.image_exists_p(fname) or aux.get_wp_image(paths, fname)
     altroot .. fmt("%s/%s/%s", sub(hash, 1, 1), sub(hash, 1, 2), name),
     altroot .. fmt("%s/%s/%s", sub(Hash, 1, 1), sub(Hash, 1, 2), Name),
   }
-  --local enpath  =
-  --if parameters then
-    --if parameters.thumb then
-      --if find(name, "%.svg$") then
-        --return svg_thumbnail_exception(root .. path, name)
-      --else
-        --root = root .. "thumb/"
-        --if parameters.xsize and parameters.ysize then
-          --path = path .. fmt(size_template, parameters.ysize, parameters.xsize, name)
-        --elseif parameters.xsize then
-          ----path = path .. fmt(width_template, parameters.xsize, name)
-          --path = path .. fmt(height_template, parameters.xsize, name)
-        --else -- ysize
-          --path = path .. fmt(height_template, parameters.ysize, name)
-        --end
-      --end
-    --end
-  --end
-  --return root .. path
   local local_path = get_image_file(paths, name)
   return local_path
 end
   return tableconcat(pfloat, ","), tableconcat(pimage, ",")
 end
 
+
+--[[ldx--
+<p>An image ordinarily maps to <logo label="context"/>’s <t>\placefigure</t>
+macro, unless it appears inside a table. In the latter case the float
+environment is omitted and a plain <t>\externalfigure</t> reference is created.</p>
+--ldx]]--
+
 formatter.image_inline = function (data)
   --table.print(data)
   local name       = gsub(data.name, " ", "_")
   local caption    = parameters.caption or ""
   local float_setups, image_setups = process_image_parameters(parameters)
 
+
+  --[[ldx--
+  <p>Images are handled economically: the first time they are used they are
+  declared via <t>\useexternalfigue</t> (in the document header). Any actual
+  appearances they make in the document are actually references to this one
+  declaration.</p>
+  --ldx]]--
+
   local image_id     = file_url .. image_setups
   local image_name   = collected_images[image_id]
   if not image_name then
   return fmt(lame_excuse, data)
 end
 
-------------------------------------------------------------------------
---  Inline HTML
-------------------------------------------------------------------------
+
+--[[ldx--
+<p><em>Inline HTML</em>. Sadly, Wikitext allows and even encourages the use of
+HTML markup. Nothing that would bother you if your target backend is a web
+browser. Unfortunately, different uses may be severely limited by this
+“feature” -- because it would entail that a complete XML parser be implemented
+in order to correctly process Wikimarkup. We don’t have any aspirations to do
+so, so we’ll limit ourselves to respect a subset of HTML tags and just ignore
+the rest.</p>
+--ldx]]--
+
 local html_source_cnt    = 0
 local html_source_prefix = [[__html_source_no_]]
 
   --return [[{\iffalse]] .. data [[\fi}]] -- unsafe
   return ""
 end
-------------------------------------------------------------------------
---  html entities
-------------------------------------------------------------------------
---- Char entities get redirected to the entities table in char-ent.lua
---- (thanks for its existence, Hans!)
---- 
---- Decimal characters are mapped straight onto utfchar(), hex entities
---- have to be normalized first.
+
+--[[ldx--
+<p><em>HTML entities</em>. Thanks to Hans there is a huge (complete?) list of
+HTML character entities contained in <t>char-ent.lua</t>. This one is accessed
+with entities referenced by name.</p>
+<p>Entities referenced by their unicode slot are passed directly to
+<t>unicode.utf8.char</t>, with an intermediate conversion step only for
+hexadecimal numbers.</p>
+--ldx]]--
 
 formatter.hex_entity = function (hexnum)
   return utf8char(tonumber("0x"..hexnum))
 end
 
---======================================================================
---  Misc
---======================================================================
 
-------------------------------------------------------------------------
---  References
-------------------------------------------------------------------------
+--[[ldx--
+<source>Miscellaneous</source>
+<p><em>References</em>. References in Wikitext have the same function as
+footnotes in typeset text. With one exception: they are supposed to take their
+own section at the end of the document. Which would, in a printed document,
+make them as annoying to the reader as endnotes. For this reason they are
+treated as footnotes for the time being.</p>
+--ldx]]--
+
 --- TODO: optional endnotes (place at {{Reflist|2}} tag)
+---       maybe pass over the „references“ section as it’s redundant with footnotes?
 
 local references = { } -- named refs -> hash, other -> array
 
   return fmt(reference_footnote_template, target_name)
 end
 
-------------------------------------------------------------------------
---  Category links
-------------------------------------------------------------------------
---- These don’t actually create content but will be handed over the
---- document’s interaction settings.
 
+--[[ldx--
+<p><em>Category links</em> are a Wiki peculiarity, hard to interpret
+meaningfully for typesetting. The most reasonable use I could come up with is
+to pass them on to the keyword field of the PDF metadata.</p>
+--ldx]]--
 
 formatter.category_link = function (raw)
   local namespace, category, sortkey = raw.namespace, raw.category, raw.sortkey
   end
 end
 
-------------------------------------------------------------------------
---  Gallery blocks
-------------------------------------------------------------------------
---- Every image has to be on its separate line.
+
+--[[ldx--
+<p><em>Gallery blocks</em>. Galleries recieve their layout dynamically
+depending on the horizontal size of the browser window, which would be hard to
+imitate in PDF output. For simplicity’s sake we will treat them as ordinary
+combinations.</p>
+--ldx]]--
 
 local combination_template = [[
 \placefigure[%s][%s]{%s}{%%
   return result
 end
 
---======================================================================
---  Templates
---======================================================================
 
---- Comment function may _not_ return nil because this would result in
---- an empty numerically indexed item in the document table, which in
---- turn would cause table.concat() to fail.
+--[[ldx--
+<p><em>Templates</em> are special, because each of them has its own special
+syntax, implemented via PHP plugins to the server’s wiki software. Bad thing,
+if you happen not to use PHP yourself. Therefore, the best we can do is to try
+to emulate/interpret the behaviour of a subset of those templates and ignore
+everything else.</p>
+--ldx]]--
+
 local ignore_excuse = [[{\iffalse Sorry, not implemented! Element: “%s ...” \fi}]]
 formatter.comment_ignore = function ()
   return ""
 formatter.comment = formatter.comment_ignore
 
 local template_handlers = { }
-------------------------------------------------------------------------
---  Infoboxes
-------------------------------------------------------------------------
+
+--[[ldx--
+<p><em>Infoboxes</em>. Those are the rectangular fields with basic data about
+the entity treated in a lemma, generally located to the right hand of the
+introductory remarks and TOC</p>
+--ldx]]--
+
 do
   local infobox_value_template = [[\framed[align=flushleft,frame=off]{%s}]]
   local infobox_key_template   = [[\framed[align=flushrigh,frame=off,bottomframe=on]{\Words{%s}}]]
   --docmain[#docmain+1] = block_ignore(content)
 end
 
---======================================================================
---  Finalizers (last stage of block processing)
---======================================================================
+
+--[[ldx--
+<source>Finalizers</source>
+<p>The code and strings below serve to complete certain stages of document
+processing, such as paragraph handling and concatenation of the document
+body and setups.</p>
+--ldx]]--
 
 local paragraph_template = [[
 \startparagraph