Commits

Jason McKesson committed 16cf6b1
  • Participants
  • Parent commits edf30a3

Comments (0)

Files changed (5)

File SLAXML/LICENSE.txt

+Copyright (c) 2013 Gavin Kistner
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

File SLAXML/README.md

+# SLAXML
+SLAXML is a pure-Lua SAX-like streaming XML parser. It is more robust than
+many (simpler) pattern-based parsers that exist ([such as mine][1]), properly
+supporting code like `<expr test="5 > 7" />`, CDATA nodes, comments, namespaces,
+and processing instructions.
+
+It is currently not a truly valid XML parser, however, as it allows certain XML that
+is syntactically-invalid (not well-formed) to be parsed without reporting an error.
+
+[1]: http://phrogz.net/lua/AKLOMParser.lua
+
+## Features
+
+* Pure Lua in a single file (two files if you use the DOM parser).
+* Streaming parser does a single pass through the input and reports what it sees along the way.
+* Supports processing instructions (`<?foo bar?>`).
+* Supports comments (`<!-- hello world -->`).
+* Supports CDATA sections (`<![CDATA[ whoa <xml> & other content as text ]]>`).
+* Supports namespaces, resolving prefixes to the proper namespace URI (`<foo xmlns="bar">` and `<wrap xmlns:bar="bar"><bar:kittens/></wrap>`).
+* Supports unescaped greater-than symbols in attribute content (a common failing for simpler pattern-based parsers).
+* Unescapes named XML entities (`&lt; &gt; &amp; &quot; &apos;`) and numeric entities (e.g. `&#10;`) in attributes and text nodes (but—properly—not in comments or CDATA). Properly handles edge cases like `&#38;amp;`.
+* Optionally ignore whitespace-only text nodes (as appear when indenting XML markup).
+* Includes a DOM parser that is a both a convenient way to pull in XML to use as well as a nice example of using the streaming parser.
+* Does not add any keys to the global namespace.
+
+## Usage
+    local SLAXML = require 'slaxml'
+
+    local myxml = io.open('my.xml'):read()
+
+    -- Specify as many/few of these as you like
+    parser = SLAXML:parser{
+      startElement = function(name,nsURI)       end, -- When "<foo" or <x:foo is seen
+      attribute    = function(name,value,nsURI) end, -- attribute found on current element
+      closeElement = function(name,nsURI)       end, -- When "</foo>" or </x:foo> or "/>" is seen
+      text         = function(text)             end, -- text and CDATA nodes
+      comment      = function(content)          end, -- comments
+      pi           = function(target,content)   end, -- processing instructions e.g. "<?yes mon?>"
+    }
+
+    -- Ignore whitespace-only text nodes and strip leading/trailing whitespace from text
+    -- (does not strip leading/trailing whitespace from CDATA)
+    parser:parse(myxml,{stripWhitespace=true})
+
+If you just want to see if it will parse your document correctly, you can simply do:
+
+    local SLAXML = require 'slaxml'
+    SLAXML:parse(myxml)
+
+…which will cause SLAXML to use its built-in callbacks that print the results as seen.
+
+## DOM Builder
+
+If you simply want to build tables from your XML, you can alternatively:
+
+    local SLAXML = require 'slaxdom' -- requires the slaxml.lua file; make sure you copy it also
+    local doc = SLAXML:dom(myxml)
+
+The returned table is a 'document' comprised of tables for elements, attributes, text nodes, comments, and processing instructions. See the following documentation for what each supports.
+
+### DOM Table Features
+
+* **Document** - the root table returned from the `SLAXML:dom()` method.
+  * <strong>`doc.type`</strong> : the string `"document"`
+  * <strong>`doc.name`</strong> : the string `"#doc"`
+  * <strong>`doc.kids`</strong> : an array table of child processing instructions, the root element, and comment nodes.
+  * <strong>`doc.root`</strong> : the root element for the document
+* **Element**
+  * <strong>`someEl.type`</strong> : the string `"element"`
+  * <strong>`someEl.name`</strong> : the string name of the element (without any namespace prefix)
+  * <strong>`someEl.nsURI`</strong> : the namespace URI for this element; `nil` if no namespace is applied
+  * <strong>`someEl.attr`</strong> : a table of attributes, indexed by name and index
+      * `local value = someEl.attr['attribute-name']` : any namespace prefix of the attribute is not part of the name
+      * `local someAttr = someEl.attr[1]` : an single attribute table (see below); useful for iterating all attributes of an element, or for disambiguating attributes with the same name in different namespaces
+  * <strong>`someEl.kids`</strong> : an array table of child elements, text nodes, comment nodes, and processing instructions
+  * <strong>`someEl.el`</strong> : an array table of child elements only
+  * <strong>`someEl.parent`</strong> : reference to the the parent element or document table
+* **Attribute**
+  * <strong>`someAttr.type`</strong> : the string `"attribute"`
+  * <strong>`someAttr.name`</strong> : the name of the attribute (without any namespace prefix)
+  * <strong>`someAttr.value`</strong> : the string value of the attribute (with XML and numeric entities unescaped)
+  * <strong>`someEl.nsURI`</strong> : the namespace URI for the attribute; `nil` if no namespace is applied
+  * <strong>`someEl.parent`</strong> : reference to the the parent element table
+* **Text** - for both CDATA and normal text nodes
+  * <strong>`someText.type`</strong> : the string `"text"`
+  * <strong>`someText.name`</strong> : the string `"#text"`
+  * <strong>`someText.value`</strong> : the string content of the text node (with XML and numeric entities unescaped for non-CDATA elements)
+  * <strong>`someText.parent`</strong> : reference to the the parent element table
+* **Comment**
+  * <strong>`someComment.type`</strong> : the string `"comment"`
+  * <strong>`someComment.name`</strong> : the string `"#comment"`
+  * <strong>`someComment.value`</strong> : the string content of the attribute
+  * <strong>`someComment.parent`</strong> : reference to the the parent element or document table
+* **Processing Instruction**
+  * <strong>`someComment.type`</strong> : the string `"pi"`
+  * <strong>`someComment.name`</strong> : the string name of the PI, e.g. `<?foo …?>` has a name of `"foo"`
+  * <strong>`someComment.value`</strong> : the string content of the PI, i.e. everything but the name
+  * <strong>`someComment.parent`</strong> : reference to the the parent element or document table
+
+### Finding Text for a DOM Element
+
+The following function can be used to calculate the "inner text" for an element:
+
+    function elementText(el)
+      local pieces = {}
+      for _,n in ipairs(el.kids) do
+        if n.type=='element' then pieces[#pieces+1] = elementText(n)
+        elseif n.type=='text' then pieces[#pieces+1] = n.value
+        end
+      end
+      return table.concat(pieces)
+    end
+
+    local xml  = [[<p>Hello <em>you crazy <b>World</b></em>!</p>>]]
+    local para = SLAXML:dom(xml).root
+    print(elementText(para)) --> "Hello you crazy World!""
+
+### A Simpler DOM
+
+If you want the DOM tables to be simpler-to-serialize you can supply the `simple` option via:
+
+    local dom = SLAXML:dom(myXML,{ simple=true })
+
+In this case no table will have a `parent` attribute, elements will not have the `el` collection, and the `attr` collection will be a simple array (without values accessible directly via attribute name). In short, the output will be a strict hierarchy with no internal references to other tables, and all data represented in exactly one spot.
+
+
+## Known Limitations / TODO
+- Does not require or enforce well-formed XML. Certain syntax errors are
+  silently ignored and consumed. For example:
+  - `foo="yes & no"` is seen as a valid attribute
+  - `<root><child>` invokes two `startElement()` calls
+    but no `closeElement()` calls
+  - `<foo></bar>` invokes `startElement("foo")`
+    followed by `closeElement("bar")`
+- No support for custom entity expansion other than the standard XML
+  entities (`&lt; &gt; &quot; &apos; &amp;`) and numeric ASCII entities
+  (e.g. `&#10;`)
+- XML Declarations (`<?xml version="1.x"?>`) are incorrectly reported
+  as Processing Instructions
+- No support for DTDs
+- No support for extended (Unicode) characters in element/attribute names
+- No support for charset
+- No support for [XInclude](http://www.w3.org/TR/xinclude/)
+
+
+## History
+
+### v0.5.1 2013-Feb-18
++ `<foo xmlns="bar">` now directly generates `startElement("foo","bar")`
+  with no post callback for `namespace` required.
+
+### v0.5 2013-Feb-18
++ Use the `local SLAXML=require 'slaxml'` pattern to prevent any pollution
+  of the global namespace.
+
+### v0.4.3 2013-Feb-17
++ Bugfix to allow empty attributes, i.e. `foo=""`
++ `closeElement` no longer includes namespace prefix in the name, includes the nsURI
+
+### v0.4 2013-Feb-16
++ DOM adds `.parent` references
++ `SLAXML.ignoreWhitespace` is now `:parse(xml,{stripWhitespace=true})`
++ "simple" mode for DOM parsing
+
+### v0.3 2013-Feb-15
++ Support namespaces for elements and attributes
+  + `<foo xmlns="barURI">` will call `startElement("foo",nil)` followed by
+    `namespace("barURI")` (and then `attribute("xmlns","barURI",nil)`);
+    you must apply the namespace to your element after creation.
+  + Child elements without a namespace prefix that inherit a namespace will
+    receive `startElement("child","barURI")`
+  + `<xy:foo>` will call `startElement("foo","uri-for-xy")`
+  + `<foo xy:bar="yay">` will call `attribute("bar","yay","uri-for-xy")`
+  + Runtime errors are generated for any namespace prefix that cannot be resolved
++ Add (optional) DOM parser that validates hierarchy and supports namespaces
+
+### v0.2 2013-Feb-15
++ Supports expanding numeric entities e.g. `&#34;` -> `"`
++ Utility functions are local to parsing (not spamming the global namespace)
+
+### v0.1 2013-Feb-7
++ Option to ignore whitespace-only text nodes
++ Supports unescaped > in attributes
++ Supports CDATA
++ Supports Comments
++ Supports Processing Instructions
+
+
+## License
+Copyright © 2013 [Gavin Kistner](mailto:!@phrogz.net)
+
+Licensed under the [MIT License](http://opensource.org/licenses/MIT). See LICENSE.txt for more details.

File SLAXML/slaxdom.lua

+-- Optional parser that creates a flat DOM from parsing
+local SLAXML = require 'slaxml'
+function SLAXML:dom(xml,opts)
+	if not opts then opts={} end
+	local rich = not opts.simple
+	local push, pop = table.insert, table.remove
+	local stack = {}
+	local doc = { type="document", name="#doc", kids={} }
+	local current = doc
+	local builder = SLAXML:parser{
+		startElement = function(name,nsURI)
+			local el = { type="element", name=name, kids={}, el=rich and {} or nil, attr={}, nsURI=nsURI, parent=rich and current or nil }
+			if current==doc then
+				if doc.root then error(("Encountered element '%s' when the document already has a root '%s' element"):format(name,doc.root.name)) end
+				doc.root = el
+			end
+			push(current.kids,el)
+			if current.el then push(current.el,el) end
+			current = el
+			push(stack,el)
+		end,
+		attribute = function(name,value,nsURI)
+			if not current or current.type~="element" then error(("Encountered an attribute %s=%s but I wasn't inside an element"):format(name,value)) end
+			local attr = {type='attribute',name=name,nsURI=nsURI,value=value,parent=rich and current or nil}
+			if rich then current.attr[name] = value end
+			push(current.attr,attr)
+		end,
+		closeElement = function(name)
+			if current.name~=name or current.type~="element" then error(("Received a close element notification for '%s' but was inside a '%s' %s"):format(name,current.name,current.type)) end
+			pop(stack)
+			current = stack[#stack]
+		end,
+		text = function(value)
+			if current.type~='document' then
+				if current.type~="element" then error(("Received a text notification '%s' but was inside a %s"):format(value,current.type)) end
+				push(current.kids,{type='text',name='#text',value=value,parent=rich and current or nil})
+			end
+		end,
+		comment = function(value)
+			push(current.kids,{type='comment',name='#comment',value=value,parent=rich and current or nil})
+		end,
+		pi = function(name,value)
+			push(current.kids,{type='pi',name=name,value=value,parent=rich and current or nil})
+		end
+	}
+	builder:parse(xml,opts)
+	return doc
+end
+return SLAXML

File SLAXML/slaxml.lua

+--[=====================================================================[
+v0.5.1 Copyright © 2013 Gavin Kistner <!@phrogz.net>; MIT Licensed
+See http://github.com/Phrogz/SLAXML for details.
+--]=====================================================================]
+local SLAXML = {
+	VERSION = "0.5.1",
+	_call = {
+		pi = function(target,content)
+			print(string.format("<?%s %s?>",target,content))
+		end,
+		comment = function(content)
+			print(string.format("<!-- %s -->",content))
+		end,
+		startElement = function(name,nsURI)
+			print(string.format("<%s%s>",name,nsURI and (" ("..nsURI..")") or ""))
+		end,
+		attribute = function(name,value,nsURI)
+			print(string.format("  %s=%q%s",name,value,nsURI and (" ("..nsURI..")") or ""))
+		end,
+		text = function(text)
+			print(string.format("  text: %q",text))
+		end,
+		closeElement = function(name,nsURI)
+			print(string.format("</%s>",name))
+		end,
+	}
+}
+
+function SLAXML:parser(callbacks)
+	return { _call=callbacks or self._call, parse=SLAXML.parse }
+end
+
+function SLAXML:parse(xml,options)
+	if not options then options = { stripWhitespace=false } end
+
+	-- Cache references for maximum speed
+	local find, sub, gsub, char, push, pop = string.find, string.sub, string.gsub, string.char, table.insert, table.remove
+	local first, last, match1, match2, match3, pos2, nsURI
+	local pos = 1
+	local state = "text"
+	local textStart = 1
+	local currentElement={}
+	local currentAttributes={}
+	local currentAttributeCt
+	local nsStack = {}
+
+	local entityMap  = { ["lt"]="<", ["gt"]=">", ["amp"]="&", ["quot"]='"', ["apos"]="'" }
+	local entitySwap = function(orig,n,s) return entityMap[s] or n=="#" and char(s) or orig end
+	local function unescape(str) return gsub( str, '(&(#?)([%d%a]+);)', entitySwap ) end
+
+	local function finishText()
+		if first>textStart and self._call.text then
+			local text = sub(xml,textStart,first-1)
+			if options.stripWhitespace then
+				text = gsub(text,'^%s+','')
+				text = gsub(text,'%s+$','')
+				if #text==0 then text=nil end
+			end
+			if text then self._call.text(unescape(text)) end
+		end
+	end
+
+	local function findPI()
+		first, last, match1, match2 = find( xml, '^<%?([:%a_][:%w_.-]*) ?(.-)%?>', pos )
+		if first then
+			finishText()
+			if self._call.pi then self._call.pi(match1,match2) end
+			pos = last+1
+			textStart = pos
+			return true
+		end
+	end
+
+	local function findComment()
+		first, last, match1 = find( xml, '^<!%-%-(.-)%-%->', pos )
+		if first then
+			finishText()
+			if self._call.comment then self._call.comment(match1) end
+			pos = last+1
+			textStart = pos
+			return true
+		end
+	end
+
+	local function nsForPrefix(prefix)
+		for i=#nsStack,1,-1 do if nsStack[i][prefix] then return nsStack[i][prefix] end end
+		error(("Cannot find namespace for prefix %s"):format(prefix))
+	end
+
+	local function startElement()
+		first, last, match1 = find( xml, '^<([%a_][%w_.-]*)', pos )
+		if first then
+			currentElement[2] = nil
+			finishText()
+			pos = last+1
+			first,last,match2 = find(xml, '^:([%a_][%w_.-]*)', pos )
+			if first then
+				currentElement[1] = match2
+				currentElement[2] = nsForPrefix(match1)
+				match1 = match2
+				pos = last+1
+			else
+				currentElement[1] = match1
+				for i=#nsStack,1,-1 do if nsStack[i]['!'] then currentElement[2] = nsStack[i]['!']; break end end
+			end
+			currentAttributeCt = 0
+			push(nsStack,{})
+			return true
+		end
+	end
+
+	local function findAttribute()
+		first, last, match1 = find( xml, '^%s+([:%a_][:%w_.-]*)%s*=%s*', pos )
+		if first then
+			pos2 = last+1
+			first, last, match2 = find( xml, '^"([^<"]*)"', pos2 ) -- FIXME: disallow non-entity ampersands
+			if first then
+				pos = last+1
+				match2 = unescape(match2)
+			else
+				first, last, match2 = find( xml, "^'([^<']*)'", pos2 ) -- FIXME: disallow non-entity ampersands
+				if first then
+					pos = last+1
+					match2 = unescape(match2)
+				end
+			end
+		end
+		if match1 and match2 then
+			local currentAttribute = {match1,match2}
+			local prefix,name = string.match(match1,'^([^:]+):([^:]+)$')
+			if prefix then
+				if prefix=='xmlns' then
+					nsStack[#nsStack][name] = match2
+				else
+					currentAttribute[1] = name
+					currentAttribute[3] = nsForPrefix(prefix)
+				end
+			else
+				if match1=='xmlns' then
+					nsStack[#nsStack]['!'] = match2
+					currentElement[2] = match2
+				end
+			end
+			currentAttributeCt = currentAttributeCt + 1
+			currentAttributes[currentAttributeCt] = currentAttribute
+			return true
+		end
+	end
+
+	local function findCDATA()
+		first, last, match1 = find( xml, '^<!%[CDATA%[(.-)%]%]>', pos )
+		if first then
+			finishText()
+			if self._call.text then self._call.text(match1) end
+			pos = last+1
+			textStart = pos
+			return true
+		end
+	end
+
+	local function closeElement()
+		first, last, match1 = find( xml, '^%s*(/?)>', pos )
+		if first then
+			state = "text"
+			pos = last+1
+			textStart = pos
+
+			if self._call.startElement then self._call.startElement(unpack(currentElement)) end
+			if self._call.attribute then
+			for i=1,currentAttributeCt do self._call.attribute(unpack(currentAttributes[i])) end end
+
+			if match1=="/" then
+				pop(nsStack)
+				if self._call.closeElement then self._call.closeElement(unpack(currentElement)) end
+			end
+			return true
+		end
+	end
+
+	local function findElementClose()
+		first, last, match1, match2 = find( xml, '^</([%a_][%w_.-]*)%s*>', pos )
+		if first then
+			nsURI = nil
+			for i=#nsStack,1,-1 do if nsStack[i]['!'] then nsURI = nsStack[i]['!']; break end end
+		else
+			first, last, match2, match1 = find( xml, '^</([%a_][%w_.-]*):([%a_][%w_.-]*)%s*>', pos )
+			if first then nsURI = nsForPrefix(match2) end
+		end
+		if first then
+			finishText()
+			if self._call.closeElement then self._call.closeElement(match1,nsURI) end
+			pos = last+1
+			textStart = pos
+			pop(nsStack)
+			return true
+		end
+	end
+
+	while pos<#xml do
+		if state=="text" then
+			if not (findPI() or findComment() or findCDATA() or findElementClose()) then		
+				if startElement() then
+					state = "attributes"
+				else
+					first, last = find( xml, '^[^<]+', pos )
+					pos = (first and last or pos) + 1
+				end
+			end
+		elseif state=="attributes" then
+			if not findAttribute() then
+				if not closeElement() then
+					error("Was in an element and couldn't find attributes or the close.")
+				end
+			end
+		end
+	end
+end
+
+return SLAXML

File SLAXML/version.txt

+bb0c1097e803d4750b4821fe9cdcfd8cf0ed0e4c