Commits

Robert Brewer committed ca7086f

Another move. Tests pass.

Comments (0)

Files changed (8)

+.*.pyc

simpleparse/stt/TextTools/TextTools.py

 # Extra stuff useful in combination with the C functions
 #
 
-def replace(text,what,with,start=0,stop=None,
+def replace(text,what,with_,start=0,stop=None,
 
             SearchObject=TextSearch,join=join,joinlist=joinlist,tag=tag,
             string_replace=string.replace,type=type,
         what = so.match
     if stop is None:
         if start == 0 and len(what) < 2:
-            return string_replace(text,what,with)
+            return string_replace(text,what,with_)
         stop = len(text)
     t = ((text,sWordStart,so,+2),
          # Found something, replace and continue searching
-         (with,Skip+AppendTagobj,len(what),-1,-1),
+         (with_,Skip+AppendTagobj,len(what),-1,-1),
          # Rest of text
          (text,Move,ToEOF)
          )
 
 # Alternative (usually slower) versions using different techniques:
 
-def _replace2(text,what,with,start=0,stop=None,
+def _replace2(text,what,with_,start=0,stop=None,
 
               join=join,joinlist=joinlist,tag=tag,
               TextSearchType=TextSearchType,TextSearch=TextSearch):
 
     """Analogon to string.replace; returns a string with all occurences
-       of what in text[start:stop] replaced by with.
+       of what in text[start:stop] replaced by with_.
        
        This version uses a one entry tag-table and a
        Boyer-Moore-Search-object.  what can be a string or a
         stop = len(text)
     if type(what) is not TextSearchType:
         what=TextSearch(what)
-    t = ((with,sFindWord,what,+1,+0),)
+    t = ((with_,sFindWord,what,+1,+0),)
     found,taglist,last = tag(text,t,start,stop)
     if not found: 
         return text
     return join(joinlist(text,taglist))
 
-def _replace3(text,what,with,
+def _replace3(text,what,with_,
 
               join=string.join,TextSearch=TextSearch,
               TextSearchType=TextSearchType):
     l = []
     x = 0
     for left,right in slices:
-        l.append(text[x:left] + with)
+        l.append(text[x:left] + with_)
         x = right
     l.append(text[x:])
     return join(l,'')
 
-def _replace4(text,what,with,
+def _replace4(text,what,with_,
 
               join=join,joinlist=joinlist,tag=tag,TextSearch=TextSearch,
               TextSearchType=TextSearchType):
         return text
     repl = [None]*len(slices)
     for i in range(len(slices)):
-        repl[i] = (with,)+slices[i]
+        repl[i] = (with_,)+slices[i]
     return join(joinlist(text,repl))
 
 def multireplace(text,replacements,start=0,stop=None,
         print 'Replacing strings'
         print '-'*72
         print
-        for what,with in (('m','M'),('mx','MX'),('mxText','MXTEXT'),
+        for what,with_ in (('m','M'),('mx','MX'),('mxText','MXTEXT'),
                           ('hmm','HMM'),('hmmm','HMM'),('hmhmm','HMM')):
-            print 'Replace "%s" with "%s"' % (what,with)
+            print 'Replace "%s" with "%s"' % (what,with_)
             t.start()
             for i in range(100):
-                rtext = string.replace(text,what,with)
+                rtext = string.replace(text,what,with_)
             print 'with string.replace:',t.stop(),'sec.'
             t.start()
             for i in range(100):
-                ttext = replace(text,what,with)
+                ttext = replace(text,what,with_)
             print 'with tag.replace:',t.stop(),'sec.'
             if ttext != rtext:
                 print 'results are NOT ok !'
                 mismatch(rtext,ttext)
             t.start()
             for i in range(100):
-                ttext = _replace2(text,what,with)
+                ttext = _replace2(text,what,with_)
             print 'with tag._replace2:',t.stop(),'sec.'
             if ttext != rtext:
                 print 'results are NOT ok !'
                 print rtext
             t.start()
             for i in range(100):
-                ttext = _replace3(text,what,with)
+                ttext = _replace3(text,what,with_)
             print 'with tag._replace3:',t.stop(),'sec.'
             if ttext != rtext:
                 print 'results are NOT ok !'
                 print rtext
             t.start()
             for i in range(100):
-                ttext = _replace4(text,what,with)
+                ttext = _replace4(text,what,with_)
             print 'with tag._replace4:',t.stop(),'sec.'
             if ttext != rtext:
                 print 'results are NOT ok !'

simpleparse/xml/.cvsignore

+*.pyc
+*.so

simpleparse/xml/__init__.py

+"""XML Parsing package
+
+At the moment it's really limited,
+but it does the basics, and the rest
+is mostly just a matter of fiddling
+about with Unicode and CharacterType
+support.  There is only very minimal
+support for Reference types, basically
+we note that a Reference exists, but
+don't do any further processing of it.
+"""

simpleparse/xml/xml_parser.py

+"""XML Parser based (loosely) on the XML Spec's EBNF
+
+This is a hand-coded parser based on the W3C's XML specification,
+there was a lot of busy-work rewriting to make the syntax agree,
+but also a number of signficant structural changes required by
+the limitations of the SimpleParse engine, and the completely
+procedural definition of References in the XML spec (the References
+don't occur in most places they can occur, and they are seen as
+altering the buffer directly as soon as they are encountered, this
+isn't something that fits readily into the mx.TextTools engine.
+
+http://www.w3.org/TR/REC-xml#sec-references
+
+Major Deviations from Spec:
+	No support for the unicode-style character classes
+	No support for UTF-16 (or Unicode at all, for that matter)
+	No support for References that alter the production
+		being parsed, so you can't have a Reference to an
+		item "</this>and<this>" or similar non-structure-
+		respecting References.  References have
+		particular locations they can occur, and they are
+		just ignored elsewhere
+	No support for parsing the contents of References within
+		the primary parsing pass
+	No support for excluded start/end tags
+	Comments allowed in both tags and declarations (but not
+		inside content-specifiers).
+	Allows end tags of the form </>
+"""
+
+declaration = """
+
+# Simple (changable) literals
+# These should be chosen based on the encoding
+# of the file, which is actually embedded in the
+# file :(
+
+<S>          := [\x20\x09\x0D\x0A]+
+<letter>     := [a-zA-Z]
+<namestart>  := letter/[_:]
+<namechar>   := letter/digit/[-._:]
+
+
+# don't change for XML, but would change for SGML or HTML
+<Eq>         := '='
+<REFO>       := '&'
+<PREFO>      := '%'
+<REFC>       := ';'
+<PIO>        := '<?'
+<PIC>        := '?>'
+<STagO>      := '<'
+<STagC>      := '>'
+<ETagO>      := '</'
+<ETagC>      := '>'
+<EmptyElemTagC> := '/>'
+
+
+# an XML-comment, note that this follows
+# SGML semantics, so that you can embed comment_sets
+# in the middle of the various declarations...
+>Comment<     := "<!", comment_set,(S?,comment_set)*,S?,">"
+>comment_set<   := '--', xml_comment,'--'
+xml_comment         := -'--'*
+
+# whitespace in tag (including possible comment)
+>TS<             := (Comment/S)+
+
+
+# general structures
+AttValue       :=    ('"', (Reference/ -[&"] )*, '"') / (  "'", (Reference / -[&'])*, "'")
+
+# Names
+Name                := namestart, namechar*
+Names               := Name, (S,Name)*
+Nmtoken             := namechar+
+Nmtokens            := Nmtoken, (S,Nmtoken)*
+
+# processing instructions
+PI          := PIO, PITarget, S?, PIContent, PIC
+PIContent   := -PIC*
+PITarget    :=   ?-( [Xx],[Mm],[Ll]), Name
+
+
+## references
+	# character reference
+	CharRef              := REFO,'#',('x',hex)/(int),REFC
+	# entity reference
+	EntityRef            := REFO, Name, REFC
+	# parsed entity ref
+	PEReference          := PREFO, Name, REFC
+
+Reference    :=    EntityRef / CharRef
+
+Misc := Comment/S
+
+### PROLOG definitions...
+
+	prolog         :=    XMLDecl?, Misc*, (doctypedecl, Misc*)?
+	XMLDecl        :=    '<?xml', VersionInfo, EncodingDecl?, SDDecl?, TS?, '?>'
+	VersionInfo    :=    TS?, 'version', TS?, Eq, TS?, (('"',VersionNum,'"')/("'",VersionNum,"'"))
+	VersionNum     :=    [a-zA-Z0-9_.:-]+
+
+
+### Document-type declarations (DTDs)
+
+	doctypedecl    :=    '<!DOCTYPE', TS, Name, (TS, ExternalID)?, TS?,('[', (markupdecl / DeclSep)*, ']', TS?)?, '>'
+
+	DeclSep        :=    PEReference / S
+	markupdecl     :=    elementdecl / AttlistDecl / EntityDecl / NotationDecl / PI / Comment
+
+	EncodingDecl   :=    TS, 'encoding', Eq, (('"', EncName, '"') / ("'", EncName, "'") )
+	EncName        :=    [A-Za-z],[A-Za-z0-9._-]*
+	SDDecl         :=    TS, 'standalone', Eq, (("'", ('yes' / 'no'), "'") / ('"', ('yes' / 'no'), '"'))
+
+	ExternalID     :=    ('SYSTEM', TS?, SystemLiteral) / ('PUBLIC', TS?, PubidLiteral, TS?, SystemLiteral ) / PEReference
+	NDataDecl      :=    (TS, 'NDATA', TS, Name)/ (TS,PEReference,TS,(Name/ PEReference)?)
+
+	SystemLiteral  :=    ('"', -["]*, '"') / ("'", -[']*, "'") / PEReference
+	PubidLiteral   :=    ('"', [\x20\x0D\x0Aa-zA-Z0-9'()+,./:=?;!*#@$_%-]*, '"') / ("'", [\x20\x0D\x0Aa-zA-Z0-9()+,./:=?;!*#@$_%-]*, "'") / PEReference
+
+	PublicID       :=    ('PUBLIC', TS, PubidLiteral) / PEReference
+
+
+### Element-type declarations
+	# hack to try and get PEReference parsing for the "normal case"
+	# where the PEReference doesn't change the production level, which
+	# seems to be suggested by the spec...
+	
+	elementdecl    :=    '<!ELEMENT', (
+		(TS, Name, TS, contentspec)/
+		elementdecl_pe
+	), TS?,'>'
+	
+	>elementdecl_pe< := (TS, PEReference, TS?, contentspec?)
+	
+	contentspec    :=    'EMPTY' / 'ANY' / Mixed / children
+	Mixed          :=    ('(', S?, '#PCDATA', (S?, '|', S?, (Name/PEReference))*, S?, ')*' ) /('(', S?, '#PCDATA', S?, ')')
+
+	repetition_specifier := ('?' / '*' / '+')?
+	children       :=    (choice / seq/ PEReference), repetition_specifier
+	cp             :=    (choice / seq / Name/ PEReference ), repetition_specifier
+	choice         :=    '(', S?, cp, ( S?, '|', S?, cp )+, S?, ')'
+	seq            :=    '(', S?, cp, ( S?, ',', S?, cp )*, S?, ')'
+
+
+### Attribute list declarations...
+	AttlistDecl    :=    '<!ATTLIST', TS, ((Name, AttDef*, TS?)/(PEReference, AttDef*, TS?)), '>'
+	AttDef         :=    TS, ((Name, TS, AttType, TS, DefaultDecl)/(PEReference, TS?, AttType?, TS?, DefaultDecl?))
+
+
+	AttType        :=    StringType / TokenizedType / EnumeratedType/ PEReference
+	StringType     :=    'CDATA'
+	TokenizedType  :=    'ID' / 'IDREF' / 'IDREFS' / 'ENTITY' / 'ENTITIES' / 'NMTOKEN' / 'NMTOKENS'
+	EnumeratedType :=    NotationType / Enumeration
+	NotationType   :=    'NOTATION', TS, ('(', NameOrList, ')')/PEReference
+	Enumeration    :=    '(', (NmTokenOrList/PEReference), ')'
+	
+	>NameOrList<    :=    S?, (Name/PEReference), (S?, '|', S?, (Name/PEReference))*, S?
+	>NmTokenOrList< :=    S?, (Nmtoken/PEReference), (S?, '|', S?, (Nmtoken/PEReference))*, S?
+
+
+	DefaultDecl    :=    '#REQUIRED' / '#IMPLIED' / ((('#FIXED', TS)/PEReference)?, (AttValue/PEReference)) / PEReference
+
+### Entity declarations
+	EntityDecl    :=    GEDecl / PEDecl
+	GEDecl        :=    '<!ENTITY', TS, ((Name, TS, EntityDef)/(PEReference,TS?,EntityDef?)), TS?, '>'
+	PEDecl        :=    '<!ENTITY', TS, '%', TS, ((Name, TS, PEDef)/(PEReference,TS?,PEDef?)), TS?, '>'
+	EntityDef     :=    EntityValue / (ExternalID, NDataDecl?) / PEReference
+	PEDef         :=    EntityValue / ExternalID / PEReference
+	EntityValue   :=    ('"', (PEReference / Reference / -[%&"])*, '"') /  ("'", (PEReference / Reference / -[%&'])*, "'")
+
+NotationDecl      :=    '<!NOTATION', TS, Name, TS, (ExternalID / PublicID), TS?, '>'
+
+### elements (nodes/tags/you-know :) )
+	# limitations in the SimpleParse engine mean that this
+	# particular structure will be basically useless...
+	element    :=    EmptyElemTag / (STag, content, ETag)
+
+	EmptyElemTag    :=    STagO, Name, (TS, Attribute)*, TS?, EmptyElemTagC
+	
+	STag       :=    STagO, Name, (TS, Attribute)*, TS?, STagC
+	ETag       :=    ETagO, Name?, TS?, ETagC
+
+	content    :=    (element / Reference / CDSect / PI / Comment / CharData)*
+
+	Attribute  :=    (Name, Eq, (AttValue/Reference))/(Reference,(Eq,(AttValue/Reference))?)
+
+	# general content of an element
+	CharData   :=    ( -[<&]+ / -(STag / EmptyElemTag / ETag / Reference / CDSect / PI / Comment) )+
+
+	# special non-parsed character data sections
+	CDSect     :=    CDStart, CData, CDEnd
+	<CDStart>  :=    '<![CDATA['
+	CData      :=    -CDEnd*
+	<CDEnd>    :=    ']]>'
+
+
+document       :=    prolog, element, Misc*
+"""
+from simpleparse.common import numbers, strings, chartypes

xml/.cvsignore

-*.pyc
-*.so

xml/__init__.py

-"""XML Parsing package
-
-At the moment it's really limited,
-but it does the basics, and the rest
-is mostly just a matter of fiddling
-about with Unicode and CharacterType
-support.  There is only very minimal
-support for Reference types, basically
-we note that a Reference exists, but
-don't do any further processing of it.
-"""

xml/xml_parser.py

-"""XML Parser based (loosely) on the XML Spec's EBNF
-
-This is a hand-coded parser based on the W3C's XML specification,
-there was a lot of busy-work rewriting to make the syntax agree,
-but also a number of signficant structural changes required by
-the limitations of the SimpleParse engine, and the completely
-procedural definition of References in the XML spec (the References
-don't occur in most places they can occur, and they are seen as
-altering the buffer directly as soon as they are encountered, this
-isn't something that fits readily into the mx.TextTools engine.
-
-http://www.w3.org/TR/REC-xml#sec-references
-
-Major Deviations from Spec:
-	No support for the unicode-style character classes
-	No support for UTF-16 (or Unicode at all, for that matter)
-	No support for References that alter the production
-		being parsed, so you can't have a Reference to an
-		item "</this>and<this>" or similar non-structure-
-		respecting References.  References have
-		particular locations they can occur, and they are
-		just ignored elsewhere
-	No support for parsing the contents of References within
-		the primary parsing pass
-	No support for excluded start/end tags
-	Comments allowed in both tags and declarations (but not
-		inside content-specifiers).
-	Allows end tags of the form </>
-"""
-
-declaration = """
-
-# Simple (changable) literals
-# These should be chosen based on the encoding
-# of the file, which is actually embedded in the
-# file :(
-
-<S>          := [\x20\x09\x0D\x0A]+
-<letter>     := [a-zA-Z]
-<namestart>  := letter/[_:]
-<namechar>   := letter/digit/[-._:]
-
-
-# don't change for XML, but would change for SGML or HTML
-<Eq>         := '='
-<REFO>       := '&'
-<PREFO>      := '%'
-<REFC>       := ';'
-<PIO>        := '<?'
-<PIC>        := '?>'
-<STagO>      := '<'
-<STagC>      := '>'
-<ETagO>      := '</'
-<ETagC>      := '>'
-<EmptyElemTagC> := '/>'
-
-
-# an XML-comment, note that this follows
-# SGML semantics, so that you can embed comment_sets
-# in the middle of the various declarations...
->Comment<     := "<!", comment_set,(S?,comment_set)*,S?,">"
->comment_set<   := '--', xml_comment,'--'
-xml_comment         := -'--'*
-
-# whitespace in tag (including possible comment)
->TS<             := (Comment/S)+
-
-
-# general structures
-AttValue       :=    ('"', (Reference/ -[&"] )*, '"') / (  "'", (Reference / -[&'])*, "'")
-
-# Names
-Name                := namestart, namechar*
-Names               := Name, (S,Name)*
-Nmtoken             := namechar+
-Nmtokens            := Nmtoken, (S,Nmtoken)*
-
-# processing instructions
-PI          := PIO, PITarget, S?, PIContent, PIC
-PIContent   := -PIC*
-PITarget    :=   ?-( [Xx],[Mm],[Ll]), Name
-
-
-## references
-	# character reference
-	CharRef              := REFO,'#',('x',hex)/(int),REFC
-	# entity reference
-	EntityRef            := REFO, Name, REFC
-	# parsed entity ref
-	PEReference          := PREFO, Name, REFC
-
-Reference    :=    EntityRef / CharRef
-
-Misc := Comment/S
-
-### PROLOG definitions...
-
-	prolog         :=    XMLDecl?, Misc*, (doctypedecl, Misc*)?
-	XMLDecl        :=    '<?xml', VersionInfo, EncodingDecl?, SDDecl?, TS?, '?>'
-	VersionInfo    :=    TS?, 'version', TS?, Eq, TS?, (('"',VersionNum,'"')/("'",VersionNum,"'"))
-	VersionNum     :=    [a-zA-Z0-9_.:-]+
-
-
-### Document-type declarations (DTDs)
-
-	doctypedecl    :=    '<!DOCTYPE', TS, Name, (TS, ExternalID)?, TS?,('[', (markupdecl / DeclSep)*, ']', TS?)?, '>'
-
-	DeclSep        :=    PEReference / S
-	markupdecl     :=    elementdecl / AttlistDecl / EntityDecl / NotationDecl / PI / Comment
-
-	EncodingDecl   :=    TS, 'encoding', Eq, (('"', EncName, '"') / ("'", EncName, "'") )
-	EncName        :=    [A-Za-z],[A-Za-z0-9._-]*
-	SDDecl         :=    TS, 'standalone', Eq, (("'", ('yes' / 'no'), "'") / ('"', ('yes' / 'no'), '"'))
-
-	ExternalID     :=    ('SYSTEM', TS?, SystemLiteral) / ('PUBLIC', TS?, PubidLiteral, TS?, SystemLiteral ) / PEReference
-	NDataDecl      :=    (TS, 'NDATA', TS, Name)/ (TS,PEReference,TS,(Name/ PEReference)?)
-
-	SystemLiteral  :=    ('"', -["]*, '"') / ("'", -[']*, "'") / PEReference
-	PubidLiteral   :=    ('"', [\x20\x0D\x0Aa-zA-Z0-9'()+,./:=?;!*#@$_%-]*, '"') / ("'", [\x20\x0D\x0Aa-zA-Z0-9()+,./:=?;!*#@$_%-]*, "'") / PEReference
-
-	PublicID       :=    ('PUBLIC', TS, PubidLiteral) / PEReference
-
-
-### Element-type declarations
-	# hack to try and get PEReference parsing for the "normal case"
-	# where the PEReference doesn't change the production level, which
-	# seems to be suggested by the spec...
-	
-	elementdecl    :=    '<!ELEMENT', (
-		(TS, Name, TS, contentspec)/
-		elementdecl_pe
-	), TS?,'>'
-	
-	>elementdecl_pe< := (TS, PEReference, TS?, contentspec?)
-	
-	contentspec    :=    'EMPTY' / 'ANY' / Mixed / children
-	Mixed          :=    ('(', S?, '#PCDATA', (S?, '|', S?, (Name/PEReference))*, S?, ')*' ) /('(', S?, '#PCDATA', S?, ')')
-
-	repetition_specifier := ('?' / '*' / '+')?
-	children       :=    (choice / seq/ PEReference), repetition_specifier
-	cp             :=    (choice / seq / Name/ PEReference ), repetition_specifier
-	choice         :=    '(', S?, cp, ( S?, '|', S?, cp )+, S?, ')'
-	seq            :=    '(', S?, cp, ( S?, ',', S?, cp )*, S?, ')'
-
-
-### Attribute list declarations...
-	AttlistDecl    :=    '<!ATTLIST', TS, ((Name, AttDef*, TS?)/(PEReference, AttDef*, TS?)), '>'
-	AttDef         :=    TS, ((Name, TS, AttType, TS, DefaultDecl)/(PEReference, TS?, AttType?, TS?, DefaultDecl?))
-
-
-	AttType        :=    StringType / TokenizedType / EnumeratedType/ PEReference
-	StringType     :=    'CDATA'
-	TokenizedType  :=    'ID' / 'IDREF' / 'IDREFS' / 'ENTITY' / 'ENTITIES' / 'NMTOKEN' / 'NMTOKENS'
-	EnumeratedType :=    NotationType / Enumeration
-	NotationType   :=    'NOTATION', TS, ('(', NameOrList, ')')/PEReference
-	Enumeration    :=    '(', (NmTokenOrList/PEReference), ')'
-	
-	>NameOrList<    :=    S?, (Name/PEReference), (S?, '|', S?, (Name/PEReference))*, S?
-	>NmTokenOrList< :=    S?, (Nmtoken/PEReference), (S?, '|', S?, (Nmtoken/PEReference))*, S?
-
-
-	DefaultDecl    :=    '#REQUIRED' / '#IMPLIED' / ((('#FIXED', TS)/PEReference)?, (AttValue/PEReference)) / PEReference
-
-### Entity declarations
-	EntityDecl    :=    GEDecl / PEDecl
-	GEDecl        :=    '<!ENTITY', TS, ((Name, TS, EntityDef)/(PEReference,TS?,EntityDef?)), TS?, '>'
-	PEDecl        :=    '<!ENTITY', TS, '%', TS, ((Name, TS, PEDef)/(PEReference,TS?,PEDef?)), TS?, '>'
-	EntityDef     :=    EntityValue / (ExternalID, NDataDecl?) / PEReference
-	PEDef         :=    EntityValue / ExternalID / PEReference
-	EntityValue   :=    ('"', (PEReference / Reference / -[%&"])*, '"') /  ("'", (PEReference / Reference / -[%&'])*, "'")
-
-NotationDecl      :=    '<!NOTATION', TS, Name, TS, (ExternalID / PublicID), TS?, '>'
-
-### elements (nodes/tags/you-know :) )
-	# limitations in the SimpleParse engine mean that this
-	# particular structure will be basically useless...
-	element    :=    EmptyElemTag / (STag, content, ETag)
-
-	EmptyElemTag    :=    STagO, Name, (TS, Attribute)*, TS?, EmptyElemTagC
-	
-	STag       :=    STagO, Name, (TS, Attribute)*, TS?, STagC
-	ETag       :=    ETagO, Name?, TS?, ETagC
-
-	content    :=    (element / Reference / CDSect / PI / Comment / CharData)*
-
-	Attribute  :=    (Name, Eq, (AttValue/Reference))/(Reference,(Eq,(AttValue/Reference))?)
-
-	# general content of an element
-	CharData   :=    ( -[<&]+ / -(STag / EmptyElemTag / ETag / Reference / CDSect / PI / Comment) )+
-
-	# special non-parsed character data sections
-	CDSect     :=    CDStart, CData, CDEnd
-	<CDStart>  :=    '<![CDATA['
-	CData      :=    -CDEnd*
-	<CDEnd>    :=    ']]>'
-
-
-document       :=    prolog, element, Misc*
-"""
-from simpleparse.common import numbers, strings, chartypes