Commits

Jeff Kistler committed 0ce5179

Work to date, including a working parser.

  • Participants

Comments (0)

Files changed (10)

+syntax: glob
+*.pyc
+*.pyo
+*~
+parser.out
+parsetab.py
+.installed.cfg
+bin
+build
+develop-eggs
+dist
+downloads
+eggs
+parts
+attic
+docs/_build
+MANIFEST
+src/*.egg-info
+
+syntax: regexp
+.*\#.*\#$
+
+CSS3Selectors
+--------------
+
+Copyright (c) 2010 Jeff Kistler
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of this project nor names of its contributors may 
+      be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ''AS IS''
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTES BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+=============
+CSS3Selectors
+=============
+
+CSS3Selectors is a CSS3 selectors parsing library.

File bootstrap.py

+##############################################################################
+#
+# Copyright (c) 2006 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+"""Bootstrap a buildout-based project
+
+Simply run this script in a directory containing a buildout.cfg.
+The script accepts buildout command-line options, so you can
+use the -c option to specify an alternate configuration file.
+
+$Id$
+"""
+
+import os, shutil, sys, tempfile, urllib2
+
+tmpeggs = tempfile.mkdtemp()
+
+is_jython = sys.platform.startswith('java')
+
+try:
+    import pkg_resources
+except ImportError:
+    ez = {}
+    exec urllib2.urlopen('http://peak.telecommunity.com/dist/ez_setup.py'
+                         ).read() in ez
+    ez['use_setuptools'](to_dir=tmpeggs, download_delay=0)
+
+    import pkg_resources
+
+if sys.platform == 'win32':
+    def quote(c):
+        if ' ' in c:
+            return '"%s"' % c # work around spawn lamosity on windows
+        else:
+            return c
+else:
+    def quote (c):
+        return c
+
+cmd = 'from setuptools.command.easy_install import main; main()'
+ws  = pkg_resources.working_set
+
+if len(sys.argv) > 2 and sys.argv[1] == '--version':
+    VERSION = ' == %s' % sys.argv[2]
+    args = sys.argv[3:] + ['bootstrap']
+else:
+    VERSION = ''
+    args = sys.argv[1:] + ['bootstrap']
+
+if is_jython:
+    import subprocess
+
+    assert subprocess.Popen([sys.executable] + ['-c', quote(cmd), '-mqNxd',
+           quote(tmpeggs), 'zc.buildout' + VERSION],
+           env=dict(os.environ,
+               PYTHONPATH=
+               ws.find(pkg_resources.Requirement.parse('setuptools')).location
+               ),
+           ).wait() == 0
+
+else:
+    assert os.spawnle(
+        os.P_WAIT, sys.executable, quote (sys.executable),
+        '-c', quote (cmd), '-mqNxd', quote (tmpeggs), 'zc.buildout' + VERSION,
+        dict(os.environ,
+            PYTHONPATH=
+            ws.find(pkg_resources.Requirement.parse('setuptools')).location
+            ),
+        ) == 0
+
+ws.add_entry(tmpeggs)
+ws.require('zc.buildout' + VERSION)
+import zc.buildout.buildout
+zc.buildout.buildout.main(args)
+shutil.rmtree(tmpeggs)

File buildout.cfg

+[buildout]
+parts = 
+     python
+#     ipython
+develop = .
+eggs =
+     ply
+
+[python]
+recipe = zc.recipe.egg
+interpreter = python
+eggs =
+    css3selectors
+    ${buildout:eggs}
+
+[ipython]
+recipe = zc.recipe.egg
+eggs = 
+    ipython
+    css3selectors
+    ${buildout:eggs}
+scripts = ipython
+
+[docs]
+recipe = collective.recipe.sphinxbuilder
+source = ${buildout:directory}/docs
+build = ${buildout:directory}/docs/_build
+import os
+from distutils.core import setup
+
+def read(fname):
+    return open(os.path.join(os.path.dirname(__file__), fname)).read()
+
+setup(
+    name = 'css3selectors',
+    version = '0.1a',
+    license = 'BSD',
+    description = 'A CSS3 selectors parsing library.',
+    long_description = read('README'),
+    author = 'Jeff Kistler',
+    author_email = 'jeff@jeffkistler.com',
+    packages = ['css3selectors',],
+    package_dir = {'': 'src'},
+    classifiers = [
+        'Development Status :: 3 - Alpha',
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: BSD License',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python',
+        'Topic :: Internet :: WWW/HTTP',
+    ]
+)

File src/css3selectors/__init__.py

Empty file added.

File src/css3selectors/lexer.py

+"""
+The lexical grammar used to generate the tokenizer.
+"""
+import re
+
+from ply import lex
+
+def r_nongroup(rx):
+    return ur'(?:' + rx + ur')'
+
+def r_or(*rxs):
+    return r_nongroup(ur'|'.join([r_nongroup(x) for x in rxs]))
+
+def r_star(rx):
+    return r_nongroup(rx) + ur'*'
+
+def r_plus(rx):
+    return r_nongroup(rx) + ur'+'
+
+def r_opt(rx):
+    return r_nongroup(rx) + ur'?'
+
+softsp         = r_opt(r_or(ur'\r\n', ur'[ \t\r\n\f]'))
+
+s              = r_plus(ur'[ \t\r\n\f]')
+w              = r_opt(s)
+nl             = ur'\n|\r\n|\r|\f'
+
+h              = ur'[0-9a-fA-F]'
+nonascii       = ur'[^\0-\177]'
+unicode        = ur'\\' + h + ur'{1,6}' + softsp
+escape         = r_or(unicode, ur'\\[^\r\n\f0-9a-fA-F]')
+nmstart        = r_or(ur'[_a-zA-Z]', nonascii, escape)
+nmchar         = r_or(ur'[_a-zA-Z0-9-]', nonascii, escape)
+string1        = ur'"%s"' % r_star(r_or(ur'[^\n\r\f\\"]',
+                                       ur'\\' + nl,
+                                       escape))
+string2        = r"'%s'" % r_star(r_or(ur"[^\n\r\f\\']",
+                                       ur'\\' + nl,
+                                       escape))
+invalid1       = ur'"%s' % r_star(r_or(ur'[^\n\r\f\\"]',
+                                      ur'\\'+nl,
+                                      escape))
+invalid2       = r"'%s" % r_star(r_or(ur"[^\n\r\f\\']",
+                                      ur'\\'+nl,
+                                      escape))
+
+comment        = ur'\/\*[^*]*\*+(?:[^/][^*]*\*+)*\/'
+comment = ur'\/\*' + r_star(ur'[^*]') + r_plus(ur'\*') + \
+          r_star(ur'[^/]' + r_star(ur'[^*]') + r_plus(ur'\*')) + \
+          ur'\/'
+
+ident          = r_opt(ur'-') + nmstart + r_star(nmchar)
+name           = r_plus(nmchar)
+
+num            = r_or(r_star(ur'[0-9]') + ur'\.' + r_plus(ur'[0-9]'),
+                      r_plus(ur'[0-9]'))
+string         = r_or(string1, string2)
+invalid        = r_or(invalid1, invalid2)
+url            = r_star(r_or(ur'[!#$%&*-~]', nonascii, escape))
+
+
+def letter(c):
+    return r_or(c.lower(),
+                ur'\\0{0,4}' +
+                r_or(hex(ord(c.upper()))[2:], hex(ord(c.lower()))[2:]) +
+                softsp)
+
+D = letter(u'D')
+E = letter(u'E')
+N = letter(u'N')
+O = letter(u'O')
+T = letter(u'T')
+V = letter(u'V')
+
+tokens = (
+    'S',
+    'INCLUDES',
+    'DASHMATCH',
+    'PREFIXMATCH',
+    'SUFFIXMATCH',
+    'SUBSTRINGMATCH',
+    'IDENT',
+    'STRING',
+    'FUNCTION',
+    'NUMBER',
+    'HASH',
+    'PLUS',
+    'GREATER',
+    'COMMA',
+    'TILDE',
+    'NOT',
+#    'ATKEYWORD',
+#    'INVALID',
+#    'PERCENTAGE',
+    'DIMENSION',
+#    'CDO',
+#    'CDC',
+#    'COMMENT',
+)
+
+literals = '[]-*|):.'
+
+t_S = s
+t_INCLUDES = ur'~='
+t_DASHMATCH = ur'\|='
+t_PREFIXMATCH = ur'\^='
+t_SUFFIXMATCH = ur'\$='
+t_SUBSTRINGMATCH = ur'\*='
+t_IDENT = ident
+t_STRING = string
+t_FUNCTION = ident + ur'\('
+t_NUMBER = num
+t_HASH = ur'\#' + name
+t_PLUS = w + ur'\+'
+t_GREATER = w + ur'>'
+t_COMMA = w + ur','
+t_TILDE = w + ur'~'
+t_NOT = ur':' + N + O + T + ur'\('
+#t_ATKEYWORD = ur'@' + ident
+#t_INVALID = invalid
+#t_PERCENTAGE = num + ur'%'
+t_DIMENSION = num + ident
+#t_CDO = ur'<!--'
+#t_CDC = ur'-->'
+t_ignore_COMMENT = comment
+
+def t_error(t):
+    print "ILLEGAL TOKEN '%s'" % t.value
+    t.lexer.skip(1)
+
+lexer = lex.lex(reflags=re.UNICODE)

File src/css3selectors/parser.py

+"""
+Production rules used to generate the parser.
+"""
+from ply import yacc
+
+from lexer import tokens
+import selectors
+
+def p_selectors_group(p):
+    '''
+    selectors_group : selector
+                    | selectors_group COMMA spaces selector
+    '''
+    if len(p) == 2:
+        p[0] = [p[1]]
+    else:
+        p[0] = p[1] + [p[4]]
+
+def p_selector(p):
+    '''
+    selector : simple_selector_sequence
+             | simple_selector_sequence combinator_sequence
+
+    '''
+    p[0] = p[1:]
+
+def p_combinator_sequence(p):
+    '''
+    combinator_sequence : combinator_sequence combinator simple_selector_sequence
+                        | combinator simple_selector_sequence
+                        | empty
+    '''
+    if len(p) == 3:
+        selector = p[2]
+        selector.combinator = selectors.STRING_TO_COMBINATOR_TYPE[p[1].strip()]
+        p[0] = selector
+    else:
+        p[0] = None
+
+def p_combinator(p):
+    '''
+    combinator : PLUS spaces
+               | GREATER spaces
+               | TILDE spaces
+               | S spaces
+    '''
+    p[0] = p[1]
+
+def p_simple_selector_sequence(p):
+    '''
+    simple_selector_sequence : element specifier_list
+                             | specifier_list
+    '''
+    # If p[1] is a list, then we have no element
+    if len(p) == 3:
+        p[0] = selectors.Selector(namespace=p[1][0], tag=p[1][1], specifiers=p[2])
+    else:
+        p[0] = selectors.Selector(specifiers=p[1])
+
+def p_specifier_list(p):
+    '''
+    specifier_list : specifier_list specifier
+                   | specifier
+                   | empty
+    '''
+    p[0] = filter(lambda x: bool(x), p[1:])
+
+def p_specifier(p):
+    '''
+    specifier : id_specifier
+              | class
+              | attrib
+              | pseudo
+              | negation
+    '''
+    p[0] = p[1]
+
+def p_element(p):
+    '''
+    element : type_selector
+            | universal
+    '''
+    p[0] = p[1]
+
+def p_type_selector(p):
+    '''
+    type_selector : element_name
+                  | namespace_prefix element_name
+    '''
+    # p[0] = selectors.Element(p[2], p[1])
+    if len(p) == 3:
+        p[0] = (p[1], p[2])
+    else:
+        p[0] = (None, p[1])
+
+def p_namespace_prefix(p):
+    '''
+    namespace_prefix : '|'
+                     | IDENT '|'
+                     | '*' '|'
+    '''
+    if len(p) == 2:
+        p[0] = p[1]
+    else:
+        p[0] = u'*'
+
+def p_element_name(p):
+    '''
+    element_name : IDENT
+    '''
+    p[0] = p[1]
+
+def p_universal(p):
+    '''
+    universal : '*' 
+              | namespace_prefix '*'
+    '''
+    if len(p) == 3:
+        p[0] = (p[1], p[2])
+    else:
+        p[0] = (None, p[1])
+
+def p_class(p):
+    '''
+    class : '.' IDENT
+    '''
+    p[0] = selectors.AttributeSpecifier(name=u"class", value=p[2], match=selectors.LIST_MATCH)
+
+def p_attrib(p):
+    '''
+    attrib : '[' spaces IDENT spaces attrib_match ']'
+           | '[' spaces namespace_prefix IDENT spaces attrib_match ']'
+    '''
+    if len(p) == 6:
+        match = selectors.STRING_TO_MATCH_TYPE.get(p[5])
+        if match:
+            value, match_type = match
+        else:
+            value, match_type = None, None
+        p[0] = selectors.AttributeSpecifier(name=p[3], value=value, match=match_type)
+    else:
+        match = p[6]
+        if match:
+            value, match_type = match
+        else:
+            value, match_type = None, None
+        p[0] = selectors.AttributeSpecifier(namespace=p[3], name=p[4], value=value, match=match_type)
+
+def p_attrib_match(p):
+    '''
+    attrib_match : PREFIXMATCH spaces attrib_val spaces
+                 | SUFFIXMATCH spaces attrib_val spaces
+                 | SUBSTRINGMATCH spaces attrib_val spaces
+                 | '=' spaces attrib_val spaces
+                 | INCLUDES spaces attrib_val spaces
+                 | DASHMATCH spaces attrib_val spaces
+                 | empty
+    '''
+    if len(p) > 1:
+        p[0] = (p[1], p[3])
+    else:
+        p[0] = None
+
+def p_attrib_val(p):
+    '''
+    attrib_val : IDENT
+               | STRING
+    '''
+    p[0] = p[1]
+
+def p_pseudo(p):
+    '''
+    pseudo : pseudo_prefix IDENT
+           | pseudo_prefix functional_pseudo
+    '''
+    if not isinstance(p[2], selectors.PseudoSpecifier):
+        p[0] = selectors.PseudoSpecifier(p[2])
+    else:
+        p[0] = p[2]
+
+def p_pseudo_prefix(p):
+    '''
+    pseudo_prefix : ':'
+                  | ':' ':'
+    '''
+    p[0] = u''.join(p[1:])
+
+def p_functional_pseudo(p):
+    '''
+    functional_pseudo : FUNCTION spaces expression ')'
+    '''
+    p[0] = selectors.PseudoSpecifier(p[1], p[3])
+
+def p_expression(p):
+    '''
+    expression : expression term spaces
+               | term spaces
+    '''
+    p[0] = p[1]
+
+def p_term(p):
+    '''
+    term : PLUS
+         | '-'
+         | DIMENSION
+         | NUMBER
+         | STRING
+         | IDENT
+    '''
+    p[0] = p[1]
+
+def p_negation(p):
+    '''
+    negation : NOT spaces negation_arg spaces ')'
+    '''
+    p[0] = selectors.NegationSpecifier(p[3])
+
+def p_negation_arg(p):
+    '''
+    negation_arg : type_selector 
+                 | universal
+                 | id_specifier
+                 | class
+                 | attrib
+                 | pseudo
+    '''
+    if isinstance(p[1], selectors.Specifier):
+        p[0] = selectors.Selector(specifiers=[p[1]])
+    else:
+        p[0] = selectors.Selector(tag=p[1][1], namespace=p[1][0])
+
+def p_id_specifier(p):
+    '''
+    id_specifier : HASH
+    '''
+    p[0] = selectors.AttributeSpecifier(name=u"id", value=p[1][1:], match=selectors.EXACT_MATCH)
+
+# Helpers
+
+def p_spaces(p):
+    '''
+    spaces : spaces S
+           | S
+           | empty
+    '''
+    p[0] = p[1] and u' '
+
+def p_empty(p):
+    '''
+    empty :
+    '''
+    p[0] = u''
+
+def p_error(p):
+    print "Parse error '%s'" % p
+
+start = 'selectors_group'
+parser = yacc.yacc()

File src/css3selectors/selectors.py

+DESCENDANT_COMBINATOR = 1
+CHILD_COMBINATOR = 2
+ADJACENT_SIBLING_COMBINATOR = 3
+GENERAL_SIBLING_COMBINATOR = 4
+
+STRING_TO_COMBINATOR_TYPE = {
+    u' ': DESCENDANT_COMBINATOR,
+    u'>': CHILD_COMBINATOR,
+    u'+': ADJACENT_SIBLING_COMBINATOR,
+    u'~': GENERAL_SIBLING_COMBINATOR,
+}
+
+
+class Selector(object):
+    """
+    A CSS3 selector.
+    """
+    def __init__(self, tag=None, namespace=None, combinator=None, specifiers=None):
+        self.tag = tag or '*'
+        self.namespace = namespace
+        self.combinator = combinator
+        self.specifiers = specifiers
+
+ATTRIBUTE_SPECIFIER = 1
+PSEUDO_SPECIFIER = 2
+NEGATION_SPECIFIER = 3
+
+class Specifier(object):
+    """
+    Base class for selector specifiers.
+    """
+    pass
+
+EXISTS_MATCH = 1
+EXACT_MATCH = 2
+LIST_MATCH = 3
+PREFIX_MATCH = 4
+SUFFIX_MATCH = 5
+CONTAINS_MATCH = 6
+HYPHEN_MATCH = 7
+
+STRING_TO_MATCH_TYPE = {
+    u'=': EXACT_MATCH,
+    u'~=': LIST_MATCH,
+    u'^=': PREFIX_MATCH,
+    u'$=': SUFFIX_MATCH,
+    u'*=': CONTAINS_MATCH,
+    u'|=': HYPHEN_MATCH,
+}
+
+class AttributeSpecifier(Specifier):
+    """
+    An expression that matches against an attribute of an element as described in section 6.3
+    """
+    type = ATTRIBUTE_SPECIFIER
+    def __init__(self, namespace=None, name=None, value=None, match=None):
+        self.namespace = namespace or '*'
+        self.name = name
+        self.value = value
+        self.match = match
+
+class PseudoSpecifier(Specifier):
+    """
+    A pseudo-class or element specifier as described in section 6.6
+    """
+    type = PSEUDO_SPECIFIER
+    def __init__(self, name=None, expression=None):
+        self.name = name
+        self.expression = expression
+
+class NegationSpecifier(Specifier):
+    """
+    The negation pseudo-class specifier as described in section 6.6.7
+    """
+    type = NEGATION_SPECIFIER
+    def __init__(self, selector):
+        self.selector = selector