diff --git a/pyulib/setup.py b/pyulib/setup.py index 70c905a..0af8d30 100755 --- a/pyulib/setup.py +++ b/pyulib/setup.py @@ -88,6 +88,7 @@ addp('ulib.ext.tarfile', ['README.txt']) addp('ulib.ext.web') addp('ulib.ext.web.wsgiserver', ['LICENSE.txt']) addp('ulib.ext.web.contrib') +addp('ulib.ext.xpath') addp('ulib.formats') addp('ulib.gae') addp('ulib.json') diff --git a/pyulib/src/ulib/ext/xpath/README.rst b/pyulib/src/ulib/ext/xpath/README.rst new file mode 100644 index 0000000..0425286 --- /dev/null +++ b/pyulib/src/ulib/ext/xpath/README.rst @@ -0,0 +1,297 @@ +:mod:`xpath` --- XPath Queries For DOM Trees +============================================ +The :mod:`xpath` module is a pure Python implementation of the XPath query +language, operating on DOM documents. It supports most of XPath 1.0, with +the following exceptions: + +* The namespace axis is not supported. +* The ``round()`` function rounds toward 0, not towards positive infinity. + +The following XPath 2.0 features are supported: + +* A default namespace may be supplied in the expression context. +* Node tests may have a wildcard namespace. (e.g., ``*:name``.) + +This module provides the following functions for evaluating XPath expressions: + +.. function:: find(expr, node, [\**kwargs]) + + Evaluate the XPath expression *expr* with *node* as the context node, + and return: + + * ``True`` or ``False``, when the expression has a boolean result. + * A :class:`float`, when the expression has an numeric result. + * A :class:`unicode`, when the expression has a string result. + * A list of :class:`xml.dom.Node`, when the expression has a + node-set result. + +.. function:: findnode(expr, node, [\**kwargs]) + + Evaluate the XPath expression *expr* with *node* as the context node, + and return a single node. If the result of the expression is a non-empty + node-set, return the first node in the set. If the result is an empty + node-set, return ``None``. If the result is not a node-set, raise + :exc:`XPathTypeError`. + +.. function:: findvalue(expr, node, [\**kwargs]) + + Evaluate the XPath expression *expr* with *node* as the context node, + and return the string-value of the result. If the result is an empty + node-set, return ``None`` instead. + +.. function:: findvalues(expr, node, [\**kwargs]) + + Evaluate the XPath expression *expr* with *node* as the context node, + and return a list of the string-values of the resulting node-set. If + the result is not a node-set, raise :exc:`XPathTypeError`. + +The above functions take take the following optional keyword arguments +defining the evaluation context: + +*context* + A :class:`XPathContext` object containing the evaluation context. It + is legal to supply both a context object and additional arguments + extending its contents. + +*default_namespace* + The default namespace URI, which will be used for any unqualified name + in the XPath expression. + +*namespaces* + A mapping of prefixes to namespace URIs. + +*variables* + A mapping of variable names to values. To map a variable in a specific + namespace, use a two element tuple of the (namespace URI, name) as the key. + +Additional keyword arguments will be used as variable bindings. + +Basic Queries +------------- +The examples in this section use this XML document: :: + + + + + + +Select the ``item`` element in a document: :: + + >>> xpath.find('//item', doc) + [, ] + +Select the ``name`` attribute of the first item element (note that this returns +a list of Attr nodes): :: + + >>> xpath.find('//item[1]/@name', doc) + [] + +Select the string-value of the ``name`` attribute of the last item element: :: + + >>> xpath.findvalue('//item[last()]/@name', doc) + u'parrot' + +Select the first item element with a ``name`` attribute that starts with "p": :: + + >>> xpath.findnode('//item[starts-with(@name,"p")]', doc) + + +Namespaces +---------- +The examples in this section use this XML document: :: + + + python + parrot + + +The *namespaces* argument to the evaluation functions provides a dictionary +of prefixes to namespace URIs. Prefixed QNames in expressions will be +expanded according to this mapping. + +To select the string-values of the ``item`` elements in the +"\http://circus.example.org/" namespace: :: + + >>> xpath.findvalues('//prefix:item', doc, + ... namespaces={'prefix':'http://circus.example.org/'}) + [u'parrot'] + +The *default_namespace* argument provides a namespace URI that will be +used for any unprefixed QName appearing in a position where an element +name is expected. (Default namespaces are a feature of XPath 2.0.) + +To select the string-values of the ``item`` elements in the +"\http://flying.example.org/" namespace: :: + + >>> xpath.findvalues('//item', doc, + ... default_namespace='http://flying.example.org/') + [u'python'] + +When a *default_namespaces* argument is not provided, the default namespace +is that of the document element. When a *namespaces* argument is not +provided, the prefix declarations consist of all prefixes defined on the +document element. + +To select the string values of all the ``item`` elements: :: + + >>> xpath.findvalues('//item | //circus:item', doc) + [u'python', u'parrot'] + +The :mod:`xpath` module supports wildcard matches against both the prefix +and local name. (XPath 1.0 only support wildcard matches against the local +name; XPath 2.0 adds support for wildcard matches against the prefix.) + +To select all children of the document element, regardless of namespace: :: + + >>> xpath.find('/*:*/*:*', doc) + [, ] + +Variables +--------- +The examples in this section use this XML document: :: + + + python + parrot + + +XPath variables may be passed to the evaluation functions as keyword +arguments: :: + + >>> xpath.findvalue('//item[@id = $id]', doc, id=2) + u'parrot' + +It is also possible to pass a dictionary of variables to an evaluation +function with the *variables* keyword argument: :: + + >>> xpath.findvalue('//item[@id = $id]', doc, variables={'id':1}) + u'python' + +To define a variable within a specific namespace, use a tuple of +``(namespace-URI, local-name)`` as the key in the variable dictionary: :: + + >>> variables = { ('http://python.example.org/', 'id') : 1 } + >>> namespaces = { 'python' : 'http://python.example.org/' } + >>> xpath.findvalue('//item[@id = $python:id]', doc, + ... variables=variables, namespaces=namespaces) + u'python' + +Compiled Expression Objects +--------------------------- +.. class:: XPath(expr) + + An expression object which contains a compiled form of the XPath + expression *expr*. + + Under most circumstances, it is not necessary to directly use this class, + since the :func:`find` et al. functions cache compiled expressions. + + .. method:: find(node, [\**kwargs]) + findnode(node, [\**kwargs]) + findvalue(node, [\**kwargs]) + findvalues(node, [\**kwargs]) + + These methods are identical to the functions of the same name. + +Create and use a compiled expression: :: + + >>> expr = xpath.XPath('//text()') + >>> print expr + /descendant-or-self::node()/child::text() + >>> expr.find() + [] + +Expression Context Objects +-------------------------- +.. class:: XPathContext([document,] [\**kwargs]) + + The static context of an XPath expression. Context objects may be + created with the same keyword arguments accepted by the expression + evaluation functions. + + The *document* argument may contain a DOM node. If provided, the + default namespace and namespace declarations will be initialized from + the document element of this node. + + The context contains the following attributes and methods: + + .. attribute:: default_namespace + + The default namespace URI. + + .. attribute:: namespaces + + The mapping of prefixes to namespace URIs. + + .. attribute:: variables + + The mapping of variables to values. The keys of this map may + be either strings for variables with no namespace, or + (namespaceURI, name) tuples for variables contained in a + namespace. + + .. method:: find(expr, node, [\**kwargs]) + findnode(expr, node, [\**kwargs]) + findvalue(expr, node, [\**kwargs]) + findvalues(expr, node, [\**kwargs]) + + Evaluate *expr* in the context with *node* as the context node. + *expr* may be either a string or a :class:`XPath` object. + +Create and use an evaluation context: :: + + >>> context = xpath.XPathContext() + >>> context.namespaces['py'] = 'http://python.example.org/' + >>> context.variables['min'] = 4 + >>> context.findvalues('//item[@id>=$min and @id<=$max]', doc, max=6) + [u'4', u'5', u'6'] + +Exceptions +---------- +This module defines the following exceptions: + +.. exception:: XPathError + + Base exception class used for all XPath exceptions. + +.. exception:: XPathNotImplementedError + + Raised when an XPath expression contains a feature of XPath which + has not been implemented. + +.. exception:: XPathParseError + + Raised when an XPath expression could not be parsed. + +.. exception:: XPathTypeError + + Raised when an XPath expression is found to contain a type error. + For example, the expression "string()/node()" contains a type error + because the "string()" function does not return a node-set. + +.. exception:: XPathUnknownFunctionError + + Raised when an XPath expression contains a function that has no + binding in the expression context. + +.. exception:: XPathUnknownPrefixError + + Raised when an XPath expression contains a QName with a namespace + prefix that has no corresponding namespace declaration in the expression + context. + +.. exception:: XPathUnknownVariableError + + Raised when an XPath expression contains a variable that has no + binding in the expression context. + +References +---------- +.. seealso:: + + `XML Path Language (XPath) Version 1.0 `_ + The W3C recommendation upon which this module is based. + + `XML Path Language (XPath) 2.0 `_ + Second version of XPath, mostly unsupported by this module. diff --git a/pyulib/src/ulib/ext/xpath/__init__.py b/pyulib/src/ulib/ext/xpath/__init__.py new file mode 100644 index 0000000..6cfabb2 --- /dev/null +++ b/pyulib/src/ulib/ext/xpath/__init__.py @@ -0,0 +1,23 @@ +import exceptions + +from _xpath import api, XPathContext, XPath +from exceptions import * + +__all__ = ['find', 'findnode', 'findvalue', 'findvalues', 'XPathContext', 'XPath'] +__all__.extend((x for x in dir(exceptions) if not x.startswith('_'))) + +@api +def find(expr, node, **kwargs): + return XPath.get(expr).find(node, **kwargs) + +@api +def findnode(expr, node, **kwargs): + return XPath.get(expr).findnode(node, **kwargs) + +@api +def findvalue(expr, node, **kwargs): + return XPath.get(expr).findvalue(node, **kwargs) + +@api +def findvalues(expr, node, **kwargs): + return XPath.get(expr).findvalues(node, **kwargs) diff --git a/pyulib/src/ulib/ext/xpath/_xpath.py b/pyulib/src/ulib/ext/xpath/_xpath.py new file mode 100644 index 0000000..d1c48bc --- /dev/null +++ b/pyulib/src/ulib/ext/xpath/_xpath.py @@ -0,0 +1,143 @@ +import expr as E +import parser as P +import yappsrt as Y + +from exceptions import * + +def api(f): + """Decorator for functions and methods that are part of the external + module API and that can throw XPathError exceptions. + + The call stack for these exceptions can be very large, and not very + interesting to the user. This decorator rethrows XPathErrors to + trim the stack. + + """ + def api_function(*args, **kwargs): + try: + return f(*args, **kwargs) + except XPathError, e: + raise e + api_function.__name__ = f.__name__ + api_function.__doc__ = f.__doc__ + return api_function + +class XPathContext(object): + def __init__(self, document=None, **kwargs): + self.default_namespace = None + self.namespaces = {} + self.variables = {} + + if document is not None: + if document.nodeType != document.DOCUMENT_NODE: + document = document.ownerDocument + if document.documentElement is not None: + attrs = document.documentElement.attributes + for attr in (attrs.item(i) for i in xrange(attrs.length)): + if attr.name == 'xmlns': + self.default_namespace = attr.value + elif attr.name.startswith('xmlns:'): + self.namespaces[attr.name[6:]] = attr.value + + self.update(**kwargs) + + def clone(self): + dup = XPathContext() + dup.default_namespace = self.default_namespace + dup.namespaces.update(self.namespaces) + dup.variables.update(self.variables) + return dup + + def update(self, default_namespace=None, namespaces=None, + variables=None, **kwargs): + if default_namespace is not None: + self.default_namespace = default_namespace + if namespaces is not None: + self.namespaces = namespaces + if variables is not None: + self.variables = variables + self.variables.update(kwargs) + + @api + def find(self, expr, node, **kwargs): + return XPath.get(expr).find(node, context=self, **kwargs) + + @api + def findnode(self, expr, node, **kwargs): + return XPath.get(expr).findnode(node, context=self, **kwargs) + + @api + def findvalue(self, expr, node, **kwargs): + return XPath.get(expr).findvalue(node, context=self, **kwargs) + + @api + def findvalues(self, expr, node, **kwargs): + return XPath.get(expr).findvalues(node, context=self, **kwargs) + +class XPath(): + _max_cache = 100 + _cache = {} + + def __init__(self, expr): + """Init docs. + """ + try: + parser = P.XPath(P.XPathScanner(str(expr))) + self.expr = parser.XPath() + except Y.SyntaxError, e: + raise XPathParseError(str(expr), e.pos, e.msg) + + @classmethod + def get(cls, s): + if isinstance(s, cls): + return s + try: + return cls._cache[s] + except KeyError: + if len(cls._cache) > cls._max_cache: + cls._cache.clear() + expr = cls(s) + cls._cache[s] = expr + return expr + + @api + def find(self, node, context=None, **kwargs): + if context is None: + context = XPathContext(node, **kwargs) + elif kwargs: + context = context.clone() + context.update(**kwargs) + return self.expr.evaluate(node, 1, 1, context) + + @api + def findnode(self, node, context=None, **kwargs): + result = self.find(node, context, **kwargs) + if not E.nodesetp(result): + raise XPathTypeError("expression is not a node-set") + if len(result) == 0: + return None + return result[0] + + @api + def findvalue(self, node, context=None, **kwargs): + result = self.find(node, context, **kwargs) + if E.nodesetp(result): + if len(result) == 0: + return None + result = E.string(result) + return result + + @api + def findvalues(self, node, context=None, **kwargs): + result = self.find(node, context, **kwargs) + if not E.nodesetp(result): + raise XPathTypeError("expression is not a node-set") + return [E.string_value(x) for x in result] + + def __repr__(self): + return '%s.%s(%s)' % (self.__class__.__module__, + self.__class__.__name__, + repr(str(self.expr))) + + def __str__(self): + return str(self.expr) diff --git a/pyulib/src/ulib/ext/xpath/exceptions.py b/pyulib/src/ulib/ext/xpath/exceptions.py new file mode 100644 index 0000000..1597670 --- /dev/null +++ b/pyulib/src/ulib/ext/xpath/exceptions.py @@ -0,0 +1,49 @@ + +class XPathError(Exception): + """Base exception class used for all XPath exceptions.""" + +class XPathNotImplementedError(XPathError): + """Raised when an XPath expression contains a feature of XPath which + has not been implemented. + + """ + +class XPathParseError(XPathError): + """Raised when an XPath expression could not be parsed.""" + + def __init__(self, expr, pos, message): + XPathError.__init__(self) + self.expr = expr + self.pos = pos + self.message = message + + def __str__(self): + return ("Syntax error:\n" + + self.expr.replace("\n", " ") + "\n" + + ("-" * self.pos) + "^") + +class XPathTypeError(XPathError): + """Raised when an XPath expression is found to contain a type error. + For example, the expression "string()/node()" contains a type error + because the "string()" function does not return a node-set. + + """ + +class XPathUnknownFunctionError(XPathError): + """Raised when an XPath expression contains a function that has no + binding in the expression context. + + """ + +class XPathUnknownPrefixError(XPathError): + """Raised when an XPath expression contains a QName with a namespace + prefix that has no corresponding namespace declaration in the expression + context. + + """ + +class XPathUnknownVariableError(XPathError): + """Raised when an XPath expression contains a variable that has no + binding in the expression context. + + """ diff --git a/pyulib/src/ulib/ext/xpath/expr.py b/pyulib/src/ulib/ext/xpath/expr.py new file mode 100644 index 0000000..4f7550a --- /dev/null +++ b/pyulib/src/ulib/ext/xpath/expr.py @@ -0,0 +1,896 @@ +from __future__ import division +from itertools import * +import math +import operator +import re +import xml.dom +import weakref + +from exceptions import * + + +# +# Data model functions. +# + +def string_value(node): + """Compute the string-value of a node.""" + if (node.nodeType == node.DOCUMENT_NODE or + node.nodeType == node.ELEMENT_NODE): + s = u'' + for n in axes['descendant'](node): + if n.nodeType == n.TEXT_NODE: + s += n.data + return s + + elif node.nodeType == node.ATTRIBUTE_NODE: + return node.value + + elif (node.nodeType == node.PROCESSING_INSTRUCTION_NODE or + node.nodeType == node.COMMENT_NODE or + node.nodeType == node.TEXT_NODE): + return node.data + +def document_order(node): + """Compute a document order value for the node. + + cmp(document_order(a), document_order(b)) will return -1, 0, or 1 if + a is before, identical to, or after b in the document respectively. + + We represent document order as a list of sibling indexes. That is, + the third child of the document node has an order of [2]. The first + child of that node has an order of [2,0]. + + Attributes have a sibling index of -1 (coming before all children of + their node) and are further ordered by name--e.g., [2,0,-1,'href']. + + """ + + # Attributes: parent-order + [-1, attribute-name] + if node.nodeType == node.ATTRIBUTE_NODE: + order = document_order(node.ownerElement) + order.extend((-1, node.name)) + return order + + # The document root (hopefully): [] + if node.parentNode is None: + return [] + + # Determine which child this is of its parent. + sibpos = 0 + sib = node + while sib.previousSibling is not None: + sibpos += 1 + sib = sib.previousSibling + + # Order: parent-order + [sibling-position] + order = document_order(node.parentNode) + order.append(sibpos) + return order + +# +# Type functions, operating on the various XPath types. +# +# Internally, we use the following representations: +# nodeset - list of DOM tree nodes in document order +# string - str or unicode +# boolean - bool +# number - int or float +# + +def nodeset(v): + """Convert a value to a nodeset.""" + if not nodesetp(v): + raise XPathTypeError, "value is not a node-set" + return v + +def nodesetp(v): + """Return true iff 'v' is a node-set.""" + if isinstance(v, list): + return True + +def string(v): + """Convert a value to a string.""" + if nodesetp(v): + if not v: + return u'' + return string_value(v[0]) + elif numberp(v): + if v == float('inf'): + return u'Infinity' + elif v == float('-inf'): + return u'-Infinity' + elif int(v) == v and v <= 0xffffffff: + v = int(v) + elif str(v) == 'nan': + return u'NaN' + return unicode(v) + elif booleanp(v): + return u'true' if v else u'false' + return v + +def stringp(v): + """Return true iff 'v' is a string.""" + return isinstance(v, basestring) + +def boolean(v): + """Convert a value to a boolean.""" + if nodesetp(v): + return len(v) > 0 + elif numberp(v): + if v == 0 or v != v: + return False + return True + elif stringp(v): + return v != '' + return v + +def booleanp(v): + """Return true iff 'v' is a boolean.""" + return isinstance(v, bool) + +def number(v): + """Convert a value to a number.""" + if nodesetp(v): + v = string(v) + try: + return float(v) + except ValueError: + return float('NaN') + +def numberp(v): + """Return true iff 'v' is a number.""" + return (not(isinstance(v, bool)) and + (isinstance(v, int) or isinstance(v, float))) + +class Expr(object): + """Abstract base class for XPath expressions.""" + + def evaluate(self, node, pos, size, context): + """Evaluate the expression. + + The context node, context position, and context size are passed as + arguments. + + Returns an XPath value: a nodeset, string, boolean, or number. + + """ + +class BinaryOperatorExpr(Expr): + """Base class for all binary operators.""" + + def __init__(self, op, left, right): + self.op = op + self.left = left + self.right = right + + def evaluate(self, node, pos, size, context): + # Subclasses either override evaluate() or implement operate(). + return self.operate(self.left.evaluate(node, pos, size, context), + self.right.evaluate(node, pos, size, context)) + + def __str__(self): + return '(%s %s %s)' % (self.left, self.op, self.right) + +class AndExpr(BinaryOperatorExpr): + """ and """ + + def evaluate(self, node, pos, size, context): + # Note that XPath boolean operations short-circuit. + return (boolean(self.left.evaluate(node, pos, size, context) and + boolean(self.right.evaluate(node, pos, size, context)))) + +class OrExpr(BinaryOperatorExpr): + """ or """ + + def evaluate(self, node, pos, size, context): + # Note that XPath boolean operations short-circuit. + return (boolean(self.left.evaluate(node, pos, size, context) or + boolean(self.right.evaluate(node, pos, size, context)))) + +class EqualityExpr(BinaryOperatorExpr): + """ = , != , etc.""" + + operators = { + '=' : operator.eq, + '!=' : operator.ne, + '<=' : operator.le, + '<' : operator.lt, + '>=' : operator.ge, + '>' : operator.gt, + } + + def operate(self, a, b): + if nodesetp(a): + for node in a: + if self.operate(string_value(node), b): + return True + return False + + if nodesetp(b): + for node in b: + if self.operate(a, string_value(node)): + return True + return False + + if self.op in ('=', '!='): + if booleanp(a) or booleanp(b): + convert = boolean + elif numberp(a) or numberp(b): + convert = number + else: + convert = string + else: + convert = number + + a, b = convert(a), convert(b) + return self.operators[self.op](a, b) + +def divop(x, y): + try: + return x / y + except ZeroDivisionError: + if x == 0 and y == 0: + return float('nan') + if x < 0: + return float('-inf') + return float('inf') + +class ArithmeticalExpr(BinaryOperatorExpr): + """ + , - , etc.""" + + # Note that we must use math.fmod for the correct modulo semantics. + operators = { + '+' : operator.add, + '-' : operator.sub, + '*' : operator.mul, + 'div' : divop, + 'mod' : math.fmod + } + + def operate(self, a, b): + return self.operators[self.op](number(a), number(b)) + +class UnionExpr(BinaryOperatorExpr): + """ | """ + + def operate(self, a, b): + if not nodesetp(a) or not nodesetp(b): + raise XPathTypeError("union operand is not a node-set") + + # Need to sort the result to preserve document order. + return sorted(set(chain(a, b)), key=document_order) + +class NegationExpr(Expr): + """- """ + + def __init__(self, expr): + self.expr = expr + + def evaluate(self, node, pos, size, context): + return -number(self.expr.evaluate(node, pos, size, context)) + + def __str__(self): + return '(-%s)' % self.expr + +class LiteralExpr(Expr): + """Literals--either numbers or strings.""" + + def __init__(self, literal): + self.literal = literal + + def evaluate(self, node, pos, size, context): + return self.literal + + def __str__(self): + if stringp(self.literal): + if "'" in self.literal: + return '"%s"' % self.literal + else: + return "'%s'" % self.literal + return string(self.literal) + +class VariableReference(Expr): + """Variable references.""" + + def __init__(self, prefix, name): + self.prefix = prefix + self.name = name + + def evaluate(self, node, pos, size, context): + try: + if self.prefix is not None: + try: + namespaceURI = context.namespaces[self.prefix] + except KeyError: + raise XPathUnknownPrefixError(self.prefix) + return context.variables[(namespaceURI, self.name)] + else: + return context.variables[self.name] + except KeyError: + raise XPathUnknownVariableError(str(self)) + + def __str__(self): + if self.prefix is None: + return '$%s' % self.name + else: + return '$%s:%s' % (self.prefix, self.name) + +class Function(Expr): + """Functions.""" + + def __init__(self, name, args): + self.name = name + self.args = args + self.evaluate = getattr(self, 'f_%s' % name.replace('-', '_'), None) + if self.evaluate is None: + raise XPathUnknownFunctionError, 'unknown function "%s()"' % name + + if len(self.args) < self.evaluate.minargs: + raise XPathTypeError, 'too few arguments for "%s()"' % name + if (self.evaluate.maxargs is not None and + len(self.args) > self.evaluate.maxargs): + raise XPathTypeError, 'too many arguments for "%s()"' % name + + # + # XPath functions are implemented by methods of the Function class. + # + # A method implementing an XPath function is decorated with the function + # decorator, and receives the evaluated function arguments as positional + # parameters. + # + + def function(minargs, maxargs, implicit=False, first=False, convert=None): + """Function decorator. + + minargs -- Minimum number of arguments taken by the function. + maxargs -- Maximum number of arguments taken by the function. + implicit -- True for functions which operate on a nodeset consisting + of the current context node when passed no argument. + (e.g., string() and number().) + convert -- When non-None, a function used to filter function arguments. + """ + def decorator(f): + def new_f(self, node, pos, size, context): + if implicit and len(self.args) == 0: + args = [[node]] + else: + args = [x.evaluate(node, pos, size, context) + for x in self.args] + if first: + args[0] = nodeset(args[0]) + if len(args[0]) > 0: + args[0] = args[0][0] + else: + args[0] = None + if convert is not None: + args = [convert(x) for x in args] + return f(self, node, pos, size, context, *args) + + new_f.minargs = minargs + new_f.maxargs = maxargs + new_f.__name__ = f.__name__ + new_f.__doc__ = f.__doc__ + return new_f + return decorator + + # Node Set Functions + + @function(0, 0) + def f_last(self, node, pos, size, context): + return size + + @function(0, 0) + def f_position(self, node, pos, size, context): + return pos + + @function(1, 1, convert=nodeset) + def f_count(self, node, pos, size, context, nodes): + return len(nodes) + + @function(1, 1) + def f_id(self, node, pos, size, context, arg): + if nodesetp(arg): + ids = (string_value(x) for x in arg) + else: + ids = [string(arg)] + if node.nodeType != node.DOCUMENT_NODE: + node = node.ownerDocument + return list(filter(None, (node.getElementById(id) for id in ids))) + + @function(0, 1, implicit=True, first=True) + def f_local_name(self, node, pos, size, context, argnode): + if argnode is None: + return '' + if (argnode.nodeType == argnode.ELEMENT_NODE or + argnode.nodeType == argnode.ATTRIBUTE_NODE): + return argnode.localName + elif argnode.nodeType == argnode.PROCESSING_INSTRUCTION_NODE: + return argnode.target + return '' + + @function(0, 1, implicit=True, first=True) + def f_namespace_uri(self, node, pos, size, context, argnode): + if argnode is None: + return '' + return argnode.namespaceURI + + @function(0, 1, implicit=True, first=True) + def f_name(self, node, pos, size, context, argnode): + if argnode is None: + return '' + if argnode.nodeType == argnode.ELEMENT_NODE: + return argnode.tagName + elif argnode.nodeType == argnode.ATTRIBUTE_NODE: + return argnode.name + elif argnode.nodeType == argnode.PROCESSING_INSTRUCTION_NODE: + return argnode.target + return '' + + # String Functions + + @function(0, 1, implicit=True, convert=string) + def f_string(self, node, pos, size, context, arg): + return arg + + @function(2, None, convert=string) + def f_concat(self, node, pos, size, context, *args): + return ''.join((x for x in args)) + + @function(2, 2, convert=string) + def f_starts_with(self, node, pos, size, context, a, b): + return a.startswith(b) + + @function(2, 2, convert=string) + def f_contains(self, node, pos, size, context, a, b): + return b in a + + @function(2, 2, convert=string) + def f_substring_before(self, node, pos, size, context, a, b): + try: + return a[0:a.index(b)] + except ValueError: + return '' + + @function(2, 2, convert=string) + def f_substring_after(self, node, pos, size, context, a, b): + try: + return a[a.index(b)+len(b):] + except ValueError: + return '' + + @function(2, 3) + def f_substring(self, node, pos, size, context, s, start, count=None): + s = string(s) + start = round(number(start)) + if start != start: + # Catch NaN + return '' + + if count is None: + end = len(s) + 1 + else: + end = start + round(number(count)) + if end != end: + # Catch NaN + return '' + if end > len(s): + end = len(s)+1 + + if start < 1: + start = 1 + if start > len(s): + return '' + if end <= start: + return '' + return s[int(start)-1:int(end)-1] + + @function(0, 1, implicit=True, convert=string) + def f_string_length(self, node, pos, size, context, s): + return len(s) + + @function(0, 1, implicit=True, convert=string) + def f_normalize_space(self, node, pos, size, context, s): + return re.sub(r'\s+', ' ', s.strip()) + + @function(3, 3, convert=lambda x: unicode(string(x))) + def f_translate(self, node, pos, size, context, s, source, target): + # str.translate() and unicode.translate() are completely different. + # The translate() arguments are coerced to unicode. + table = {} + for schar, tchar in izip(source, target): + schar = ord(schar) + if schar not in table: + table[schar] = tchar + if len(source) > len(target): + for schar in source[len(target):]: + schar = ord(schar) + if schar not in table: + table[schar] = None + return s.translate(table) + + # Boolean functions + + @function(1, 1, convert=boolean) + def f_boolean(self, node, pos, size, context, b): + return b + + @function(1, 1, convert=boolean) + def f_not(self, node, pos, size, context, b): + return not b + + @function(0, 0) + def f_true(self, node, pos, size, context): + return True + + @function(0, 0) + def f_false(self, node, pos, size, context): + return False + + @function(1, 1, convert=string) + def f_lang(self, node, pos, size, context, s): + s = s.lower() + for n in axes['ancestor-or-self'](node): + if n.nodeType == n.ELEMENT_NODE and n.hasAttribute('xml:lang'): + lang = n.getAttribute('xml:lang').lower() + if s == lang or lang.startswith(s + u'-'): + return True + break + return False + + # Number functions + + @function(0, 1, implicit=True, convert=number) + def f_number(self, node, pos, size, context, n): + return n + + @function(1, 1, convert=nodeset) + def f_sum(self, node, pos, size, context, nodes): + return sum((number(string_value(x)) for x in nodes)) + + @function(1, 1, convert=number) + def f_floor(self, node, pos, size, context, n): + return math.floor(n) + + @function(1, 1, convert=number) + def f_ceiling(self, node, pos, size, context, n): + return math.ceil(n) + + @function(1, 1, convert=number) + def f_round(self, node, pos, size, context, n): + # XXX round(-0.0) should be -0.0, not 0.0. + # XXX round(-1.5) should be -1.0, not -2.0. + return round(n) + + def __str__(self): + return '%s(%s)' % (self.name, ', '.join((str(x) for x in self.args))) + +# +# XPath axes. +# + +# Dictionary of all axis functions. +axes = {} + +def axisfn(reverse=False, principal_node_type=xml.dom.Node.ELEMENT_NODE): + """Axis function decorator. + + An axis function will take a node as an argument and return a sequence + over the nodes along an XPath axis. Axis functions have two extra + attributes indicating the axis direction and principal node type. + """ + def decorate(f): + f.__name__ = f.__name__.replace('_', '-') + f.reverse = reverse + f.principal_node_type = principal_node_type + return f + return decorate + +def make_axes(): + """Define functions to walk each of the possible XPath axes.""" + + @axisfn() + def child(node): + return node.childNodes + + @axisfn() + def descendant(node): + for child in node.childNodes: + for node in descendant_or_self(child): + yield node + + @axisfn() + def parent(node): + if node.parentNode is not None: + yield node.parentNode + + @axisfn(reverse=True) + def ancestor(node): + while node.parentNode is not None: + node = node.parentNode + yield node + + @axisfn() + def following_sibling(node): + while node.nextSibling is not None: + node = node.nextSibling + yield node + + @axisfn(reverse=True) + def preceding_sibling(node): + while node.previousSibling is not None: + node = node.previousSibling + yield node + + @axisfn() + def following(node): + while node is not None: + while node.nextSibling is not None: + node = node.nextSibling + for n in descendant_or_self(node): + yield n + node = node.parentNode + + @axisfn(reverse=True) + def preceding(node): + while node is not None: + while node.previousSibling is not None: + node = node.previousSibling + # Could be more efficient here. + for n in reversed(list(descendant_or_self(node))): + yield n + node = node.parentNode + + @axisfn(principal_node_type=xml.dom.Node.ATTRIBUTE_NODE) + def attribute(node): + if node.attributes is not None: + return (node.attributes.item(i) + for i in xrange(node.attributes.length)) + return () + + @axisfn() + def namespace(node): + raise XPathNotImplementedError("namespace axis is not implemented") + + @axisfn() + def self(node): + yield node + + @axisfn() + def descendant_or_self(node): + yield node + for child in node.childNodes: + for node in descendant_or_self(child): + yield node + + @axisfn(reverse=True) + def ancestor_or_self(node): + return chain([node], ancestor(node)) + + # Place each axis function defined here into the 'axes' dict. + for axis in locals().values(): + axes[axis.__name__] = axis + +make_axes() + +def merge_into_nodeset(target, source): + """Place all the nodes from the source node-set into the target + node-set, preserving document order. Both node-sets must be in + document order to begin with. + + """ + if len(target) == 0: + target.extend(source) + return + + source = [n for n in source if n not in target] + if len(source) == 0: + return + + # If the last node in the target set comes before the first node in the + # source set, then we can just concatenate the sets. Otherwise, we + # will need to sort. (We could also check to see if the last node in + # the source set comes before the first node in the target set, but this + # situation is very unlikely in practice.) + if document_order(target[-1]) < document_order(source[0]): + target.extend(source) + else: + target.extend(source) + target.sort(key=document_order) + +class AbsolutePathExpr(Expr): + """Absolute location paths.""" + + def __init__(self, path): + self.path = path + + def evaluate(self, node, pos, size, context): + if node.nodeType != node.DOCUMENT_NODE: + node = node.ownerDocument + if self.path is None: + return [node] + return self.path.evaluate(node, 1, 1, context) + + def __str__(self): + return '/%s' % (self.path or '') + +class PathExpr(Expr): + """Location path expressions.""" + + def __init__(self, steps): + self.steps = steps + + def evaluate(self, node, pos, size, context): + # The first step in the path is evaluated in the current context. + # If this is the only step in the path, the return value is + # unimportant. If there are other steps, however, it must be a + # node-set. + result = self.steps[0].evaluate(node, pos, size, context) + if len(self.steps) > 1 and not nodesetp(result): + raise XPathTypeError("path step is not a node-set") + + # Subsequent steps are evaluated for each node in the node-set + # resulting from the previous step. + for step in self.steps[1:]: + aggregate = [] + for i in xrange(len(result)): + nodes = step.evaluate(result[i], i+1, len(result), context) + if not nodesetp(nodes): + raise XPathTypeError("path step is not a node-set") + merge_into_nodeset(aggregate, nodes) + result = aggregate + + return result + + def __str__(self): + return '/'.join((str(s) for s in self.steps)) + +class PredicateList(Expr): + """A list of predicates. + + Predicates are handled as an expression wrapping the expression + filtered by the predicates. + + """ + def __init__(self, expr, predicates, axis='child'): + self.predicates = predicates + self.expr = expr + self.axis = axes[axis] + + def evaluate(self, node, pos, size, context): + result = self.expr.evaluate(node, pos, size, context) + if not nodesetp(result): + raise XPathTypeError("predicate input is not a node-set") + + if self.axis.reverse: + result.reverse() + + for pred in self.predicates: + match = [] + for i, node in izip(count(1), result): + r = pred.evaluate(node, i, len(result), context) + + # If a predicate evaluates to a number, select the node + # with that position. Otherwise, select nodes for which + # the boolean value of the predicate is true. + if numberp(r): + if r == i: + match.append(node) + elif boolean(r): + match.append(node) + result = match + + if self.axis.reverse: + result.reverse() + + return result + + def __str__(self): + s = str(self.expr) + if '/' in s: + s = '(%s)' % s + return s + ''.join(('[%s]' % x for x in self.predicates)) + +class AxisStep(Expr): + """One step in a location path expression.""" + + def __init__(self, axis, test=None, predicates=None): + if test is None: + test = AnyKindTest() + self.axis = axes[axis] + self.test = test + + def evaluate(self, node, pos, size, context): + match = [] + for n in self.axis(node): + if self.test.match(n, self.axis, context): + match.append(n) + + if self.axis.reverse: + match.reverse() + + return match + + def __str__(self): + return '%s::%s' % (self.axis.__name__, self.test) + +# +# Node tests. +# + +class Test(object): + """Abstract base class for node tests.""" + + def match(self, node, axis, context): + """Return True if 'node' matches the test along 'axis'.""" + +class NameTest(object): + def __init__(self, prefix, localpart): + self.prefix = prefix + self.localName = localpart + if self.prefix == None and self.localName == '*': + self.prefix = '*' + + def match(self, node, axis, context): + if node.nodeType != axis.principal_node_type: + return False + + if self.prefix != '*': + namespaceURI = None + if self.prefix is not None: + try: + namespaceURI = context.namespaces[self.prefix] + except KeyError: + raise XPathUnknownPrefixError(self.prefix) + elif axis.principal_node_type == node.ELEMENT_NODE: + namespaceURI = context.default_namespace + if namespaceURI != node.namespaceURI: + return False + if self.localName != '*': + if self.localName != node.localName: + return False + return True + + def __str__(self): + if self.prefix is not None: + return '%s:%s' % (self.prefix, self.localName) + else: + return self.localName + +class PITest(object): + def __init__(self, name=None): + self.name = name + + def match(self, node, axis, context): + return (node.nodeType == node.PROCESSING_INSTRUCTION_NODE and + (self.name is None or node.target == self.name)) + + def __str__(self): + if self.name is None: + name = '' + elif "'" in self.name: + name = '"%s"' % self.name + else: + name = "'%s'" % self.name + return 'processing-instruction(%s)' % name + +class CommentTest(object): + def match(self, node, axis, context): + return node.nodeType == node.COMMENT_NODE + + def __str__(self): + return 'comment()' + +class TextTest(object): + def match(self, node, axis, context): + return node.nodeType == node.TEXT_NODE + + def __str__(self): + return 'text()' + +class AnyKindTest(object): + def match(self, node, axis, context): + return True + + def __str__(self): + return 'node()' diff --git a/pyulib/src/ulib/ext/xpath/parser.g b/pyulib/src/ulib/ext/xpath/parser.g new file mode 100644 index 0000000..df75bb1 --- /dev/null +++ b/pyulib/src/ulib/ext/xpath/parser.g @@ -0,0 +1,252 @@ +import expr as X +from yappsrt import * + +%% + +parser XPath: + option: 'no-support-module' + + ignore: r'\s+' + token END: r'$' + + token FORWARD_AXIS_NAME: + r'child|descendant-or-self|attribute|self|descendant|following-sibling|following|namespace' + token REVERSE_AXIS_NAME: + r'parent|preceding-sibling|preceding|ancestor-or-self|ancestor' + + # Dire hack here, since yapps2 has only one token of lookahead: NCNAME + # does not match when followed by a open paren. + token NCNAME: r'[a-zA-Z_][a-zA-Z0-9_\-\.\w]*(?!\()' + token FUNCNAME: r'[a-zA-Z_][a-zA-Z0-9_\-\.\w]*' + + token DQUOTE: r'\"(?:[^\"])*\"' + token SQUOTE: r"\'(?:[^\'])*\'" + token NUMBER: r'((\.[0-9]+)|([0-9]+(\.[0-9]*)?))([eE][\+\-]?[0-9]+)?' + token EQ_COMP: r'\!?\=' + token REL_COMP: r'[\<\>]\=?' + token ADD_COMP: r'[\+\-]' + token MUL_COMP: r'\*|div|mod' + + rule XPath: + Expr END {{ return Expr }} + + rule Expr: + OrExpr {{ return OrExpr }} + + rule OrExpr: + AndExpr {{ Expr = AndExpr }} + ( + r'or' AndExpr + {{ Expr = X.OrExpr('or', Expr, AndExpr) }} + )* {{ return Expr }} + + rule AndExpr: + EqualityExpr {{ Expr = EqualityExpr }} + ( + r'and' EqualityExpr + {{ Expr = X.AndExpr('and', Expr, EqualityExpr) }} + )* {{ return Expr }} + + rule EqualityExpr: + RelationalExpr {{ Expr = RelationalExpr }} + ( + EQ_COMP + RelationalExpr + {{ Expr = X.EqualityExpr(EQ_COMP, Expr, RelationalExpr) }} + )* {{ return Expr }} + + rule RelationalExpr: + AdditiveExpr {{ Expr = AdditiveExpr }} + ( + REL_COMP + AdditiveExpr + {{ Expr = X.EqualityExpr(REL_COMP, Expr, AdditiveExpr) }} + )* {{ return Expr }} + + rule AdditiveExpr: + MultiplicativeExpr {{ Expr = MultiplicativeExpr }} + ( + ADD_COMP + MultiplicativeExpr + {{ Expr = X.ArithmeticalExpr(ADD_COMP, Expr, MultiplicativeExpr) }} + )* {{ return Expr }} + + rule MultiplicativeExpr: + UnionExpr {{ Expr = UnionExpr }} + ( + MUL_COMP + UnionExpr + {{ Expr = X.ArithmeticalExpr(MUL_COMP, Expr, UnionExpr) }} + )* {{ return Expr }} + + rule UnionExpr: + UnaryExpr {{ Expr = UnaryExpr }} + ( + '\|' UnaryExpr + {{ Expr = X.UnionExpr('|', Expr, UnaryExpr) }} + )* {{ return Expr }} + + rule UnaryExpr: + r'\-' ValueExpr {{ return X.NegationExpr(ValueExpr) }} + | ValueExpr {{ return ValueExpr }} + + rule ValueExpr: + PathExpr {{ return PathExpr }} + + rule PathExpr: + r'\/' {{ path = None }} + [ + RelativePathExpr {{ path = RelativePathExpr }} + ] {{ return X.AbsolutePathExpr(path) }} + | r'\/\/' RelativePathExpr + {{ step = X.AxisStep('descendant-or-self') }} + {{ RelativePathExpr.steps.insert(0, step) }} + {{ return X.AbsolutePathExpr(RelativePathExpr) }} + | RelativePathExpr {{ return RelativePathExpr }} + + rule RelativePathExpr: + StepExpr {{ steps = [StepExpr] }} + ( + ( + r'\/' + | r'\/\/' + {{ steps.append(X.AxisStep('descendant-or-self')) }} + ) + StepExpr {{ steps.append(StepExpr) }} + )* + {{ return X.PathExpr(steps) }} + + rule StepExpr: + AxisStep {{ return AxisStep }} + | FilterExpr {{ return FilterExpr }} + + rule AxisStep: + ( + ForwardStep {{ step = ForwardStep }} + | ReverseStep {{ step = ReverseStep }} + ) {{ expr = X.AxisStep(*step) }} + [ + PredicateList + {{ expr = X.PredicateList(expr, PredicateList, step[0]) }} + ] + {{ return expr }} + + rule ForwardStep: + ForwardAxis NodeTest {{ return [ForwardAxis, NodeTest] }} + | AbbrevForwardStep {{ return AbbrevForwardStep }} + + rule ForwardAxis: + FORWARD_AXIS_NAME r'::' {{ return FORWARD_AXIS_NAME }} + + rule AbbrevForwardStep: + {{ axis = 'child' }} + [ + r'@' {{ axis = 'attribute' }} + ] + NodeTest {{ return [axis, NodeTest] }} + + rule ReverseStep: + ReverseAxis NodeTest {{ return [ReverseAxis, NodeTest] }} + | AbbrevReverseStep {{ return AbbrevReverseStep }} + + rule ReverseAxis: + REVERSE_AXIS_NAME r'::' {{ return REVERSE_AXIS_NAME }} + + rule AbbrevReverseStep: + r'\.\.' {{ return ['parent', None] }} + + rule NodeTest: + KindTest {{ return KindTest }} + | NameTest {{ return NameTest }} + + rule NameTest: + # We also support the XPath 2.0 :*. + {{ prefix = None }} + WildcardOrNCName {{ localpart = WildcardOrNCName }} + [ + r':' WildcardOrNCName {{ prefix = localpart }} + {{ localpart = WildcardOrNCName }} + ] + {{ return X.NameTest(prefix, localpart) }} + + rule WildcardOrNCName: + r'\*' {{ return '*' }} + | NCNAME {{ return NCNAME }} + + rule FilterExpr: + PrimaryExpr + [ + PredicateList + {{ PrimaryExpr = X.PredicateList(PrimaryExpr,PredicateList) }} + ] {{ return PrimaryExpr }} + + rule PredicateList: + Predicate {{ predicates = [Predicate] }} + ( + Predicate {{ predicates.append(Predicate) }} + )* {{ return predicates }} + + rule Predicate: + r'\[' Expr r'\]' {{ return Expr }} + + rule PrimaryExpr: + Literal {{ return X.LiteralExpr(Literal) }} + | VariableReference {{ return VariableReference }} + | r'\(' Expr r'\)' {{ return Expr }} + | ContextItemExpr {{ return ContextItemExpr }} + | FunctionCall {{ return FunctionCall }} + + rule VariableReference: + r'\$' QName + {{ return X.VariableReference(*QName) }} + + rule ContextItemExpr: + r'\.' {{ return X.AxisStep('self') }} + + rule FunctionCall: + FUNCNAME r'\(' {{ args = [] }} + [ + Expr {{ args.append(Expr) }} + ( + r'\,' Expr {{ args.append(Expr) }} + )* + ] r'\)' {{ return X.Function(FUNCNAME, args) }} + + rule KindTest: + PITest {{ return PITest }} + | CommentTest {{ return CommentTest }} + | TextTest {{ return TextTest }} + | AnyKindTest {{ return AnyKindTest }} + + rule PITest: + r'processing-instruction' {{ name = None }} + r'\(' [ + NCNAME {{ name = NCNAME }} + | StringLiteral {{ name = StringLiteral }} + ] r'\)' {{ return X.PITest(name) }} + + rule CommentTest: + r'comment' r'\(' r'\)' {{ return X.CommentTest() }} + + rule TextTest: + r'text' r'\(' r'\)' {{ return X.TextTest() }} + + rule AnyKindTest: + r'node' r'\(' r'\)' {{ return X.AnyKindTest() }} + + rule Literal: + NumericLiteral {{ return NumericLiteral }} + | StringLiteral {{ return StringLiteral }} + + rule NumericLiteral: + NUMBER {{ return float(NUMBER) }} + + rule StringLiteral: + DQUOTE {{ return DQUOTE[1:-1] }} + | SQUOTE {{ return SQUOTE[1:-1] }} + + rule QName: + NCNAME {{ name = NCNAME }} + [ + r'\:' NCNAME {{ return (name, NCNAME) }} + ] {{ return (None, name) }} diff --git a/pyulib/src/ulib/ext/xpath/parser.py b/pyulib/src/ulib/ext/xpath/parser.py new file mode 100644 index 0000000..bb673f9 --- /dev/null +++ b/pyulib/src/ulib/ext/xpath/parser.py @@ -0,0 +1,420 @@ +import expr as X +from yappsrt import * + + +from string import * +import re + +class XPathScanner(Scanner): + patterns = [ + ("r'\\:'", re.compile('\\:')), + ("r'node'", re.compile('node')), + ("r'text'", re.compile('text')), + ("r'comment'", re.compile('comment')), + ("r'processing-instruction'", re.compile('processing-instruction')), + ("r'\\,'", re.compile('\\,')), + ("r'\\.'", re.compile('\\.')), + ("r'\\$'", re.compile('\\$')), + ("r'\\)'", re.compile('\\)')), + ("r'\\('", re.compile('\\(')), + ("r'\\]'", re.compile('\\]')), + ("r'\\['", re.compile('\\[')), + ("r'\\*'", re.compile('\\*')), + ("r':'", re.compile(':')), + ("r'\\.\\.'", re.compile('\\.\\.')), + ("r'@'", re.compile('@')), + ("r'::'", re.compile('::')), + ("r'\\/\\/'", re.compile('\\/\\/')), + ("r'\\/'", re.compile('\\/')), + ("r'\\-'", re.compile('\\-')), + ("'\\|'", re.compile('\\|')), + ("r'and'", re.compile('and')), + ("r'or'", re.compile('or')), + ('\\s+', re.compile('\\s+')), + ('END', re.compile('$')), + ('FORWARD_AXIS_NAME', re.compile('child|descendant-or-self|attribute|self|descendant|following-sibling|following|namespace')), + ('REVERSE_AXIS_NAME', re.compile('parent|preceding-sibling|preceding|ancestor-or-self|ancestor')), + ('NCNAME', re.compile('[a-zA-Z_][a-zA-Z0-9_\\-\\.\\w]*(?!\\()')), + ('FUNCNAME', re.compile('[a-zA-Z_][a-zA-Z0-9_\\-\\.\\w]*')), + ('DQUOTE', re.compile('\\"(?:[^\\"])*\\"')), + ('SQUOTE', re.compile("\\'(?:[^\\'])*\\'")), + ('NUMBER', re.compile('((\\.[0-9]+)|([0-9]+(\\.[0-9]*)?))([eE][\\+\\-]?[0-9]+)?')), + ('EQ_COMP', re.compile('\\!?\\=')), + ('REL_COMP', re.compile('[\\<\\>]\\=?')), + ('ADD_COMP', re.compile('[\\+\\-]')), + ('MUL_COMP', re.compile('\\*|div|mod')), + ] + def __init__(self, str): + Scanner.__init__(self,None,['\\s+'],str) + +class XPath(Parser): + def XPath(self): + Expr = self.Expr() + END = self._scan('END') + return Expr + + def Expr(self): + OrExpr = self.OrExpr() + return OrExpr + + def OrExpr(self): + AndExpr = self.AndExpr() + Expr = AndExpr + while self._peek("r'or'", 'END', "r'\\]'", "r'\\)'", "r'\\,'") == "r'or'": + self._scan("r'or'") + AndExpr = self.AndExpr() + Expr = X.OrExpr('or', Expr, AndExpr) + return Expr + + def AndExpr(self): + EqualityExpr = self.EqualityExpr() + Expr = EqualityExpr + while self._peek("r'and'", "r'or'", 'END', "r'\\]'", "r'\\)'", "r'\\,'") == "r'and'": + self._scan("r'and'") + EqualityExpr = self.EqualityExpr() + Expr = X.AndExpr('and', Expr, EqualityExpr) + return Expr + + def EqualityExpr(self): + RelationalExpr = self.RelationalExpr() + Expr = RelationalExpr + while self._peek('EQ_COMP', "r'and'", "r'or'", 'END', "r'\\]'", "r'\\)'", "r'\\,'") == 'EQ_COMP': + EQ_COMP = self._scan('EQ_COMP') + RelationalExpr = self.RelationalExpr() + Expr = X.EqualityExpr(EQ_COMP, Expr, RelationalExpr) + return Expr + + def RelationalExpr(self): + AdditiveExpr = self.AdditiveExpr() + Expr = AdditiveExpr + while self._peek('REL_COMP', 'EQ_COMP', "r'and'", "r'or'", 'END', "r'\\]'", "r'\\)'", "r'\\,'") == 'REL_COMP': + REL_COMP = self._scan('REL_COMP') + AdditiveExpr = self.AdditiveExpr() + Expr = X.EqualityExpr(REL_COMP, Expr, AdditiveExpr) + return Expr + + def AdditiveExpr(self): + MultiplicativeExpr = self.MultiplicativeExpr() + Expr = MultiplicativeExpr + while self._peek('ADD_COMP', 'REL_COMP', 'EQ_COMP', "r'and'", "r'or'", 'END', "r'\\]'", "r'\\)'", "r'\\,'") == 'ADD_COMP': + ADD_COMP = self._scan('ADD_COMP') + MultiplicativeExpr = self.MultiplicativeExpr() + Expr = X.ArithmeticalExpr(ADD_COMP, Expr, MultiplicativeExpr) + return Expr + + def MultiplicativeExpr(self): + UnionExpr = self.UnionExpr() + Expr = UnionExpr + while self._peek('MUL_COMP', 'ADD_COMP', 'REL_COMP', 'EQ_COMP', "r'and'", "r'or'", 'END', "r'\\]'", "r'\\)'", "r'\\,'") == 'MUL_COMP': + MUL_COMP = self._scan('MUL_COMP') + UnionExpr = self.UnionExpr() + Expr = X.ArithmeticalExpr(MUL_COMP, Expr, UnionExpr) + return Expr + + def UnionExpr(self): + UnaryExpr = self.UnaryExpr() + Expr = UnaryExpr + while self._peek("'\\|'", 'MUL_COMP', 'ADD_COMP', 'REL_COMP', 'EQ_COMP', "r'and'", "r'or'", 'END', "r'\\]'", "r'\\)'", "r'\\,'") == "'\\|'": + self._scan("'\\|'") + UnaryExpr = self.UnaryExpr() + Expr = X.UnionExpr('|', Expr, UnaryExpr) + return Expr + + def UnaryExpr(self): + _token_ = self._peek("r'\\-'", "r'\\/'", "r'\\/\\/'", "r'\\('", 'FORWARD_AXIS_NAME', "r'@'", 'REVERSE_AXIS_NAME', "r'\\.\\.'", "r'\\$'", "r'\\.'", 'FUNCNAME', 'NUMBER', 'DQUOTE', 'SQUOTE', "r'processing-instruction'", "r'comment'", "r'text'", "r'node'", "r'\\*'", 'NCNAME') + if _token_ == "r'\\-'": + self._scan("r'\\-'") + ValueExpr = self.ValueExpr() + return X.NegationExpr(ValueExpr) + else: + ValueExpr = self.ValueExpr() + return ValueExpr + + def ValueExpr(self): + PathExpr = self.PathExpr() + return PathExpr + + def PathExpr(self): + _token_ = self._peek("r'\\/'", "r'\\/\\/'", "r'\\('", 'FORWARD_AXIS_NAME', "r'@'", 'REVERSE_AXIS_NAME', "r'\\.\\.'", "r'\\$'", "r'\\.'", 'FUNCNAME', 'NUMBER', 'DQUOTE', 'SQUOTE', "r'processing-instruction'", "r'comment'", "r'text'", "r'node'", "r'\\*'", 'NCNAME') + if _token_ == "r'\\/'": + self._scan("r'\\/'") + path = None + if self._peek("r'\\('", 'FORWARD_AXIS_NAME', "r'@'", 'REVERSE_AXIS_NAME', "r'\\.\\.'", "r'\\$'", "r'\\.'", 'FUNCNAME', 'NUMBER', 'DQUOTE', 'SQUOTE', "r'processing-instruction'", "r'comment'", "r'text'", "r'node'", "r'\\*'", 'NCNAME', "'\\|'", 'MUL_COMP', 'ADD_COMP', 'REL_COMP', 'EQ_COMP', "r'and'", "r'or'", 'END', "r'\\]'", "r'\\)'", "r'\\,'") not in ["'\\|'", 'MUL_COMP', 'ADD_COMP', 'REL_COMP', 'EQ_COMP', "r'and'", "r'or'", 'END', "r'\\]'", "r'\\)'", "r'\\,'"]: + RelativePathExpr = self.RelativePathExpr() + path = RelativePathExpr + return X.AbsolutePathExpr(path) + elif _token_ == "r'\\/\\/'": + self._scan("r'\\/\\/'") + RelativePathExpr = self.RelativePathExpr() + step = X.AxisStep('descendant-or-self') + RelativePathExpr.steps.insert(0, step) + return X.AbsolutePathExpr(RelativePathExpr) + else: + RelativePathExpr = self.RelativePathExpr() + return RelativePathExpr + + def RelativePathExpr(self): + StepExpr = self.StepExpr() + steps = [StepExpr] + while self._peek("r'\\/'", "r'\\/\\/'", "'\\|'", 'MUL_COMP', 'ADD_COMP', 'REL_COMP', 'EQ_COMP', "r'and'", "r'or'", 'END', "r'\\]'", "r'\\)'", "r'\\,'") in ["r'\\/'", "r'\\/\\/'"]: + _token_ = self._peek("r'\\/'", "r'\\/\\/'") + if _token_ == "r'\\/'": + self._scan("r'\\/'") + else:# == "r'\\/\\/'" + self._scan("r'\\/\\/'") + steps.append(X.AxisStep('descendant-or-self')) + StepExpr = self.StepExpr() + steps.append(StepExpr) + return X.PathExpr(steps) + + def StepExpr(self): + _token_ = self._peek("r'\\('", 'FORWARD_AXIS_NAME', "r'@'", 'REVERSE_AXIS_NAME', "r'\\.\\.'", "r'\\$'", "r'\\.'", 'FUNCNAME', 'NUMBER', 'DQUOTE', 'SQUOTE', "r'processing-instruction'", "r'comment'", "r'text'", "r'node'", "r'\\*'", 'NCNAME') + if _token_ not in ["r'\\('", "r'\\$'", "r'\\.'", 'FUNCNAME', 'NUMBER', 'DQUOTE', 'SQUOTE']: + AxisStep = self.AxisStep() + return AxisStep + else: + FilterExpr = self.FilterExpr() + return FilterExpr + + def AxisStep(self): + _token_ = self._peek('FORWARD_AXIS_NAME', "r'@'", 'REVERSE_AXIS_NAME', "r'\\.\\.'", "r'processing-instruction'", "r'comment'", "r'text'", "r'node'", "r'\\*'", 'NCNAME') + if _token_ not in ['REVERSE_AXIS_NAME', "r'\\.\\.'"]: + ForwardStep = self.ForwardStep() + step = ForwardStep + else:# in ['REVERSE_AXIS_NAME', "r'\\.\\.'"] + ReverseStep = self.ReverseStep() + step = ReverseStep + expr = X.AxisStep(*step) + if self._peek("r'\\['", "r'\\/'", "r'\\/\\/'", "'\\|'", 'MUL_COMP', 'ADD_COMP', 'REL_COMP', 'EQ_COMP', "r'and'", "r'or'", 'END', "r'\\]'", "r'\\)'", "r'\\,'") == "r'\\['": + PredicateList = self.PredicateList() + expr = X.PredicateList(expr, PredicateList, step[0]) + return expr + + def ForwardStep(self): + _token_ = self._peek('FORWARD_AXIS_NAME', "r'@'", "r'processing-instruction'", "r'comment'", "r'text'", "r'node'", "r'\\*'", 'NCNAME') + if _token_ == 'FORWARD_AXIS_NAME': + ForwardAxis = self.ForwardAxis() + NodeTest = self.NodeTest() + return [ForwardAxis, NodeTest] + else: + AbbrevForwardStep = self.AbbrevForwardStep() + return AbbrevForwardStep + + def ForwardAxis(self): + FORWARD_AXIS_NAME = self._scan('FORWARD_AXIS_NAME') + self._scan("r'::'") + return FORWARD_AXIS_NAME + + def AbbrevForwardStep(self): + axis = 'child' + if self._peek("r'@'", "r'processing-instruction'", "r'comment'", "r'text'", "r'node'", "r'\\*'", 'NCNAME') == "r'@'": + self._scan("r'@'") + axis = 'attribute' + NodeTest = self.NodeTest() + return [axis, NodeTest] + + def ReverseStep(self): + _token_ = self._peek('REVERSE_AXIS_NAME', "r'\\.\\.'") + if _token_ == 'REVERSE_AXIS_NAME': + ReverseAxis = self.ReverseAxis() + NodeTest = self.NodeTest() + return [ReverseAxis, NodeTest] + else:# == "r'\\.\\.'" + AbbrevReverseStep = self.AbbrevReverseStep() + return AbbrevReverseStep + + def ReverseAxis(self): + REVERSE_AXIS_NAME = self._scan('REVERSE_AXIS_NAME') + self._scan("r'::'") + return REVERSE_AXIS_NAME + + def AbbrevReverseStep(self): + self._scan("r'\\.\\.'") + return ['parent', None] + + def NodeTest(self): + _token_ = self._peek("r'processing-instruction'", "r'comment'", "r'text'", "r'node'", "r'\\*'", 'NCNAME') + if _token_ not in ["r'\\*'", 'NCNAME']: + KindTest = self.KindTest() + return KindTest + else:# in ["r'\\*'", 'NCNAME'] + NameTest = self.NameTest() + return NameTest + + def NameTest(self): + prefix = None + WildcardOrNCName = self.WildcardOrNCName() + localpart = WildcardOrNCName + if self._peek("r':'", "r'\\['", "r'\\/'", "r'\\/\\/'", "'\\|'", 'MUL_COMP', 'ADD_COMP', 'REL_COMP', 'EQ_COMP', "r'and'", "r'or'", 'END', "r'\\]'", "r'\\)'", "r'\\,'") == "r':'": + self._scan("r':'") + WildcardOrNCName = self.WildcardOrNCName() + prefix = localpart + localpart = WildcardOrNCName + return X.NameTest(prefix, localpart) + + def WildcardOrNCName(self): + _token_ = self._peek("r'\\*'", 'NCNAME') + if _token_ == "r'\\*'": + self._scan("r'\\*'") + return '*' + else:# == 'NCNAME' + NCNAME = self._scan('NCNAME') + return NCNAME + + def FilterExpr(self): + PrimaryExpr = self.PrimaryExpr() + if self._peek("r'\\['", "r'\\/'", "r'\\/\\/'", "'\\|'", 'MUL_COMP', 'ADD_COMP', 'REL_COMP', 'EQ_COMP', "r'and'", "r'or'", 'END', "r'\\]'", "r'\\)'", "r'\\,'") == "r'\\['": + PredicateList = self.PredicateList() + PrimaryExpr = X.PredicateList(PrimaryExpr,PredicateList) + return PrimaryExpr + + def PredicateList(self): + Predicate = self.Predicate() + predicates = [Predicate] + while self._peek("r'\\['", "r'\\/'", "r'\\/\\/'", "'\\|'", 'MUL_COMP', 'ADD_COMP', 'REL_COMP', 'EQ_COMP', "r'and'", "r'or'", 'END', "r'\\]'", "r'\\)'", "r'\\,'") == "r'\\['": + Predicate = self.Predicate() + predicates.append(Predicate) + return predicates + + def Predicate(self): + self._scan("r'\\['") + Expr = self.Expr() + self._scan("r'\\]'") + return Expr + + def PrimaryExpr(self): + _token_ = self._peek("r'\\('", "r'\\$'", "r'\\.'", 'FUNCNAME', 'NUMBER', 'DQUOTE', 'SQUOTE') + if _token_ not in ["r'\\('", "r'\\$'", "r'\\.'", 'FUNCNAME']: + Literal = self.Literal() + return X.LiteralExpr(Literal) + elif _token_ == "r'\\$'": + VariableReference = self.VariableReference() + return VariableReference + elif _token_ == "r'\\('": + self._scan("r'\\('") + Expr = self.Expr() + self._scan("r'\\)'") + return Expr + elif _token_ == "r'\\.'": + ContextItemExpr = self.ContextItemExpr() + return ContextItemExpr + else:# == 'FUNCNAME' + FunctionCall = self.FunctionCall() + return FunctionCall + + def VariableReference(self): + self._scan("r'\\$'") + QName = self.QName() + return X.VariableReference(*QName) + + def ContextItemExpr(self): + self._scan("r'\\.'") + return X.AxisStep('self') + + def FunctionCall(self): + FUNCNAME = self._scan('FUNCNAME') + self._scan("r'\\('") + args = [] + if self._peek("r'\\,'", "r'\\)'", "r'\\-'", "r'\\/'", "r'\\/\\/'", "r'\\('", 'FORWARD_AXIS_NAME', "r'@'", 'REVERSE_AXIS_NAME', "r'\\.\\.'", "r'\\$'", "r'\\.'", 'FUNCNAME', 'NUMBER', 'DQUOTE', 'SQUOTE', "r'processing-instruction'", "r'comment'", "r'text'", "r'node'", "r'\\*'", 'NCNAME') not in ["r'\\,'", "r'\\)'"]: + Expr = self.Expr() + args.append(Expr) + while self._peek("r'\\,'", "r'\\)'") == "r'\\,'": + self._scan("r'\\,'") + Expr = self.Expr() + args.append(Expr) + self._scan("r'\\)'") + return X.Function(FUNCNAME, args) + + def KindTest(self): + _token_ = self._peek("r'processing-instruction'", "r'comment'", "r'text'", "r'node'") + if _token_ == "r'processing-instruction'": + PITest = self.PITest() + return PITest + elif _token_ == "r'comment'": + CommentTest = self.CommentTest() + return CommentTest + elif _token_ == "r'text'": + TextTest = self.TextTest() + return TextTest + else:# == "r'node'" + AnyKindTest = self.AnyKindTest() + return AnyKindTest + + def PITest(self): + self._scan("r'processing-instruction'") + name = None + self._scan("r'\\('") + if self._peek('NCNAME', "r'\\)'", 'DQUOTE', 'SQUOTE') != "r'\\)'": + _token_ = self._peek('NCNAME', 'DQUOTE', 'SQUOTE') + if _token_ == 'NCNAME': + NCNAME = self._scan('NCNAME') + name = NCNAME + else:# in ['DQUOTE', 'SQUOTE'] + StringLiteral = self.StringLiteral() + name = StringLiteral + self._scan("r'\\)'") + return X.PITest(name) + + def CommentTest(self): + self._scan("r'comment'") + self._scan("r'\\('") + self._scan("r'\\)'") + return X.CommentTest() + + def TextTest(self): + self._scan("r'text'") + self._scan("r'\\('") + self._scan("r'\\)'") + return X.TextTest() + + def AnyKindTest(self): + self._scan("r'node'") + self._scan("r'\\('") + self._scan("r'\\)'") + return X.AnyKindTest() + + def Literal(self): + _token_ = self._peek('NUMBER', 'DQUOTE', 'SQUOTE') + if _token_ == 'NUMBER': + NumericLiteral = self.NumericLiteral() + return NumericLiteral + else:# in ['DQUOTE', 'SQUOTE'] + StringLiteral = self.StringLiteral() + return StringLiteral + + def NumericLiteral(self): + NUMBER = self._scan('NUMBER') + return float(NUMBER) + + def StringLiteral(self): + _token_ = self._peek('DQUOTE', 'SQUOTE') + if _token_ == 'DQUOTE': + DQUOTE = self._scan('DQUOTE') + return DQUOTE[1:-1] + else:# == 'SQUOTE' + SQUOTE = self._scan('SQUOTE') + return SQUOTE[1:-1] + + def QName(self): + NCNAME = self._scan('NCNAME') + name = NCNAME + if self._peek("r'\\:'", "r'\\['", "r'\\/'", "r'\\/\\/'", "'\\|'", 'MUL_COMP', 'ADD_COMP', 'REL_COMP', 'EQ_COMP', "r'and'", "r'or'", 'END', "r'\\]'", "r'\\)'", "r'\\,'") == "r'\\:'": + self._scan("r'\\:'") + NCNAME = self._scan('NCNAME') + return (name, NCNAME) + return (None, name) + + +def parse(rule, text): + P = XPath(XPathScanner(text)) + return wrap_error_reporter(P, rule) + +if __name__ == '__main__': + from sys import argv, stdin + if len(argv) >= 2: + if len(argv) >= 3: + f = open(argv[2],'r') + else: + f = stdin + print parse(argv[1], f.read()) + else: print 'Args: []' diff --git a/pyulib/src/ulib/ext/xpath/yappsrt.py b/pyulib/src/ulib/ext/xpath/yappsrt.py new file mode 100644 index 0000000..c8d8933 --- /dev/null +++ b/pyulib/src/ulib/ext/xpath/yappsrt.py @@ -0,0 +1,174 @@ +# Yapps 2.0 Runtime +# +# This module is needed to run generated parsers. + +from string import join, count, find, rfind +import re + +class SyntaxError(Exception): + """When we run into an unexpected token, this is the exception to use""" + def __init__(self, pos=-1, msg="Bad Token"): + Exception.__init__(self) + self.pos = pos + self.msg = msg + def __repr__(self): + if self.pos < 0: return "#" + else: return "SyntaxError[@ char %s: %s]" % (repr(self.pos), self.msg) + +class NoMoreTokens(Exception): + """Another exception object, for when we run out of tokens""" + pass + +class Scanner: + def __init__(self, patterns, ignore, input): + """Patterns is [(terminal,regex)...] + Ignore is [terminal,...]; + Input is a string""" + self.tokens = [] + self.restrictions = [] + self.input = input + self.pos = 0 + self.ignore = ignore + # The stored patterns are a pair (compiled regex,source + # regex). If the patterns variable passed in to the + # constructor is None, we assume that the class already has a + # proper .patterns list constructed + if patterns is not None: + self.patterns = [] + for k, r in patterns: + self.patterns.append( (k, re.compile(r)) ) + + def token(self, i, restrict=0): + """Get the i'th token, and if i is one past the end, then scan + for another token; restrict is a list of tokens that + are allowed, or 0 for any token.""" + if i == len(self.tokens): self.scan(restrict) + if i < len(self.tokens): + # Make sure the restriction is more restricted + if restrict and self.restrictions[i]: + for r in restrict: + if r not in self.restrictions[i]: + raise NotImplementedError("Unimplemented: restriction set changed") + return self.tokens[i] + raise NoMoreTokens() + + def __repr__(self): + """Print the last 10 tokens that have been scanned in""" + output = '' + for t in self.tokens[-10:]: + output = '%s\n (@%s) %s = %s' % (output,t[0],t[2],repr(t[3])) + return output + + def scan(self, restrict): + """Should scan another token and add it to the list, self.tokens, + and add the restriction to self.restrictions""" + # Keep looking for a token, ignoring any in self.ignore + while 1: + # Search the patterns for the longest match, with earlier + # tokens in the list having preference + best_match = -1 + best_pat = '(error)' + for p, regexp in self.patterns: + # First check to see if we're ignoring this token + if restrict and p not in restrict and p not in self.ignore: + continue + m = regexp.match(self.input, self.pos) + if m and len(m.group(0)) > best_match: + # We got a match that's better than the previous one + best_pat = p + best_match = len(m.group(0)) + + # If we didn't find anything, raise an error + if best_pat == '(error)' and best_match < 0: + msg = "Bad Token" + if restrict: + msg = "Trying to find one of "+join(restrict,", ") + raise SyntaxError(self.pos, msg) + + # If we found something that isn't to be ignored, return it + if best_pat not in self.ignore: + # Create a token with this data + token = (self.pos, self.pos+best_match, best_pat, + self.input[self.pos:self.pos+best_match]) + self.pos = self.pos + best_match + # Only add this token if it's not in the list + # (to prevent looping) + if not self.tokens or token != self.tokens[-1]: + self.tokens.append(token) + self.restrictions.append(restrict) + return + else: + # This token should be ignored .. + self.pos = self.pos + best_match + +class Parser: + def __init__(self, scanner): + self._scanner = scanner + self._pos = 0 + + def _peek(self, *types): + """Returns the token type for lookahead; if there are any args + then the list of args is the set of token types to allow""" + tok = self._scanner.token(self._pos, types) + return tok[2] + + def _scan(self, type): + """Returns the matched text, and moves to the next token""" + tok = self._scanner.token(self._pos, [type]) + if tok[2] != type: + raise SyntaxError(tok[0], 'Trying to find '+type) + self._pos = 1+self._pos + return tok[3] + + + +def print_error(input, err, scanner): + """This is a really dumb long function to print error messages nicely.""" + p = err.pos + # Figure out the line number + line = count(input[:p], '\n') + print err.msg+" on line "+repr(line+1)+":" + # Now try printing part of the line + text = input[max(p-80, 0):p+80] + p = p - max(p-80, 0) + + # Strip to the left + i = rfind(text[:p], '\n') + j = rfind(text[:p], '\r') + if i < 0 or (0 <= j < i): i = j + if 0 <= i < p: + p = p - i - 1 + text = text[i+1:] + + # Strip to the right + i = find(text,'\n', p) + j = find(text,'\r', p) + if i < 0 or (0 <= j < i): i = j + if i >= 0: + text = text[:i] + + # Now shorten the text + while len(text) > 70 and p > 60: + # Cut off 10 chars + text = "..." + text[10:] + p = p - 7 + + # Now print the string, along with an indicator + print '> ',text + print '> ',' '*p + '^' + print 'List of nearby tokens:', scanner + +def wrap_error_reporter(parser, rule): + return_value = None + try: + return_value = getattr(parser, rule)() + except SyntaxError, s: + input = parser._scanner.input + try: + print_error(input, s, parser._scanner) + except ImportError: + print 'Syntax Error',s.msg,'on line',1+count(input[:s.pos], '\n') + except NoMoreTokens: + print 'Could not complete parsing; stopped around here:' + print parser._scanner + return return_value