nutools/lib/ulib/support/python/xpath/expr.py

904 lines
27 KiB
Python
Raw Normal View History

from __future__ import division
from itertools import *
import math
import operator
import re
import xml.dom
import weakref
from xpath.exceptions import *
import xpath
#
# Data model functions.
#
def string_value(node):
"""Compute the string-value of a node."""
if (node.nodeType == node.DOCUMENT_NODE or
node.nodeType == node.ELEMENT_NODE):
s = u''
for n in axes['descendant'](node):
if n.nodeType == n.TEXT_NODE:
s += n.data
elif n.nodeType == n.CDATA_SECTION_NODE:
s += n.nodeValue
return s
elif node.nodeType == node.ATTRIBUTE_NODE:
return node.value
elif (node.nodeType == node.PROCESSING_INSTRUCTION_NODE or
node.nodeType == node.COMMENT_NODE or
node.nodeType == node.TEXT_NODE):
return node.data
elif node.nodeType == node.CDATA_SECTION_NODE:
return node.nodeValue
def document_order(node):
"""Compute a document order value for the node.
cmp(document_order(a), document_order(b)) will return -1, 0, or 1 if
a is before, identical to, or after b in the document respectively.
We represent document order as a list of sibling indexes. That is,
the third child of the document node has an order of [2]. The first
child of that node has an order of [2,0].
Attributes have a sibling index of -1 (coming before all children of
their node) and are further ordered by name--e.g., [2,0,-1,'href'].
"""
# Attributes: parent-order + [-1, attribute-name]
if node.nodeType == node.ATTRIBUTE_NODE:
order = document_order(node.ownerElement)
order.extend((-1, node.name))
return order
# The document root (hopefully): []
if node.parentNode is None:
return []
# Determine which child this is of its parent.
sibpos = 0
sib = node
while sib.previousSibling is not None:
sibpos += 1
sib = sib.previousSibling
# Order: parent-order + [sibling-position]
order = document_order(node.parentNode)
order.append(sibpos)
return order
#
# Type functions, operating on the various XPath types.
#
# Internally, we use the following representations:
# nodeset - list of DOM tree nodes in document order
# string - str or unicode
# boolean - bool
# number - int or float
#
def nodeset(v):
"""Convert a value to a nodeset."""
if not nodesetp(v):
raise XPathTypeError, "value is not a node-set"
return v
def nodesetp(v):
"""Return true iff 'v' is a node-set."""
if isinstance(v, list):
return True
def string(v):
"""Convert a value to a string."""
if nodesetp(v):
if not v:
return u''
return string_value(v[0])
elif numberp(v):
if v == float('inf'):
return u'Infinity'
elif v == float('-inf'):
return u'-Infinity'
elif str(v) == 'nan':
return u'NaN'
elif int(v) == v and v <= 0xffffffff:
v = int(v)
return unicode(v)
elif booleanp(v):
return u'true' if v else u'false'
return v
def stringp(v):
"""Return true iff 'v' is a string."""
return isinstance(v, basestring)
def boolean(v):
"""Convert a value to a boolean."""
if nodesetp(v):
return len(v) > 0
elif numberp(v):
if v == 0 or v != v:
return False
return True
elif stringp(v):
return v != ''
return v
def booleanp(v):
"""Return true iff 'v' is a boolean."""
return isinstance(v, bool)
def number(v):
"""Convert a value to a number."""
if nodesetp(v):
v = string(v)
try:
return float(v)
except ValueError:
return float('NaN')
def numberp(v):
"""Return true iff 'v' is a number."""
return (not(isinstance(v, bool)) and
(isinstance(v, int) or isinstance(v, float)))
class Expr(object):
"""Abstract base class for XPath expressions."""
def evaluate(self, node, pos, size, context):
"""Evaluate the expression.
The context node, context position, and context size are passed as
arguments.
Returns an XPath value: a nodeset, string, boolean, or number.
"""
class BinaryOperatorExpr(Expr):
"""Base class for all binary operators."""
def __init__(self, op, left, right):
self.op = op
self.left = left
self.right = right
def evaluate(self, node, pos, size, context):
# Subclasses either override evaluate() or implement operate().
return self.operate(self.left.evaluate(node, pos, size, context),
self.right.evaluate(node, pos, size, context))
def __str__(self):
return '(%s %s %s)' % (self.left, self.op, self.right)
class AndExpr(BinaryOperatorExpr):
"""<x> and <y>"""
def evaluate(self, node, pos, size, context):
# Note that XPath boolean operations short-circuit.
return (boolean(self.left.evaluate(node, pos, size, context) and
boolean(self.right.evaluate(node, pos, size, context))))
class OrExpr(BinaryOperatorExpr):
"""<x> or <y>"""
def evaluate(self, node, pos, size, context):
# Note that XPath boolean operations short-circuit.
return (boolean(self.left.evaluate(node, pos, size, context) or
boolean(self.right.evaluate(node, pos, size, context))))
class EqualityExpr(BinaryOperatorExpr):
"""<x> = <y>, <x> != <y>, etc."""
operators = {
'=' : operator.eq,
'!=' : operator.ne,
'<=' : operator.le,
'<' : operator.lt,
'>=' : operator.ge,
'>' : operator.gt,
}
def operate(self, a, b):
if nodesetp(a):
for node in a:
if self.operate(string_value(node), b):
return True
return False
if nodesetp(b):
for node in b:
if self.operate(a, string_value(node)):
return True
return False
if self.op in ('=', '!='):
if booleanp(a) or booleanp(b):
convert = boolean
elif numberp(a) or numberp(b):
convert = number
else:
convert = string
else:
convert = number
a, b = convert(a), convert(b)
return self.operators[self.op](a, b)
def divop(x, y):
try:
return x / y
except ZeroDivisionError:
if x == 0 and y == 0:
return float('nan')
if x < 0:
return float('-inf')
return float('inf')
class ArithmeticalExpr(BinaryOperatorExpr):
"""<x> + <y>, <x> - <y>, etc."""
# Note that we must use math.fmod for the correct modulo semantics.
operators = {
'+' : operator.add,
'-' : operator.sub,
'*' : operator.mul,
'div' : divop,
'mod' : math.fmod
}
def operate(self, a, b):
return self.operators[self.op](number(a), number(b))
class UnionExpr(BinaryOperatorExpr):
"""<x> | <y>"""
def operate(self, a, b):
if not nodesetp(a) or not nodesetp(b):
raise XPathTypeError("union operand is not a node-set")
# Need to sort the result to preserve document order.
return sorted(set(chain(a, b)), key=document_order)
class NegationExpr(Expr):
"""- <x>"""
def __init__(self, expr):
self.expr = expr
def evaluate(self, node, pos, size, context):
return -number(self.expr.evaluate(node, pos, size, context))
def __str__(self):
return '(-%s)' % self.expr
class LiteralExpr(Expr):
"""Literals--either numbers or strings."""
def __init__(self, literal):
self.literal = literal
def evaluate(self, node, pos, size, context):
return self.literal
def __str__(self):
if stringp(self.literal):
if "'" in self.literal:
return '"%s"' % self.literal
else:
return "'%s'" % self.literal
return string(self.literal)
class VariableReference(Expr):
"""Variable references."""
def __init__(self, prefix, name):
self.prefix = prefix
self.name = name
def evaluate(self, node, pos, size, context):
try:
if self.prefix is not None:
try:
namespaceURI = context.namespaces[self.prefix]
except KeyError:
raise XPathUnknownPrefixError(self.prefix)
return context.variables[(namespaceURI, self.name)]
else:
return context.variables[self.name]
except KeyError:
raise XPathUnknownVariableError(str(self))
def __str__(self):
if self.prefix is None:
return '$%s' % self.name
else:
return '$%s:%s' % (self.prefix, self.name)
class Function(Expr):
"""Functions."""
def __init__(self, name, args):
self.name = name
self.args = args
self.evaluate = getattr(self, 'f_%s' % name.replace('-', '_'), None)
if self.evaluate is None:
raise XPathUnknownFunctionError, 'unknown function "%s()"' % name
if len(self.args) < self.evaluate.minargs:
raise XPathTypeError, 'too few arguments for "%s()"' % name
if (self.evaluate.maxargs is not None and
len(self.args) > self.evaluate.maxargs):
raise XPathTypeError, 'too many arguments for "%s()"' % name
#
# XPath functions are implemented by methods of the Function class.
#
# A method implementing an XPath function is decorated with the function
# decorator, and receives the evaluated function arguments as positional
# parameters.
#
def function(minargs, maxargs, implicit=False, first=False, convert=None):
"""Function decorator.
minargs -- Minimum number of arguments taken by the function.
maxargs -- Maximum number of arguments taken by the function.
implicit -- True for functions which operate on a nodeset consisting
of the current context node when passed no argument.
(e.g., string() and number().)
convert -- When non-None, a function used to filter function arguments.
"""
def decorator(f):
def new_f(self, node, pos, size, context):
if implicit and len(self.args) == 0:
args = [[node]]
else:
args = [x.evaluate(node, pos, size, context)
for x in self.args]
if first:
args[0] = nodeset(args[0])
if len(args[0]) > 0:
args[0] = args[0][0]
else:
args[0] = None
if convert is not None:
args = [convert(x) for x in args]
return f(self, node, pos, size, context, *args)
new_f.minargs = minargs
new_f.maxargs = maxargs
new_f.__name__ = f.__name__
new_f.__doc__ = f.__doc__
return new_f
return decorator
# Node Set Functions
@function(0, 0)
def f_last(self, node, pos, size, context):
return size
@function(0, 0)
def f_position(self, node, pos, size, context):
return pos
@function(1, 1, convert=nodeset)
def f_count(self, node, pos, size, context, nodes):
return len(nodes)
@function(1, 1)
def f_id(self, node, pos, size, context, arg):
if nodesetp(arg):
ids = (string_value(x) for x in arg)
else:
ids = [string(arg)]
if node.nodeType != node.DOCUMENT_NODE:
node = node.ownerDocument
return list(filter(None, (node.getElementById(id) for id in ids)))
@function(0, 1, implicit=True, first=True)
def f_local_name(self, node, pos, size, context, argnode):
if argnode is None:
return ''
if (argnode.nodeType == argnode.ELEMENT_NODE or
argnode.nodeType == argnode.ATTRIBUTE_NODE):
return argnode.localName
elif argnode.nodeType == argnode.PROCESSING_INSTRUCTION_NODE:
return argnode.target
return ''
@function(0, 1, implicit=True, first=True)
def f_namespace_uri(self, node, pos, size, context, argnode):
if argnode is None:
return ''
return argnode.namespaceURI
@function(0, 1, implicit=True, first=True)
def f_name(self, node, pos, size, context, argnode):
if argnode is None:
return ''
if argnode.nodeType == argnode.ELEMENT_NODE:
return argnode.tagName
elif argnode.nodeType == argnode.ATTRIBUTE_NODE:
return argnode.name
elif argnode.nodeType == argnode.PROCESSING_INSTRUCTION_NODE:
return argnode.target
return ''
# String Functions
@function(0, 1, implicit=True, convert=string)
def f_string(self, node, pos, size, context, arg):
return arg
@function(2, None, convert=string)
def f_concat(self, node, pos, size, context, *args):
return ''.join((x for x in args))
@function(2, 2, convert=string)
def f_starts_with(self, node, pos, size, context, a, b):
return a.startswith(b)
@function(2, 2, convert=string)
def f_contains(self, node, pos, size, context, a, b):
return b in a
@function(2, 2, convert=string)
def f_substring_before(self, node, pos, size, context, a, b):
try:
return a[0:a.index(b)]
except ValueError:
return ''
@function(2, 2, convert=string)
def f_substring_after(self, node, pos, size, context, a, b):
try:
return a[a.index(b)+len(b):]
except ValueError:
return ''
@function(2, 3)
def f_substring(self, node, pos, size, context, s, start, count=None):
s = string(s)
start = round(number(start))
if start != start:
# Catch NaN
return ''
if count is None:
end = len(s) + 1
else:
end = start + round(number(count))
if end != end:
# Catch NaN
return ''
if end > len(s):
end = len(s)+1
if start < 1:
start = 1
if start > len(s):
return ''
if end <= start:
return ''
return s[int(start)-1:int(end)-1]
@function(0, 1, implicit=True, convert=string)
def f_string_length(self, node, pos, size, context, s):
return len(s)
@function(0, 1, implicit=True, convert=string)
def f_normalize_space(self, node, pos, size, context, s):
return re.sub(r'\s+', ' ', s.strip())
@function(3, 3, convert=lambda x: unicode(string(x)))
def f_translate(self, node, pos, size, context, s, source, target):
# str.translate() and unicode.translate() are completely different.
# The translate() arguments are coerced to unicode.
table = {}
for schar, tchar in izip(source, target):
schar = ord(schar)
if schar not in table:
table[schar] = tchar
if len(source) > len(target):
for schar in source[len(target):]:
schar = ord(schar)
if schar not in table:
table[schar] = None
return s.translate(table)
# Boolean functions
@function(1, 1, convert=boolean)
def f_boolean(self, node, pos, size, context, b):
return b
@function(1, 1, convert=boolean)
def f_not(self, node, pos, size, context, b):
return not b
@function(0, 0)
def f_true(self, node, pos, size, context):
return True
@function(0, 0)
def f_false(self, node, pos, size, context):
return False
@function(1, 1, convert=string)
def f_lang(self, node, pos, size, context, s):
s = s.lower()
for n in axes['ancestor-or-self'](node):
if n.nodeType == n.ELEMENT_NODE and n.hasAttribute('xml:lang'):
lang = n.getAttribute('xml:lang').lower()
if s == lang or lang.startswith(s + u'-'):
return True
break
return False
# Number functions
@function(0, 1, implicit=True, convert=number)
def f_number(self, node, pos, size, context, n):
return n
@function(1, 1, convert=nodeset)
def f_sum(self, node, pos, size, context, nodes):
return sum((number(string_value(x)) for x in nodes))
@function(1, 1, convert=number)
def f_floor(self, node, pos, size, context, n):
return math.floor(n)
@function(1, 1, convert=number)
def f_ceiling(self, node, pos, size, context, n):
return math.ceil(n)
@function(1, 1, convert=number)
def f_round(self, node, pos, size, context, n):
# XXX round(-0.0) should be -0.0, not 0.0.
# XXX round(-1.5) should be -1.0, not -2.0.
return round(n)
def __str__(self):
return '%s(%s)' % (self.name, ', '.join((str(x) for x in self.args)))
#
# XPath axes.
#
# Dictionary of all axis functions.
axes = {}
def axisfn(reverse=False, principal_node_type=xml.dom.Node.ELEMENT_NODE):
"""Axis function decorator.
An axis function will take a node as an argument and return a sequence
over the nodes along an XPath axis. Axis functions have two extra
attributes indicating the axis direction and principal node type.
"""
def decorate(f):
f.__name__ = f.__name__.replace('_', '-')
f.reverse = reverse
f.principal_node_type = principal_node_type
return f
return decorate
def make_axes():
"""Define functions to walk each of the possible XPath axes."""
@axisfn()
def child(node):
return node.childNodes
@axisfn()
def descendant(node):
for child in node.childNodes:
for node in descendant_or_self(child):
yield node
@axisfn()
def parent(node):
if node.parentNode is not None:
yield node.parentNode
@axisfn(reverse=True)
def ancestor(node):
while node.parentNode is not None:
node = node.parentNode
yield node
@axisfn()
def following_sibling(node):
while node.nextSibling is not None:
node = node.nextSibling
yield node
@axisfn(reverse=True)
def preceding_sibling(node):
while node.previousSibling is not None:
node = node.previousSibling
yield node
@axisfn()
def following(node):
while node is not None:
while node.nextSibling is not None:
node = node.nextSibling
for n in descendant_or_self(node):
yield n
node = node.parentNode
@axisfn(reverse=True)
def preceding(node):
while node is not None:
while node.previousSibling is not None:
node = node.previousSibling
# Could be more efficient here.
for n in reversed(list(descendant_or_self(node))):
yield n
node = node.parentNode
@axisfn(principal_node_type=xml.dom.Node.ATTRIBUTE_NODE)
def attribute(node):
if node.attributes is not None:
return (node.attributes.item(i)
for i in xrange(node.attributes.length))
return ()
@axisfn()
def namespace(node):
raise XPathNotImplementedError("namespace axis is not implemented")
@axisfn()
def self(node):
yield node
@axisfn()
def descendant_or_self(node):
yield node
for child in node.childNodes:
for node in descendant_or_self(child):
yield node
@axisfn(reverse=True)
def ancestor_or_self(node):
return chain([node], ancestor(node))
# Place each axis function defined here into the 'axes' dict.
for axis in locals().values():
axes[axis.__name__] = axis
make_axes()
def merge_into_nodeset(target, source):
"""Place all the nodes from the source node-set into the target
node-set, preserving document order. Both node-sets must be in
document order to begin with.
"""
if len(target) == 0:
target.extend(source)
return
source = [n for n in source if n not in target]
if len(source) == 0:
return
# If the last node in the target set comes before the first node in the
# source set, then we can just concatenate the sets. Otherwise, we
# will need to sort. (We could also check to see if the last node in
# the source set comes before the first node in the target set, but this
# situation is very unlikely in practice.)
if document_order(target[-1]) < document_order(source[0]):
target.extend(source)
else:
target.extend(source)
target.sort(key=document_order)
class AbsolutePathExpr(Expr):
"""Absolute location paths."""
def __init__(self, path):
self.path = path
def evaluate(self, node, pos, size, context):
if node.nodeType != node.DOCUMENT_NODE:
node = node.ownerDocument
if self.path is None:
return [node]
return self.path.evaluate(node, 1, 1, context)
def __str__(self):
return '/%s' % (self.path or '')
class PathExpr(Expr):
"""Location path expressions."""
def __init__(self, steps):
self.steps = steps
def evaluate(self, node, pos, size, context):
# The first step in the path is evaluated in the current context.
# If this is the only step in the path, the return value is
# unimportant. If there are other steps, however, it must be a
# node-set.
result = self.steps[0].evaluate(node, pos, size, context)
if len(self.steps) > 1 and not nodesetp(result):
raise XPathTypeError("path step is not a node-set")
# Subsequent steps are evaluated for each node in the node-set
# resulting from the previous step.
for step in self.steps[1:]:
aggregate = []
for i in xrange(len(result)):
nodes = step.evaluate(result[i], i+1, len(result), context)
if not nodesetp(nodes):
raise XPathTypeError("path step is not a node-set")
merge_into_nodeset(aggregate, nodes)
result = aggregate
return result
def __str__(self):
return '/'.join((str(s) for s in self.steps))
class PredicateList(Expr):
"""A list of predicates.
Predicates are handled as an expression wrapping the expression
filtered by the predicates.
"""
def __init__(self, expr, predicates, axis='child'):
self.predicates = predicates
self.expr = expr
self.axis = axes[axis]
def evaluate(self, node, pos, size, context):
result = self.expr.evaluate(node, pos, size, context)
if not nodesetp(result):
raise XPathTypeError("predicate input is not a node-set")
if self.axis.reverse:
result.reverse()
for pred in self.predicates:
match = []
for i, node in izip(count(1), result):
r = pred.evaluate(node, i, len(result), context)
# If a predicate evaluates to a number, select the node
# with that position. Otherwise, select nodes for which
# the boolean value of the predicate is true.
if numberp(r):
if r == i:
match.append(node)
elif boolean(r):
match.append(node)
result = match
if self.axis.reverse:
result.reverse()
return result
def __str__(self):
s = str(self.expr)
if '/' in s:
s = '(%s)' % s
return s + ''.join(('[%s]' % x for x in self.predicates))
class AxisStep(Expr):
"""One step in a location path expression."""
def __init__(self, axis, test=None, predicates=None):
if test is None:
test = AnyKindTest()
self.axis = axes[axis]
self.test = test
def evaluate(self, node, pos, size, context):
match = []
for n in self.axis(node):
if self.test.match(n, self.axis, context):
match.append(n)
if self.axis.reverse:
match.reverse()
return match
def __str__(self):
return '%s::%s' % (self.axis.__name__, self.test)
#
# Node tests.
#
class Test(object):
"""Abstract base class for node tests."""
def match(self, node, axis, context):
"""Return True if 'node' matches the test along 'axis'."""
class NameTest(object):
def __init__(self, prefix, localpart):
self.prefix = prefix
self.localName = localpart
if self.prefix == None and self.localName == '*':
self.prefix = '*'
def match(self, node, axis, context):
if node.nodeType != axis.principal_node_type:
return False
if self.prefix != '*':
namespaceURI = None
if self.prefix is not None:
try:
namespaceURI = context.namespaces[self.prefix]
except KeyError:
raise XPathUnknownPrefixError(self.prefix)
elif axis.principal_node_type == node.ELEMENT_NODE:
namespaceURI = context.default_namespace
if namespaceURI != node.namespaceURI:
return False
if self.localName != '*':
if self.localName != node.localName:
return False
return True
def __str__(self):
if self.prefix is not None:
return '%s:%s' % (self.prefix, self.localName)
else:
return self.localName
class PITest(object):
def __init__(self, name=None):
self.name = name
def match(self, node, axis, context):
return (node.nodeType == node.PROCESSING_INSTRUCTION_NODE and
(self.name is None or node.target == self.name))
def __str__(self):
if self.name is None:
name = ''
elif "'" in self.name:
name = '"%s"' % self.name
else:
name = "'%s'" % self.name
return 'processing-instruction(%s)' % name
class CommentTest(object):
def match(self, node, axis, context):
return node.nodeType == node.COMMENT_NODE
def __str__(self):
return 'comment()'
class TextTest(object):
def match(self, node, axis, context):
return (node.nodeType == node.TEXT_NODE or
node.nodeType == node.CDATA_SECTION_NODE)
def __str__(self):
return 'text()'
class AnyKindTest(object):
def match(self, node, axis, context):
return True
def __str__(self):
return 'node()'