TreePath.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. """
  2. A simple XPath-like language for tree traversal.
  3. This works by creating a filter chain of generator functions. Each
  4. function selects a part of the expression, e.g. a child node, a
  5. specific descendant or a node that holds an attribute.
  6. """
  7. from __future__ import absolute_import
  8. import re
  9. import operator
  10. path_tokenizer = re.compile(
  11. "("
  12. "'[^']*'|\"[^\"]*\"|"
  13. "//?|"
  14. "\(\)|"
  15. "==?|"
  16. "[/.*\[\]\(\)@])|"
  17. "([^/\[\]\(\)@=\s]+)|"
  18. "\s+"
  19. ).findall
  20. def iterchildren(node, attr_name):
  21. # returns an iterable of all child nodes of that name
  22. child = getattr(node, attr_name)
  23. if child is not None:
  24. if type(child) is list:
  25. return child
  26. else:
  27. return [child]
  28. else:
  29. return ()
  30. def _get_first_or_none(it):
  31. try:
  32. try:
  33. _next = it.next
  34. except AttributeError:
  35. return next(it)
  36. else:
  37. return _next()
  38. except StopIteration:
  39. return None
  40. def type_name(node):
  41. return node.__class__.__name__.split('.')[-1]
  42. def parse_func(next, token):
  43. name = token[1]
  44. token = next()
  45. if token[0] != '(':
  46. raise ValueError("Expected '(' after function name '%s'" % name)
  47. predicate = handle_predicate(next, token)
  48. return name, predicate
  49. def handle_func_not(next, token):
  50. """
  51. not(...)
  52. """
  53. name, predicate = parse_func(next, token)
  54. def select(result):
  55. for node in result:
  56. if _get_first_or_none(predicate([node])) is None:
  57. yield node
  58. return select
  59. def handle_name(next, token):
  60. """
  61. /NodeName/
  62. or
  63. func(...)
  64. """
  65. name = token[1]
  66. if name in functions:
  67. return functions[name](next, token)
  68. def select(result):
  69. for node in result:
  70. for attr_name in node.child_attrs:
  71. for child in iterchildren(node, attr_name):
  72. if type_name(child) == name:
  73. yield child
  74. return select
  75. def handle_star(next, token):
  76. """
  77. /*/
  78. """
  79. def select(result):
  80. for node in result:
  81. for name in node.child_attrs:
  82. for child in iterchildren(node, name):
  83. yield child
  84. return select
  85. def handle_dot(next, token):
  86. """
  87. /./
  88. """
  89. def select(result):
  90. return result
  91. return select
  92. def handle_descendants(next, token):
  93. """
  94. //...
  95. """
  96. token = next()
  97. if token[0] == "*":
  98. def iter_recursive(node):
  99. for name in node.child_attrs:
  100. for child in iterchildren(node, name):
  101. yield child
  102. for c in iter_recursive(child):
  103. yield c
  104. elif not token[0]:
  105. node_name = token[1]
  106. def iter_recursive(node):
  107. for name in node.child_attrs:
  108. for child in iterchildren(node, name):
  109. if type_name(child) == node_name:
  110. yield child
  111. for c in iter_recursive(child):
  112. yield c
  113. else:
  114. raise ValueError("Expected node name after '//'")
  115. def select(result):
  116. for node in result:
  117. for child in iter_recursive(node):
  118. yield child
  119. return select
  120. def handle_attribute(next, token):
  121. token = next()
  122. if token[0]:
  123. raise ValueError("Expected attribute name")
  124. name = token[1]
  125. value = None
  126. try:
  127. token = next()
  128. except StopIteration:
  129. pass
  130. else:
  131. if token[0] == '=':
  132. value = parse_path_value(next)
  133. readattr = operator.attrgetter(name)
  134. if value is None:
  135. def select(result):
  136. for node in result:
  137. try:
  138. attr_value = readattr(node)
  139. except AttributeError:
  140. continue
  141. if attr_value is not None:
  142. yield attr_value
  143. else:
  144. def select(result):
  145. for node in result:
  146. try:
  147. attr_value = readattr(node)
  148. except AttributeError:
  149. continue
  150. if attr_value == value:
  151. yield attr_value
  152. return select
  153. def parse_path_value(next):
  154. token = next()
  155. value = token[0]
  156. if value:
  157. if value[:1] == "'" or value[:1] == '"':
  158. return value[1:-1]
  159. try:
  160. return int(value)
  161. except ValueError:
  162. pass
  163. elif token[1].isdigit():
  164. return int(token[1])
  165. else:
  166. name = token[1].lower()
  167. if name == 'true':
  168. return True
  169. elif name == 'false':
  170. return False
  171. raise ValueError("Invalid attribute predicate: '%s'" % value)
  172. def handle_predicate(next, token):
  173. token = next()
  174. selector = []
  175. while token[0] != ']':
  176. selector.append( operations[token[0]](next, token) )
  177. try:
  178. token = next()
  179. except StopIteration:
  180. break
  181. else:
  182. if token[0] == "/":
  183. token = next()
  184. if not token[0] and token[1] == 'and':
  185. return logical_and(selector, handle_predicate(next, token))
  186. def select(result):
  187. for node in result:
  188. subresult = iter((node,))
  189. for select in selector:
  190. subresult = select(subresult)
  191. predicate_result = _get_first_or_none(subresult)
  192. if predicate_result is not None:
  193. yield node
  194. return select
  195. def logical_and(lhs_selects, rhs_select):
  196. def select(result):
  197. for node in result:
  198. subresult = iter((node,))
  199. for select in lhs_selects:
  200. subresult = select(subresult)
  201. predicate_result = _get_first_or_none(subresult)
  202. subresult = iter((node,))
  203. if predicate_result is not None:
  204. for result_node in rhs_select(subresult):
  205. yield node
  206. return select
  207. operations = {
  208. "@": handle_attribute,
  209. "": handle_name,
  210. "*": handle_star,
  211. ".": handle_dot,
  212. "//": handle_descendants,
  213. "[": handle_predicate,
  214. }
  215. functions = {
  216. 'not' : handle_func_not
  217. }
  218. def _build_path_iterator(path):
  219. # parse pattern
  220. stream = iter([ (special,text)
  221. for (special,text) in path_tokenizer(path)
  222. if special or text ])
  223. try:
  224. _next = stream.next
  225. except AttributeError:
  226. # Python 3
  227. def _next():
  228. return next(stream)
  229. token = _next()
  230. selector = []
  231. while 1:
  232. try:
  233. selector.append(operations[token[0]](_next, token))
  234. except StopIteration:
  235. raise ValueError("invalid path")
  236. try:
  237. token = _next()
  238. if token[0] == "/":
  239. token = _next()
  240. except StopIteration:
  241. break
  242. return selector
  243. # main module API
  244. def iterfind(node, path):
  245. selector_chain = _build_path_iterator(path)
  246. result = iter((node,))
  247. for select in selector_chain:
  248. result = select(result)
  249. return result
  250. def find_first(node, path):
  251. return _get_first_or_none(iterfind(node, path))
  252. def find_all(node, path):
  253. return list(iterfind(node, path))