_elementpath.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. #
  2. # ElementTree
  3. # $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
  4. #
  5. # limited xpath support for element trees
  6. #
  7. # history:
  8. # 2003-05-23 fl created
  9. # 2003-05-28 fl added support for // etc
  10. # 2003-08-27 fl fixed parsing of periods in element names
  11. # 2007-09-10 fl new selection engine
  12. # 2007-09-12 fl fixed parent selector
  13. # 2007-09-13 fl added iterfind; changed findall to return a list
  14. # 2007-11-30 fl added namespaces support
  15. # 2009-10-30 fl added child element value filter
  16. #
  17. # Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved.
  18. #
  19. # fredrik@pythonware.com
  20. # http://www.pythonware.com
  21. #
  22. # --------------------------------------------------------------------
  23. # The ElementTree toolkit is
  24. #
  25. # Copyright (c) 1999-2009 by Fredrik Lundh
  26. #
  27. # By obtaining, using, and/or copying this software and/or its
  28. # associated documentation, you agree that you have read, understood,
  29. # and will comply with the following terms and conditions:
  30. #
  31. # Permission to use, copy, modify, and distribute this software and
  32. # its associated documentation for any purpose and without fee is
  33. # hereby granted, provided that the above copyright notice appears in
  34. # all copies, and that both that copyright notice and this permission
  35. # notice appear in supporting documentation, and that the name of
  36. # Secret Labs AB or the author not be used in advertising or publicity
  37. # pertaining to distribution of the software without specific, written
  38. # prior permission.
  39. #
  40. # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
  41. # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
  42. # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
  43. # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
  44. # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  45. # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  46. # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  47. # OF THIS SOFTWARE.
  48. # --------------------------------------------------------------------
  49. ##
  50. # Implementation module for XPath support. There's usually no reason
  51. # to import this module directly; the <b>ElementTree</b> does this for
  52. # you, if needed.
  53. ##
  54. import re
  55. xpath_tokenizer_re = re.compile(
  56. "("
  57. "'[^']*'|\"[^\"]*\"|"
  58. "::|"
  59. "//?|"
  60. r"\.\.|"
  61. r"\(\)|"
  62. r"[/.*:\[\]\(\)@=])|"
  63. r"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
  64. r"\s+"
  65. )
  66. def xpath_tokenizer(pattern, namespaces=None):
  67. default_namespace = namespaces.get(None) if namespaces else None
  68. for token in xpath_tokenizer_re.findall(pattern):
  69. tag = token[1]
  70. if tag and tag[0] != "{":
  71. if ":" in tag:
  72. prefix, uri = tag.split(":", 1)
  73. try:
  74. if not namespaces:
  75. raise KeyError
  76. yield token[0], "{%s}%s" % (namespaces[prefix], uri)
  77. except KeyError:
  78. raise SyntaxError("prefix %r not found in prefix map" % prefix)
  79. elif default_namespace:
  80. yield token[0], "{%s}%s" % (default_namespace, tag)
  81. else:
  82. yield token
  83. else:
  84. yield token
  85. def prepare_child(next, token):
  86. tag = token[1]
  87. def select(result):
  88. for elem in result:
  89. for e in elem.iterchildren(tag):
  90. yield e
  91. return select
  92. def prepare_star(next, token):
  93. def select(result):
  94. for elem in result:
  95. for e in elem.iterchildren('*'):
  96. yield e
  97. return select
  98. def prepare_self(next, token):
  99. def select(result):
  100. return result
  101. return select
  102. def prepare_descendant(next, token):
  103. token = next()
  104. if token[0] == "*":
  105. tag = "*"
  106. elif not token[0]:
  107. tag = token[1]
  108. else:
  109. raise SyntaxError("invalid descendant")
  110. def select(result):
  111. for elem in result:
  112. for e in elem.iterdescendants(tag):
  113. yield e
  114. return select
  115. def prepare_parent(next, token):
  116. def select(result):
  117. for elem in result:
  118. parent = elem.getparent()
  119. if parent is not None:
  120. yield parent
  121. return select
  122. def prepare_predicate(next, token):
  123. # FIXME: replace with real parser!!! refs:
  124. # http://effbot.org/zone/simple-iterator-parser.htm
  125. # http://javascript.crockford.com/tdop/tdop.html
  126. signature = []
  127. predicate = []
  128. while 1:
  129. token = next()
  130. if token[0] == "]":
  131. break
  132. if token[0] and token[0][:1] in "'\"":
  133. token = "'", token[0][1:-1]
  134. signature.append(token[0] or "-")
  135. predicate.append(token[1])
  136. signature = "".join(signature)
  137. # use signature to determine predicate type
  138. if signature == "@-":
  139. # [@attribute] predicate
  140. key = predicate[1]
  141. def select(result):
  142. for elem in result:
  143. if elem.get(key) is not None:
  144. yield elem
  145. return select
  146. if signature == "@-='":
  147. # [@attribute='value']
  148. key = predicate[1]
  149. value = predicate[-1]
  150. def select(result):
  151. for elem in result:
  152. if elem.get(key) == value:
  153. yield elem
  154. return select
  155. if signature == "-" and not re.match(r"-?\d+$", predicate[0]):
  156. # [tag]
  157. tag = predicate[0]
  158. def select(result):
  159. for elem in result:
  160. for _ in elem.iterchildren(tag):
  161. yield elem
  162. break
  163. return select
  164. if signature == "-='" and not re.match(r"-?\d+$", predicate[0]):
  165. # [tag='value']
  166. tag = predicate[0]
  167. value = predicate[-1]
  168. def select(result):
  169. for elem in result:
  170. for e in elem.iterchildren(tag):
  171. if "".join(e.itertext()) == value:
  172. yield elem
  173. break
  174. return select
  175. if signature == "-" or signature == "-()" or signature == "-()-":
  176. # [index] or [last()] or [last()-index]
  177. if signature == "-":
  178. # [index]
  179. index = int(predicate[0]) - 1
  180. if index < 0:
  181. if index == -1:
  182. raise SyntaxError(
  183. "indices in path predicates are 1-based, not 0-based")
  184. else:
  185. raise SyntaxError("path index >= 1 expected")
  186. else:
  187. if predicate[0] != "last":
  188. raise SyntaxError("unsupported function")
  189. if signature == "-()-":
  190. try:
  191. index = int(predicate[2]) - 1
  192. except ValueError:
  193. raise SyntaxError("unsupported expression")
  194. else:
  195. index = -1
  196. def select(result):
  197. for elem in result:
  198. parent = elem.getparent()
  199. if parent is None:
  200. continue
  201. try:
  202. # FIXME: what if the selector is "*" ?
  203. elems = list(parent.iterchildren(elem.tag))
  204. if elems[index] is elem:
  205. yield elem
  206. except IndexError:
  207. pass
  208. return select
  209. raise SyntaxError("invalid predicate")
  210. ops = {
  211. "": prepare_child,
  212. "*": prepare_star,
  213. ".": prepare_self,
  214. "..": prepare_parent,
  215. "//": prepare_descendant,
  216. "[": prepare_predicate,
  217. }
  218. # --------------------------------------------------------------------
  219. _cache = {}
  220. def _build_path_iterator(path, namespaces):
  221. """compile selector pattern"""
  222. if path[-1:] == "/":
  223. path += "*" # implicit all (FIXME: keep this?)
  224. cache_key = (path,)
  225. if namespaces:
  226. if '' in namespaces:
  227. raise ValueError("empty namespace prefix must be passed as None, not the empty string")
  228. if None in namespaces:
  229. cache_key += (namespaces[None],) + tuple(sorted(
  230. item for item in namespaces.items() if item[0] is not None))
  231. else:
  232. cache_key += tuple(sorted(namespaces.items()))
  233. try:
  234. return _cache[cache_key]
  235. except KeyError:
  236. pass
  237. if len(_cache) > 100:
  238. _cache.clear()
  239. if path[:1] == "/":
  240. raise SyntaxError("cannot use absolute path on element")
  241. stream = iter(xpath_tokenizer(path, namespaces))
  242. try:
  243. _next = stream.next
  244. except AttributeError:
  245. # Python 3
  246. _next = stream.__next__
  247. try:
  248. token = _next()
  249. except StopIteration:
  250. raise SyntaxError("empty path expression")
  251. selector = []
  252. while 1:
  253. try:
  254. selector.append(ops[token[0]](_next, token))
  255. except StopIteration:
  256. raise SyntaxError("invalid path")
  257. try:
  258. token = _next()
  259. if token[0] == "/":
  260. token = _next()
  261. except StopIteration:
  262. break
  263. _cache[cache_key] = selector
  264. return selector
  265. ##
  266. # Iterate over the matching nodes
  267. def iterfind(elem, path, namespaces=None):
  268. selector = _build_path_iterator(path, namespaces)
  269. result = iter((elem,))
  270. for select in selector:
  271. result = select(result)
  272. return result
  273. ##
  274. # Find first matching object.
  275. def find(elem, path, namespaces=None):
  276. it = iterfind(elem, path, namespaces)
  277. try:
  278. return next(it)
  279. except StopIteration:
  280. return None
  281. ##
  282. # Find all matching objects.
  283. def findall(elem, path, namespaces=None):
  284. return list(iterfind(elem, path, namespaces))
  285. ##
  286. # Find text for first matching object.
  287. def findtext(elem, path, default=None, namespaces=None):
  288. el = find(elem, path, namespaces)
  289. if el is None:
  290. return default
  291. else:
  292. return el.text or ''