sax.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. """
  2. SAX-based adapter to copy trees from/to the Python standard library.
  3. Use the `ElementTreeContentHandler` class to build an ElementTree from
  4. SAX events.
  5. Use the `ElementTreeProducer` class or the `saxify()` function to fire
  6. the SAX events of an ElementTree against a SAX ContentHandler.
  7. See http://codespeak.net/lxml/sax.html
  8. """
  9. from xml.sax.handler import ContentHandler
  10. from lxml import etree
  11. from lxml.etree import ElementTree, SubElement
  12. from lxml.etree import Comment, ProcessingInstruction
  13. class SaxError(etree.LxmlError):
  14. """General SAX error.
  15. """
  16. pass
  17. def _getNsTag(tag):
  18. if tag[0] == '{':
  19. return tuple(tag[1:].split('}', 1))
  20. else:
  21. return (None, tag)
  22. class ElementTreeContentHandler(ContentHandler):
  23. """Build an lxml ElementTree from SAX events.
  24. """
  25. def __init__(self, makeelement=None):
  26. ContentHandler.__init__(self)
  27. self._root = None
  28. self._root_siblings = []
  29. self._element_stack = []
  30. self._default_ns = None
  31. self._ns_mapping = { None : [None] }
  32. self._new_mappings = {}
  33. if makeelement is None:
  34. makeelement = etree.Element
  35. self._makeelement = makeelement
  36. def _get_etree(self):
  37. "Contains the generated ElementTree after parsing is finished."
  38. return ElementTree(self._root)
  39. etree = property(_get_etree, doc=_get_etree.__doc__)
  40. def setDocumentLocator(self, locator):
  41. pass
  42. def startDocument(self):
  43. pass
  44. def endDocument(self):
  45. pass
  46. def startPrefixMapping(self, prefix, uri):
  47. self._new_mappings[prefix] = uri
  48. try:
  49. self._ns_mapping[prefix].append(uri)
  50. except KeyError:
  51. self._ns_mapping[prefix] = [uri]
  52. if prefix is None:
  53. self._default_ns = uri
  54. def endPrefixMapping(self, prefix):
  55. ns_uri_list = self._ns_mapping[prefix]
  56. ns_uri_list.pop()
  57. if prefix is None:
  58. self._default_ns = ns_uri_list[-1]
  59. def _buildTag(self, ns_name_tuple):
  60. ns_uri, local_name = ns_name_tuple
  61. if ns_uri:
  62. el_tag = "{%s}%s" % ns_name_tuple
  63. elif self._default_ns:
  64. el_tag = "{%s}%s" % (self._default_ns, local_name)
  65. else:
  66. el_tag = local_name
  67. return el_tag
  68. def startElementNS(self, ns_name, qname, attributes=None):
  69. el_name = self._buildTag(ns_name)
  70. if attributes:
  71. attrs = {}
  72. try:
  73. iter_attributes = attributes.iteritems()
  74. except AttributeError:
  75. iter_attributes = attributes.items()
  76. for name_tuple, value in iter_attributes:
  77. if name_tuple[0]:
  78. attr_name = "{%s}%s" % name_tuple
  79. else:
  80. attr_name = name_tuple[1]
  81. attrs[attr_name] = value
  82. else:
  83. attrs = None
  84. element_stack = self._element_stack
  85. if self._root is None:
  86. element = self._root = \
  87. self._makeelement(el_name, attrs, self._new_mappings)
  88. if self._root_siblings and hasattr(element, 'addprevious'):
  89. for sibling in self._root_siblings:
  90. element.addprevious(sibling)
  91. del self._root_siblings[:]
  92. else:
  93. element = SubElement(element_stack[-1], el_name,
  94. attrs, self._new_mappings)
  95. element_stack.append(element)
  96. self._new_mappings.clear()
  97. def processingInstruction(self, target, data):
  98. pi = ProcessingInstruction(target, data)
  99. if self._root is None:
  100. self._root_siblings.append(pi)
  101. else:
  102. self._element_stack[-1].append(pi)
  103. def endElementNS(self, ns_name, qname):
  104. element = self._element_stack.pop()
  105. el_tag = self._buildTag(ns_name)
  106. if el_tag != element.tag:
  107. raise SaxError("Unexpected element closed: " + el_tag)
  108. def startElement(self, name, attributes=None):
  109. if attributes:
  110. attributes = dict(
  111. [((None, k), v) for k, v in attributes.items()]
  112. )
  113. self.startElementNS((None, name), name, attributes)
  114. def endElement(self, name):
  115. self.endElementNS((None, name), name)
  116. def characters(self, data):
  117. last_element = self._element_stack[-1]
  118. try:
  119. # if there already is a child element, we must append to its tail
  120. last_element = last_element[-1]
  121. last_element.tail = (last_element.tail or '') + data
  122. except IndexError:
  123. # otherwise: append to the text
  124. last_element.text = (last_element.text or '') + data
  125. ignorableWhitespace = characters
  126. class ElementTreeProducer(object):
  127. """Produces SAX events for an element and children.
  128. """
  129. def __init__(self, element_or_tree, content_handler):
  130. try:
  131. element = element_or_tree.getroot()
  132. except AttributeError:
  133. element = element_or_tree
  134. self._element = element
  135. self._content_handler = content_handler
  136. from xml.sax.xmlreader import AttributesNSImpl as attr_class
  137. self._attr_class = attr_class
  138. self._empty_attributes = attr_class({}, {})
  139. def saxify(self):
  140. self._content_handler.startDocument()
  141. element = self._element
  142. if hasattr(element, 'getprevious'):
  143. siblings = []
  144. sibling = element.getprevious()
  145. while getattr(sibling, 'tag', None) is ProcessingInstruction:
  146. siblings.append(sibling)
  147. sibling = sibling.getprevious()
  148. for sibling in siblings[::-1]:
  149. self._recursive_saxify(sibling, {})
  150. self._recursive_saxify(element, {})
  151. if hasattr(element, 'getnext'):
  152. sibling = element.getnext()
  153. while getattr(sibling, 'tag', None) is ProcessingInstruction:
  154. self._recursive_saxify(sibling, {})
  155. sibling = sibling.getnext()
  156. self._content_handler.endDocument()
  157. def _recursive_saxify(self, element, prefixes):
  158. content_handler = self._content_handler
  159. tag = element.tag
  160. if tag is Comment or tag is ProcessingInstruction:
  161. if tag is ProcessingInstruction:
  162. content_handler.processingInstruction(
  163. element.target, element.text)
  164. if element.tail:
  165. content_handler.characters(element.tail)
  166. return
  167. new_prefixes = []
  168. build_qname = self._build_qname
  169. attribs = element.items()
  170. if attribs:
  171. attr_values = {}
  172. attr_qnames = {}
  173. for attr_ns_name, value in attribs:
  174. attr_ns_tuple = _getNsTag(attr_ns_name)
  175. attr_values[attr_ns_tuple] = value
  176. attr_qnames[attr_ns_tuple] = build_qname(
  177. attr_ns_tuple[0], attr_ns_tuple[1], prefixes, new_prefixes)
  178. sax_attributes = self._attr_class(attr_values, attr_qnames)
  179. else:
  180. sax_attributes = self._empty_attributes
  181. ns_uri, local_name = _getNsTag(tag)
  182. qname = build_qname(ns_uri, local_name, prefixes, new_prefixes)
  183. for prefix, uri in new_prefixes:
  184. content_handler.startPrefixMapping(prefix, uri)
  185. content_handler.startElementNS((ns_uri, local_name),
  186. qname, sax_attributes)
  187. if element.text:
  188. content_handler.characters(element.text)
  189. for child in element:
  190. self._recursive_saxify(child, prefixes)
  191. content_handler.endElementNS((ns_uri, local_name), qname)
  192. for prefix, uri in new_prefixes:
  193. content_handler.endPrefixMapping(prefix)
  194. if element.tail:
  195. content_handler.characters(element.tail)
  196. def _build_qname(self, ns_uri, local_name, prefixes, new_prefixes):
  197. if ns_uri is None:
  198. return local_name
  199. try:
  200. prefix = prefixes[ns_uri]
  201. except KeyError:
  202. prefix = prefixes[ns_uri] = 'ns%02d' % len(prefixes)
  203. new_prefixes.append( (prefix, ns_uri) )
  204. return prefix + ':' + local_name
  205. def saxify(element_or_tree, content_handler):
  206. """One-shot helper to generate SAX events from an XML tree and fire
  207. them against a SAX ContentHandler.
  208. """
  209. return ElementTreeProducer(element_or_tree, content_handler).saxify()