html5parser.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. """
  2. An interface to html5lib that mimics the lxml.html interface.
  3. """
  4. import sys
  5. import string
  6. from html5lib import HTMLParser as _HTMLParser
  7. from html5lib.treebuilders.etree_lxml import TreeBuilder
  8. from lxml import etree
  9. from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE, Element
  10. # python3 compatibility
  11. try:
  12. _strings = basestring
  13. except NameError:
  14. _strings = (bytes, str)
  15. try:
  16. from urllib2 import urlopen
  17. except ImportError:
  18. from urllib.request import urlopen
  19. try:
  20. from urlparse import urlparse
  21. except ImportError:
  22. from urllib.parse import urlparse
  23. class HTMLParser(_HTMLParser):
  24. """An html5lib HTML parser with lxml as tree."""
  25. def __init__(self, strict=False, **kwargs):
  26. _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
  27. try:
  28. from html5lib import XHTMLParser as _XHTMLParser
  29. except ImportError:
  30. pass
  31. else:
  32. class XHTMLParser(_XHTMLParser):
  33. """An html5lib XHTML Parser with lxml as tree."""
  34. def __init__(self, strict=False, **kwargs):
  35. _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
  36. xhtml_parser = XHTMLParser()
  37. def _find_tag(tree, tag):
  38. elem = tree.find(tag)
  39. if elem is not None:
  40. return elem
  41. return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
  42. def document_fromstring(html, guess_charset=True, parser=None):
  43. """Parse a whole document into a string."""
  44. if not isinstance(html, _strings):
  45. raise TypeError('string required')
  46. if parser is None:
  47. parser = html_parser
  48. return parser.parse(html, useChardet=guess_charset).getroot()
  49. def fragments_fromstring(html, no_leading_text=False,
  50. guess_charset=False, parser=None):
  51. """Parses several HTML elements, returning a list of elements.
  52. The first item in the list may be a string. If no_leading_text is true,
  53. then it will be an error if there is leading text, and it will always be
  54. a list of only elements.
  55. If `guess_charset` is `True` and the text was not unicode but a
  56. bytestring, the `chardet` library will perform charset guessing on the
  57. string.
  58. """
  59. if not isinstance(html, _strings):
  60. raise TypeError('string required')
  61. if parser is None:
  62. parser = html_parser
  63. children = parser.parseFragment(html, 'div', useChardet=guess_charset)
  64. if children and isinstance(children[0], _strings):
  65. if no_leading_text:
  66. if children[0].strip():
  67. raise etree.ParserError('There is leading text: %r' %
  68. children[0])
  69. del children[0]
  70. return children
  71. def fragment_fromstring(html, create_parent=False,
  72. guess_charset=False, parser=None):
  73. """Parses a single HTML element; it is an error if there is more than
  74. one element, or if anything but whitespace precedes or follows the
  75. element.
  76. If create_parent is true (or is a tag name) then a parent node
  77. will be created to encapsulate the HTML in a single element. In
  78. this case, leading or trailing text is allowed.
  79. """
  80. if not isinstance(html, _strings):
  81. raise TypeError('string required')
  82. accept_leading_text = bool(create_parent)
  83. elements = fragments_fromstring(
  84. html, guess_charset=guess_charset, parser=parser,
  85. no_leading_text=not accept_leading_text)
  86. if create_parent:
  87. if not isinstance(create_parent, _strings):
  88. create_parent = 'div'
  89. new_root = Element(create_parent)
  90. if elements:
  91. if isinstance(elements[0], _strings):
  92. new_root.text = elements[0]
  93. del elements[0]
  94. new_root.extend(elements)
  95. return new_root
  96. if not elements:
  97. raise etree.ParserError('No elements found')
  98. if len(elements) > 1:
  99. raise etree.ParserError('Multiple elements found')
  100. result = elements[0]
  101. if result.tail and result.tail.strip():
  102. raise etree.ParserError('Element followed by text: %r' % result.tail)
  103. result.tail = None
  104. return result
  105. def fromstring(html, guess_charset=True, parser=None):
  106. """Parse the html, returning a single element/document.
  107. This tries to minimally parse the chunk of text, without knowing if it
  108. is a fragment or a document.
  109. base_url will set the document's base_url attribute (and the tree's docinfo.URL)
  110. """
  111. if not isinstance(html, _strings):
  112. raise TypeError('string required')
  113. doc = document_fromstring(html, parser=parser,
  114. guess_charset=guess_charset)
  115. # document starts with doctype or <html>, full document!
  116. start = html[:50]
  117. if isinstance(start, bytes):
  118. # Allow text comparison in python3.
  119. # Decode as ascii, that also covers latin-1 and utf-8 for the
  120. # characters we need.
  121. start = start.decode('ascii', 'replace')
  122. start = start.lstrip().lower()
  123. if start.startswith('<html') or start.startswith('<!doctype'):
  124. return doc
  125. head = _find_tag(doc, 'head')
  126. # if the head is not empty we have a full document
  127. if len(head):
  128. return doc
  129. body = _find_tag(doc, 'body')
  130. # The body has just one element, so it was probably a single
  131. # element passed in
  132. if (len(body) == 1 and (not body.text or not body.text.strip())
  133. and (not body[-1].tail or not body[-1].tail.strip())):
  134. return body[0]
  135. # Now we have a body which represents a bunch of tags which have the
  136. # content that was passed in. We will create a fake container, which
  137. # is the body tag, except <body> implies too much structure.
  138. if _contains_block_level_tag(body):
  139. body.tag = 'div'
  140. else:
  141. body.tag = 'span'
  142. return body
  143. def parse(filename_url_or_file, guess_charset=True, parser=None):
  144. """Parse a filename, URL, or file-like object into an HTML document
  145. tree. Note: this returns a tree, not an element. Use
  146. ``parse(...).getroot()`` to get the document root.
  147. """
  148. if parser is None:
  149. parser = html_parser
  150. if not isinstance(filename_url_or_file, _strings):
  151. fp = filename_url_or_file
  152. elif _looks_like_url(filename_url_or_file):
  153. fp = urlopen(filename_url_or_file)
  154. else:
  155. fp = open(filename_url_or_file, 'rb')
  156. return parser.parse(fp, useChardet=guess_charset)
  157. def _looks_like_url(str):
  158. scheme = urlparse(str)[0]
  159. if not scheme:
  160. return False
  161. elif (sys.platform == 'win32' and
  162. scheme in string.ascii_letters
  163. and len(scheme) == 1):
  164. # looks like a 'normal' absolute path
  165. return False
  166. else:
  167. return True
  168. html_parser = HTMLParser()