test_html5lib.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. """Tests to ensure that the html5lib tree builder generates good trees."""
  2. import warnings
  3. try:
  4. from bs4.builder import HTML5TreeBuilder
  5. HTML5LIB_PRESENT = True
  6. except ImportError, e:
  7. HTML5LIB_PRESENT = False
  8. from bs4.element import SoupStrainer
  9. from bs4.testing import (
  10. HTML5TreeBuilderSmokeTest,
  11. SoupTest,
  12. skipIf,
  13. )
  14. @skipIf(
  15. not HTML5LIB_PRESENT,
  16. "html5lib seems not to be present, not testing its tree builder.")
  17. class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
  18. """See ``HTML5TreeBuilderSmokeTest``."""
  19. @property
  20. def default_builder(self):
  21. return HTML5TreeBuilder()
  22. def test_soupstrainer(self):
  23. # The html5lib tree builder does not support SoupStrainers.
  24. strainer = SoupStrainer("b")
  25. markup = "<p>A <b>bold</b> statement.</p>"
  26. with warnings.catch_warnings(record=True) as w:
  27. soup = self.soup(markup, parse_only=strainer)
  28. self.assertEqual(
  29. soup.decode(), self.document_for(markup))
  30. self.assertTrue(
  31. "the html5lib tree builder doesn't support parse_only" in
  32. str(w[0].message))
  33. def test_correctly_nested_tables(self):
  34. """html5lib inserts <tbody> tags where other parsers don't."""
  35. markup = ('<table id="1">'
  36. '<tr>'
  37. "<td>Here's another table:"
  38. '<table id="2">'
  39. '<tr><td>foo</td></tr>'
  40. '</table></td>')
  41. self.assertSoupEquals(
  42. markup,
  43. '<table id="1"><tbody><tr><td>Here\'s another table:'
  44. '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
  45. '</td></tr></tbody></table>')
  46. self.assertSoupEquals(
  47. "<table><thead><tr><td>Foo</td></tr></thead>"
  48. "<tbody><tr><td>Bar</td></tr></tbody>"
  49. "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
  50. def test_xml_declaration_followed_by_doctype(self):
  51. markup = '''<?xml version="1.0" encoding="utf-8"?>
  52. <!DOCTYPE html>
  53. <html>
  54. <head>
  55. </head>
  56. <body>
  57. <p>foo</p>
  58. </body>
  59. </html>'''
  60. soup = self.soup(markup)
  61. # Verify that we can reach the <p> tag; this means the tree is connected.
  62. self.assertEqual(b"<p>foo</p>", soup.p.encode())
  63. def test_reparented_markup(self):
  64. markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
  65. soup = self.soup(markup)
  66. self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
  67. self.assertEqual(2, len(soup.find_all('p')))
  68. def test_reparented_markup_ends_with_whitespace(self):
  69. markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
  70. soup = self.soup(markup)
  71. self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
  72. self.assertEqual(2, len(soup.find_all('p')))