html5lib_shim.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610
  1. # flake8: noqa
  2. """
  3. Shim module between Bleach and html5lib. This makes it easier to upgrade the
  4. html5lib library without having to change a lot of code.
  5. """
  6. from __future__ import unicode_literals
  7. import re
  8. import string
  9. import six
  10. from bleach._vendor.html5lib import (
  11. HTMLParser,
  12. getTreeWalker,
  13. )
  14. from bleach._vendor.html5lib import constants
  15. from bleach._vendor.html5lib.constants import (
  16. namespaces,
  17. prefixes,
  18. )
  19. from bleach._vendor.html5lib.constants import _ReparseException as ReparseException
  20. from bleach._vendor.html5lib.filters.base import Filter
  21. from bleach._vendor.html5lib.filters.sanitizer import allowed_protocols
  22. from bleach._vendor.html5lib.filters.sanitizer import Filter as SanitizerFilter
  23. from bleach._vendor.html5lib._inputstream import HTMLInputStream
  24. from bleach._vendor.html5lib.serializer import HTMLSerializer
  25. from bleach._vendor.html5lib._tokenizer import HTMLTokenizer
  26. from bleach._vendor.html5lib._trie import Trie
  27. #: Map of entity name to expanded entity
  28. ENTITIES = constants.entities
  29. #: Trie of html entity string -> character representation
  30. ENTITIES_TRIE = Trie(ENTITIES)
  31. #: Token type constants--these never change
  32. TAG_TOKEN_TYPES = {
  33. constants.tokenTypes['StartTag'],
  34. constants.tokenTypes['EndTag'],
  35. constants.tokenTypes['EmptyTag']
  36. }
  37. CHARACTERS_TYPE = constants.tokenTypes['Characters']
  38. PARSEERROR_TYPE = constants.tokenTypes['ParseError']
  39. #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
  40. #: https://html.spec.whatwg.org/multipage/indices.html#elements-3
  41. HTML_TAGS = [
  42. 'a',
  43. 'abbr',
  44. 'address',
  45. 'area',
  46. 'article',
  47. 'aside',
  48. 'audio',
  49. 'b',
  50. 'base',
  51. 'bdi',
  52. 'bdo',
  53. 'blockquote',
  54. 'body',
  55. 'br',
  56. 'button',
  57. 'canvas',
  58. 'caption',
  59. 'cite',
  60. 'code',
  61. 'col',
  62. 'colgroup',
  63. 'data',
  64. 'datalist',
  65. 'dd',
  66. 'del',
  67. 'details',
  68. 'dfn',
  69. 'dialog',
  70. 'div',
  71. 'dl',
  72. 'dt',
  73. 'em',
  74. 'embed',
  75. 'fieldset',
  76. 'figcaption',
  77. 'figure',
  78. 'footer',
  79. 'form',
  80. 'h1',
  81. 'h2',
  82. 'h3',
  83. 'h4',
  84. 'h5',
  85. 'h6',
  86. 'head',
  87. 'header',
  88. 'hgroup',
  89. 'hr',
  90. 'html',
  91. 'i',
  92. 'iframe',
  93. 'img',
  94. 'input',
  95. 'ins',
  96. 'kbd',
  97. 'keygen',
  98. 'label',
  99. 'legend',
  100. 'li',
  101. 'link',
  102. 'map',
  103. 'mark',
  104. 'menu',
  105. 'meta',
  106. 'meter',
  107. 'nav',
  108. 'noscript',
  109. 'object',
  110. 'ol',
  111. 'optgroup',
  112. 'option',
  113. 'output',
  114. 'p',
  115. 'param',
  116. 'picture',
  117. 'pre',
  118. 'progress',
  119. 'q',
  120. 'rp',
  121. 'rt',
  122. 'ruby',
  123. 's',
  124. 'samp',
  125. 'script',
  126. 'section',
  127. 'select',
  128. 'slot',
  129. 'small',
  130. 'source',
  131. 'span',
  132. 'strong',
  133. 'style',
  134. 'sub',
  135. 'summary',
  136. 'sup',
  137. 'table',
  138. 'tbody',
  139. 'td',
  140. 'template',
  141. 'textarea',
  142. 'tfoot',
  143. 'th',
  144. 'thead',
  145. 'time',
  146. 'title',
  147. 'tr',
  148. 'track',
  149. 'u',
  150. 'ul',
  151. 'var',
  152. 'video',
  153. 'wbr',
  154. ]
  155. class InputStreamWithMemory(object):
  156. """Wraps an HTMLInputStream to remember characters since last <
  157. This wraps existing HTMLInputStream classes to keep track of the stream
  158. since the last < which marked an open tag state.
  159. """
  160. def __init__(self, inner_stream):
  161. self._inner_stream = inner_stream
  162. self.reset = self._inner_stream.reset
  163. self.position = self._inner_stream.position
  164. self._buffer = []
  165. @property
  166. def errors(self):
  167. return self._inner_stream.errors
  168. @property
  169. def charEncoding(self):
  170. return self._inner_stream.charEncoding
  171. @property
  172. def changeEncoding(self):
  173. return self._inner_stream.changeEncoding
  174. def char(self):
  175. c = self._inner_stream.char()
  176. # char() can return None if EOF, so ignore that
  177. if c:
  178. self._buffer.append(c)
  179. return c
  180. def charsUntil(self, characters, opposite=False):
  181. chars = self._inner_stream.charsUntil(characters, opposite=opposite)
  182. self._buffer.extend(list(chars))
  183. return chars
  184. def unget(self, char):
  185. if self._buffer:
  186. self._buffer.pop(-1)
  187. return self._inner_stream.unget(char)
  188. def get_tag(self):
  189. """Returns the stream history since last '<'
  190. Since the buffer starts at the last '<' as as seen by tagOpenState(),
  191. we know that everything from that point to when this method is called
  192. is the "tag" that is being tokenized.
  193. """
  194. return six.text_type('').join(self._buffer)
  195. def start_tag(self):
  196. """Resets stream history to just '<'
  197. This gets called by tagOpenState() which marks a '<' that denotes an
  198. open tag. Any time we see that, we reset the buffer.
  199. """
  200. self._buffer = ['<']
  201. class BleachHTMLTokenizer(HTMLTokenizer):
  202. """Tokenizer that doesn't consume character entities"""
  203. def __init__(self, consume_entities=False, **kwargs):
  204. super(BleachHTMLTokenizer, self).__init__(**kwargs)
  205. self.consume_entities = consume_entities
  206. # Wrap the stream with one that remembers the history
  207. self.stream = InputStreamWithMemory(self.stream)
  208. def __iter__(self):
  209. last_error_token = None
  210. for token in super(BleachHTMLTokenizer, self).__iter__():
  211. if last_error_token is not None:
  212. if ((last_error_token['data'] == 'invalid-character-in-attribute-name' and
  213. token['type'] in TAG_TOKEN_TYPES and
  214. token.get('data'))):
  215. # Remove attribute names that have ', " or < in them
  216. # because those characters are invalid for attribute names.
  217. token['data'] = [
  218. item for item in token['data']
  219. if ('"' not in item[0] and
  220. "'" not in item[0] and
  221. '<' not in item[0])
  222. ]
  223. last_error_token = None
  224. yield token
  225. elif ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and
  226. self.parser.tags is not None and
  227. token['data'].lower().strip() not in self.parser.tags)):
  228. # We've got either a malformed tag or a pseudo-tag or
  229. # something that html5lib wants to turn into a malformed
  230. # comment which Bleach clean() will drop so we interfere
  231. # with the token stream to handle it more correctly.
  232. #
  233. # If this is an allowed tag, it's malformed and we just let
  234. # the html5lib parser deal with it--we don't enter into this
  235. # block.
  236. #
  237. # If this is not an allowed tag, then we convert it to
  238. # characters and it'll get escaped in the sanitizer.
  239. token['data'] = self.stream.get_tag()
  240. token['type'] = CHARACTERS_TYPE
  241. last_error_token = None
  242. yield token
  243. elif token['type'] == PARSEERROR_TYPE:
  244. # If the token is a parse error, then let the last_error_token
  245. # go, and make token the new last_error_token
  246. yield last_error_token
  247. last_error_token = token
  248. else:
  249. yield last_error_token
  250. yield token
  251. last_error_token = None
  252. continue
  253. # If the token is a ParseError, we hold on to it so we can get the
  254. # next token and potentially fix it.
  255. if token['type'] == PARSEERROR_TYPE:
  256. last_error_token = token
  257. continue
  258. yield token
  259. if last_error_token:
  260. yield last_error_token
  261. def consumeEntity(self, allowedChar=None, fromAttribute=False):
  262. # If this tokenizer is set to consume entities, then we can let the
  263. # superclass do its thing.
  264. if self.consume_entities:
  265. return super(BleachHTMLTokenizer, self).consumeEntity(allowedChar, fromAttribute)
  266. # If this tokenizer is set to not consume entities, then we don't want
  267. # to consume and convert them, so this overrides the html5lib tokenizer's
  268. # consumeEntity so that it's now a no-op.
  269. #
  270. # However, when that gets called, it's consumed an &, so we put that back in
  271. # the stream.
  272. if fromAttribute:
  273. self.currentToken['data'][-1][1] += '&'
  274. else:
  275. self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": '&'})
  276. def tagOpenState(self):
  277. # This state marks a < that is either a StartTag, EndTag, EmptyTag,
  278. # or ParseError. In all cases, we want to drop any stream history
  279. # we've collected so far and we do that by calling start_tag() on
  280. # the input stream wrapper.
  281. self.stream.start_tag()
  282. return super(BleachHTMLTokenizer, self).tagOpenState()
  283. def emitCurrentToken(self):
  284. token = self.currentToken
  285. if ((self.parser.tags is not None and
  286. token['type'] in TAG_TOKEN_TYPES and
  287. token['name'].lower() not in self.parser.tags)):
  288. # If this is a start/end/empty tag for a tag that's not in our
  289. # allowed list, then it gets stripped or escaped. In both of these
  290. # cases it gets converted to a Characters token.
  291. if self.parser.strip:
  292. # If we're stripping the token, we just throw in an empty
  293. # string token.
  294. new_data = ''
  295. else:
  296. # If we're escaping the token, we want to escape the exact
  297. # original string. Since tokenizing also normalizes data
  298. # and this is a tag-like thing, we've lost some information.
  299. # So we go back through the stream to get the original
  300. # string and use that.
  301. new_data = self.stream.get_tag()
  302. new_token = {
  303. 'type': CHARACTERS_TYPE,
  304. 'data': new_data
  305. }
  306. self.currentToken = new_token
  307. self.tokenQueue.append(new_token)
  308. self.state = self.dataState
  309. return
  310. super(BleachHTMLTokenizer, self).emitCurrentToken()
  311. class BleachHTMLParser(HTMLParser):
  312. """Parser that uses BleachHTMLTokenizer"""
  313. def __init__(self, tags, strip, consume_entities, **kwargs):
  314. """
  315. :arg tags: list of allowed tags--everything else is either stripped or
  316. escaped; if None, then this doesn't look at tags at all
  317. :arg strip: whether to strip disallowed tags (True) or escape them (False);
  318. if tags=None, then this doesn't have any effect
  319. :arg consume_entities: whether to consume entities (default behavior) or
  320. leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
  321. """
  322. self.tags = [tag.lower() for tag in tags] if tags is not None else None
  323. self.strip = strip
  324. self.consume_entities = consume_entities
  325. super(BleachHTMLParser, self).__init__(**kwargs)
  326. def _parse(self, stream, innerHTML=False, container='div', scripting=True, **kwargs):
  327. # set scripting=True to parse <noscript> as though JS is enabled to
  328. # match the expected context in browsers
  329. #
  330. # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
  331. #
  332. # Override HTMLParser so we can swap out the tokenizer for our own.
  333. self.innerHTMLMode = innerHTML
  334. self.container = container
  335. self.scripting = scripting
  336. self.tokenizer = BleachHTMLTokenizer(
  337. stream=stream,
  338. consume_entities=self.consume_entities,
  339. parser=self,
  340. **kwargs
  341. )
  342. self.reset()
  343. try:
  344. self.mainLoop()
  345. except ReparseException:
  346. self.reset()
  347. self.mainLoop()
  348. def convert_entity(value):
  349. """Convert an entity (minus the & and ; part) into what it represents
  350. This handles numeric, hex, and text entities.
  351. :arg value: the string (minus the ``&`` and ``;`` part) to convert
  352. :returns: unicode character or None if it's an ambiguous ampersand that
  353. doesn't match a character entity
  354. """
  355. if value[0] == '#':
  356. if value[1] in ('x', 'X'):
  357. return six.unichr(int(value[2:], 16))
  358. return six.unichr(int(value[1:], 10))
  359. return ENTITIES.get(value, None)
  360. def convert_entities(text):
  361. """Converts all found entities in the text
  362. :arg text: the text to convert entities in
  363. :returns: unicode text with converted entities
  364. """
  365. if '&' not in text:
  366. return text
  367. new_text = []
  368. for part in next_possible_entity(text):
  369. if not part:
  370. continue
  371. if part.startswith('&'):
  372. entity = match_entity(part)
  373. if entity is not None:
  374. converted = convert_entity(entity)
  375. # If it's not an ambiguous ampersand, then replace with the
  376. # unicode character. Otherwise, we leave the entity in.
  377. if converted is not None:
  378. new_text.append(converted)
  379. remainder = part[len(entity) + 2:]
  380. if part:
  381. new_text.append(remainder)
  382. continue
  383. new_text.append(part)
  384. return ''.join(new_text)
  385. def match_entity(stream):
  386. """Returns first entity in stream or None if no entity exists
  387. Note: For Bleach purposes, entities must start with a "&" and end with
  388. a ";". This ignoresambiguous character entities that have no ";" at the
  389. end.
  390. :arg stream: the character stream
  391. :returns: ``None`` or the entity string without "&" or ";"
  392. """
  393. # Nix the & at the beginning
  394. if stream[0] != '&':
  395. raise ValueError('Stream should begin with "&"')
  396. stream = stream[1:]
  397. stream = list(stream)
  398. possible_entity = ''
  399. end_characters = '<&=;' + string.whitespace
  400. # Handle number entities
  401. if stream and stream[0] == '#':
  402. possible_entity = '#'
  403. stream.pop(0)
  404. if stream and stream[0] in ('x', 'X'):
  405. allowed = '0123456789abcdefABCDEF'
  406. possible_entity += stream.pop(0)
  407. else:
  408. allowed = '0123456789'
  409. # FIXME(willkg): Do we want to make sure these are valid number
  410. # entities? This doesn't do that currently.
  411. while stream and stream[0] not in end_characters:
  412. c = stream.pop(0)
  413. if c not in allowed:
  414. break
  415. possible_entity += c
  416. if possible_entity and stream and stream[0] == ';':
  417. return possible_entity
  418. return None
  419. # Handle character entities
  420. while stream and stream[0] not in end_characters:
  421. c = stream.pop(0)
  422. if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
  423. break
  424. possible_entity += c
  425. if possible_entity and stream and stream[0] == ';':
  426. return possible_entity
  427. return None
  428. AMP_SPLIT_RE = re.compile('(&)')
  429. def next_possible_entity(text):
  430. """Takes a text and generates a list of possible entities
  431. :arg text: the text to look at
  432. :returns: generator where each part (except the first) starts with an
  433. "&"
  434. """
  435. for i, part in enumerate(AMP_SPLIT_RE.split(text)):
  436. if i == 0:
  437. yield part
  438. elif i % 2 == 0:
  439. yield '&' + part
  440. class BleachHTMLSerializer(HTMLSerializer):
  441. """HTMLSerializer that undoes & -> &amp; in attributes and sets
  442. escape_rcdata to True
  443. """
  444. # per the HTMLSerializer.__init__ docstring:
  445. #
  446. # Whether to escape characters that need to be
  447. # escaped within normal elements within rcdata elements such as
  448. # style.
  449. #
  450. escape_rcdata = True
  451. def escape_base_amp(self, stoken):
  452. """Escapes just bare & in HTML attribute values"""
  453. # First, undo escaping of &. We need to do this because html5lib's
  454. # HTMLSerializer expected the tokenizer to consume all the character
  455. # entities and convert them to their respective characters, but the
  456. # BleachHTMLTokenizer doesn't do that. For example, this fixes
  457. # &amp;entity; back to &entity; .
  458. stoken = stoken.replace('&amp;', '&')
  459. # However, we do want all bare & that are not marking character
  460. # entities to be changed to &amp;, so let's do that carefully here.
  461. for part in next_possible_entity(stoken):
  462. if not part:
  463. continue
  464. if part.startswith('&'):
  465. entity = match_entity(part)
  466. # Only leave entities in that are not ambiguous. If they're
  467. # ambiguous, then we escape the ampersand.
  468. if entity is not None and convert_entity(entity) is not None:
  469. yield '&' + entity + ';'
  470. # Length of the entity plus 2--one for & at the beginning
  471. # and one for ; at the end
  472. part = part[len(entity) + 2:]
  473. if part:
  474. yield part
  475. continue
  476. yield part.replace('&', '&amp;')
  477. def serialize(self, treewalker, encoding=None):
  478. """Wrap HTMLSerializer.serialize and conver & to &amp; in attribute values
  479. Note that this converts & to &amp; in attribute values where the & isn't
  480. already part of an unambiguous character entity.
  481. """
  482. in_tag = False
  483. after_equals = False
  484. for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
  485. if in_tag:
  486. if stoken == '>':
  487. in_tag = False
  488. elif after_equals:
  489. if stoken != '"':
  490. for part in self.escape_base_amp(stoken):
  491. yield part
  492. after_equals = False
  493. continue
  494. elif stoken == '=':
  495. after_equals = True
  496. yield stoken
  497. else:
  498. if stoken.startswith('<'):
  499. in_tag = True
  500. yield stoken