123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610 |
- # flake8: noqa
- """
- Shim module between Bleach and html5lib. This makes it easier to upgrade the
- html5lib library without having to change a lot of code.
- """
- from __future__ import unicode_literals
- import re
- import string
- import six
- from bleach._vendor.html5lib import (
- HTMLParser,
- getTreeWalker,
- )
- from bleach._vendor.html5lib import constants
- from bleach._vendor.html5lib.constants import (
- namespaces,
- prefixes,
- )
- from bleach._vendor.html5lib.constants import _ReparseException as ReparseException
- from bleach._vendor.html5lib.filters.base import Filter
- from bleach._vendor.html5lib.filters.sanitizer import allowed_protocols
- from bleach._vendor.html5lib.filters.sanitizer import Filter as SanitizerFilter
- from bleach._vendor.html5lib._inputstream import HTMLInputStream
- from bleach._vendor.html5lib.serializer import HTMLSerializer
- from bleach._vendor.html5lib._tokenizer import HTMLTokenizer
- from bleach._vendor.html5lib._trie import Trie
- #: Map of entity name to expanded entity
- ENTITIES = constants.entities
- #: Trie of html entity string -> character representation
- ENTITIES_TRIE = Trie(ENTITIES)
- #: Token type constants--these never change
- TAG_TOKEN_TYPES = {
- constants.tokenTypes['StartTag'],
- constants.tokenTypes['EndTag'],
- constants.tokenTypes['EmptyTag']
- }
- CHARACTERS_TYPE = constants.tokenTypes['Characters']
- PARSEERROR_TYPE = constants.tokenTypes['ParseError']
- #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
- #: https://html.spec.whatwg.org/multipage/indices.html#elements-3
- HTML_TAGS = [
- 'a',
- 'abbr',
- 'address',
- 'area',
- 'article',
- 'aside',
- 'audio',
- 'b',
- 'base',
- 'bdi',
- 'bdo',
- 'blockquote',
- 'body',
- 'br',
- 'button',
- 'canvas',
- 'caption',
- 'cite',
- 'code',
- 'col',
- 'colgroup',
- 'data',
- 'datalist',
- 'dd',
- 'del',
- 'details',
- 'dfn',
- 'dialog',
- 'div',
- 'dl',
- 'dt',
- 'em',
- 'embed',
- 'fieldset',
- 'figcaption',
- 'figure',
- 'footer',
- 'form',
- 'h1',
- 'h2',
- 'h3',
- 'h4',
- 'h5',
- 'h6',
- 'head',
- 'header',
- 'hgroup',
- 'hr',
- 'html',
- 'i',
- 'iframe',
- 'img',
- 'input',
- 'ins',
- 'kbd',
- 'keygen',
- 'label',
- 'legend',
- 'li',
- 'link',
- 'map',
- 'mark',
- 'menu',
- 'meta',
- 'meter',
- 'nav',
- 'noscript',
- 'object',
- 'ol',
- 'optgroup',
- 'option',
- 'output',
- 'p',
- 'param',
- 'picture',
- 'pre',
- 'progress',
- 'q',
- 'rp',
- 'rt',
- 'ruby',
- 's',
- 'samp',
- 'script',
- 'section',
- 'select',
- 'slot',
- 'small',
- 'source',
- 'span',
- 'strong',
- 'style',
- 'sub',
- 'summary',
- 'sup',
- 'table',
- 'tbody',
- 'td',
- 'template',
- 'textarea',
- 'tfoot',
- 'th',
- 'thead',
- 'time',
- 'title',
- 'tr',
- 'track',
- 'u',
- 'ul',
- 'var',
- 'video',
- 'wbr',
- ]
- class InputStreamWithMemory(object):
- """Wraps an HTMLInputStream to remember characters since last <
- This wraps existing HTMLInputStream classes to keep track of the stream
- since the last < which marked an open tag state.
- """
- def __init__(self, inner_stream):
- self._inner_stream = inner_stream
- self.reset = self._inner_stream.reset
- self.position = self._inner_stream.position
- self._buffer = []
- @property
- def errors(self):
- return self._inner_stream.errors
- @property
- def charEncoding(self):
- return self._inner_stream.charEncoding
- @property
- def changeEncoding(self):
- return self._inner_stream.changeEncoding
- def char(self):
- c = self._inner_stream.char()
- # char() can return None if EOF, so ignore that
- if c:
- self._buffer.append(c)
- return c
- def charsUntil(self, characters, opposite=False):
- chars = self._inner_stream.charsUntil(characters, opposite=opposite)
- self._buffer.extend(list(chars))
- return chars
- def unget(self, char):
- if self._buffer:
- self._buffer.pop(-1)
- return self._inner_stream.unget(char)
- def get_tag(self):
- """Returns the stream history since last '<'
- Since the buffer starts at the last '<' as as seen by tagOpenState(),
- we know that everything from that point to when this method is called
- is the "tag" that is being tokenized.
- """
- return six.text_type('').join(self._buffer)
- def start_tag(self):
- """Resets stream history to just '<'
- This gets called by tagOpenState() which marks a '<' that denotes an
- open tag. Any time we see that, we reset the buffer.
- """
- self._buffer = ['<']
- class BleachHTMLTokenizer(HTMLTokenizer):
- """Tokenizer that doesn't consume character entities"""
- def __init__(self, consume_entities=False, **kwargs):
- super(BleachHTMLTokenizer, self).__init__(**kwargs)
- self.consume_entities = consume_entities
- # Wrap the stream with one that remembers the history
- self.stream = InputStreamWithMemory(self.stream)
- def __iter__(self):
- last_error_token = None
- for token in super(BleachHTMLTokenizer, self).__iter__():
- if last_error_token is not None:
- if ((last_error_token['data'] == 'invalid-character-in-attribute-name' and
- token['type'] in TAG_TOKEN_TYPES and
- token.get('data'))):
- # Remove attribute names that have ', " or < in them
- # because those characters are invalid for attribute names.
- token['data'] = [
- item for item in token['data']
- if ('"' not in item[0] and
- "'" not in item[0] and
- '<' not in item[0])
- ]
- last_error_token = None
- yield token
- elif ((last_error_token['data'] == 'expected-closing-tag-but-got-char' and
- self.parser.tags is not None and
- token['data'].lower().strip() not in self.parser.tags)):
- # We've got either a malformed tag or a pseudo-tag or
- # something that html5lib wants to turn into a malformed
- # comment which Bleach clean() will drop so we interfere
- # with the token stream to handle it more correctly.
- #
- # If this is an allowed tag, it's malformed and we just let
- # the html5lib parser deal with it--we don't enter into this
- # block.
- #
- # If this is not an allowed tag, then we convert it to
- # characters and it'll get escaped in the sanitizer.
- token['data'] = self.stream.get_tag()
- token['type'] = CHARACTERS_TYPE
- last_error_token = None
- yield token
- elif token['type'] == PARSEERROR_TYPE:
- # If the token is a parse error, then let the last_error_token
- # go, and make token the new last_error_token
- yield last_error_token
- last_error_token = token
- else:
- yield last_error_token
- yield token
- last_error_token = None
- continue
- # If the token is a ParseError, we hold on to it so we can get the
- # next token and potentially fix it.
- if token['type'] == PARSEERROR_TYPE:
- last_error_token = token
- continue
- yield token
- if last_error_token:
- yield last_error_token
- def consumeEntity(self, allowedChar=None, fromAttribute=False):
- # If this tokenizer is set to consume entities, then we can let the
- # superclass do its thing.
- if self.consume_entities:
- return super(BleachHTMLTokenizer, self).consumeEntity(allowedChar, fromAttribute)
- # If this tokenizer is set to not consume entities, then we don't want
- # to consume and convert them, so this overrides the html5lib tokenizer's
- # consumeEntity so that it's now a no-op.
- #
- # However, when that gets called, it's consumed an &, so we put that back in
- # the stream.
- if fromAttribute:
- self.currentToken['data'][-1][1] += '&'
- else:
- self.tokenQueue.append({"type": CHARACTERS_TYPE, "data": '&'})
- def tagOpenState(self):
- # This state marks a < that is either a StartTag, EndTag, EmptyTag,
- # or ParseError. In all cases, we want to drop any stream history
- # we've collected so far and we do that by calling start_tag() on
- # the input stream wrapper.
- self.stream.start_tag()
- return super(BleachHTMLTokenizer, self).tagOpenState()
- def emitCurrentToken(self):
- token = self.currentToken
- if ((self.parser.tags is not None and
- token['type'] in TAG_TOKEN_TYPES and
- token['name'].lower() not in self.parser.tags)):
- # If this is a start/end/empty tag for a tag that's not in our
- # allowed list, then it gets stripped or escaped. In both of these
- # cases it gets converted to a Characters token.
- if self.parser.strip:
- # If we're stripping the token, we just throw in an empty
- # string token.
- new_data = ''
- else:
- # If we're escaping the token, we want to escape the exact
- # original string. Since tokenizing also normalizes data
- # and this is a tag-like thing, we've lost some information.
- # So we go back through the stream to get the original
- # string and use that.
- new_data = self.stream.get_tag()
- new_token = {
- 'type': CHARACTERS_TYPE,
- 'data': new_data
- }
- self.currentToken = new_token
- self.tokenQueue.append(new_token)
- self.state = self.dataState
- return
- super(BleachHTMLTokenizer, self).emitCurrentToken()
- class BleachHTMLParser(HTMLParser):
- """Parser that uses BleachHTMLTokenizer"""
- def __init__(self, tags, strip, consume_entities, **kwargs):
- """
- :arg tags: list of allowed tags--everything else is either stripped or
- escaped; if None, then this doesn't look at tags at all
- :arg strip: whether to strip disallowed tags (True) or escape them (False);
- if tags=None, then this doesn't have any effect
- :arg consume_entities: whether to consume entities (default behavior) or
- leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
- """
- self.tags = [tag.lower() for tag in tags] if tags is not None else None
- self.strip = strip
- self.consume_entities = consume_entities
- super(BleachHTMLParser, self).__init__(**kwargs)
- def _parse(self, stream, innerHTML=False, container='div', scripting=True, **kwargs):
- # set scripting=True to parse <noscript> as though JS is enabled to
- # match the expected context in browsers
- #
- # https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
- #
- # Override HTMLParser so we can swap out the tokenizer for our own.
- self.innerHTMLMode = innerHTML
- self.container = container
- self.scripting = scripting
- self.tokenizer = BleachHTMLTokenizer(
- stream=stream,
- consume_entities=self.consume_entities,
- parser=self,
- **kwargs
- )
- self.reset()
- try:
- self.mainLoop()
- except ReparseException:
- self.reset()
- self.mainLoop()
- def convert_entity(value):
- """Convert an entity (minus the & and ; part) into what it represents
- This handles numeric, hex, and text entities.
- :arg value: the string (minus the ``&`` and ``;`` part) to convert
- :returns: unicode character or None if it's an ambiguous ampersand that
- doesn't match a character entity
- """
- if value[0] == '#':
- if value[1] in ('x', 'X'):
- return six.unichr(int(value[2:], 16))
- return six.unichr(int(value[1:], 10))
- return ENTITIES.get(value, None)
- def convert_entities(text):
- """Converts all found entities in the text
- :arg text: the text to convert entities in
- :returns: unicode text with converted entities
- """
- if '&' not in text:
- return text
- new_text = []
- for part in next_possible_entity(text):
- if not part:
- continue
- if part.startswith('&'):
- entity = match_entity(part)
- if entity is not None:
- converted = convert_entity(entity)
- # If it's not an ambiguous ampersand, then replace with the
- # unicode character. Otherwise, we leave the entity in.
- if converted is not None:
- new_text.append(converted)
- remainder = part[len(entity) + 2:]
- if part:
- new_text.append(remainder)
- continue
- new_text.append(part)
- return ''.join(new_text)
- def match_entity(stream):
- """Returns first entity in stream or None if no entity exists
- Note: For Bleach purposes, entities must start with a "&" and end with
- a ";". This ignoresambiguous character entities that have no ";" at the
- end.
- :arg stream: the character stream
- :returns: ``None`` or the entity string without "&" or ";"
- """
- # Nix the & at the beginning
- if stream[0] != '&':
- raise ValueError('Stream should begin with "&"')
- stream = stream[1:]
- stream = list(stream)
- possible_entity = ''
- end_characters = '<&=;' + string.whitespace
- # Handle number entities
- if stream and stream[0] == '#':
- possible_entity = '#'
- stream.pop(0)
- if stream and stream[0] in ('x', 'X'):
- allowed = '0123456789abcdefABCDEF'
- possible_entity += stream.pop(0)
- else:
- allowed = '0123456789'
- # FIXME(willkg): Do we want to make sure these are valid number
- # entities? This doesn't do that currently.
- while stream and stream[0] not in end_characters:
- c = stream.pop(0)
- if c not in allowed:
- break
- possible_entity += c
- if possible_entity and stream and stream[0] == ';':
- return possible_entity
- return None
- # Handle character entities
- while stream and stream[0] not in end_characters:
- c = stream.pop(0)
- if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
- break
- possible_entity += c
- if possible_entity and stream and stream[0] == ';':
- return possible_entity
- return None
- AMP_SPLIT_RE = re.compile('(&)')
- def next_possible_entity(text):
- """Takes a text and generates a list of possible entities
- :arg text: the text to look at
- :returns: generator where each part (except the first) starts with an
- "&"
- """
- for i, part in enumerate(AMP_SPLIT_RE.split(text)):
- if i == 0:
- yield part
- elif i % 2 == 0:
- yield '&' + part
- class BleachHTMLSerializer(HTMLSerializer):
- """HTMLSerializer that undoes & -> & in attributes and sets
- escape_rcdata to True
- """
- # per the HTMLSerializer.__init__ docstring:
- #
- # Whether to escape characters that need to be
- # escaped within normal elements within rcdata elements such as
- # style.
- #
- escape_rcdata = True
- def escape_base_amp(self, stoken):
- """Escapes just bare & in HTML attribute values"""
- # First, undo escaping of &. We need to do this because html5lib's
- # HTMLSerializer expected the tokenizer to consume all the character
- # entities and convert them to their respective characters, but the
- # BleachHTMLTokenizer doesn't do that. For example, this fixes
- # &entity; back to &entity; .
- stoken = stoken.replace('&', '&')
- # However, we do want all bare & that are not marking character
- # entities to be changed to &, so let's do that carefully here.
- for part in next_possible_entity(stoken):
- if not part:
- continue
- if part.startswith('&'):
- entity = match_entity(part)
- # Only leave entities in that are not ambiguous. If they're
- # ambiguous, then we escape the ampersand.
- if entity is not None and convert_entity(entity) is not None:
- yield '&' + entity + ';'
- # Length of the entity plus 2--one for & at the beginning
- # and one for ; at the end
- part = part[len(entity) + 2:]
- if part:
- yield part
- continue
- yield part.replace('&', '&')
- def serialize(self, treewalker, encoding=None):
- """Wrap HTMLSerializer.serialize and conver & to & in attribute values
- Note that this converts & to & in attribute values where the & isn't
- already part of an unambiguous character entity.
- """
- in_tag = False
- after_equals = False
- for stoken in super(BleachHTMLSerializer, self).serialize(treewalker, encoding):
- if in_tag:
- if stoken == '>':
- in_tag = False
- elif after_equals:
- if stoken != '"':
- for part in self.escape_base_amp(stoken):
- yield part
- after_equals = False
- continue
- elif stoken == '=':
- after_equals = True
- yield stoken
- else:
- if stoken.startswith('<'):
- in_tag = True
- yield stoken
|