sanitizer.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624
  1. from __future__ import unicode_literals
  2. from itertools import chain
  3. import re
  4. import six
  5. from six.moves.urllib.parse import urlparse
  6. from xml.sax.saxutils import unescape
  7. from bleach import html5lib_shim
  8. from bleach.utils import alphabetize_attributes, force_unicode
  9. #: List of allowed tags
  10. ALLOWED_TAGS = [
  11. 'a',
  12. 'abbr',
  13. 'acronym',
  14. 'b',
  15. 'blockquote',
  16. 'code',
  17. 'em',
  18. 'i',
  19. 'li',
  20. 'ol',
  21. 'strong',
  22. 'ul',
  23. ]
  24. #: Map of allowed attributes by tag
  25. ALLOWED_ATTRIBUTES = {
  26. 'a': ['href', 'title'],
  27. 'abbr': ['title'],
  28. 'acronym': ['title'],
  29. }
  30. #: List of allowed styles
  31. ALLOWED_STYLES = []
  32. #: List of allowed protocols
  33. ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
  34. #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
  35. INVISIBLE_CHARACTERS = ''.join([chr(c) for c in chain(range(0, 9), range(11, 13), range(14, 32))])
  36. #: Regexp for characters that are invisible
  37. INVISIBLE_CHARACTERS_RE = re.compile(
  38. '[' + INVISIBLE_CHARACTERS + ']',
  39. re.UNICODE
  40. )
  41. #: String to replace invisible characters with. This can be a character, a
  42. #: string, or even a function that takes a Python re matchobj
  43. INVISIBLE_REPLACEMENT_CHAR = '?'
  44. class Cleaner(object):
  45. """Cleaner for cleaning HTML fragments of malicious content
  46. This cleaner is a security-focused function whose sole purpose is to remove
  47. malicious content from a string such that it can be displayed as content in
  48. a web page.
  49. To use::
  50. from bleach.sanitizer import Cleaner
  51. cleaner = Cleaner()
  52. for text in all_the_yucky_things:
  53. sanitized = cleaner.clean(text)
  54. .. Note::
  55. This cleaner is not designed to use to transform content to be used in
  56. non-web-page contexts.
  57. .. Warning::
  58. This cleaner is not thread-safe--the html parser has internal state.
  59. Create a separate cleaner per thread!
  60. """
  61. def __init__(self, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
  62. styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False,
  63. strip_comments=True, filters=None):
  64. """Initializes a Cleaner
  65. :arg list tags: allowed list of tags; defaults to
  66. ``bleach.sanitizer.ALLOWED_TAGS``
  67. :arg dict attributes: allowed attributes; can be a callable, list or dict;
  68. defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
  69. :arg list styles: allowed list of css styles; defaults to
  70. ``bleach.sanitizer.ALLOWED_STYLES``
  71. :arg list protocols: allowed list of protocols for links; defaults
  72. to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
  73. :arg bool strip: whether or not to strip disallowed elements
  74. :arg bool strip_comments: whether or not to strip HTML comments
  75. :arg list filters: list of html5lib Filter classes to pass streamed content through
  76. .. seealso:: http://html5lib.readthedocs.io/en/latest/movingparts.html#filters
  77. .. Warning::
  78. Using filters changes the output of ``bleach.Cleaner.clean``.
  79. Make sure the way the filters change the output are secure.
  80. """
  81. self.tags = tags
  82. self.attributes = attributes
  83. self.styles = styles
  84. self.protocols = protocols
  85. self.strip = strip
  86. self.strip_comments = strip_comments
  87. self.filters = filters or []
  88. self.parser = html5lib_shim.BleachHTMLParser(
  89. tags=self.tags,
  90. strip=self.strip,
  91. consume_entities=False,
  92. namespaceHTMLElements=False
  93. )
  94. self.walker = html5lib_shim.getTreeWalker('etree')
  95. self.serializer = html5lib_shim.BleachHTMLSerializer(
  96. quote_attr_values='always',
  97. omit_optional_tags=False,
  98. escape_lt_in_attrs=True,
  99. # We want to leave entities as they are without escaping or
  100. # resolving or expanding
  101. resolve_entities=False,
  102. # Bleach has its own sanitizer, so don't use the html5lib one
  103. sanitize=False,
  104. # Bleach sanitizer alphabetizes already, so don't use the html5lib one
  105. alphabetical_attributes=False,
  106. )
  107. def clean(self, text):
  108. """Cleans text and returns sanitized result as unicode
  109. :arg str text: text to be cleaned
  110. :returns: sanitized text as unicode
  111. :raises TypeError: if ``text`` is not a text type
  112. """
  113. if not isinstance(text, six.string_types):
  114. message = "argument cannot be of '{name}' type, must be of text type".format(
  115. name=text.__class__.__name__)
  116. raise TypeError(message)
  117. if not text:
  118. return ''
  119. text = force_unicode(text)
  120. dom = self.parser.parseFragment(text)
  121. filtered = BleachSanitizerFilter(
  122. source=self.walker(dom),
  123. # Bleach-sanitizer-specific things
  124. attributes=self.attributes,
  125. strip_disallowed_elements=self.strip,
  126. strip_html_comments=self.strip_comments,
  127. # html5lib-sanitizer things
  128. allowed_elements=self.tags,
  129. allowed_css_properties=self.styles,
  130. allowed_protocols=self.protocols,
  131. allowed_svg_properties=[],
  132. )
  133. # Apply any filters after the BleachSanitizerFilter
  134. for filter_class in self.filters:
  135. filtered = filter_class(source=filtered)
  136. return self.serializer.render(filtered)
  137. def attribute_filter_factory(attributes):
  138. """Generates attribute filter function for the given attributes value
  139. The attributes value can take one of several shapes. This returns a filter
  140. function appropriate to the attributes value. One nice thing about this is
  141. that there's less if/then shenanigans in the ``allow_token`` method.
  142. """
  143. if callable(attributes):
  144. return attributes
  145. if isinstance(attributes, dict):
  146. def _attr_filter(tag, attr, value):
  147. if tag in attributes:
  148. attr_val = attributes[tag]
  149. if callable(attr_val):
  150. return attr_val(tag, attr, value)
  151. if attr in attr_val:
  152. return True
  153. if '*' in attributes:
  154. attr_val = attributes['*']
  155. if callable(attr_val):
  156. return attr_val(tag, attr, value)
  157. return attr in attr_val
  158. return False
  159. return _attr_filter
  160. if isinstance(attributes, list):
  161. def _attr_filter(tag, attr, value):
  162. return attr in attributes
  163. return _attr_filter
  164. raise ValueError('attributes needs to be a callable, a list or a dict')
  165. class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
  166. """html5lib Filter that sanitizes text
  167. This filter can be used anywhere html5lib filters can be used.
  168. """
  169. def __init__(self, source, attributes=ALLOWED_ATTRIBUTES,
  170. strip_disallowed_elements=False, strip_html_comments=True,
  171. **kwargs):
  172. """Creates a BleachSanitizerFilter instance
  173. :arg Treewalker source: stream
  174. :arg list tags: allowed list of tags; defaults to
  175. ``bleach.sanitizer.ALLOWED_TAGS``
  176. :arg dict attributes: allowed attributes; can be a callable, list or dict;
  177. defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
  178. :arg list styles: allowed list of css styles; defaults to
  179. ``bleach.sanitizer.ALLOWED_STYLES``
  180. :arg list protocols: allowed list of protocols for links; defaults
  181. to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
  182. :arg bool strip_disallowed_elements: whether or not to strip disallowed
  183. elements
  184. :arg bool strip_html_comments: whether or not to strip HTML comments
  185. """
  186. self.attr_filter = attribute_filter_factory(attributes)
  187. self.strip_disallowed_elements = strip_disallowed_elements
  188. self.strip_html_comments = strip_html_comments
  189. return super(BleachSanitizerFilter, self).__init__(source, **kwargs)
  190. def sanitize_stream(self, token_iterator):
  191. for token in token_iterator:
  192. ret = self.sanitize_token(token)
  193. if not ret:
  194. continue
  195. if isinstance(ret, list):
  196. for subtoken in ret:
  197. yield subtoken
  198. else:
  199. yield ret
  200. def merge_characters(self, token_iterator):
  201. """Merge consecutive Characters tokens in a stream"""
  202. characters_buffer = []
  203. for token in token_iterator:
  204. if characters_buffer:
  205. if token['type'] == 'Characters':
  206. characters_buffer.append(token)
  207. continue
  208. else:
  209. # Merge all the characters tokens together into one and then
  210. # operate on it.
  211. new_token = {
  212. 'data': ''.join([char_token['data'] for char_token in characters_buffer]),
  213. 'type': 'Characters'
  214. }
  215. characters_buffer = []
  216. yield new_token
  217. elif token['type'] == 'Characters':
  218. characters_buffer.append(token)
  219. continue
  220. yield token
  221. new_token = {
  222. 'data': ''.join([char_token['data'] for char_token in characters_buffer]),
  223. 'type': 'Characters'
  224. }
  225. yield new_token
  226. def __iter__(self):
  227. return self.merge_characters(self.sanitize_stream(html5lib_shim.Filter.__iter__(self)))
  228. def sanitize_token(self, token):
  229. """Sanitize a token either by HTML-encoding or dropping.
  230. Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
  231. ['attribute', 'pairs'], 'tag': callable}.
  232. Here callable is a function with two arguments of attribute name and
  233. value. It should return true of false.
  234. Also gives the option to strip tags instead of encoding.
  235. :arg dict token: token to sanitize
  236. :returns: token or list of tokens
  237. """
  238. token_type = token['type']
  239. if token_type in ['StartTag', 'EndTag', 'EmptyTag']:
  240. if token['name'] in self.allowed_elements:
  241. return self.allow_token(token)
  242. elif self.strip_disallowed_elements:
  243. return None
  244. else:
  245. if 'data' in token:
  246. # Alphabetize the attributes before calling .disallowed_token()
  247. # so that the resulting string is stable
  248. token['data'] = alphabetize_attributes(token['data'])
  249. return self.disallowed_token(token)
  250. elif token_type == 'Comment':
  251. if not self.strip_html_comments:
  252. return token
  253. else:
  254. return None
  255. elif token_type == 'Characters':
  256. return self.sanitize_characters(token)
  257. else:
  258. return token
  259. def sanitize_characters(self, token):
  260. """Handles Characters tokens
  261. Our overridden tokenizer doesn't do anything with entities. However,
  262. that means that the serializer will convert all ``&`` in Characters
  263. tokens to ``&``.
  264. Since we don't want that, we extract entities here and convert them to
  265. Entity tokens so the serializer will let them be.
  266. :arg token: the Characters token to work on
  267. :returns: a list of tokens
  268. """
  269. data = token.get('data', '')
  270. if not data:
  271. return token
  272. data = INVISIBLE_CHARACTERS_RE.sub(INVISIBLE_REPLACEMENT_CHAR, data)
  273. token['data'] = data
  274. # If there isn't a & in the data, we can return now
  275. if '&' not in data:
  276. return token
  277. new_tokens = []
  278. # For each possible entity that starts with a "&", we try to extract an
  279. # actual entity and re-tokenize accordingly
  280. for part in html5lib_shim.next_possible_entity(data):
  281. if not part:
  282. continue
  283. if part.startswith('&'):
  284. entity = html5lib_shim.match_entity(part)
  285. if entity is not None:
  286. if entity == 'amp':
  287. # LinkifyFilter can't match urls across token boundaries
  288. # which is problematic with & since that shows up in
  289. # querystrings all the time. This special-cases &
  290. # and converts it to a & and sticks it in as a
  291. # Characters token. It'll get merged with surrounding
  292. # tokens in the BleachSanitizerfilter.__iter__ and
  293. # escaped in the serializer.
  294. new_tokens.append({'type': 'Characters', 'data': '&'})
  295. else:
  296. new_tokens.append({'type': 'Entity', 'name': entity})
  297. # Length of the entity plus 2--one for & at the beginning
  298. # and one for ; at the end
  299. remainder = part[len(entity) + 2:]
  300. if remainder:
  301. new_tokens.append({'type': 'Characters', 'data': remainder})
  302. continue
  303. new_tokens.append({'type': 'Characters', 'data': part})
  304. return new_tokens
  305. def sanitize_uri_value(self, value, allowed_protocols):
  306. """Checks a uri value to see if it's allowed
  307. :arg value: the uri value to sanitize
  308. :arg allowed_protocols: list of allowed protocols
  309. :returns: allowed value or None
  310. """
  311. # NOTE(willkg): This transforms the value into one that's easier to
  312. # match and verify, but shouldn't get returned since it's vastly
  313. # different than the original value.
  314. # Convert all character entities in the value
  315. new_value = html5lib_shim.convert_entities(value)
  316. # Nix backtick, space characters, and control characters
  317. new_value = re.sub(
  318. r"[`\000-\040\177-\240\s]+",
  319. '',
  320. new_value
  321. )
  322. # Remove REPLACEMENT characters
  323. new_value = new_value.replace('\ufffd', '')
  324. # Lowercase it--this breaks the value, but makes it easier to match
  325. # against
  326. new_value = new_value.lower()
  327. try:
  328. # Drop attributes with uri values that have protocols that aren't
  329. # allowed
  330. parsed = urlparse(new_value)
  331. except ValueError:
  332. # URI is impossible to parse, therefore it's not allowed
  333. return None
  334. if parsed.scheme:
  335. # If urlparse found a scheme, check that
  336. if parsed.scheme in allowed_protocols:
  337. return value
  338. else:
  339. # Allow uris that are just an anchor
  340. if new_value.startswith('#'):
  341. return value
  342. # Handle protocols that urlparse doesn't recognize like "myprotocol"
  343. if ':' in new_value and new_value.split(':')[0] in allowed_protocols:
  344. return value
  345. # If there's no protocol/scheme specified, then assume it's "http"
  346. # and see if that's allowed
  347. if 'http' in allowed_protocols:
  348. return value
  349. return None
  350. def allow_token(self, token):
  351. """Handles the case where we're allowing the tag"""
  352. if 'data' in token:
  353. # Loop through all the attributes and drop the ones that are not
  354. # allowed, are unsafe or break other rules. Additionally, fix
  355. # attribute values that need fixing.
  356. #
  357. # At the end of this loop, we have the final set of attributes
  358. # we're keeping.
  359. attrs = {}
  360. for namespaced_name, val in token['data'].items():
  361. namespace, name = namespaced_name
  362. # Drop attributes that are not explicitly allowed
  363. #
  364. # NOTE(willkg): We pass in the attribute name--not a namespaced
  365. # name.
  366. if not self.attr_filter(token['name'], name, val):
  367. continue
  368. # Drop attributes with uri values that use a disallowed protocol
  369. # Sanitize attributes with uri values
  370. if namespaced_name in self.attr_val_is_uri:
  371. new_value = self.sanitize_uri_value(val, self.allowed_protocols)
  372. if new_value is None:
  373. continue
  374. val = new_value
  375. # Drop values in svg attrs with non-local IRIs
  376. if namespaced_name in self.svg_attr_val_allows_ref:
  377. new_val = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
  378. ' ',
  379. unescape(val))
  380. new_val = new_val.strip()
  381. if not new_val:
  382. continue
  383. else:
  384. # Replace the val with the unescaped version because
  385. # it's a iri
  386. val = new_val
  387. # Drop href and xlink:href attr for svg elements with non-local IRIs
  388. if (None, token['name']) in self.svg_allow_local_href:
  389. if namespaced_name in [
  390. (None, 'href'), (html5lib_shim.namespaces['xlink'], 'href')
  391. ]:
  392. if re.search(r'^\s*[^#\s]', val):
  393. continue
  394. # If it's a style attribute, sanitize it
  395. if namespaced_name == (None, 'style'):
  396. val = self.sanitize_css(val)
  397. # At this point, we want to keep the attribute, so add it in
  398. attrs[namespaced_name] = val
  399. token['data'] = alphabetize_attributes(attrs)
  400. return token
  401. def disallowed_token(self, token):
  402. token_type = token["type"]
  403. if token_type == "EndTag":
  404. token["data"] = "</%s>" % token["name"]
  405. elif token["data"]:
  406. assert token_type in ("StartTag", "EmptyTag")
  407. attrs = []
  408. for (ns, name), v in token["data"].items():
  409. # If we end up with a namespace, but no name, switch them so we
  410. # have a valid name to use.
  411. if ns and not name:
  412. ns, name = name, ns
  413. # Figure out namespaced name if the namespace is appropriate
  414. # and exists; if the ns isn't in prefixes, then drop it.
  415. if ns is None or ns not in html5lib_shim.prefixes:
  416. namespaced_name = name
  417. else:
  418. namespaced_name = '%s:%s' % (html5lib_shim.prefixes[ns], name)
  419. attrs.append(' %s="%s"' % (
  420. namespaced_name,
  421. # NOTE(willkg): HTMLSerializer escapes attribute values
  422. # already, so if we do it here (like HTMLSerializer does),
  423. # then we end up double-escaping.
  424. v)
  425. )
  426. token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
  427. else:
  428. token["data"] = "<%s>" % token["name"]
  429. if token.get("selfClosing"):
  430. token["data"] = token["data"][:-1] + "/>"
  431. token["type"] = "Characters"
  432. del token["name"]
  433. return token
  434. def sanitize_css(self, style):
  435. """Sanitizes css in style tags"""
  436. # Convert entities in the style so that it can be parsed as CSS
  437. style = html5lib_shim.convert_entities(style)
  438. # Drop any url values before we do anything else
  439. style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
  440. # The gauntlet of sanitization
  441. # Validate the css in the style tag and if it's not valid, then drop
  442. # the whole thing.
  443. parts = style.split(';')
  444. gauntlet = re.compile(
  445. r"""^( # consider a style attribute value as composed of:
  446. [/:,#%!.\s\w] # a non-newline character
  447. |\w-\w # 3 characters in the form \w-\w
  448. |'[\s\w]+'\s* # a single quoted string of [\s\w]+ with trailing space
  449. |"[\s\w]+" # a double quoted string of [\s\w]+
  450. |\([\d,%\.\s]+\) # a parenthesized string of one or more digits, commas, periods, ...
  451. )*$""", # ... percent signs, or whitespace e.g. from 'color: hsl(30,100%,50%)'
  452. flags=re.U | re.VERBOSE
  453. )
  454. for part in parts:
  455. if not gauntlet.match(part):
  456. return ''
  457. if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
  458. return ''
  459. clean = []
  460. for prop, value in re.findall(r'([-\w]+)\s*:\s*([^:;]*)', style):
  461. if not value:
  462. continue
  463. if prop.lower() in self.allowed_css_properties:
  464. clean.append(prop + ': ' + value + ';')
  465. elif prop.lower() in self.allowed_svg_properties:
  466. clean.append(prop + ': ' + value + ';')
  467. return ' '.join(clean)