123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- """
- NBConvert Preprocessor for sanitizing HTML rendering of notebooks.
- """
- from bleach import (
- ALLOWED_ATTRIBUTES,
- ALLOWED_STYLES,
- ALLOWED_TAGS,
- clean,
- )
- from traitlets import (
- Any,
- Bool,
- List,
- Set,
- Unicode,
- )
- from .base import Preprocessor
- class SanitizeHTML(Preprocessor):
- # Bleach config.
- attributes = Any(
- config=True,
- default_value=ALLOWED_ATTRIBUTES,
- help="Allowed HTML tag attributes",
- )
- tags = List(
- Unicode(),
- config=True,
- default_value=ALLOWED_TAGS,
- help="List of HTML tags to allow",
- )
- styles = List(
- Unicode(),
- config=True,
- default_value=ALLOWED_STYLES,
- help="Allowed CSS styles if <style> tag is whitelisted"
- )
- strip = Bool(
- config=True,
- default_value=False,
- help="If True, remove unsafe markup entirely instead of escaping"
- )
- strip_comments = Bool(
- config=True,
- default_value=True,
- help="If True, strip comments from escaped HTML",
- )
- # Display data config.
- safe_output_keys = Set(
- config=True,
- default_value={
- 'metadata', # Not a mimetype per-se, but expected and safe.
- 'text/plain',
- 'text/latex',
- 'application/json',
- 'image/png',
- 'image/jpeg',
- },
- help="Cell output mimetypes to render without modification",
- )
- sanitized_output_types = Set(
- config=True,
- default_value={
- 'text/html',
- 'text/markdown',
- },
- help="Cell output types to display after escaping with Bleach.",
- )
- def preprocess_cell(self, cell, resources, cell_index):
- """
- Sanitize potentially-dangerous contents of the cell.
- Cell Types:
- raw:
- Sanitize literal HTML
- markdown:
- Sanitize literal HTML
- code:
- Sanitize outputs that could result in code execution
- """
- if cell.cell_type == 'raw':
- # Sanitize all raw cells anyway.
- # Only ones with the text/html mimetype should be emitted
- # but erring on the side of safety maybe.
- cell.source = self.sanitize_html_tags(cell.source)
- return cell, resources
- elif cell.cell_type == 'markdown':
- cell.source = self.sanitize_html_tags(cell.source)
- return cell, resources
- elif cell.cell_type == 'code':
- cell.outputs = self.sanitize_code_outputs(cell.outputs)
- return cell, resources
- def sanitize_code_outputs(self, outputs):
- """
- Sanitize code cell outputs.
- Removes 'text/javascript' fields from display_data outputs, and
- runs `sanitize_html_tags` over 'text/html'.
- """
- for output in outputs:
- # These are always ascii, so nothing to escape.
- if output['output_type'] in ('stream', 'error'):
- continue
- data = output.data
- to_remove = []
- for key in data:
- if key in self.safe_output_keys:
- continue
- elif key in self.sanitized_output_types:
- self.log.info("Sanitizing %s" % key)
- data[key] = self.sanitize_html_tags(data[key])
- else:
- # Mark key for removal. (Python doesn't allow deletion of
- # keys from a dict during iteration)
- to_remove.append(key)
- for key in to_remove:
- self.log.info("Removing %s" % key)
- del data[key]
- return outputs
- def sanitize_html_tags(self, html_str):
- """
- Sanitize a string containing raw HTML tags.
- """
- return clean(
- html_str,
- tags=self.tags,
- attributes=self.attributes,
- styles=self.styles,
- strip=self.strip,
- strip_comments=self.strip_comments,
- )
|