sanitize.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. """
  2. NBConvert Preprocessor for sanitizing HTML rendering of notebooks.
  3. """
  4. from bleach import (
  5. ALLOWED_ATTRIBUTES,
  6. ALLOWED_STYLES,
  7. ALLOWED_TAGS,
  8. clean,
  9. )
  10. from traitlets import (
  11. Any,
  12. Bool,
  13. List,
  14. Set,
  15. Unicode,
  16. )
  17. from .base import Preprocessor
  18. class SanitizeHTML(Preprocessor):
  19. # Bleach config.
  20. attributes = Any(
  21. config=True,
  22. default_value=ALLOWED_ATTRIBUTES,
  23. help="Allowed HTML tag attributes",
  24. )
  25. tags = List(
  26. Unicode(),
  27. config=True,
  28. default_value=ALLOWED_TAGS,
  29. help="List of HTML tags to allow",
  30. )
  31. styles = List(
  32. Unicode(),
  33. config=True,
  34. default_value=ALLOWED_STYLES,
  35. help="Allowed CSS styles if <style> tag is whitelisted"
  36. )
  37. strip = Bool(
  38. config=True,
  39. default_value=False,
  40. help="If True, remove unsafe markup entirely instead of escaping"
  41. )
  42. strip_comments = Bool(
  43. config=True,
  44. default_value=True,
  45. help="If True, strip comments from escaped HTML",
  46. )
  47. # Display data config.
  48. safe_output_keys = Set(
  49. config=True,
  50. default_value={
  51. 'metadata', # Not a mimetype per-se, but expected and safe.
  52. 'text/plain',
  53. 'text/latex',
  54. 'application/json',
  55. 'image/png',
  56. 'image/jpeg',
  57. },
  58. help="Cell output mimetypes to render without modification",
  59. )
  60. sanitized_output_types = Set(
  61. config=True,
  62. default_value={
  63. 'text/html',
  64. 'text/markdown',
  65. },
  66. help="Cell output types to display after escaping with Bleach.",
  67. )
  68. def preprocess_cell(self, cell, resources, cell_index):
  69. """
  70. Sanitize potentially-dangerous contents of the cell.
  71. Cell Types:
  72. raw:
  73. Sanitize literal HTML
  74. markdown:
  75. Sanitize literal HTML
  76. code:
  77. Sanitize outputs that could result in code execution
  78. """
  79. if cell.cell_type == 'raw':
  80. # Sanitize all raw cells anyway.
  81. # Only ones with the text/html mimetype should be emitted
  82. # but erring on the side of safety maybe.
  83. cell.source = self.sanitize_html_tags(cell.source)
  84. return cell, resources
  85. elif cell.cell_type == 'markdown':
  86. cell.source = self.sanitize_html_tags(cell.source)
  87. return cell, resources
  88. elif cell.cell_type == 'code':
  89. cell.outputs = self.sanitize_code_outputs(cell.outputs)
  90. return cell, resources
  91. def sanitize_code_outputs(self, outputs):
  92. """
  93. Sanitize code cell outputs.
  94. Removes 'text/javascript' fields from display_data outputs, and
  95. runs `sanitize_html_tags` over 'text/html'.
  96. """
  97. for output in outputs:
  98. # These are always ascii, so nothing to escape.
  99. if output['output_type'] in ('stream', 'error'):
  100. continue
  101. data = output.data
  102. to_remove = []
  103. for key in data:
  104. if key in self.safe_output_keys:
  105. continue
  106. elif key in self.sanitized_output_types:
  107. self.log.info("Sanitizing %s" % key)
  108. data[key] = self.sanitize_html_tags(data[key])
  109. else:
  110. # Mark key for removal. (Python doesn't allow deletion of
  111. # keys from a dict during iteration)
  112. to_remove.append(key)
  113. for key in to_remove:
  114. self.log.info("Removing %s" % key)
  115. del data[key]
  116. return outputs
  117. def sanitize_html_tags(self, html_str):
  118. """
  119. Sanitize a string containing raw HTML tags.
  120. """
  121. return clean(
  122. html_str,
  123. tags=self.tags,
  124. attributes=self.attributes,
  125. styles=self.styles,
  126. strip=self.strip,
  127. strip_comments=self.strip_comments,
  128. )