extractoutput.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. """A preprocessor that extracts all of the outputs from the
  2. notebook file. The extracted outputs are returned in the 'resources' dictionary.
  3. """
  4. # Copyright (c) IPython Development Team.
  5. # Distributed under the terms of the Modified BSD License.
  6. from textwrap import dedent
  7. from binascii import a2b_base64
  8. import sys
  9. import os
  10. import json
  11. from mimetypes import guess_extension
  12. from traitlets import Unicode, Set
  13. from .base import Preprocessor
  14. if sys.version_info < (3,):
  15. text_type = basestring
  16. else:
  17. text_type = str
  18. def guess_extension_without_jpe(mimetype):
  19. """
  20. This function fixes a problem with '.jpe' extensions
  21. of jpeg images which are then not recognised by latex.
  22. For any other case, the function works in the same way
  23. as mimetypes.guess_extension
  24. """
  25. ext = guess_extension(mimetype)
  26. if ext==".jpe":
  27. ext=".jpeg"
  28. return ext
  29. def platform_utf_8_encode(data):
  30. if isinstance(data, text_type):
  31. if sys.platform == 'win32':
  32. data = data.replace('\n', '\r\n')
  33. data = data.encode('utf-8')
  34. return data
  35. class ExtractOutputPreprocessor(Preprocessor):
  36. """
  37. Extracts all of the outputs from the notebook file. The extracted
  38. outputs are returned in the 'resources' dictionary.
  39. """
  40. output_filename_template = Unicode(
  41. "{unique_key}_{cell_index}_{index}{extension}"
  42. ).tag(config=True)
  43. extract_output_types = Set(
  44. {'image/png', 'image/jpeg', 'image/svg+xml', 'application/pdf'}
  45. ).tag(config=True)
  46. def preprocess_cell(self, cell, resources, cell_index):
  47. """
  48. Apply a transformation on each cell,
  49. Parameters
  50. ----------
  51. cell : NotebookNode cell
  52. Notebook cell being processed
  53. resources : dictionary
  54. Additional resources used in the conversion process. Allows
  55. preprocessors to pass variables into the Jinja engine.
  56. cell_index : int
  57. Index of the cell being processed (see base.py)
  58. """
  59. #Get the unique key from the resource dict if it exists. If it does not
  60. #exist, use 'output' as the default. Also, get files directory if it
  61. #has been specified
  62. unique_key = resources.get('unique_key', 'output')
  63. output_files_dir = resources.get('output_files_dir', None)
  64. #Make sure outputs key exists
  65. if not isinstance(resources['outputs'], dict):
  66. resources['outputs'] = {}
  67. #Loop through all of the outputs in the cell
  68. for index, out in enumerate(cell.get('outputs', [])):
  69. if out.output_type not in {'display_data', 'execute_result'}:
  70. continue
  71. if 'text/html' in out.data:
  72. out['data']['text/html'] = dedent(out['data']['text/html'])
  73. #Get the output in data formats that the template needs extracted
  74. for mime_type in self.extract_output_types:
  75. if mime_type in out.data:
  76. data = out.data[mime_type]
  77. # Binary files are base64-encoded, SVG is already XML
  78. if mime_type in {'image/png', 'image/jpeg', 'application/pdf'}:
  79. # data is b64-encoded as text (str, unicode),
  80. # we want the original bytes
  81. data = a2b_base64(data)
  82. elif mime_type == 'application/json' or not isinstance(data, text_type):
  83. # Data is either JSON-like and was parsed into a Python
  84. # object according to the spec, or data is for sure
  85. # JSON. In the latter case we want to go extra sure that
  86. # we enclose a scalar string value into extra quotes by
  87. # serializing it properly.
  88. if isinstance(data, bytes) and not isinstance(data, text_type):
  89. # In python 3 we need to guess the encoding in this
  90. # instance. Some modules that return raw data like
  91. # svg can leave the data in byte form instead of str
  92. data = data.decode('utf-8')
  93. data = platform_utf_8_encode(json.dumps(data))
  94. else:
  95. # All other text_type data will fall into this path
  96. data = platform_utf_8_encode(data)
  97. ext = guess_extension_without_jpe(mime_type)
  98. if ext is None:
  99. ext = '.' + mime_type.rsplit('/')[-1]
  100. if out.metadata.get('filename', ''):
  101. filename = out.metadata['filename']
  102. if not filename.endswith(ext):
  103. filename+=ext
  104. else:
  105. filename = self.output_filename_template.format(
  106. unique_key=unique_key,
  107. cell_index=cell_index,
  108. index=index,
  109. extension=ext)
  110. # On the cell, make the figure available via
  111. # cell.outputs[i].metadata.filenames['mime/type']
  112. # where
  113. # cell.outputs[i].data['mime/type'] contains the data
  114. if output_files_dir is not None:
  115. filename = os.path.join(output_files_dir, filename)
  116. out.metadata.setdefault('filenames', {})
  117. out.metadata['filenames'][mime_type] = filename
  118. if filename in resources['outputs']:
  119. raise ValueError(
  120. "Your outputs have filename metadata associated "
  121. "with them. Nbconvert saves these outputs to "
  122. "external files using this filename metadata. "
  123. "Filenames need to be unique across the notebook, "
  124. "or images will be overwritten. The filename {} is "
  125. "associated with more than one output. The second "
  126. "output associated with this filename is in cell "
  127. "{}.".format(filename, cell_index)
  128. )
  129. #In the resources, make the figure available via
  130. # resources['outputs']['filename'] = data
  131. resources['outputs'][filename] = data
  132. return cell, resources