excel.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. from __future__ import absolute_import
  2. # Copyright (c) 2010-2019 openpyxl
  3. """Read an xlsx file into Python"""
  4. # Python stdlib imports
  5. from zipfile import ZipFile, ZIP_DEFLATED, BadZipfile
  6. from sys import exc_info
  7. from io import BytesIO
  8. import os.path
  9. import warnings
  10. # compatibility imports
  11. from openpyxl.compat import unicode, file
  12. from openpyxl.pivot.table import TableDefinition
  13. # Allow blanket setting of KEEP_VBA for testing
  14. try:
  15. from ..tests import KEEP_VBA
  16. except ImportError:
  17. KEEP_VBA = False
  18. # package imports
  19. from openpyxl.utils.exceptions import InvalidFileException
  20. from openpyxl.xml.constants import (
  21. ARC_SHARED_STRINGS,
  22. ARC_CORE,
  23. ARC_CONTENT_TYPES,
  24. ARC_WORKBOOK,
  25. ARC_THEME,
  26. COMMENTS_NS,
  27. SHARED_STRINGS,
  28. EXTERNAL_LINK,
  29. XLTM,
  30. XLTX,
  31. XLSM,
  32. XLSX,
  33. )
  34. from openpyxl.cell import MergedCell
  35. from openpyxl.comments.comment_sheet import CommentSheet
  36. from .strings import read_string_table
  37. from .workbook import WorkbookParser
  38. from openpyxl.styles.stylesheet import apply_stylesheet
  39. from openpyxl.packaging.core import DocumentProperties
  40. from openpyxl.packaging.manifest import Manifest, Override
  41. from openpyxl.packaging.relationship import (
  42. RelationshipList,
  43. get_dependents,
  44. get_rels_path,
  45. )
  46. from openpyxl.worksheet._read_only import ReadOnlyWorksheet
  47. from openpyxl.worksheet._reader import WorksheetReader
  48. from openpyxl.chartsheet import Chartsheet
  49. from openpyxl.worksheet.table import Table
  50. from openpyxl.drawing.spreadsheet_drawing import SpreadsheetDrawing
  51. from openpyxl.xml.functions import fromstring
  52. from .drawings import find_images
  53. SUPPORTED_FORMATS = ('.xlsx', '.xlsm', '.xltx', '.xltm')
  54. def _validate_archive(filename):
  55. """
  56. Does a first check whether filename is a string or a file-like
  57. object. If it is a string representing a filename, a check is done
  58. for supported formats by checking the given file-extension. If the
  59. file-extension is not in SUPPORTED_FORMATS an InvalidFileException
  60. will raised. Otherwise the filename (resp. file-like object) will
  61. forwarded to zipfile.ZipFile returning a ZipFile-Instance.
  62. """
  63. is_file_like = hasattr(filename, 'read')
  64. if not is_file_like:
  65. file_format = os.path.splitext(filename)[-1].lower()
  66. if file_format not in SUPPORTED_FORMATS:
  67. if file_format == '.xls':
  68. msg = ('openpyxl does not support the old .xls file format, '
  69. 'please use xlrd to read this file, or convert it to '
  70. 'the more recent .xlsx file format.')
  71. elif file_format == '.xlsb':
  72. msg = ('openpyxl does not support binary format .xlsb, '
  73. 'please convert this file to .xlsx format if you want '
  74. 'to open it with openpyxl')
  75. else:
  76. msg = ('openpyxl does not support %s file format, '
  77. 'please check you can open '
  78. 'it with Excel first. '
  79. 'Supported formats are: %s') % (file_format,
  80. ','.join(SUPPORTED_FORMATS))
  81. raise InvalidFileException(msg)
  82. archive = ZipFile(filename, 'r')
  83. return archive
  84. def _find_workbook_part(package):
  85. workbook_types = [XLTM, XLTX, XLSM, XLSX]
  86. for ct in workbook_types:
  87. part = package.find(ct)
  88. if part:
  89. return part
  90. # some applications reassign the default for application/xml
  91. defaults = set((p.ContentType for p in package.Default))
  92. workbook_type = defaults & set(workbook_types)
  93. if workbook_type:
  94. return Override("/" + ARC_WORKBOOK, workbook_type.pop())
  95. raise IOError("File contains no valid workbook part")
  96. class ExcelReader:
  97. """
  98. Read an Excel package and dispatch the contents to the relevant modules
  99. """
  100. def __init__(self, fn, read_only=False, keep_vba=KEEP_VBA,
  101. data_only=False, keep_links=True):
  102. self.archive = _validate_archive(fn)
  103. self.valid_files = self.archive.namelist()
  104. self.read_only = read_only
  105. self.keep_vba = keep_vba
  106. self.data_only = data_only
  107. self.keep_links = keep_links
  108. self.shared_strings = []
  109. def read_manifest(self):
  110. src = self.archive.read(ARC_CONTENT_TYPES)
  111. root = fromstring(src)
  112. self.package = Manifest.from_tree(root)
  113. def read_strings(self):
  114. ct = self.package.find(SHARED_STRINGS)
  115. if ct is not None:
  116. strings_path = ct.PartName[1:]
  117. with self.archive.open(strings_path,) as src:
  118. self.shared_strings = read_string_table(src)
  119. def read_workbook(self):
  120. wb_part = _find_workbook_part(self.package)
  121. self.parser = WorkbookParser(self.archive, wb_part.PartName[1:], keep_links=self.keep_links)
  122. self.parser.parse()
  123. wb = self.parser.wb
  124. wb._sheets = []
  125. wb._data_only = self.data_only
  126. wb._read_only = self.read_only
  127. wb.template = wb_part.ContentType in (XLTX, XLTM)
  128. # If are going to preserve the vba then attach a copy of the archive to the
  129. # workbook so that is available for the save.
  130. if self.keep_vba:
  131. wb.vba_archive = ZipFile(BytesIO(), 'a', ZIP_DEFLATED)
  132. for name in self.valid_files:
  133. wb.vba_archive.writestr(name, self.archive.read(name))
  134. if self.read_only:
  135. wb._archive = self.archive
  136. self.wb = wb
  137. def read_properties(self):
  138. if ARC_CORE in self.valid_files:
  139. src = fromstring(self.archive.read(ARC_CORE))
  140. self.wb.properties = DocumentProperties.from_tree(src)
  141. def read_theme(self):
  142. if ARC_THEME in self.valid_files:
  143. self.wb.loaded_theme = self.archive.read(ARC_THEME)
  144. def read_chartsheet(self, sheet, rel):
  145. sheet_path = rel.target
  146. rels_path = get_rels_path(sheet_path)
  147. rels = []
  148. if rels_path in self.valid_files:
  149. rels = get_dependents(self.archive, rels_path)
  150. with self.archive.open(sheet_path, "r") as src:
  151. xml = src.read()
  152. node = fromstring(xml)
  153. cs = Chartsheet.from_tree(node)
  154. cs._parent = self.wb
  155. cs.title = sheet.name
  156. self.wb._add_sheet(cs)
  157. drawings = rels.find(SpreadsheetDrawing._rel_type)
  158. for rel in drawings:
  159. charts, images = find_images(self.archive, rel.target)
  160. for c in charts:
  161. cs.add_chart(c)
  162. def read_worksheets(self):
  163. comment_warning = """Cell '{0}':{1} is part of a merged range but has a comment which will be removed because merged cells cannot contain any data."""
  164. for sheet, rel in self.parser.find_sheets():
  165. if rel.target not in self.valid_files:
  166. continue
  167. if "chartsheet" in rel.Type:
  168. self.read_chartsheet(sheet, rel)
  169. continue
  170. rels_path = get_rels_path(rel.target)
  171. rels = RelationshipList()
  172. if rels_path in self.valid_files:
  173. rels = get_dependents(self.archive, rels_path)
  174. if self.read_only:
  175. ws = ReadOnlyWorksheet(self.wb, sheet.name, rel.target, self.shared_strings)
  176. self.wb._sheets.append(ws)
  177. continue
  178. else:
  179. fh = self.archive.open(rel.target)
  180. ws = self.wb.create_sheet(sheet.name)
  181. ws._rels = rels
  182. ws_parser = WorksheetReader(ws, fh, self.shared_strings, self.data_only)
  183. ws_parser.bind_all()
  184. # assign any comments to cells
  185. for r in rels.find(COMMENTS_NS):
  186. src = self.archive.read(r.target)
  187. comment_sheet = CommentSheet.from_tree(fromstring(src))
  188. for ref, comment in comment_sheet.comments:
  189. try:
  190. ws[ref].comment = comment
  191. except AttributeError:
  192. c = ws[ref]
  193. if isinstance(c, MergedCell):
  194. warnings.warn(comment_warning.format(ws.title, c.coordinate))
  195. continue
  196. # preserve link to VML file if VBA
  197. if self.wb.vba_archive and ws.legacy_drawing:
  198. ws.legacy_drawing = rels[ws.legacy_drawing].target
  199. for t in ws_parser.tables:
  200. src = self.archive.read(t)
  201. xml = fromstring(src)
  202. table = Table.from_tree(xml)
  203. ws.add_table(table)
  204. drawings = rels.find(SpreadsheetDrawing._rel_type)
  205. for rel in drawings:
  206. charts, images = find_images(self.archive, rel.target)
  207. for c in charts:
  208. ws.add_chart(c, c.anchor)
  209. for im in images:
  210. ws.add_image(im, im.anchor)
  211. pivot_rel = rels.find(TableDefinition.rel_type)
  212. for r in pivot_rel:
  213. pivot_path = r.Target
  214. src = self.archive.read(pivot_path)
  215. tree = fromstring(src)
  216. pivot = TableDefinition.from_tree(tree)
  217. pivot.cache = self.parser.pivot_caches[pivot.cacheId]
  218. ws.add_pivot(pivot)
  219. ws.sheet_state = sheet.state
  220. def read(self):
  221. self.read_manifest()
  222. self.read_strings()
  223. self.read_workbook()
  224. self.read_properties()
  225. self.read_theme()
  226. apply_stylesheet(self.archive, self.wb)
  227. self.read_worksheets()
  228. self.parser.assign_names()
  229. if not self.read_only:
  230. self.archive.close()
  231. def load_workbook(filename, read_only=False, keep_vba=KEEP_VBA,
  232. data_only=False, keep_links=True):
  233. """Open the given filename and return the workbook
  234. :param filename: the path to open or a file-like object
  235. :type filename: string or a file-like object open in binary mode c.f., :class:`zipfile.ZipFile`
  236. :param read_only: optimised for reading, content cannot be edited
  237. :type read_only: bool
  238. :param keep_vba: preseve vba content (this does NOT mean you can use it)
  239. :type keep_vba: bool
  240. :param guess_types: guess cell content type and do not read it from the file
  241. :type guess_types: bool
  242. :param data_only: controls whether cells with formulae have either the formula (default) or the value stored the last time Excel read the sheet
  243. :type data_only: bool
  244. :param keep_links: whether links to external workbooks should be preserved. The default is True
  245. :type keep_links: bool
  246. :rtype: :class:`openpyxl.workbook.Workbook`
  247. .. note::
  248. When using lazy load, all worksheets will be :class:`openpyxl.worksheet.iter_worksheet.IterableWorksheet`
  249. and the returned workbook will be read-only.
  250. """
  251. reader = ExcelReader(filename, read_only, keep_vba,
  252. data_only, keep_links)
  253. reader.read()
  254. return reader.wb