book.py 56 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420
  1. # Copyright (c) 2005-2012 Stephen John Machin, Lingfo Pty Ltd
  2. # This module is part of the xlrd package, which is released under a
  3. # BSD-style licence.
  4. from __future__ import print_function
  5. from .timemachine import *
  6. from .biffh import *
  7. import struct; unpack = struct.unpack
  8. import sys
  9. import time
  10. from . import sheet
  11. from . import compdoc
  12. from .formula import *
  13. from . import formatting
  14. if sys.version.startswith("IronPython"):
  15. # print >> sys.stderr, "...importing encodings"
  16. import encodings
  17. empty_cell = sheet.empty_cell # for exposure to the world ...
  18. DEBUG = 0
  19. USE_FANCY_CD = 1
  20. TOGGLE_GC = 0
  21. import gc
  22. # gc.set_debug(gc.DEBUG_STATS)
  23. try:
  24. import mmap
  25. MMAP_AVAILABLE = 1
  26. except ImportError:
  27. MMAP_AVAILABLE = 0
  28. USE_MMAP = MMAP_AVAILABLE
  29. MY_EOF = 0xF00BAAA # not a 16-bit number
  30. SUPBOOK_UNK, SUPBOOK_INTERNAL, SUPBOOK_EXTERNAL, SUPBOOK_ADDIN, SUPBOOK_DDEOLE = range(5)
  31. SUPPORTED_VERSIONS = (80, 70, 50, 45, 40, 30, 21, 20)
  32. _code_from_builtin_name = {
  33. "Consolidate_Area": "\x00",
  34. "Auto_Open": "\x01",
  35. "Auto_Close": "\x02",
  36. "Extract": "\x03",
  37. "Database": "\x04",
  38. "Criteria": "\x05",
  39. "Print_Area": "\x06",
  40. "Print_Titles": "\x07",
  41. "Recorder": "\x08",
  42. "Data_Form": "\x09",
  43. "Auto_Activate": "\x0A",
  44. "Auto_Deactivate": "\x0B",
  45. "Sheet_Title": "\x0C",
  46. "_FilterDatabase": "\x0D",
  47. }
  48. builtin_name_from_code = {}
  49. code_from_builtin_name = {}
  50. for _bin, _bic in _code_from_builtin_name.items():
  51. _bin = UNICODE_LITERAL(_bin)
  52. _bic = UNICODE_LITERAL(_bic)
  53. code_from_builtin_name[_bin] = _bic
  54. builtin_name_from_code[_bic] = _bin
  55. del _bin, _bic, _code_from_builtin_name
  56. def open_workbook_xls(filename=None,
  57. logfile=sys.stdout, verbosity=0, use_mmap=USE_MMAP,
  58. file_contents=None,
  59. encoding_override=None,
  60. formatting_info=False, on_demand=False, ragged_rows=False,
  61. ):
  62. t0 = time.clock()
  63. if TOGGLE_GC:
  64. orig_gc_enabled = gc.isenabled()
  65. if orig_gc_enabled:
  66. gc.disable()
  67. bk = Book()
  68. try:
  69. bk.biff2_8_load(
  70. filename=filename, file_contents=file_contents,
  71. logfile=logfile, verbosity=verbosity, use_mmap=use_mmap,
  72. encoding_override=encoding_override,
  73. formatting_info=formatting_info,
  74. on_demand=on_demand,
  75. ragged_rows=ragged_rows,
  76. )
  77. t1 = time.clock()
  78. bk.load_time_stage_1 = t1 - t0
  79. biff_version = bk.getbof(XL_WORKBOOK_GLOBALS)
  80. if not biff_version:
  81. raise XLRDError("Can't determine file's BIFF version")
  82. if biff_version not in SUPPORTED_VERSIONS:
  83. raise XLRDError(
  84. "BIFF version %s is not supported"
  85. % biff_text_from_num[biff_version]
  86. )
  87. bk.biff_version = biff_version
  88. if biff_version <= 40:
  89. # no workbook globals, only 1 worksheet
  90. if on_demand:
  91. fprintf(bk.logfile,
  92. "*** WARNING: on_demand is not supported for this Excel version.\n"
  93. "*** Setting on_demand to False.\n")
  94. bk.on_demand = on_demand = False
  95. bk.fake_globals_get_sheet()
  96. elif biff_version == 45:
  97. # worksheet(s) embedded in global stream
  98. bk.parse_globals()
  99. if on_demand:
  100. fprintf(bk.logfile, "*** WARNING: on_demand is not supported for this Excel version.\n"
  101. "*** Setting on_demand to False.\n")
  102. bk.on_demand = on_demand = False
  103. else:
  104. bk.parse_globals()
  105. bk._sheet_list = [None for sh in bk._sheet_names]
  106. if not on_demand:
  107. bk.get_sheets()
  108. bk.nsheets = len(bk._sheet_list)
  109. if biff_version == 45 and bk.nsheets > 1:
  110. fprintf(bk.logfile,
  111. "*** WARNING: Excel 4.0 workbook (.XLW) file contains %d worksheets.\n"
  112. "*** Book-level data will be that of the last worksheet.\n",
  113. bk.nsheets
  114. )
  115. if TOGGLE_GC:
  116. if orig_gc_enabled:
  117. gc.enable()
  118. t2 = time.clock()
  119. bk.load_time_stage_2 = t2 - t1
  120. except:
  121. bk.release_resources()
  122. raise
  123. # normal exit
  124. if not on_demand:
  125. bk.release_resources()
  126. return bk
  127. ##
  128. # For debugging: dump the file's BIFF records in char & hex.
  129. # @param filename The path to the file to be dumped.
  130. # @param outfile An open file, to which the dump is written.
  131. # @param unnumbered If true, omit offsets (for meaningful diffs).
  132. def dump(filename, outfile=sys.stdout, unnumbered=False):
  133. bk = Book()
  134. bk.biff2_8_load(filename=filename, logfile=outfile, )
  135. biff_dump(bk.mem, bk.base, bk.stream_len, 0, outfile, unnumbered)
  136. ##
  137. # For debugging and analysis: summarise the file's BIFF records.
  138. # I.e. produce a sorted file of (record_name, count).
  139. # @param filename The path to the file to be summarised.
  140. # @param outfile An open file, to which the summary is written.
  141. def count_records(filename, outfile=sys.stdout):
  142. bk = Book()
  143. bk.biff2_8_load(filename=filename, logfile=outfile, )
  144. biff_count_records(bk.mem, bk.base, bk.stream_len, outfile)
  145. ##
  146. # Information relating to a named reference, formula, macro, etc.
  147. # <br /> -- New in version 0.6.0
  148. # <br /> -- <i>Name information is <b>not</b> extracted from files older than
  149. # Excel 5.0 (Book.biff_version < 50)</i>
  150. class Name(BaseObject):
  151. _repr_these = ['stack']
  152. book = None # parent
  153. ##
  154. # 0 = Visible; 1 = Hidden
  155. hidden = 0
  156. ##
  157. # 0 = Command macro; 1 = Function macro. Relevant only if macro == 1
  158. func = 0
  159. ##
  160. # 0 = Sheet macro; 1 = VisualBasic macro. Relevant only if macro == 1
  161. vbasic = 0
  162. ##
  163. # 0 = Standard name; 1 = Macro name
  164. macro = 0
  165. ##
  166. # 0 = Simple formula; 1 = Complex formula (array formula or user defined)<br />
  167. # <i>No examples have been sighted.</i>
  168. complex = 0
  169. ##
  170. # 0 = User-defined name; 1 = Built-in name
  171. # (common examples: Print_Area, Print_Titles; see OOo docs for full list)
  172. builtin = 0
  173. ##
  174. # Function group. Relevant only if macro == 1; see OOo docs for values.
  175. funcgroup = 0
  176. ##
  177. # 0 = Formula definition; 1 = Binary data<br /> <i>No examples have been sighted.</i>
  178. binary = 0
  179. ##
  180. # The index of this object in book.name_obj_list
  181. name_index = 0
  182. ##
  183. # A Unicode string. If builtin, decoded as per OOo docs.
  184. name = UNICODE_LITERAL("")
  185. ##
  186. # An 8-bit string.
  187. raw_formula = b''
  188. ##
  189. # -1: The name is global (visible in all calculation sheets).<br />
  190. # -2: The name belongs to a macro sheet or VBA sheet.<br />
  191. # -3: The name is invalid.<br />
  192. # 0 <= scope < book.nsheets: The name is local to the sheet whose index is scope.
  193. scope = -1
  194. ##
  195. # The result of evaluating the formula, if any.
  196. # If no formula, or evaluation of the formula encountered problems,
  197. # the result is None. Otherwise the result is a single instance of the
  198. # Operand class.
  199. #
  200. result = None
  201. ##
  202. # This is a convenience method for the frequent use case where the name
  203. # refers to a single cell.
  204. # @return An instance of the Cell class.
  205. # @throws XLRDError The name is not a constant absolute reference
  206. # to a single cell.
  207. def cell(self):
  208. res = self.result
  209. if res:
  210. # result should be an instance of the Operand class
  211. kind = res.kind
  212. value = res.value
  213. if kind == oREF and len(value) == 1:
  214. ref3d = value[0]
  215. if (0 <= ref3d.shtxlo == ref3d.shtxhi - 1
  216. and ref3d.rowxlo == ref3d.rowxhi - 1
  217. and ref3d.colxlo == ref3d.colxhi - 1):
  218. sh = self.book.sheet_by_index(ref3d.shtxlo)
  219. return sh.cell(ref3d.rowxlo, ref3d.colxlo)
  220. self.dump(self.book.logfile,
  221. header="=== Dump of Name object ===",
  222. footer="======= End of dump =======",
  223. )
  224. raise XLRDError("Not a constant absolute reference to a single cell")
  225. ##
  226. # This is a convenience method for the use case where the name
  227. # refers to one rectangular area in one worksheet.
  228. # @param clipped If true (the default), the returned rectangle is clipped
  229. # to fit in (0, sheet.nrows, 0, sheet.ncols) -- it is guaranteed that
  230. # 0 <= rowxlo <= rowxhi <= sheet.nrows and that the number of usable rows
  231. # in the area (which may be zero) is rowxhi - rowxlo; likewise for columns.
  232. # @return a tuple (sheet_object, rowxlo, rowxhi, colxlo, colxhi).
  233. # @throws XLRDError The name is not a constant absolute reference
  234. # to a single area in a single sheet.
  235. def area2d(self, clipped=True):
  236. res = self.result
  237. if res:
  238. # result should be an instance of the Operand class
  239. kind = res.kind
  240. value = res.value
  241. if kind == oREF and len(value) == 1: # only 1 reference
  242. ref3d = value[0]
  243. if 0 <= ref3d.shtxlo == ref3d.shtxhi - 1: # only 1 usable sheet
  244. sh = self.book.sheet_by_index(ref3d.shtxlo)
  245. if not clipped:
  246. return sh, ref3d.rowxlo, ref3d.rowxhi, ref3d.colxlo, ref3d.colxhi
  247. rowxlo = min(ref3d.rowxlo, sh.nrows)
  248. rowxhi = max(rowxlo, min(ref3d.rowxhi, sh.nrows))
  249. colxlo = min(ref3d.colxlo, sh.ncols)
  250. colxhi = max(colxlo, min(ref3d.colxhi, sh.ncols))
  251. assert 0 <= rowxlo <= rowxhi <= sh.nrows
  252. assert 0 <= colxlo <= colxhi <= sh.ncols
  253. return sh, rowxlo, rowxhi, colxlo, colxhi
  254. self.dump(self.book.logfile,
  255. header="=== Dump of Name object ===",
  256. footer="======= End of dump =======",
  257. )
  258. raise XLRDError("Not a constant absolute reference to a single area in a single sheet")
  259. ##
  260. # Contents of a "workbook".
  261. # <p>WARNING: You don't call this class yourself. You use the Book object that
  262. # was returned when you called xlrd.open_workbook("myfile.xls").</p>
  263. class Book(BaseObject):
  264. ##
  265. # The number of worksheets present in the workbook file.
  266. # This information is available even when no sheets have yet been loaded.
  267. nsheets = 0
  268. ##
  269. # Which date system was in force when this file was last saved.<br />
  270. # 0 => 1900 system (the Excel for Windows default).<br />
  271. # 1 => 1904 system (the Excel for Macintosh default).<br />
  272. datemode = 0 # In case it's not specified in the file.
  273. ##
  274. # Version of BIFF (Binary Interchange File Format) used to create the file.
  275. # Latest is 8.0 (represented here as 80), introduced with Excel 97.
  276. # Earliest supported by this module: 2.0 (represented as 20).
  277. biff_version = 0
  278. ##
  279. # List containing a Name object for each NAME record in the workbook.
  280. # <br /> -- New in version 0.6.0
  281. name_obj_list = []
  282. ##
  283. # An integer denoting the character set used for strings in this file.
  284. # For BIFF 8 and later, this will be 1200, meaning Unicode; more precisely, UTF_16_LE.
  285. # For earlier versions, this is used to derive the appropriate Python encoding
  286. # to be used to convert to Unicode.
  287. # Examples: 1252 -> 'cp1252', 10000 -> 'mac_roman'
  288. codepage = None
  289. ##
  290. # The encoding that was derived from the codepage.
  291. encoding = None
  292. ##
  293. # A tuple containing the (telephone system) country code for:<br />
  294. # [0]: the user-interface setting when the file was created.<br />
  295. # [1]: the regional settings.<br />
  296. # Example: (1, 61) meaning (USA, Australia).
  297. # This information may give a clue to the correct encoding for an unknown codepage.
  298. # For a long list of observed values, refer to the OpenOffice.org documentation for
  299. # the COUNTRY record.
  300. countries = (0, 0)
  301. ##
  302. # What (if anything) is recorded as the name of the last user to save the file.
  303. user_name = UNICODE_LITERAL('')
  304. ##
  305. # A list of Font class instances, each corresponding to a FONT record.
  306. # <br /> -- New in version 0.6.1
  307. font_list = []
  308. ##
  309. # A list of XF class instances, each corresponding to an XF record.
  310. # <br /> -- New in version 0.6.1
  311. xf_list = []
  312. ##
  313. # A list of Format objects, each corresponding to a FORMAT record, in
  314. # the order that they appear in the input file.
  315. # It does <i>not</i> contain builtin formats.
  316. # If you are creating an output file using (for example) pyExcelerator,
  317. # use this list.
  318. # The collection to be used for all visual rendering purposes is format_map.
  319. # <br /> -- New in version 0.6.1
  320. format_list = []
  321. ##
  322. # The mapping from XF.format_key to Format object.
  323. # <br /> -- New in version 0.6.1
  324. format_map = {}
  325. ##
  326. # This provides access via name to the extended format information for
  327. # both built-in styles and user-defined styles.<br />
  328. # It maps <i>name</i> to (<i>built_in</i>, <i>xf_index</i>), where:<br />
  329. # <i>name</i> is either the name of a user-defined style,
  330. # or the name of one of the built-in styles. Known built-in names are
  331. # Normal, RowLevel_1 to RowLevel_7,
  332. # ColLevel_1 to ColLevel_7, Comma, Currency, Percent, "Comma [0]",
  333. # "Currency [0]", Hyperlink, and "Followed Hyperlink".<br />
  334. # <i>built_in</i> 1 = built-in style, 0 = user-defined<br />
  335. # <i>xf_index</i> is an index into Book.xf_list.<br />
  336. # References: OOo docs s6.99 (STYLE record); Excel UI Format/Style
  337. # <br /> -- New in version 0.6.1; since 0.7.4, extracted only if
  338. # open_workbook(..., formatting_info=True)
  339. style_name_map = {}
  340. ##
  341. # This provides definitions for colour indexes. Please refer to the
  342. # above section "The Palette; Colour Indexes" for an explanation
  343. # of how colours are represented in Excel.<br />
  344. # Colour indexes into the palette map into (red, green, blue) tuples.
  345. # "Magic" indexes e.g. 0x7FFF map to None.
  346. # <i>colour_map</i> is what you need if you want to render cells on screen or in a PDF
  347. # file. If you are writing an output XLS file, use <i>palette_record</i>.
  348. # <br /> -- New in version 0.6.1. Extracted only if open_workbook(..., formatting_info=True)
  349. colour_map = {}
  350. ##
  351. # If the user has changed any of the colours in the standard palette, the XLS
  352. # file will contain a PALETTE record with 56 (16 for Excel 4.0 and earlier)
  353. # RGB values in it, and this list will be e.g. [(r0, b0, g0), ..., (r55, b55, g55)].
  354. # Otherwise this list will be empty. This is what you need if you are
  355. # writing an output XLS file. If you want to render cells on screen or in a PDF
  356. # file, use colour_map.
  357. # <br /> -- New in version 0.6.1. Extracted only if open_workbook(..., formatting_info=True)
  358. palette_record = []
  359. ##
  360. # Time in seconds to extract the XLS image as a contiguous string (or mmap equivalent).
  361. load_time_stage_1 = -1.0
  362. ##
  363. # Time in seconds to parse the data from the contiguous string (or mmap equivalent).
  364. load_time_stage_2 = -1.0
  365. ##
  366. # @return A list of all sheets in the book.
  367. # All sheets not already loaded will be loaded.
  368. def sheets(self):
  369. for sheetx in xrange(self.nsheets):
  370. if not self._sheet_list[sheetx]:
  371. self.get_sheet(sheetx)
  372. return self._sheet_list[:]
  373. ##
  374. # @param sheetx Sheet index in range(nsheets)
  375. # @return An object of the Sheet class
  376. def sheet_by_index(self, sheetx):
  377. return self._sheet_list[sheetx] or self.get_sheet(sheetx)
  378. ##
  379. # @param sheet_name Name of sheet required
  380. # @return An object of the Sheet class
  381. def sheet_by_name(self, sheet_name):
  382. try:
  383. sheetx = self._sheet_names.index(sheet_name)
  384. except ValueError:
  385. raise XLRDError('No sheet named <%r>' % sheet_name)
  386. return self.sheet_by_index(sheetx)
  387. ##
  388. # @return A list of the names of all the worksheets in the workbook file.
  389. # This information is available even when no sheets have yet been loaded.
  390. def sheet_names(self):
  391. return self._sheet_names[:]
  392. ##
  393. # @param sheet_name_or_index Name or index of sheet enquired upon
  394. # @return true if sheet is loaded, false otherwise
  395. # <br /> -- New in version 0.7.1
  396. def sheet_loaded(self, sheet_name_or_index):
  397. if isinstance(sheet_name_or_index, int):
  398. sheetx = sheet_name_or_index
  399. else:
  400. try:
  401. sheetx = self._sheet_names.index(sheet_name_or_index)
  402. except ValueError:
  403. raise XLRDError('No sheet named <%r>' % sheet_name_or_index)
  404. return bool(self._sheet_list[sheetx])
  405. ##
  406. # @param sheet_name_or_index Name or index of sheet to be unloaded.
  407. # <br /> -- New in version 0.7.1
  408. def unload_sheet(self, sheet_name_or_index):
  409. if isinstance(sheet_name_or_index, int):
  410. sheetx = sheet_name_or_index
  411. else:
  412. try:
  413. sheetx = self._sheet_names.index(sheet_name_or_index)
  414. except ValueError:
  415. raise XLRDError('No sheet named <%r>' % sheet_name_or_index)
  416. self._sheet_list[sheetx] = None
  417. ##
  418. # This method has a dual purpose. You can call it to release
  419. # memory-consuming objects and (possibly) a memory-mapped file
  420. # (mmap.mmap object) when you have finished loading sheets in
  421. # on_demand mode, but still require the Book object to examine the
  422. # loaded sheets. It is also called automatically (a) when open_workbook
  423. # raises an exception and (b) if you are using a "with" statement, when
  424. # the "with" block is exited. Calling this method multiple times on the
  425. # same object has no ill effect.
  426. def release_resources(self):
  427. self._resources_released = 1
  428. if hasattr(self.mem, "close"):
  429. # must be a mmap.mmap object
  430. self.mem.close()
  431. self.mem = None
  432. if hasattr(self.filestr, "close"):
  433. self.filestr.close()
  434. self.filestr = None
  435. self._sharedstrings = None
  436. self._rich_text_runlist_map = None
  437. def __enter__(self):
  438. return self
  439. def __exit__(self, exc_type, exc_value, exc_tb):
  440. self.release_resources()
  441. # return false
  442. ##
  443. # A mapping from (lower_case_name, scope) to a single Name object.
  444. # <br /> -- New in version 0.6.0
  445. name_and_scope_map = {}
  446. ##
  447. # A mapping from lower_case_name to a list of Name objects. The list is
  448. # sorted in scope order. Typically there will be one item (of global scope)
  449. # in the list.
  450. # <br /> -- New in version 0.6.0
  451. name_map = {}
  452. def __init__(self):
  453. self._sheet_list = []
  454. self._sheet_names = []
  455. self._sheet_visibility = [] # from BOUNDSHEET record
  456. self.nsheets = 0
  457. self._sh_abs_posn = [] # sheet's absolute position in the stream
  458. self._sharedstrings = []
  459. self._rich_text_runlist_map = {}
  460. self.raw_user_name = False
  461. self._sheethdr_count = 0 # BIFF 4W only
  462. self.builtinfmtcount = -1 # unknown as yet. BIFF 3, 4S, 4W
  463. self.initialise_format_info()
  464. self._all_sheets_count = 0 # includes macro & VBA sheets
  465. self._supbook_count = 0
  466. self._supbook_locals_inx = None
  467. self._supbook_addins_inx = None
  468. self._all_sheets_map = [] # maps an all_sheets index to a calc-sheets index (or -1)
  469. self._externsheet_info = []
  470. self._externsheet_type_b57 = []
  471. self._extnsht_name_from_num = {}
  472. self._sheet_num_from_name = {}
  473. self._extnsht_count = 0
  474. self._supbook_types = []
  475. self._resources_released = 0
  476. self.addin_func_names = []
  477. self.name_obj_list = []
  478. self.colour_map = {}
  479. self.palette_record = []
  480. self.xf_list = []
  481. self.style_name_map = {}
  482. self.mem = b''
  483. self.filestr = b''
  484. def biff2_8_load(self, filename=None, file_contents=None,
  485. logfile=sys.stdout, verbosity=0, use_mmap=USE_MMAP,
  486. encoding_override=None,
  487. formatting_info=False,
  488. on_demand=False,
  489. ragged_rows=False,
  490. ):
  491. # DEBUG = 0
  492. self.logfile = logfile
  493. self.verbosity = verbosity
  494. self.use_mmap = use_mmap and MMAP_AVAILABLE
  495. self.encoding_override = encoding_override
  496. self.formatting_info = formatting_info
  497. self.on_demand = on_demand
  498. self.ragged_rows = ragged_rows
  499. if not file_contents:
  500. with open(filename, "rb") as f:
  501. f.seek(0, 2) # EOF
  502. size = f.tell()
  503. f.seek(0, 0) # BOF
  504. if size == 0:
  505. raise XLRDError("File size is 0 bytes")
  506. if self.use_mmap:
  507. self.filestr = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ)
  508. self.stream_len = size
  509. else:
  510. self.filestr = f.read()
  511. self.stream_len = len(self.filestr)
  512. else:
  513. self.filestr = file_contents
  514. self.stream_len = len(file_contents)
  515. self.base = 0
  516. if self.filestr[:8] != compdoc.SIGNATURE:
  517. # got this one at the antique store
  518. self.mem = self.filestr
  519. else:
  520. cd = compdoc.CompDoc(self.filestr, logfile=self.logfile)
  521. if USE_FANCY_CD:
  522. for qname in ['Workbook', 'Book']:
  523. self.mem, self.base, self.stream_len = \
  524. cd.locate_named_stream(UNICODE_LITERAL(qname))
  525. if self.mem: break
  526. else:
  527. raise XLRDError("Can't find workbook in OLE2 compound document")
  528. else:
  529. for qname in ['Workbook', 'Book']:
  530. self.mem = cd.get_named_stream(UNICODE_LITERAL(qname))
  531. if self.mem: break
  532. else:
  533. raise XLRDError("Can't find workbook in OLE2 compound document")
  534. self.stream_len = len(self.mem)
  535. del cd
  536. if self.mem is not self.filestr:
  537. if hasattr(self.filestr, "close"):
  538. self.filestr.close()
  539. self.filestr = b''
  540. self._position = self.base
  541. if DEBUG:
  542. print("mem: %s, base: %d, len: %d" % (type(self.mem), self.base, self.stream_len), file=self.logfile)
  543. def initialise_format_info(self):
  544. # needs to be done once per sheet for BIFF 4W :-(
  545. self.format_map = {}
  546. self.format_list = []
  547. self.xfcount = 0
  548. self.actualfmtcount = 0 # number of FORMAT records seen so far
  549. self._xf_index_to_xl_type_map = {0: XL_CELL_NUMBER}
  550. self._xf_epilogue_done = 0
  551. self.xf_list = []
  552. self.font_list = []
  553. def get2bytes(self):
  554. pos = self._position
  555. buff_two = self.mem[pos:pos+2]
  556. lenbuff = len(buff_two)
  557. self._position += lenbuff
  558. if lenbuff < 2:
  559. return MY_EOF
  560. lo, hi = buff_two
  561. return (BYTES_ORD(hi) << 8) | BYTES_ORD(lo)
  562. def get_record_parts(self):
  563. pos = self._position
  564. mem = self.mem
  565. code, length = unpack('<HH', mem[pos:pos+4])
  566. pos += 4
  567. data = mem[pos:pos+length]
  568. self._position = pos + length
  569. return (code, length, data)
  570. def get_record_parts_conditional(self, reqd_record):
  571. pos = self._position
  572. mem = self.mem
  573. code, length = unpack('<HH', mem[pos:pos+4])
  574. if code != reqd_record:
  575. return (None, 0, b'')
  576. pos += 4
  577. data = mem[pos:pos+length]
  578. self._position = pos + length
  579. return (code, length, data)
  580. def get_sheet(self, sh_number, update_pos=True):
  581. if self._resources_released:
  582. raise XLRDError("Can't load sheets after releasing resources.")
  583. if update_pos:
  584. self._position = self._sh_abs_posn[sh_number]
  585. _unused_biff_version = self.getbof(XL_WORKSHEET)
  586. # assert biff_version == self.biff_version ### FAILS
  587. # Have an example where book is v7 but sheet reports v8!!!
  588. # It appears to work OK if the sheet version is ignored.
  589. # Confirmed by Daniel Rentz: happens when Excel does "save as"
  590. # creating an old version file; ignore version details on sheet BOF.
  591. sh = sheet.Sheet(self,
  592. self._position,
  593. self._sheet_names[sh_number],
  594. sh_number,
  595. )
  596. sh.read(self)
  597. self._sheet_list[sh_number] = sh
  598. return sh
  599. def get_sheets(self):
  600. # DEBUG = 0
  601. if DEBUG: print("GET_SHEETS:", self._sheet_names, self._sh_abs_posn, file=self.logfile)
  602. for sheetno in xrange(len(self._sheet_names)):
  603. if DEBUG: print("GET_SHEETS: sheetno =", sheetno, self._sheet_names, self._sh_abs_posn, file=self.logfile)
  604. self.get_sheet(sheetno)
  605. def fake_globals_get_sheet(self): # for BIFF 4.0 and earlier
  606. formatting.initialise_book(self)
  607. fake_sheet_name = UNICODE_LITERAL('Sheet 1')
  608. self._sheet_names = [fake_sheet_name]
  609. self._sh_abs_posn = [0]
  610. self._sheet_visibility = [0] # one sheet, visible
  611. self._sheet_list.append(None) # get_sheet updates _sheet_list but needs a None beforehand
  612. self.get_sheets()
  613. def handle_boundsheet(self, data):
  614. # DEBUG = 1
  615. bv = self.biff_version
  616. self.derive_encoding()
  617. if DEBUG:
  618. fprintf(self.logfile, "BOUNDSHEET: bv=%d data %r\n", bv, data);
  619. if bv == 45: # BIFF4W
  620. #### Not documented in OOo docs ...
  621. # In fact, the *only* data is the name of the sheet.
  622. sheet_name = unpack_string(data, 0, self.encoding, lenlen=1)
  623. visibility = 0
  624. sheet_type = XL_BOUNDSHEET_WORKSHEET # guess, patch later
  625. if len(self._sh_abs_posn) == 0:
  626. abs_posn = self._sheetsoffset + self.base
  627. # Note (a) this won't be used
  628. # (b) it's the position of the SHEETHDR record
  629. # (c) add 11 to get to the worksheet BOF record
  630. else:
  631. abs_posn = -1 # unknown
  632. else:
  633. offset, visibility, sheet_type = unpack('<iBB', data[0:6])
  634. abs_posn = offset + self.base # because global BOF is always at posn 0 in the stream
  635. if bv < BIFF_FIRST_UNICODE:
  636. sheet_name = unpack_string(data, 6, self.encoding, lenlen=1)
  637. else:
  638. sheet_name = unpack_unicode(data, 6, lenlen=1)
  639. if DEBUG or self.verbosity >= 2:
  640. fprintf(self.logfile,
  641. "BOUNDSHEET: inx=%d vis=%r sheet_name=%r abs_posn=%d sheet_type=0x%02x\n",
  642. self._all_sheets_count, visibility, sheet_name, abs_posn, sheet_type)
  643. self._all_sheets_count += 1
  644. if sheet_type != XL_BOUNDSHEET_WORKSHEET:
  645. self._all_sheets_map.append(-1)
  646. descr = {
  647. 1: 'Macro sheet',
  648. 2: 'Chart',
  649. 6: 'Visual Basic module',
  650. }.get(sheet_type, 'UNKNOWN')
  651. if DEBUG or self.verbosity >= 1:
  652. fprintf(self.logfile,
  653. "NOTE *** Ignoring non-worksheet data named %r (type 0x%02x = %s)\n",
  654. sheet_name, sheet_type, descr)
  655. else:
  656. snum = len(self._sheet_names)
  657. self._all_sheets_map.append(snum)
  658. self._sheet_names.append(sheet_name)
  659. self._sh_abs_posn.append(abs_posn)
  660. self._sheet_visibility.append(visibility)
  661. self._sheet_num_from_name[sheet_name] = snum
  662. def handle_builtinfmtcount(self, data):
  663. ### N.B. This count appears to be utterly useless.
  664. # DEBUG = 1
  665. builtinfmtcount = unpack('<H', data[0:2])[0]
  666. if DEBUG: fprintf(self.logfile, "BUILTINFMTCOUNT: %r\n", builtinfmtcount)
  667. self.builtinfmtcount = builtinfmtcount
  668. def derive_encoding(self):
  669. if self.encoding_override:
  670. self.encoding = self.encoding_override
  671. elif self.codepage is None:
  672. if self.biff_version < 80:
  673. fprintf(self.logfile,
  674. "*** No CODEPAGE record, no encoding_override: will use 'ascii'\n")
  675. self.encoding = 'ascii'
  676. else:
  677. self.codepage = 1200 # utf16le
  678. if self.verbosity >= 2:
  679. fprintf(self.logfile, "*** No CODEPAGE record; assuming 1200 (utf_16_le)\n")
  680. else:
  681. codepage = self.codepage
  682. if codepage in encoding_from_codepage:
  683. encoding = encoding_from_codepage[codepage]
  684. elif 300 <= codepage <= 1999:
  685. encoding = 'cp' + str(codepage)
  686. else:
  687. encoding = 'unknown_codepage_' + str(codepage)
  688. if DEBUG or (self.verbosity and encoding != self.encoding) :
  689. fprintf(self.logfile, "CODEPAGE: codepage %r -> encoding %r\n", codepage, encoding)
  690. self.encoding = encoding
  691. if self.codepage != 1200: # utf_16_le
  692. # If we don't have a codec that can decode ASCII into Unicode,
  693. # we're well & truly stuffed -- let the punter know ASAP.
  694. try:
  695. _unused = unicode(b'trial', self.encoding)
  696. except BaseException as e:
  697. fprintf(self.logfile,
  698. "ERROR *** codepage %r -> encoding %r -> %s: %s\n",
  699. self.codepage, self.encoding, type(e).__name__.split(".")[-1], e)
  700. raise
  701. if self.raw_user_name:
  702. strg = unpack_string(self.user_name, 0, self.encoding, lenlen=1)
  703. strg = strg.rstrip()
  704. # if DEBUG:
  705. # print "CODEPAGE: user name decoded from %r to %r" % (self.user_name, strg)
  706. self.user_name = strg
  707. self.raw_user_name = False
  708. return self.encoding
  709. def handle_codepage(self, data):
  710. # DEBUG = 0
  711. codepage = unpack('<H', data[0:2])[0]
  712. self.codepage = codepage
  713. self.derive_encoding()
  714. def handle_country(self, data):
  715. countries = unpack('<HH', data[0:4])
  716. if self.verbosity: print("Countries:", countries, file=self.logfile)
  717. # Note: in BIFF7 and earlier, country record was put (redundantly?) in each worksheet.
  718. assert self.countries == (0, 0) or self.countries == countries
  719. self.countries = countries
  720. def handle_datemode(self, data):
  721. datemode = unpack('<H', data[0:2])[0]
  722. if DEBUG or self.verbosity:
  723. fprintf(self.logfile, "DATEMODE: datemode %r\n", datemode)
  724. assert datemode in (0, 1)
  725. self.datemode = datemode
  726. def handle_externname(self, data):
  727. blah = DEBUG or self.verbosity >= 2
  728. if self.biff_version >= 80:
  729. option_flags, other_info =unpack("<HI", data[:6])
  730. pos = 6
  731. name, pos = unpack_unicode_update_pos(data, pos, lenlen=1)
  732. extra = data[pos:]
  733. if self._supbook_types[-1] == SUPBOOK_ADDIN:
  734. self.addin_func_names.append(name)
  735. if blah:
  736. fprintf(self.logfile,
  737. "EXTERNNAME: sbktype=%d oflags=0x%04x oinfo=0x%08x name=%r extra=%r\n",
  738. self._supbook_types[-1], option_flags, other_info, name, extra)
  739. def handle_externsheet(self, data):
  740. self.derive_encoding() # in case CODEPAGE record missing/out of order/wrong
  741. self._extnsht_count += 1 # for use as a 1-based index
  742. blah1 = DEBUG or self.verbosity >= 1
  743. blah2 = DEBUG or self.verbosity >= 2
  744. if self.biff_version >= 80:
  745. num_refs = unpack("<H", data[0:2])[0]
  746. bytes_reqd = num_refs * 6 + 2
  747. while len(data) < bytes_reqd:
  748. if blah1:
  749. fprintf(
  750. self.logfile,
  751. "INFO: EXTERNSHEET needs %d bytes, have %d\n",
  752. bytes_reqd, len(data),
  753. )
  754. code2, length2, data2 = self.get_record_parts()
  755. if code2 != XL_CONTINUE:
  756. raise XLRDError("Missing CONTINUE after EXTERNSHEET record")
  757. data += data2
  758. pos = 2
  759. for k in xrange(num_refs):
  760. info = unpack("<HHH", data[pos:pos+6])
  761. ref_recordx, ref_first_sheetx, ref_last_sheetx = info
  762. self._externsheet_info.append(info)
  763. pos += 6
  764. if blah2:
  765. fprintf(
  766. self.logfile,
  767. "EXTERNSHEET(b8): k = %2d, record = %2d, first_sheet = %5d, last sheet = %5d\n",
  768. k, ref_recordx, ref_first_sheetx, ref_last_sheetx,
  769. )
  770. else:
  771. nc, ty = unpack("<BB", data[:2])
  772. if blah2:
  773. print("EXTERNSHEET(b7-):", file=self.logfile)
  774. hex_char_dump(data, 0, len(data), fout=self.logfile)
  775. msg = {
  776. 1: "Encoded URL",
  777. 2: "Current sheet!!",
  778. 3: "Specific sheet in own doc't",
  779. 4: "Nonspecific sheet in own doc't!!",
  780. }.get(ty, "Not encoded")
  781. print(" %3d chars, type is %d (%s)" % (nc, ty, msg), file=self.logfile)
  782. if ty == 3:
  783. sheet_name = unicode(data[2:nc+2], self.encoding)
  784. self._extnsht_name_from_num[self._extnsht_count] = sheet_name
  785. if blah2: print(self._extnsht_name_from_num, file=self.logfile)
  786. if not (1 <= ty <= 4):
  787. ty = 0
  788. self._externsheet_type_b57.append(ty)
  789. def handle_filepass(self, data):
  790. if self.verbosity >= 2:
  791. logf = self.logfile
  792. fprintf(logf, "FILEPASS:\n")
  793. hex_char_dump(data, 0, len(data), base=0, fout=logf)
  794. if self.biff_version >= 80:
  795. kind1, = unpack('<H', data[:2])
  796. if kind1 == 0: # weak XOR encryption
  797. key, hash_value = unpack('<HH', data[2:])
  798. fprintf(logf,
  799. 'weak XOR: key=0x%04x hash=0x%04x\n',
  800. key, hash_value)
  801. elif kind1 == 1:
  802. kind2, = unpack('<H', data[4:6])
  803. if kind2 == 1: # BIFF8 standard encryption
  804. caption = "BIFF8 std"
  805. elif kind2 == 2:
  806. caption = "BIFF8 strong"
  807. else:
  808. caption = "** UNKNOWN ENCRYPTION METHOD **"
  809. fprintf(logf, "%s\n", caption)
  810. raise XLRDError("Workbook is encrypted")
  811. def handle_name(self, data):
  812. blah = DEBUG or self.verbosity >= 2
  813. bv = self.biff_version
  814. if bv < 50:
  815. return
  816. self.derive_encoding()
  817. # print
  818. # hex_char_dump(data, 0, len(data), fout=self.logfile)
  819. (
  820. option_flags, kb_shortcut, name_len, fmla_len, extsht_index, sheet_index,
  821. menu_text_len, description_text_len, help_topic_text_len, status_bar_text_len,
  822. ) = unpack("<HBBHHH4B", data[0:14])
  823. nobj = Name()
  824. nobj.book = self ### CIRCULAR ###
  825. name_index = len(self.name_obj_list)
  826. nobj.name_index = name_index
  827. self.name_obj_list.append(nobj)
  828. nobj.option_flags = option_flags
  829. for attr, mask, nshift in (
  830. ('hidden', 1, 0),
  831. ('func', 2, 1),
  832. ('vbasic', 4, 2),
  833. ('macro', 8, 3),
  834. ('complex', 0x10, 4),
  835. ('builtin', 0x20, 5),
  836. ('funcgroup', 0xFC0, 6),
  837. ('binary', 0x1000, 12),
  838. ):
  839. setattr(nobj, attr, (option_flags & mask) >> nshift)
  840. macro_flag = " M"[nobj.macro]
  841. if bv < 80:
  842. internal_name, pos = unpack_string_update_pos(data, 14, self.encoding, known_len=name_len)
  843. else:
  844. internal_name, pos = unpack_unicode_update_pos(data, 14, known_len=name_len)
  845. nobj.extn_sheet_num = extsht_index
  846. nobj.excel_sheet_index = sheet_index
  847. nobj.scope = None # patched up in the names_epilogue() method
  848. if blah:
  849. fprintf(
  850. self.logfile,
  851. "NAME[%d]:%s oflags=%d, name_len=%d, fmla_len=%d, extsht_index=%d, sheet_index=%d, name=%r\n",
  852. name_index, macro_flag, option_flags, name_len,
  853. fmla_len, extsht_index, sheet_index, internal_name)
  854. name = internal_name
  855. if nobj.builtin:
  856. name = builtin_name_from_code.get(name, "??Unknown??")
  857. if blah: print(" builtin: %s" % name, file=self.logfile)
  858. nobj.name = name
  859. nobj.raw_formula = data[pos:]
  860. nobj.basic_formula_len = fmla_len
  861. nobj.evaluated = 0
  862. if blah:
  863. nobj.dump(
  864. self.logfile,
  865. header="--- handle_name: name[%d] ---" % name_index,
  866. footer="-------------------",
  867. )
  868. def names_epilogue(self):
  869. blah = self.verbosity >= 2
  870. f = self.logfile
  871. if blah:
  872. print("+++++ names_epilogue +++++", file=f)
  873. print("_all_sheets_map", REPR(self._all_sheets_map), file=f)
  874. print("_extnsht_name_from_num", REPR(self._extnsht_name_from_num), file=f)
  875. print("_sheet_num_from_name", REPR(self._sheet_num_from_name), file=f)
  876. num_names = len(self.name_obj_list)
  877. for namex in range(num_names):
  878. nobj = self.name_obj_list[namex]
  879. # Convert from excel_sheet_index to scope.
  880. # This is done here because in BIFF7 and earlier, the
  881. # BOUNDSHEET records (from which _all_sheets_map is derived)
  882. # come after the NAME records.
  883. if self.biff_version >= 80:
  884. sheet_index = nobj.excel_sheet_index
  885. if sheet_index == 0:
  886. intl_sheet_index = -1 # global
  887. elif 1 <= sheet_index <= len(self._all_sheets_map):
  888. intl_sheet_index = self._all_sheets_map[sheet_index-1]
  889. if intl_sheet_index == -1: # maps to a macro or VBA sheet
  890. intl_sheet_index = -2 # valid sheet reference but not useful
  891. else:
  892. # huh?
  893. intl_sheet_index = -3 # invalid
  894. elif 50 <= self.biff_version <= 70:
  895. sheet_index = nobj.extn_sheet_num
  896. if sheet_index == 0:
  897. intl_sheet_index = -1 # global
  898. else:
  899. sheet_name = self._extnsht_name_from_num[sheet_index]
  900. intl_sheet_index = self._sheet_num_from_name.get(sheet_name, -2)
  901. nobj.scope = intl_sheet_index
  902. for namex in range(num_names):
  903. nobj = self.name_obj_list[namex]
  904. # Parse the formula ...
  905. if nobj.macro or nobj.binary: continue
  906. if nobj.evaluated: continue
  907. evaluate_name_formula(self, nobj, namex, blah=blah)
  908. if self.verbosity >= 2:
  909. print("---------- name object dump ----------", file=f)
  910. for namex in range(num_names):
  911. nobj = self.name_obj_list[namex]
  912. nobj.dump(f, header="--- name[%d] ---" % namex)
  913. print("--------------------------------------", file=f)
  914. #
  915. # Build some dicts for access to the name objects
  916. #
  917. name_and_scope_map = {} # (name.lower(), scope): Name_object
  918. name_map = {} # name.lower() : list of Name_objects (sorted in scope order)
  919. for namex in range(num_names):
  920. nobj = self.name_obj_list[namex]
  921. name_lcase = nobj.name.lower()
  922. key = (name_lcase, nobj.scope)
  923. if key in name_and_scope_map and self.verbosity:
  924. fprintf(f, 'Duplicate entry %r in name_and_scope_map\n', key)
  925. name_and_scope_map[key] = nobj
  926. sort_data = (nobj.scope, namex, nobj)
  927. # namex (a temp unique ID) ensures the Name objects will not
  928. # be compared (fatal in py3)
  929. if name_lcase in name_map:
  930. name_map[name_lcase].append(sort_data)
  931. else:
  932. name_map[name_lcase] = [sort_data]
  933. for key in name_map.keys():
  934. alist = name_map[key]
  935. alist.sort()
  936. name_map[key] = [x[2] for x in alist]
  937. self.name_and_scope_map = name_and_scope_map
  938. self.name_map = name_map
  939. def handle_obj(self, data):
  940. # Not doing much handling at all.
  941. # Worrying about embedded (BOF ... EOF) substreams is done elsewhere.
  942. # DEBUG = 1
  943. obj_type, obj_id = unpack('<HI', data[4:10])
  944. # if DEBUG: print "---> handle_obj type=%d id=0x%08x" % (obj_type, obj_id)
  945. def handle_supbook(self, data):
  946. # aka EXTERNALBOOK in OOo docs
  947. self._supbook_types.append(None)
  948. blah = DEBUG or self.verbosity >= 2
  949. if blah:
  950. print("SUPBOOK:", file=self.logfile)
  951. hex_char_dump(data, 0, len(data), fout=self.logfile)
  952. num_sheets = unpack("<H", data[0:2])[0]
  953. if blah: print("num_sheets = %d" % num_sheets, file=self.logfile)
  954. sbn = self._supbook_count
  955. self._supbook_count += 1
  956. if data[2:4] == b"\x01\x04":
  957. self._supbook_types[-1] = SUPBOOK_INTERNAL
  958. self._supbook_locals_inx = self._supbook_count - 1
  959. if blah:
  960. print("SUPBOOK[%d]: internal 3D refs; %d sheets" % (sbn, num_sheets), file=self.logfile)
  961. print(" _all_sheets_map", self._all_sheets_map, file=self.logfile)
  962. return
  963. if data[0:4] == b"\x01\x00\x01\x3A":
  964. self._supbook_types[-1] = SUPBOOK_ADDIN
  965. self._supbook_addins_inx = self._supbook_count - 1
  966. if blah: print("SUPBOOK[%d]: add-in functions" % sbn, file=self.logfile)
  967. return
  968. url, pos = unpack_unicode_update_pos(data, 2, lenlen=2)
  969. if num_sheets == 0:
  970. self._supbook_types[-1] = SUPBOOK_DDEOLE
  971. if blah: fprintf(self.logfile, "SUPBOOK[%d]: DDE/OLE document = %r\n", sbn, url)
  972. return
  973. self._supbook_types[-1] = SUPBOOK_EXTERNAL
  974. if blah: fprintf(self.logfile, "SUPBOOK[%d]: url = %r\n", sbn, url)
  975. sheet_names = []
  976. for x in range(num_sheets):
  977. try:
  978. shname, pos = unpack_unicode_update_pos(data, pos, lenlen=2)
  979. except struct.error:
  980. # #### FIX ME ####
  981. # Should implement handling of CONTINUE record(s) ...
  982. if self.verbosity:
  983. print((
  984. "*** WARNING: unpack failure in sheet %d of %d in SUPBOOK record for file %r"
  985. % (x, num_sheets, url)
  986. ), file=self.logfile)
  987. break
  988. sheet_names.append(shname)
  989. if blah: fprintf(self.logfile, " sheetx=%d namelen=%d name=%r (next pos=%d)\n", x, len(shname), shname, pos)
  990. def handle_sheethdr(self, data):
  991. # This a BIFF 4W special.
  992. # The SHEETHDR record is followed by a (BOF ... EOF) substream containing
  993. # a worksheet.
  994. # DEBUG = 1
  995. self.derive_encoding()
  996. sheet_len = unpack('<i', data[:4])[0]
  997. sheet_name = unpack_string(data, 4, self.encoding, lenlen=1)
  998. sheetno = self._sheethdr_count
  999. assert sheet_name == self._sheet_names[sheetno]
  1000. self._sheethdr_count += 1
  1001. BOF_posn = self._position
  1002. posn = BOF_posn - 4 - len(data)
  1003. if DEBUG: fprintf(self.logfile, 'SHEETHDR %d at posn %d: len=%d name=%r\n', sheetno, posn, sheet_len, sheet_name)
  1004. self.initialise_format_info()
  1005. if DEBUG: print('SHEETHDR: xf epilogue flag is %d' % self._xf_epilogue_done, file=self.logfile)
  1006. self._sheet_list.append(None) # get_sheet updates _sheet_list but needs a None beforehand
  1007. self.get_sheet(sheetno, update_pos=False)
  1008. if DEBUG: print('SHEETHDR: posn after get_sheet() =', self._position, file=self.logfile)
  1009. self._position = BOF_posn + sheet_len
  1010. def handle_sheetsoffset(self, data):
  1011. # DEBUG = 0
  1012. posn = unpack('<i', data)[0]
  1013. if DEBUG: print('SHEETSOFFSET:', posn, file=self.logfile)
  1014. self._sheetsoffset = posn
  1015. def handle_sst(self, data):
  1016. # DEBUG = 1
  1017. if DEBUG:
  1018. print("SST Processing", file=self.logfile)
  1019. t0 = time.time()
  1020. nbt = len(data)
  1021. strlist = [data]
  1022. uniquestrings = unpack('<i', data[4:8])[0]
  1023. if DEBUG or self.verbosity >= 2:
  1024. fprintf(self.logfile, "SST: unique strings: %d\n", uniquestrings)
  1025. while 1:
  1026. code, nb, data = self.get_record_parts_conditional(XL_CONTINUE)
  1027. if code is None:
  1028. break
  1029. nbt += nb
  1030. if DEBUG >= 2:
  1031. fprintf(self.logfile, "CONTINUE: adding %d bytes to SST -> %d\n", nb, nbt)
  1032. strlist.append(data)
  1033. self._sharedstrings, rt_runlist = unpack_SST_table(strlist, uniquestrings)
  1034. if self.formatting_info:
  1035. self._rich_text_runlist_map = rt_runlist
  1036. if DEBUG:
  1037. t1 = time.time()
  1038. print("SST processing took %.2f seconds" % (t1 - t0, ), file=self.logfile)
  1039. def handle_writeaccess(self, data):
  1040. DEBUG = 0
  1041. if self.biff_version < 80:
  1042. if not self.encoding:
  1043. self.raw_user_name = True
  1044. self.user_name = data
  1045. return
  1046. strg = unpack_string(data, 0, self.encoding, lenlen=1)
  1047. else:
  1048. strg = unpack_unicode(data, 0, lenlen=2)
  1049. if DEBUG: fprintf(self.logfile, "WRITEACCESS: %d bytes; raw=%s %r\n", len(data), self.raw_user_name, strg)
  1050. strg = strg.rstrip()
  1051. self.user_name = strg
  1052. def parse_globals(self):
  1053. # DEBUG = 0
  1054. # no need to position, just start reading (after the BOF)
  1055. formatting.initialise_book(self)
  1056. while 1:
  1057. rc, length, data = self.get_record_parts()
  1058. if DEBUG: print("parse_globals: record code is 0x%04x" % rc, file=self.logfile)
  1059. if rc == XL_SST:
  1060. self.handle_sst(data)
  1061. elif rc == XL_FONT or rc == XL_FONT_B3B4:
  1062. self.handle_font(data)
  1063. elif rc == XL_FORMAT: # XL_FORMAT2 is BIFF <= 3.0, can't appear in globals
  1064. self.handle_format(data)
  1065. elif rc == XL_XF:
  1066. self.handle_xf(data)
  1067. elif rc == XL_BOUNDSHEET:
  1068. self.handle_boundsheet(data)
  1069. elif rc == XL_DATEMODE:
  1070. self.handle_datemode(data)
  1071. elif rc == XL_CODEPAGE:
  1072. self.handle_codepage(data)
  1073. elif rc == XL_COUNTRY:
  1074. self.handle_country(data)
  1075. elif rc == XL_EXTERNNAME:
  1076. self.handle_externname(data)
  1077. elif rc == XL_EXTERNSHEET:
  1078. self.handle_externsheet(data)
  1079. elif rc == XL_FILEPASS:
  1080. self.handle_filepass(data)
  1081. elif rc == XL_WRITEACCESS:
  1082. self.handle_writeaccess(data)
  1083. elif rc == XL_SHEETSOFFSET:
  1084. self.handle_sheetsoffset(data)
  1085. elif rc == XL_SHEETHDR:
  1086. self.handle_sheethdr(data)
  1087. elif rc == XL_SUPBOOK:
  1088. self.handle_supbook(data)
  1089. elif rc == XL_NAME:
  1090. self.handle_name(data)
  1091. elif rc == XL_PALETTE:
  1092. self.handle_palette(data)
  1093. elif rc == XL_STYLE:
  1094. self.handle_style(data)
  1095. elif rc & 0xff == 9 and self.verbosity:
  1096. fprintf(self.logfile, "*** Unexpected BOF at posn %d: 0x%04x len=%d data=%r\n",
  1097. self._position - length - 4, rc, length, data)
  1098. elif rc == XL_EOF:
  1099. self.xf_epilogue()
  1100. self.names_epilogue()
  1101. self.palette_epilogue()
  1102. if not self.encoding:
  1103. self.derive_encoding()
  1104. if self.biff_version == 45:
  1105. # DEBUG = 0
  1106. if DEBUG: print("global EOF: position", self._position, file=self.logfile)
  1107. # if DEBUG:
  1108. # pos = self._position - 4
  1109. # print repr(self.mem[pos:pos+40])
  1110. return
  1111. else:
  1112. # if DEBUG:
  1113. # print >> self.logfile, "parse_globals: ignoring record code 0x%04x" % rc
  1114. pass
  1115. def read(self, pos, length):
  1116. data = self.mem[pos:pos+length]
  1117. self._position = pos + len(data)
  1118. return data
  1119. def getbof(self, rqd_stream):
  1120. # DEBUG = 1
  1121. # if DEBUG: print >> self.logfile, "getbof(): position", self._position
  1122. if DEBUG: print("reqd: 0x%04x" % rqd_stream, file=self.logfile)
  1123. def bof_error(msg):
  1124. raise XLRDError('Unsupported format, or corrupt file: ' + msg)
  1125. savpos = self._position
  1126. opcode = self.get2bytes()
  1127. if opcode == MY_EOF:
  1128. bof_error('Expected BOF record; met end of file')
  1129. if opcode not in bofcodes:
  1130. bof_error('Expected BOF record; found %r' % self.mem[savpos:savpos+8])
  1131. length = self.get2bytes()
  1132. if length == MY_EOF:
  1133. bof_error('Incomplete BOF record[1]; met end of file')
  1134. if not (4 <= length <= 20):
  1135. bof_error(
  1136. 'Invalid length (%d) for BOF record type 0x%04x'
  1137. % (length, opcode))
  1138. padding = b'\0' * max(0, boflen[opcode] - length)
  1139. data = self.read(self._position, length);
  1140. if DEBUG: fprintf(self.logfile, "\ngetbof(): data=%r\n", data)
  1141. if len(data) < length:
  1142. bof_error('Incomplete BOF record[2]; met end of file')
  1143. data += padding
  1144. version1 = opcode >> 8
  1145. version2, streamtype = unpack('<HH', data[0:4])
  1146. if DEBUG:
  1147. print("getbof(): op=0x%04x version2=0x%04x streamtype=0x%04x" \
  1148. % (opcode, version2, streamtype), file=self.logfile)
  1149. bof_offset = self._position - 4 - length
  1150. if DEBUG:
  1151. print("getbof(): BOF found at offset %d; savpos=%d" \
  1152. % (bof_offset, savpos), file=self.logfile)
  1153. version = build = year = 0
  1154. if version1 == 0x08:
  1155. build, year = unpack('<HH', data[4:8])
  1156. if version2 == 0x0600:
  1157. version = 80
  1158. elif version2 == 0x0500:
  1159. if year < 1994 or build in (2412, 3218, 3321):
  1160. version = 50
  1161. else:
  1162. version = 70
  1163. else:
  1164. # dodgy one, created by a 3rd-party tool
  1165. version = {
  1166. 0x0000: 21,
  1167. 0x0007: 21,
  1168. 0x0200: 21,
  1169. 0x0300: 30,
  1170. 0x0400: 40,
  1171. }.get(version2, 0)
  1172. elif version1 in (0x04, 0x02, 0x00):
  1173. version = {0x04: 40, 0x02: 30, 0x00: 21}[version1]
  1174. if version == 40 and streamtype == XL_WORKBOOK_GLOBALS_4W:
  1175. version = 45 # i.e. 4W
  1176. if DEBUG or self.verbosity >= 2:
  1177. print("BOF: op=0x%04x vers=0x%04x stream=0x%04x buildid=%d buildyr=%d -> BIFF%d" \
  1178. % (opcode, version2, streamtype, build, year, version), file=self.logfile)
  1179. got_globals = streamtype == XL_WORKBOOK_GLOBALS or (
  1180. version == 45 and streamtype == XL_WORKBOOK_GLOBALS_4W)
  1181. if (rqd_stream == XL_WORKBOOK_GLOBALS and got_globals) or streamtype == rqd_stream:
  1182. return version
  1183. if version < 50 and streamtype == XL_WORKSHEET:
  1184. return version
  1185. if version >= 50 and streamtype == 0x0100:
  1186. bof_error("Workspace file -- no spreadsheet data")
  1187. bof_error(
  1188. 'BOF not workbook/worksheet: op=0x%04x vers=0x%04x strm=0x%04x build=%d year=%d -> BIFF%d' \
  1189. % (opcode, version2, streamtype, build, year, version)
  1190. )
  1191. # === helper functions
  1192. def expand_cell_address(inrow, incol):
  1193. # Ref : OOo docs, "4.3.4 Cell Addresses in BIFF8"
  1194. outrow = inrow
  1195. if incol & 0x8000:
  1196. if outrow >= 32768:
  1197. outrow -= 65536
  1198. relrow = 1
  1199. else:
  1200. relrow = 0
  1201. outcol = incol & 0xFF
  1202. if incol & 0x4000:
  1203. if outcol >= 128:
  1204. outcol -= 256
  1205. relcol = 1
  1206. else:
  1207. relcol = 0
  1208. return outrow, outcol, relrow, relcol
  1209. def colname(colx, _A2Z="ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
  1210. assert colx >= 0
  1211. name = UNICODE_LITERAL('')
  1212. while 1:
  1213. quot, rem = divmod(colx, 26)
  1214. name = _A2Z[rem] + name
  1215. if not quot:
  1216. return name
  1217. colx = quot - 1
  1218. def display_cell_address(rowx, colx, relrow, relcol):
  1219. if relrow:
  1220. rowpart = "(*%s%d)" % ("+-"[rowx < 0], abs(rowx))
  1221. else:
  1222. rowpart = "$%d" % (rowx+1,)
  1223. if relcol:
  1224. colpart = "(*%s%d)" % ("+-"[colx < 0], abs(colx))
  1225. else:
  1226. colpart = "$" + colname(colx)
  1227. return colpart + rowpart
  1228. def unpack_SST_table(datatab, nstrings):
  1229. "Return list of strings"
  1230. datainx = 0
  1231. ndatas = len(datatab)
  1232. data = datatab[0]
  1233. datalen = len(data)
  1234. pos = 8
  1235. strings = []
  1236. strappend = strings.append
  1237. richtext_runs = {}
  1238. local_unpack = unpack
  1239. local_min = min
  1240. local_BYTES_ORD = BYTES_ORD
  1241. latin_1 = "latin_1"
  1242. for _unused_i in xrange(nstrings):
  1243. nchars = local_unpack('<H', data[pos:pos+2])[0]
  1244. pos += 2
  1245. options = local_BYTES_ORD(data[pos])
  1246. pos += 1
  1247. rtcount = 0
  1248. phosz = 0
  1249. if options & 0x08: # richtext
  1250. rtcount = local_unpack('<H', data[pos:pos+2])[0]
  1251. pos += 2
  1252. if options & 0x04: # phonetic
  1253. phosz = local_unpack('<i', data[pos:pos+4])[0]
  1254. pos += 4
  1255. accstrg = UNICODE_LITERAL('')
  1256. charsgot = 0
  1257. while 1:
  1258. charsneed = nchars - charsgot
  1259. if options & 0x01:
  1260. # Uncompressed UTF-16
  1261. charsavail = local_min((datalen - pos) >> 1, charsneed)
  1262. rawstrg = data[pos:pos+2*charsavail]
  1263. # if DEBUG: print "SST U16: nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg)
  1264. try:
  1265. accstrg += unicode(rawstrg, "utf_16_le")
  1266. except:
  1267. # print "SST U16: nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg)
  1268. # Probable cause: dodgy data e.g. unfinished surrogate pair.
  1269. # E.g. file unicode2.xls in pyExcelerator's examples has cells containing
  1270. # unichr(i) for i in range(0x100000)
  1271. # so this will include 0xD800 etc
  1272. raise
  1273. pos += 2*charsavail
  1274. else:
  1275. # Note: this is COMPRESSED (not ASCII!) encoding!!!
  1276. charsavail = local_min(datalen - pos, charsneed)
  1277. rawstrg = data[pos:pos+charsavail]
  1278. # if DEBUG: print "SST CMPRSD: nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg)
  1279. accstrg += unicode(rawstrg, latin_1)
  1280. pos += charsavail
  1281. charsgot += charsavail
  1282. if charsgot == nchars:
  1283. break
  1284. datainx += 1
  1285. data = datatab[datainx]
  1286. datalen = len(data)
  1287. options = local_BYTES_ORD(data[0])
  1288. pos = 1
  1289. if rtcount:
  1290. runs = []
  1291. for runindex in xrange(rtcount):
  1292. if pos == datalen:
  1293. pos = 0
  1294. datainx += 1
  1295. data = datatab[datainx]
  1296. datalen = len(data)
  1297. runs.append(local_unpack("<HH", data[pos:pos+4]))
  1298. pos += 4
  1299. richtext_runs[len(strings)] = runs
  1300. pos += phosz # size of the phonetic stuff to skip
  1301. if pos >= datalen:
  1302. # adjust to correct position in next record
  1303. pos = pos - datalen
  1304. datainx += 1
  1305. if datainx < ndatas:
  1306. data = datatab[datainx]
  1307. datalen = len(data)
  1308. else:
  1309. assert _unused_i == nstrings - 1
  1310. strappend(accstrg)
  1311. return strings, richtext_runs