__init__.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467
  1. from os import path
  2. from .info import __VERSION__
  3. # <p>Copyright (c) 2005-2012 Stephen John Machin, Lingfo Pty Ltd</p>
  4. # <p>This module is part of the xlrd package, which is released under a
  5. # BSD-style licence.</p>
  6. from . import licences
  7. ##
  8. # <p><b>A Python module for extracting data from MS Excel (TM) spreadsheet files.
  9. # <br /><br />
  10. # Version 0.7.4 -- April 2012
  11. # </b></p>
  12. #
  13. # <h2>General information</h2>
  14. #
  15. # <h3>Acknowledgements</h3>
  16. #
  17. # <p>
  18. # Development of this module would not have been possible without the document
  19. # "OpenOffice.org's Documentation of the Microsoft Excel File Format"
  20. # ("OOo docs" for short).
  21. # The latest version is available from OpenOffice.org in
  22. # <a href=http://sc.openoffice.org/excelfileformat.pdf> PDF format</a>
  23. # and
  24. # <a href=http://sc.openoffice.org/excelfileformat.odt> ODT format.</a>
  25. # Small portions of the OOo docs are reproduced in this
  26. # document. A study of the OOo docs is recommended for those who wish a
  27. # deeper understanding of the Excel file layout than the xlrd docs can provide.
  28. # </p>
  29. #
  30. # <p>Backporting to Python 2.1 was partially funded by
  31. # <a href=http://journyx.com/>
  32. # Journyx - provider of timesheet and project accounting solutions.
  33. # </a>
  34. # </p>
  35. #
  36. # <p>Provision of formatting information in version 0.6.1 was funded by
  37. # <a href=http://www.simplistix.co.uk>
  38. # Simplistix Ltd.
  39. # </a>
  40. # </p>
  41. #
  42. # <h3>Unicode</h3>
  43. #
  44. # <p>This module presents all text strings as Python unicode objects.
  45. # From Excel 97 onwards, text in Excel spreadsheets has been stored as Unicode.
  46. # Older files (Excel 95 and earlier) don't keep strings in Unicode;
  47. # a CODEPAGE record provides a codepage number (for example, 1252) which is
  48. # used by xlrd to derive the encoding (for same example: "cp1252") which is
  49. # used to translate to Unicode.</p>
  50. # <small>
  51. # <p>If the CODEPAGE record is missing (possible if the file was created
  52. # by third-party software), xlrd will assume that the encoding is ascii, and keep going.
  53. # If the actual encoding is not ascii, a UnicodeDecodeError exception will be raised and
  54. # you will need to determine the encoding yourself, and tell xlrd:
  55. # <pre>
  56. # book = xlrd.open_workbook(..., encoding_override="cp1252")
  57. # </pre></p>
  58. # <p>If the CODEPAGE record exists but is wrong (for example, the codepage
  59. # number is 1251, but the strings are actually encoded in koi8_r),
  60. # it can be overridden using the same mechanism.
  61. # The supplied runxlrd.py has a corresponding command-line argument, which
  62. # may be used for experimentation:
  63. # <pre>
  64. # runxlrd.py -e koi8_r 3rows myfile.xls
  65. # </pre></p>
  66. # <p>The first place to look for an encoding ("codec name") is
  67. # <a href=http://docs.python.org/lib/standard-encodings.html>
  68. # the Python documentation</a>.
  69. # </p>
  70. # </small>
  71. #
  72. # <h3>Dates in Excel spreadsheets</h3>
  73. #
  74. # <p>In reality, there are no such things. What you have are floating point
  75. # numbers and pious hope.
  76. # There are several problems with Excel dates:</p>
  77. #
  78. # <p>(1) Dates are not stored as a separate data type; they are stored as
  79. # floating point numbers and you have to rely on
  80. # (a) the "number format" applied to them in Excel and/or
  81. # (b) knowing which cells are supposed to have dates in them.
  82. # This module helps with (a) by inspecting the
  83. # format that has been applied to each number cell;
  84. # if it appears to be a date format, the cell
  85. # is classified as a date rather than a number. Feedback on this feature,
  86. # especially from non-English-speaking locales, would be appreciated.</p>
  87. #
  88. # <p>(2) Excel for Windows stores dates by default as the number of
  89. # days (or fraction thereof) since 1899-12-31T00:00:00. Excel for
  90. # Macintosh uses a default start date of 1904-01-01T00:00:00. The date
  91. # system can be changed in Excel on a per-workbook basis (for example:
  92. # Tools -> Options -> Calculation, tick the "1904 date system" box).
  93. # This is of course a bad idea if there are already dates in the
  94. # workbook. There is no good reason to change it even if there are no
  95. # dates in the workbook. Which date system is in use is recorded in the
  96. # workbook. A workbook transported from Windows to Macintosh (or vice
  97. # versa) will work correctly with the host Excel. When using this
  98. # module's xldate_as_tuple function to convert numbers from a workbook,
  99. # you must use the datemode attribute of the Book object. If you guess,
  100. # or make a judgement depending on where you believe the workbook was
  101. # created, you run the risk of being 1462 days out of kilter.</p>
  102. #
  103. # <p>Reference:
  104. # http://support.microsoft.com/default.aspx?scid=KB;EN-US;q180162</p>
  105. #
  106. #
  107. # <p>(3) The Excel implementation of the Windows-default 1900-based date system works on the
  108. # incorrect premise that 1900 was a leap year. It interprets the number 60 as meaning 1900-02-29,
  109. # which is not a valid date. Consequently any number less than 61 is ambiguous. Example: is 59 the
  110. # result of 1900-02-28 entered directly, or is it 1900-03-01 minus 2 days? The OpenOffice.org Calc
  111. # program "corrects" the Microsoft problem; entering 1900-02-27 causes the number 59 to be stored.
  112. # Save as an XLS file, then open the file with Excel -- you'll see 1900-02-28 displayed.</p>
  113. #
  114. # <p>Reference: http://support.microsoft.com/default.aspx?scid=kb;en-us;214326</p>
  115. #
  116. # <p>(4) The Macintosh-default 1904-based date system counts 1904-01-02 as day 1 and 1904-01-01 as day zero.
  117. # Thus any number such that (0.0 <= number < 1.0) is ambiguous. Is 0.625 a time of day (15:00:00),
  118. # independent of the calendar,
  119. # or should it be interpreted as an instant on a particular day (1904-01-01T15:00:00)?
  120. # The xldate_* functions in this module
  121. # take the view that such a number is a calendar-independent time of day (like Python's datetime.time type) for both
  122. # date systems. This is consistent with more recent Microsoft documentation
  123. # (for example, the help file for Excel 2002 which says that the first day
  124. # in the 1904 date system is 1904-01-02).
  125. #
  126. # <p>(5) Usage of the Excel DATE() function may leave strange dates in a spreadsheet. Quoting the help file,
  127. # in respect of the 1900 date system: "If year is between 0 (zero) and 1899 (inclusive),
  128. # Excel adds that value to 1900 to calculate the year. For example, DATE(108,1,2) returns January 2, 2008 (1900+108)."
  129. # This gimmick, semi-defensible only for arguments up to 99 and only in the pre-Y2K-awareness era,
  130. # means that DATE(1899, 12, 31) is interpreted as 3799-12-31.</p>
  131. #
  132. # <p>For further information, please refer to the documentation for the xldate_* functions.</p>
  133. #
  134. # <h3> Named references, constants, formulas, and macros</h3>
  135. #
  136. # <p>
  137. # A name is used to refer to a cell, a group of cells, a constant
  138. # value, a formula, or a macro. Usually the scope of a name is global
  139. # across the whole workbook. However it can be local to a worksheet.
  140. # For example, if the sales figures are in different cells in
  141. # different sheets, the user may define the name "Sales" in each
  142. # sheet. There are built-in names, like "Print_Area" and
  143. # "Print_Titles"; these two are naturally local to a sheet.
  144. # </p><p>
  145. # To inspect the names with a user interface like MS Excel, OOo Calc,
  146. # or Gnumeric, click on Insert/Names/Define. This will show the global
  147. # names, plus those local to the currently selected sheet.
  148. # </p><p>
  149. # A Book object provides two dictionaries (name_map and
  150. # name_and_scope_map) and a list (name_obj_list) which allow various
  151. # ways of accessing the Name objects. There is one Name object for
  152. # each NAME record found in the workbook. Name objects have many
  153. # attributes, several of which are relevant only when obj.macro is 1.
  154. # </p><p>
  155. # In the examples directory you will find namesdemo.xls which
  156. # showcases the many different ways that names can be used, and
  157. # xlrdnamesAPIdemo.py which offers 3 different queries for inspecting
  158. # the names in your files, and shows how to extract whatever a name is
  159. # referring to. There is currently one "convenience method",
  160. # Name.cell(), which extracts the value in the case where the name
  161. # refers to a single cell. More convenience methods are planned. The
  162. # source code for Name.cell (in __init__.py) is an extra source of
  163. # information on how the Name attributes hang together.
  164. # </p>
  165. #
  166. # <p><i>Name information is <b>not</b> extracted from files older than
  167. # Excel 5.0 (Book.biff_version < 50)</i></p>
  168. #
  169. # <h3>Formatting</h3>
  170. #
  171. # <h4>Introduction</h4>
  172. #
  173. # <p>This collection of features, new in xlrd version 0.6.1, is intended
  174. # to provide the information needed to (1) display/render spreadsheet contents
  175. # (say) on a screen or in a PDF file, and (2) copy spreadsheet data to another
  176. # file without losing the ability to display/render it.</p>
  177. #
  178. # <h4>The Palette; Colour Indexes</h4>
  179. #
  180. # <p>A colour is represented in Excel as a (red, green, blue) ("RGB") tuple
  181. # with each component in range(256). However it is not possible to access an
  182. # unlimited number of colours; each spreadsheet is limited to a palette of 64 different
  183. # colours (24 in Excel 3.0 and 4.0, 8 in Excel 2.0). Colours are referenced by an index
  184. # ("colour index") into this palette.
  185. #
  186. # Colour indexes 0 to 7 represent 8 fixed built-in colours: black, white, red, green, blue,
  187. # yellow, magenta, and cyan.<p>
  188. #
  189. # The remaining colours in the palette (8 to 63 in Excel 5.0 and later)
  190. # can be changed by the user. In the Excel 2003 UI, Tools/Options/Color presents a palette
  191. # of 7 rows of 8 colours. The last two rows are reserved for use in charts.<br />
  192. # The correspondence between this grid and the assigned
  193. # colour indexes is NOT left-to-right top-to-bottom.<br />
  194. # Indexes 8 to 15 correspond to changeable
  195. # parallels of the 8 fixed colours -- for example, index 7 is forever cyan;
  196. # index 15 starts off being cyan but can be changed by the user.<br />
  197. #
  198. # The default colour for each index depends on the file version; tables of the defaults
  199. # are available in the source code. If the user changes one or more colours,
  200. # a PALETTE record appears in the XLS file -- it gives the RGB values for *all* changeable
  201. # indexes.<br />
  202. # Note that colours can be used in "number formats": "[CYAN]...." and "[COLOR8]...." refer
  203. # to colour index 7; "[COLOR16]...." will produce cyan
  204. # unless the user changes colour index 15 to something else.<br />
  205. #
  206. # <p>In addition, there are several "magic" colour indexes used by Excel:<br />
  207. # 0x18 (BIFF3-BIFF4), 0x40 (BIFF5-BIFF8): System window text colour for border lines
  208. # (used in XF, CF, and WINDOW2 records)<br />
  209. # 0x19 (BIFF3-BIFF4), 0x41 (BIFF5-BIFF8): System window background colour for pattern background
  210. # (used in XF and CF records )<br />
  211. # 0x43: System face colour (dialogue background colour)<br />
  212. # 0x4D: System window text colour for chart border lines<br />
  213. # 0x4E: System window background colour for chart areas<br />
  214. # 0x4F: Automatic colour for chart border lines (seems to be always Black)<br />
  215. # 0x50: System ToolTip background colour (used in note objects)<br />
  216. # 0x51: System ToolTip text colour (used in note objects)<br />
  217. # 0x7FFF: System window text colour for fonts (used in FONT and CF records)<br />
  218. # Note 0x7FFF appears to be the *default* colour index. It appears quite often in FONT
  219. # records.<br />
  220. #
  221. # <h4>Default Formatting</h4>
  222. #
  223. # Default formatting is applied to all empty cells (those not described by a cell record).
  224. # Firstly row default information (ROW record, Rowinfo class) is used if available.
  225. # Failing that, column default information (COLINFO record, Colinfo class) is used if available.
  226. # As a last resort the worksheet/workbook default cell format will be used; this
  227. # should always be present in an Excel file,
  228. # described by the XF record with the fixed index 15 (0-based). By default, it uses the
  229. # worksheet/workbook default cell style, described by the very first XF record (index 0).
  230. #
  231. # <h4> Formatting features not included in xlrd version 0.6.1</h4>
  232. # <ul>
  233. # <li>Rich text i.e. strings containing partial <b>bold</b> <i>italic</i>
  234. # and <u>underlined</u> text, change of font inside a string, etc.
  235. # See OOo docs s3.4 and s3.2.
  236. # <i> Rich text is included in version 0.7.2</i></li>
  237. # <li>Asian phonetic text (known as "ruby"), used for Japanese furigana. See OOo docs
  238. # s3.4.2 (p15)</li>
  239. # <li>Conditional formatting. See OOo docs
  240. # s5.12, s6.21 (CONDFMT record), s6.16 (CF record)</li>
  241. # <li>Miscellaneous sheet-level and book-level items e.g. printing layout, screen panes. </li>
  242. # <li>Modern Excel file versions don't keep most of the built-in
  243. # "number formats" in the file; Excel loads formats according to the
  244. # user's locale. Currently xlrd's emulation of this is limited to
  245. # a hard-wired table that applies to the US English locale. This may mean
  246. # that currency symbols, date order, thousands separator, decimals separator, etc
  247. # are inappropriate. Note that this does not affect users who are copying XLS
  248. # files, only those who are visually rendering cells.</li>
  249. # </ul>
  250. #
  251. # <h3>Loading worksheets on demand</h3>
  252. #
  253. # <p>This feature, new in version 0.7.1, is governed by the on_demand argument
  254. # to the open_workbook() function and allows saving memory and time by loading
  255. # only those sheets that the caller is interested in, and releasing sheets
  256. # when no longer required.</p>
  257. #
  258. # <p>on_demand=False (default): No change. open_workbook() loads global data
  259. # and all sheets, releases resources no longer required (principally the
  260. # str or mmap object containing the Workbook stream), and returns.</p>
  261. #
  262. # <p>on_demand=True and BIFF version < 5.0: A warning message is emitted,
  263. # on_demand is recorded as False, and the old process is followed.</p>
  264. #
  265. # <p>on_demand=True and BIFF version >= 5.0: open_workbook() loads global
  266. # data and returns without releasing resources. At this stage, the only
  267. # information available about sheets is Book.nsheets and Book.sheet_names().</p>
  268. #
  269. # <p>Book.sheet_by_name() and Book.sheet_by_index() will load the requested
  270. # sheet if it is not already loaded.</p>
  271. #
  272. # <p>Book.sheets() will load all/any unloaded sheets.</p>
  273. #
  274. # <p>The caller may save memory by calling
  275. # Book.unload_sheet(sheet_name_or_index) when finished with the sheet.
  276. # This applies irrespective of the state of on_demand.</p>
  277. #
  278. # <p>The caller may re-load an unloaded sheet by calling Book.sheet_by_xxxx()
  279. # -- except if those required resources have been released (which will
  280. # have happened automatically when on_demand is false). This is the only
  281. # case where an exception will be raised.</p>
  282. #
  283. # <p>The caller may query the state of a sheet:
  284. # Book.sheet_loaded(sheet_name_or_index) -> a bool</p>
  285. #
  286. # <p> Book.release_resources() may used to save memory and close
  287. # any memory-mapped file before proceding to examine already-loaded
  288. # sheets. Once resources are released, no further sheets can be loaded.</p>
  289. #
  290. # <p> When using on-demand, it is advisable to ensure that
  291. # Book.release_resources() is always called even if an exception
  292. # is raised in your own code; otherwise if the input file has been
  293. # memory-mapped, the mmap.mmap object will not be closed and you will
  294. # not be able to access the physical file until your Python process
  295. # terminates. This can be done by calling Book.release_resources()
  296. # explicitly in the finally suite of a try/finally block.
  297. # New in xlrd 0.7.2: the Book object is a "context manager", so if
  298. # using Python 2.5 or later, you can wrap your code in a "with"
  299. # statement.</p>
  300. ##
  301. import sys, zipfile, pprint
  302. from . import timemachine
  303. from .biffh import (
  304. XLRDError,
  305. biff_text_from_num,
  306. error_text_from_code,
  307. XL_CELL_BLANK,
  308. XL_CELL_TEXT,
  309. XL_CELL_BOOLEAN,
  310. XL_CELL_ERROR,
  311. XL_CELL_EMPTY,
  312. XL_CELL_DATE,
  313. XL_CELL_NUMBER
  314. )
  315. from .formula import * # is constrained by __all__
  316. from .book import Book, colname #### TODO #### formula also has `colname` (restricted to 256 cols)
  317. from .sheet import empty_cell
  318. from .xldate import XLDateError, xldate_as_tuple
  319. from .xlsx import X12Book
  320. if sys.version.startswith("IronPython"):
  321. # print >> sys.stderr, "...importing encodings"
  322. import encodings
  323. try:
  324. import mmap
  325. MMAP_AVAILABLE = 1
  326. except ImportError:
  327. MMAP_AVAILABLE = 0
  328. USE_MMAP = MMAP_AVAILABLE
  329. ##
  330. #
  331. # Open a spreadsheet file for data extraction.
  332. #
  333. # @param filename The path to the spreadsheet file to be opened.
  334. #
  335. # @param logfile An open file to which messages and diagnostics are written.
  336. #
  337. # @param verbosity Increases the volume of trace material written to the logfile.
  338. #
  339. # @param use_mmap Whether to use the mmap module is determined heuristically.
  340. # Use this arg to override the result. Current heuristic: mmap is used if it exists.
  341. #
  342. # @param file_contents ... as a string or an mmap.mmap object or some other behave-alike object.
  343. # If file_contents is supplied, filename will not be used, except (possibly) in messages.
  344. #
  345. # @param encoding_override Used to overcome missing or bad codepage information
  346. # in older-version files. Refer to discussion in the <b>Unicode</b> section above.
  347. # <br /> -- New in version 0.6.0
  348. #
  349. # @param formatting_info Governs provision of a reference to an XF (eXtended Format) object
  350. # for each cell in the worksheet.
  351. # <br /> Default is <i>False</i>. This is backwards compatible and saves memory.
  352. # "Blank" cells (those with their own formatting information but no data) are treated as empty
  353. # (by ignoring the file's BLANK and MULBLANK records).
  354. # It cuts off any bottom "margin" of rows of empty (and blank) cells and
  355. # any right "margin" of columns of empty (and blank) cells.
  356. # Only cell_value and cell_type are available.
  357. # <br /> <i>True</i> provides all cells, including empty and blank cells.
  358. # XF information is available for each cell.
  359. # <br /> -- New in version 0.6.1
  360. #
  361. # @param on_demand Governs whether sheets are all loaded initially or when demanded
  362. # by the caller. Please refer back to the section "Loading worksheets on demand" for details.
  363. # <br /> -- New in version 0.7.1
  364. #
  365. # @param ragged_rows False (the default) means all rows are padded out with empty cells so that all
  366. # rows have the same size (Sheet.ncols). True means that there are no empty cells at the ends of rows.
  367. # This can result in substantial memory savings if rows are of widely varying sizes. See also the
  368. # Sheet.row_len() method.
  369. # <br /> -- New in version 0.7.2
  370. #
  371. # @return An instance of the Book class.
  372. def open_workbook(filename=None,
  373. logfile=sys.stdout,
  374. verbosity=0,
  375. use_mmap=USE_MMAP,
  376. file_contents=None,
  377. encoding_override=None,
  378. formatting_info=False,
  379. on_demand=False,
  380. ragged_rows=False,
  381. ):
  382. peeksz = 4
  383. if file_contents:
  384. peek = file_contents[:peeksz]
  385. else:
  386. with open(filename, "rb") as f:
  387. peek = f.read(peeksz)
  388. if peek == b"PK\x03\x04": # a ZIP file
  389. if file_contents:
  390. zf = zipfile.ZipFile(timemachine.BYTES_IO(file_contents))
  391. else:
  392. zf = zipfile.ZipFile(filename)
  393. # Workaround for some third party files that use forward slashes and
  394. # lower case names. We map the expected name in lowercase to the
  395. # actual filename in the zip container.
  396. component_names = dict([(X12Book.convert_filename(name), name)
  397. for name in zf.namelist()])
  398. if verbosity:
  399. logfile.write('ZIP component_names:\n')
  400. pprint.pprint(component_names, logfile)
  401. if 'xl/workbook.xml' in component_names:
  402. from . import xlsx
  403. bk = xlsx.open_workbook_2007_xml(
  404. zf,
  405. component_names,
  406. logfile=logfile,
  407. verbosity=verbosity,
  408. use_mmap=use_mmap,
  409. formatting_info=formatting_info,
  410. on_demand=on_demand,
  411. ragged_rows=ragged_rows,
  412. )
  413. return bk
  414. if 'xl/workbook.bin' in component_names:
  415. raise XLRDError('Excel 2007 xlsb file; not supported')
  416. if 'content.xml' in component_names:
  417. raise XLRDError('Openoffice.org ODS file; not supported')
  418. raise XLRDError('ZIP file contents not a known type of workbook')
  419. from . import book
  420. bk = book.open_workbook_xls(
  421. filename=filename,
  422. logfile=logfile,
  423. verbosity=verbosity,
  424. use_mmap=use_mmap,
  425. file_contents=file_contents,
  426. encoding_override=encoding_override,
  427. formatting_info=formatting_info,
  428. on_demand=on_demand,
  429. ragged_rows=ragged_rows,
  430. )
  431. return bk
  432. ##
  433. # For debugging: dump an XLS file's BIFF records in char & hex.
  434. # @param filename The path to the file to be dumped.
  435. # @param outfile An open file, to which the dump is written.
  436. # @param unnumbered If true, omit offsets (for meaningful diffs).
  437. def dump(filename, outfile=sys.stdout, unnumbered=False):
  438. from .biffh import biff_dump
  439. bk = Book()
  440. bk.biff2_8_load(filename=filename, logfile=outfile, )
  441. biff_dump(bk.mem, bk.base, bk.stream_len, 0, outfile, unnumbered)
  442. ##
  443. # For debugging and analysis: summarise the file's BIFF records.
  444. # I.e. produce a sorted file of (record_name, count).
  445. # @param filename The path to the file to be summarised.
  446. # @param outfile An open file, to which the summary is written.
  447. def count_records(filename, outfile=sys.stdout):
  448. from .biffh import biff_count_records
  449. bk = Book()
  450. bk.biff2_8_load(filename=filename, logfile=outfile, )
  451. biff_count_records(bk.mem, bk.base, bk.stream_len, outfile)