xlsx.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852
  1. ##
  2. # Portions copyright (c) 2008-2012 Stephen John Machin, Lingfo Pty Ltd
  3. # This module is part of the xlrd package, which is released under a BSD-style licence.
  4. ##
  5. from __future__ import print_function, unicode_literals
  6. DEBUG = 0
  7. from os.path import normpath, join
  8. import sys
  9. import re
  10. from .timemachine import *
  11. from .book import Book, Name
  12. from .biffh import error_text_from_code, XLRDError, XL_CELL_BLANK, XL_CELL_TEXT, XL_CELL_BOOLEAN, XL_CELL_ERROR
  13. from .formatting import is_date_format_string, Format, XF
  14. from .sheet import Sheet
  15. DLF = sys.stdout # Default Log File
  16. ET = None
  17. ET_has_iterparse = False
  18. Element_has_iter = False
  19. def ensure_elementtree_imported(verbosity, logfile):
  20. global ET, ET_has_iterparse, Element_has_iter
  21. if ET is not None:
  22. return
  23. if "IronPython" in sys.version:
  24. import xml.etree.ElementTree as ET
  25. #### 2.7.2.1: fails later with
  26. #### NotImplementedError: iterparse is not supported on IronPython. (CP #31923)
  27. else:
  28. try: import xml.etree.cElementTree as ET
  29. except ImportError:
  30. try: import cElementTree as ET
  31. except ImportError:
  32. try: import lxml.etree as ET
  33. except ImportError:
  34. try: import xml.etree.ElementTree as ET
  35. except ImportError:
  36. try: import elementtree.ElementTree as ET
  37. except ImportError:
  38. raise Exception("Failed to import an ElementTree implementation")
  39. if hasattr(ET, 'iterparse'):
  40. _dummy_stream = BYTES_IO(b'')
  41. try:
  42. ET.iterparse(_dummy_stream)
  43. ET_has_iterparse = True
  44. except NotImplementedError:
  45. pass
  46. Element_has_iter = hasattr(ET.ElementTree, 'iter')
  47. if verbosity:
  48. etree_version = repr([
  49. (item, getattr(ET, item))
  50. for item in ET.__dict__.keys()
  51. if item.lower().replace('_', '') == 'version'
  52. ])
  53. print(ET.__file__, ET.__name__, etree_version, ET_has_iterparse, file=logfile)
  54. def split_tag(tag):
  55. pos = tag.rfind('}') + 1
  56. if pos >= 2:
  57. return tag[:pos], tag[pos:]
  58. return '', tag
  59. def augment_keys(adict, uri):
  60. # uri must already be enclosed in {}
  61. for x in list(adict.keys()):
  62. adict[uri + x] = adict[x]
  63. _UPPERCASE_1_REL_INDEX = {} # Used in fast conversion of column names (e.g. "XFD") to indices (16383)
  64. for _x in xrange(26):
  65. _UPPERCASE_1_REL_INDEX["ABCDEFGHIJKLMNOPQRSTUVWXYZ"[_x]] = _x + 1
  66. for _x in "123456789":
  67. _UPPERCASE_1_REL_INDEX[_x] = 0
  68. del _x
  69. def cell_name_to_rowx_colx(cell_name, letter_value=_UPPERCASE_1_REL_INDEX,
  70. allow_no_col=False):
  71. # Extract column index from cell name
  72. # A<row number> => 0, Z =>25, AA => 26, XFD => 16383
  73. colx = 0
  74. charx = -1
  75. try:
  76. for c in cell_name:
  77. charx += 1
  78. lv = letter_value[c]
  79. if lv:
  80. colx = colx * 26 + lv
  81. else: # start of row number; can't be '0'
  82. if charx == 0:
  83. # there was no col marker
  84. if allow_no_col:
  85. colx = None
  86. break
  87. else:
  88. raise Exception(
  89. 'Missing col in cell name %r', cell_name)
  90. else:
  91. colx = colx - 1
  92. assert 0 <= colx < X12_MAX_COLS
  93. break
  94. except KeyError:
  95. raise Exception('Unexpected character %r in cell name %r' % (c, cell_name))
  96. rowx = int(cell_name[charx:]) - 1
  97. return rowx, colx
  98. error_code_from_text = {}
  99. for _code, _text in error_text_from_code.items():
  100. error_code_from_text[_text] = _code
  101. # === X12 === Excel 2007 .xlsx ===============================================
  102. U_SSML12 = "{http://schemas.openxmlformats.org/spreadsheetml/2006/main}"
  103. U_ODREL = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}"
  104. U_PKGREL = "{http://schemas.openxmlformats.org/package/2006/relationships}"
  105. U_CP = "{http://schemas.openxmlformats.org/package/2006/metadata/core-properties}"
  106. U_DC = "{http://purl.org/dc/elements/1.1/}"
  107. U_DCTERMS = "{http://purl.org/dc/terms/}"
  108. XML_SPACE_ATTR = "{http://www.w3.org/XML/1998/namespace}space"
  109. XML_WHITESPACE = "\t\n \r"
  110. X12_MAX_ROWS = 2 ** 20
  111. X12_MAX_COLS = 2 ** 14
  112. V_TAG = U_SSML12 + 'v' # cell child: value
  113. F_TAG = U_SSML12 + 'f' # cell child: formula
  114. IS_TAG = U_SSML12 + 'is' # cell child: inline string
  115. def unescape(s,
  116. subber=re.compile(r'_x[0-9A-Fa-f]{4,4}_', re.UNICODE).sub,
  117. repl=lambda mobj: unichr(int(mobj.group(0)[2:6], 16)),
  118. ):
  119. if "_" in s:
  120. return subber(repl, s)
  121. return s
  122. def cooked_text(self, elem):
  123. t = elem.text
  124. if t is None:
  125. return ''
  126. if elem.get(XML_SPACE_ATTR) != 'preserve':
  127. t = t.strip(XML_WHITESPACE)
  128. return ensure_unicode(unescape(t))
  129. def get_text_from_si_or_is(self, elem, r_tag=U_SSML12+'r', t_tag=U_SSML12 +'t'):
  130. "Returns unescaped unicode"
  131. accum = []
  132. for child in elem:
  133. # self.dump_elem(child)
  134. tag = child.tag
  135. if tag == t_tag:
  136. t = cooked_text(self, child)
  137. if t: # note: .text attribute can be None
  138. accum.append(t)
  139. elif tag == r_tag:
  140. for tnode in child:
  141. if tnode.tag == t_tag:
  142. t = cooked_text(self, tnode)
  143. if t:
  144. accum.append(t)
  145. return ''.join(accum)
  146. def map_attributes(amap, elem, obj):
  147. for xml_attr, obj_attr, cnv_func_or_const in amap:
  148. if not xml_attr:
  149. setattr(obj, obj_attr, cnv_func_or_const)
  150. continue
  151. if not obj_attr: continue #### FIX ME ####
  152. raw_value = elem.get(xml_attr)
  153. cooked_value = cnv_func_or_const(raw_value)
  154. setattr(obj, obj_attr, cooked_value)
  155. def cnv_ST_Xstring(s):
  156. if s is None: return ""
  157. return ensure_unicode(s)
  158. def cnv_xsd_unsignedInt(s):
  159. if not s:
  160. return None
  161. value = int(s)
  162. assert value >= 0
  163. return value
  164. def cnv_xsd_boolean(s):
  165. if not s:
  166. return 0
  167. if s in ("1", "true", "on"):
  168. return 1
  169. if s in ("0", "false", "off"):
  170. return 0
  171. raise ValueError("unexpected xsd:boolean value: %r" % s)
  172. _defined_name_attribute_map = (
  173. ("name", "name", cnv_ST_Xstring, ),
  174. ("comment", "", cnv_ST_Xstring, ),
  175. ("customMenu", "", cnv_ST_Xstring, ),
  176. ("description", "", cnv_ST_Xstring, ),
  177. ("help", "", cnv_ST_Xstring, ),
  178. ("statusBar", "", cnv_ST_Xstring, ),
  179. ("localSheetId", "scope", cnv_xsd_unsignedInt, ),
  180. ("hidden", "hidden", cnv_xsd_boolean, ),
  181. ("function", "func", cnv_xsd_boolean, ),
  182. ("vbProcedure", "vbasic", cnv_xsd_boolean, ),
  183. ("xlm", "macro", cnv_xsd_boolean, ),
  184. ("functionGroupId", "funcgroup", cnv_xsd_unsignedInt, ),
  185. ("shortcutKey", "", cnv_ST_Xstring, ),
  186. ("publishToServer", "", cnv_xsd_boolean, ),
  187. ("workbookParameter", "", cnv_xsd_boolean, ),
  188. ("", "any_err", 0, ),
  189. ("", "any_external", 0, ),
  190. ("", "any_rel", 0, ),
  191. ("", "basic_formula_len", 0, ),
  192. ("", "binary", 0, ),
  193. ("", "builtin", 0, ),
  194. ("", "complex", 0, ),
  195. ("", "evaluated", 0, ),
  196. ("", "excel_sheet_index", 0, ),
  197. ("", "excel_sheet_num", 0, ),
  198. ("", "option_flags", 0, ),
  199. ("", "result", None, ),
  200. ("", "stack", None, ),
  201. )
  202. def make_name_access_maps(bk):
  203. name_and_scope_map = {} # (name.lower(), scope): Name_object
  204. name_map = {} # name.lower() : list of Name_objects (sorted in scope order)
  205. num_names = len(bk.name_obj_list)
  206. for namex in xrange(num_names):
  207. nobj = bk.name_obj_list[namex]
  208. name_lcase = nobj.name.lower()
  209. key = (name_lcase, nobj.scope)
  210. if key in name_and_scope_map:
  211. msg = 'Duplicate entry %r in name_and_scope_map' % (key, )
  212. if 0:
  213. raise XLRDError(msg)
  214. else:
  215. if bk.verbosity:
  216. print(msg, file=bk.logfile)
  217. name_and_scope_map[key] = nobj
  218. sort_data = (nobj.scope, namex, nobj)
  219. if name_lcase in name_map:
  220. name_map[name_lcase].append(sort_data)
  221. else:
  222. name_map[name_lcase] = [sort_data]
  223. for key in name_map.keys():
  224. alist = name_map[key]
  225. alist.sort()
  226. name_map[key] = [x[2] for x in alist]
  227. bk.name_and_scope_map = name_and_scope_map
  228. bk.name_map = name_map
  229. class X12General(object):
  230. def process_stream(self, stream, heading=None):
  231. if self.verbosity >= 2 and heading is not None:
  232. fprintf(self.logfile, "\n=== %s ===\n", heading)
  233. self.tree = ET.parse(stream)
  234. getmethod = self.tag2meth.get
  235. for elem in self.tree.iter() if Element_has_iter else self.tree.getiterator():
  236. if self.verbosity >= 3:
  237. self.dump_elem(elem)
  238. meth = getmethod(elem.tag)
  239. if meth:
  240. meth(self, elem)
  241. self.finish_off()
  242. def finish_off(self):
  243. pass
  244. def dump_elem(self, elem):
  245. fprintf(self.logfile,
  246. "===\ntag=%r len=%d attrib=%r text=%r tail=%r\n",
  247. split_tag(elem.tag)[1], len(elem), elem.attrib, elem.text, elem.tail)
  248. def dumpout(self, fmt, *vargs):
  249. text = (12 * ' ' + fmt + '\n') % vargs
  250. self.logfile.write(text)
  251. class X12Book(X12General):
  252. def __init__(self, bk, logfile=DLF, verbosity=False):
  253. self.bk = bk
  254. self.logfile = logfile
  255. self.verbosity = verbosity
  256. self.bk.nsheets = 0
  257. self.bk.props = {}
  258. self.relid2path = {}
  259. self.relid2reltype = {}
  260. self.sheet_targets = [] # indexed by sheetx
  261. self.sheetIds = [] # indexed by sheetx
  262. core_props_menu = {
  263. U_CP+"lastModifiedBy": ("last_modified_by", cnv_ST_Xstring),
  264. U_DC+"creator": ("creator", cnv_ST_Xstring),
  265. U_DCTERMS+"modified": ("modified", cnv_ST_Xstring),
  266. U_DCTERMS+"created": ("created", cnv_ST_Xstring),
  267. }
  268. def process_coreprops(self, stream):
  269. if self.verbosity >= 2:
  270. fprintf(self.logfile, "\n=== coreProps ===\n")
  271. self.tree = ET.parse(stream)
  272. getmenu = self.core_props_menu.get
  273. props = {}
  274. for elem in self.tree.iter() if Element_has_iter else self.tree.getiterator():
  275. if self.verbosity >= 3:
  276. self.dump_elem(elem)
  277. menu = getmenu(elem.tag)
  278. if menu:
  279. attr, func = menu
  280. value = func(elem.text)
  281. props[attr] = value
  282. self.bk.user_name = props.get('last_modified_by') or props.get('creator')
  283. self.bk.props = props
  284. if self.verbosity >= 2:
  285. fprintf(self.logfile, "props: %r\n", props)
  286. self.finish_off()
  287. @staticmethod
  288. def convert_filename(name):
  289. return name.replace('\\', '/').lower()
  290. def process_rels(self, stream):
  291. if self.verbosity >= 2:
  292. fprintf(self.logfile, "\n=== Relationships ===\n")
  293. tree = ET.parse(stream)
  294. r_tag = U_PKGREL + 'Relationship'
  295. for elem in tree.findall(r_tag):
  296. rid = elem.get('Id')
  297. target = X12Book.convert_filename(elem.get('Target'))
  298. reltype = elem.get('Type').split('/')[-1]
  299. if self.verbosity >= 2:
  300. self.dumpout('Id=%r Type=%r Target=%r', rid, reltype, target)
  301. self.relid2reltype[rid] = reltype
  302. # self.relid2path[rid] = 'xl/' + target
  303. if target.startswith('/'):
  304. self.relid2path[rid] = target[1:] # drop the /
  305. else:
  306. self.relid2path[rid] = 'xl/' + target
  307. def do_defined_name(self, elem):
  308. #### UNDER CONSTRUCTION ####
  309. if 0 and self.verbosity >= 3:
  310. self.dump_elem(elem)
  311. nobj = Name()
  312. bk = self.bk
  313. nobj.bk = bk
  314. nobj.name_index = len(bk.name_obj_list)
  315. bk.name_obj_list.append(nobj)
  316. nobj.name = elem.get('name')
  317. nobj.raw_formula = None # compiled bytecode formula -- not in XLSX
  318. nobj.formula_text = cooked_text(self, elem)
  319. map_attributes(_defined_name_attribute_map, elem, nobj)
  320. if nobj.scope is None:
  321. nobj.scope = -1 # global
  322. if nobj.name.startswith("_xlnm."):
  323. nobj.builtin = 1
  324. if self.verbosity >= 2:
  325. nobj.dump(header='=== Name object ===')
  326. def do_defined_names(self, elem):
  327. for child in elem:
  328. self.do_defined_name(child)
  329. make_name_access_maps(self.bk)
  330. def do_sheet(self, elem):
  331. bk = self.bk
  332. sheetx = bk.nsheets
  333. # print elem.attrib
  334. rid = elem.get(U_ODREL + 'id')
  335. sheetId = int(elem.get('sheetId'))
  336. name = unescape(ensure_unicode(elem.get('name')))
  337. reltype = self.relid2reltype[rid]
  338. target = self.relid2path[rid]
  339. if self.verbosity >= 2:
  340. self.dumpout(
  341. 'sheetx=%d sheetId=%r rid=%r type=%r name=%r',
  342. sheetx, sheetId, rid, reltype, name)
  343. if reltype != 'worksheet':
  344. if self.verbosity >= 2:
  345. self.dumpout('Ignoring sheet of type %r (name=%r)', reltype, name)
  346. return
  347. state = elem.get('state')
  348. visibility_map = {
  349. None: 0,
  350. 'visible': 0,
  351. 'hidden': 1,
  352. 'veryHidden': 2
  353. }
  354. bk._sheet_visibility.append(visibility_map[state])
  355. sheet = Sheet(bk, position=None, name=name, number=sheetx)
  356. sheet.utter_max_rows = X12_MAX_ROWS
  357. sheet.utter_max_cols = X12_MAX_COLS
  358. bk._sheet_list.append(sheet)
  359. bk._sheet_names.append(name)
  360. bk.nsheets += 1
  361. self.sheet_targets.append(target)
  362. self.sheetIds.append(sheetId)
  363. def do_workbookpr(self, elem):
  364. datemode = cnv_xsd_boolean(elem.get('date1904'))
  365. if self.verbosity >= 2:
  366. self.dumpout('datemode=%r', datemode)
  367. self.bk.datemode = datemode
  368. tag2meth = {
  369. 'definedNames': do_defined_names,
  370. 'workbookPr': do_workbookpr,
  371. 'sheet': do_sheet,
  372. }
  373. augment_keys(tag2meth, U_SSML12)
  374. class X12SST(X12General):
  375. def __init__(self, bk, logfile=DLF, verbosity=0):
  376. self.bk = bk
  377. self.logfile = logfile
  378. self.verbosity = verbosity
  379. if ET_has_iterparse:
  380. self.process_stream = self.process_stream_iterparse
  381. else:
  382. self.process_stream = self.process_stream_findall
  383. def process_stream_iterparse(self, stream, heading=None):
  384. if self.verbosity >= 2 and heading is not None:
  385. fprintf(self.logfile, "\n=== %s ===\n", heading)
  386. si_tag = U_SSML12 + 'si'
  387. elemno = -1
  388. sst = self.bk._sharedstrings
  389. for event, elem in ET.iterparse(stream):
  390. if elem.tag != si_tag: continue
  391. elemno = elemno + 1
  392. if self.verbosity >= 3:
  393. fprintf(self.logfile, "element #%d\n", elemno)
  394. self.dump_elem(elem)
  395. result = get_text_from_si_or_is(self, elem)
  396. sst.append(result)
  397. elem.clear() # destroy all child elements
  398. if self.verbosity >= 2:
  399. self.dumpout('Entries in SST: %d', len(sst))
  400. if self.verbosity >= 3:
  401. for x, s in enumerate(sst):
  402. fprintf(self.logfile, "SST x=%d s=%r\n", x, s)
  403. def process_stream_findall(self, stream, heading=None):
  404. if self.verbosity >= 2 and heading is not None:
  405. fprintf(self.logfile, "\n=== %s ===\n", heading)
  406. self.tree = ET.parse(stream)
  407. si_tag = U_SSML12 + 'si'
  408. elemno = -1
  409. sst = self.bk._sharedstrings
  410. for elem in self.tree.findall(si_tag):
  411. elemno = elemno + 1
  412. if self.verbosity >= 3:
  413. fprintf(self.logfile, "element #%d\n", elemno)
  414. self.dump_elem(elem)
  415. result = get_text_from_si_or_is(self, elem)
  416. sst.append(result)
  417. if self.verbosity >= 2:
  418. self.dumpout('Entries in SST: %d', len(sst))
  419. class X12Styles(X12General):
  420. def __init__(self, bk, logfile=DLF, verbosity=0):
  421. self.bk = bk
  422. self.logfile = logfile
  423. self.verbosity = verbosity
  424. self.xf_counts = [0, 0]
  425. self.xf_type = None
  426. self.fmt_is_date = {}
  427. for x in list(range(14, 23)) + list(range(45, 48)): #### hard-coding FIX ME ####
  428. self.fmt_is_date[x] = 1
  429. # dummy entry for XF 0 in case no Styles section
  430. self.bk._xf_index_to_xl_type_map[0] = 2
  431. # fill_in_standard_formats(bk) #### pre-integration kludge
  432. def do_cellstylexfs(self, elem):
  433. self.xf_type = 0
  434. def do_cellxfs(self, elem):
  435. self.xf_type = 1
  436. def do_numfmt(self, elem):
  437. formatCode = ensure_unicode(elem.get('formatCode'))
  438. numFmtId = int(elem.get('numFmtId'))
  439. is_date = is_date_format_string(self.bk, formatCode)
  440. self.fmt_is_date[numFmtId] = is_date
  441. fmt_obj = Format(numFmtId, is_date + 2, formatCode)
  442. self.bk.format_map[numFmtId] = fmt_obj
  443. if self.verbosity >= 3:
  444. self.dumpout('numFmtId=%d formatCode=%r is_date=%d', numFmtId, formatCode, is_date)
  445. def do_xf(self, elem):
  446. if self.xf_type != 1:
  447. #### ignoring style XFs for the moment
  448. return
  449. xfx = self.xf_counts[self.xf_type]
  450. self.xf_counts[self.xf_type] = xfx + 1
  451. xf = XF()
  452. self.bk.xf_list.append(xf)
  453. self.bk.xfcount += 1
  454. numFmtId = int(elem.get('numFmtId', '0'))
  455. xf.format_key = numFmtId
  456. is_date = self.fmt_is_date.get(numFmtId, 0)
  457. self.bk._xf_index_to_xl_type_map[xfx] = is_date + 2
  458. if self.verbosity >= 3:
  459. self.dumpout(
  460. 'xfx=%d numFmtId=%d',
  461. xfx, numFmtId,
  462. )
  463. self.dumpout(repr(self.bk._xf_index_to_xl_type_map))
  464. tag2meth = {
  465. 'cellStyleXfs': do_cellstylexfs,
  466. 'cellXfs': do_cellxfs,
  467. 'numFmt': do_numfmt,
  468. 'xf': do_xf,
  469. }
  470. augment_keys(tag2meth, U_SSML12)
  471. class X12Sheet(X12General):
  472. def __init__(self, sheet, logfile=DLF, verbosity=0):
  473. self.sheet = sheet
  474. self.logfile = logfile
  475. self.verbosity = verbosity
  476. self.rowx = -1 # We may need to count them.
  477. self.bk = sheet.book
  478. self.sst = self.bk._sharedstrings
  479. self.relid2path = {}
  480. self.relid2reltype = {}
  481. self.merged_cells = sheet.merged_cells
  482. self.warned_no_cell_name = 0
  483. self.warned_no_row_num = 0
  484. if ET_has_iterparse:
  485. self.process_stream = self.own_process_stream
  486. def own_process_stream(self, stream, heading=None):
  487. if self.verbosity >= 2 and heading is not None:
  488. fprintf(self.logfile, "\n=== %s ===\n", heading)
  489. getmethod = self.tag2meth.get
  490. row_tag = U_SSML12 + "row"
  491. self_do_row = self.do_row
  492. for event, elem in ET.iterparse(stream):
  493. if elem.tag == row_tag:
  494. self_do_row(elem)
  495. elem.clear() # destroy all child elements (cells)
  496. elif elem.tag == U_SSML12 + "dimension":
  497. self.do_dimension(elem)
  498. elif elem.tag == U_SSML12 + "mergeCell":
  499. self.do_merge_cell(elem)
  500. self.finish_off()
  501. def process_rels(self, stream):
  502. if self.verbosity >= 2:
  503. fprintf(self.logfile, "\n=== Sheet Relationships ===\n")
  504. tree = ET.parse(stream)
  505. r_tag = U_PKGREL + 'Relationship'
  506. for elem in tree.findall(r_tag):
  507. rid = elem.get('Id')
  508. target = elem.get('Target')
  509. reltype = elem.get('Type').split('/')[-1]
  510. if self.verbosity >= 2:
  511. self.dumpout('Id=%r Type=%r Target=%r', rid, reltype, target)
  512. self.relid2reltype[rid] = reltype
  513. self.relid2path[rid] = normpath(join('xl/worksheets', target))
  514. def process_comments_stream(self, stream):
  515. root = ET.parse(stream).getroot()
  516. author_list = root[0]
  517. assert author_list.tag == U_SSML12 + 'authors'
  518. authors = [elem.text for elem in author_list]
  519. comment_list = root[1]
  520. assert comment_list.tag == U_SSML12 + 'commentList'
  521. cell_note_map = self.sheet.cell_note_map
  522. from .sheet import Note
  523. text_tag = U_SSML12 + 'text'
  524. r_tag = U_SSML12 + 'r'
  525. t_tag = U_SSML12 + 't'
  526. for elem in comment_list.findall(U_SSML12 + 'comment'):
  527. ts = elem.findall('./' + text_tag + '/' + t_tag)
  528. ts += elem.findall('./' + text_tag + '/' + r_tag + '/' + t_tag)
  529. ref = elem.get('ref')
  530. note = Note()
  531. note.author = authors[int(elem.get('authorId'))]
  532. note.rowx, note.colx = coords = cell_name_to_rowx_colx(ref)
  533. note.text = ''
  534. for t in ts:
  535. note.text += cooked_text(self, t)
  536. cell_note_map[coords] = note
  537. def do_dimension(self, elem):
  538. ref = elem.get('ref') # example: "A1:Z99" or just "A1"
  539. if ref:
  540. # print >> self.logfile, "dimension: ref=%r" % ref
  541. last_cell_ref = ref.split(':')[-1] # example: "Z99"
  542. rowx, colx = cell_name_to_rowx_colx(
  543. last_cell_ref, allow_no_col=True)
  544. self.sheet._dimnrows = rowx + 1
  545. if colx is not None:
  546. self.sheet._dimncols = colx + 1
  547. def do_merge_cell(self, elem):
  548. # The ref attribute should be a cell range like "B1:D5".
  549. ref = elem.get('ref')
  550. if ref:
  551. first_cell_ref, last_cell_ref = ref.split(':')
  552. first_rowx, first_colx = cell_name_to_rowx_colx(first_cell_ref)
  553. last_rowx, last_colx = cell_name_to_rowx_colx(last_cell_ref)
  554. self.merged_cells.append((first_rowx, last_rowx + 1,
  555. first_colx, last_colx + 1))
  556. def do_row(self, row_elem):
  557. def bad_child_tag(child_tag):
  558. raise Exception('cell type %s has unexpected child <%s> at rowx=%r colx=%r' % (cell_type, child_tag, rowx, colx))
  559. row_number = row_elem.get('r')
  560. if row_number is None: # Yes, it's optional.
  561. self.rowx += 1
  562. explicit_row_number = 0
  563. if self.verbosity and not self.warned_no_row_num:
  564. self.dumpout("no row number; assuming rowx=%d", self.rowx)
  565. self.warned_no_row_num = 1
  566. else:
  567. self.rowx = int(row_number) - 1
  568. explicit_row_number = 1
  569. assert 0 <= self.rowx < X12_MAX_ROWS
  570. rowx = self.rowx
  571. colx = -1
  572. if self.verbosity >= 3:
  573. self.dumpout("<row> row_number=%r rowx=%d explicit=%d",
  574. row_number, self.rowx, explicit_row_number)
  575. letter_value = _UPPERCASE_1_REL_INDEX
  576. for cell_elem in row_elem:
  577. cell_name = cell_elem.get('r')
  578. if cell_name is None: # Yes, it's optional.
  579. colx += 1
  580. if self.verbosity and not self.warned_no_cell_name:
  581. self.dumpout("no cellname; assuming rowx=%d colx=%d", rowx, colx)
  582. self.warned_no_cell_name = 1
  583. else:
  584. # Extract column index from cell name
  585. # A<row number> => 0, Z =>25, AA => 26, XFD => 16383
  586. colx = 0
  587. charx = -1
  588. try:
  589. for c in cell_name:
  590. charx += 1
  591. if c == '$':
  592. continue
  593. lv = letter_value[c]
  594. if lv:
  595. colx = colx * 26 + lv
  596. else: # start of row number; can't be '0'
  597. colx = colx - 1
  598. assert 0 <= colx < X12_MAX_COLS
  599. break
  600. except KeyError:
  601. raise Exception('Unexpected character %r in cell name %r' % (c, cell_name))
  602. if explicit_row_number and cell_name[charx:] != row_number:
  603. raise Exception('cell name %r but row number is %r' % (cell_name, row_number))
  604. xf_index = int(cell_elem.get('s', '0'))
  605. cell_type = cell_elem.get('t', 'n')
  606. tvalue = None
  607. formula = None
  608. if cell_type == 'n':
  609. # n = number. Most frequent type.
  610. # <v> child contains plain text which can go straight into float()
  611. # OR there's no text in which case it's a BLANK cell
  612. for child in cell_elem:
  613. child_tag = child.tag
  614. if child_tag == V_TAG:
  615. tvalue = child.text
  616. elif child_tag == F_TAG:
  617. formula = cooked_text(self, child)
  618. else:
  619. raise Exception('unexpected tag %r' % child_tag)
  620. if not tvalue:
  621. if self.bk.formatting_info:
  622. self.sheet.put_cell(rowx, colx, XL_CELL_BLANK, '', xf_index)
  623. else:
  624. self.sheet.put_cell(rowx, colx, None, float(tvalue), xf_index)
  625. elif cell_type == "s":
  626. # s = index into shared string table. 2nd most frequent type
  627. # <v> child contains plain text which can go straight into int()
  628. for child in cell_elem:
  629. child_tag = child.tag
  630. if child_tag == V_TAG:
  631. tvalue = child.text
  632. elif child_tag == F_TAG:
  633. # formula not expected here, but gnumeric does it.
  634. formula = child.text
  635. else:
  636. bad_child_tag(child_tag)
  637. if not tvalue:
  638. # <c r="A1" t="s"/>
  639. if self.bk.formatting_info:
  640. self.sheet.put_cell(rowx, colx, XL_CELL_BLANK, '', xf_index)
  641. else:
  642. value = self.sst[int(tvalue)]
  643. self.sheet.put_cell(rowx, colx, XL_CELL_TEXT, value, xf_index)
  644. elif cell_type == "str":
  645. # str = string result from formula.
  646. # Should have <f> (formula) child; however in one file, all text cells are str with no formula.
  647. # <v> child can contain escapes
  648. for child in cell_elem:
  649. child_tag = child.tag
  650. if child_tag == V_TAG:
  651. tvalue = cooked_text(self, child)
  652. elif child_tag == F_TAG:
  653. formula = cooked_text(self, child)
  654. else:
  655. bad_child_tag(child_tag)
  656. # assert tvalue is not None and formula is not None
  657. # Yuk. Fails with file created by gnumeric -- no tvalue!
  658. self.sheet.put_cell(rowx, colx, XL_CELL_TEXT, tvalue, xf_index)
  659. elif cell_type == "b":
  660. # b = boolean
  661. # <v> child contains "0" or "1"
  662. # Maybe the data should be converted with cnv_xsd_boolean;
  663. # ECMA standard is silent; Excel 2007 writes 0 or 1
  664. for child in cell_elem:
  665. child_tag = child.tag
  666. if child_tag == V_TAG:
  667. tvalue = child.text
  668. elif child_tag == F_TAG:
  669. formula = cooked_text(self, child)
  670. else:
  671. bad_child_tag(child_tag)
  672. self.sheet.put_cell(rowx, colx, XL_CELL_BOOLEAN, int(tvalue), xf_index)
  673. elif cell_type == "e":
  674. # e = error
  675. # <v> child contains e.g. "#REF!"
  676. for child in cell_elem:
  677. child_tag = child.tag
  678. if child_tag == V_TAG:
  679. tvalue = child.text
  680. elif child_tag == F_TAG:
  681. formula = cooked_text(self, child)
  682. else:
  683. bad_child_tag(child_tag)
  684. value = error_code_from_text[tvalue]
  685. self.sheet.put_cell(rowx, colx, XL_CELL_ERROR, value, xf_index)
  686. elif cell_type == "inlineStr":
  687. # Not expected in files produced by Excel.
  688. # It's a way of allowing 3rd party s/w to write text (including rich text) cells
  689. # without having to build a shared string table
  690. for child in cell_elem:
  691. child_tag = child.tag
  692. if child_tag == IS_TAG:
  693. tvalue = get_text_from_si_or_is(self, child)
  694. elif child_tag == V_TAG:
  695. tvalue = child.text
  696. elif child_tag == F_TAG:
  697. formula = child.text
  698. else:
  699. bad_child_tag(child_tag)
  700. if not tvalue:
  701. if self.bk.formatting_info:
  702. self.sheet.put_cell(rowx, colx, XL_CELL_BLANK, '', xf_index)
  703. else:
  704. self.sheet.put_cell(rowx, colx, XL_CELL_TEXT, tvalue, xf_index)
  705. else:
  706. raise Exception("Unknown cell type %r in rowx=%d colx=%d" % (cell_type, rowx, colx))
  707. tag2meth = {
  708. 'row': do_row,
  709. }
  710. augment_keys(tag2meth, U_SSML12)
  711. def open_workbook_2007_xml(
  712. zf,
  713. component_names,
  714. logfile=sys.stdout,
  715. verbosity=0,
  716. use_mmap=0,
  717. formatting_info=0,
  718. on_demand=0,
  719. ragged_rows=0,
  720. ):
  721. ensure_elementtree_imported(verbosity, logfile)
  722. bk = Book()
  723. bk.logfile = logfile
  724. bk.verbosity = verbosity
  725. bk.formatting_info = formatting_info
  726. if formatting_info:
  727. raise NotImplementedError("formatting_info=True not yet implemented")
  728. bk.use_mmap = False #### Not supported initially
  729. bk.on_demand = on_demand
  730. if on_demand:
  731. if verbosity:
  732. print("WARNING *** on_demand=True not yet implemented; falling back to False", file=bk.logfile)
  733. bk.on_demand = False
  734. bk.ragged_rows = ragged_rows
  735. x12book = X12Book(bk, logfile, verbosity)
  736. zflo = zf.open(component_names['xl/_rels/workbook.xml.rels'])
  737. x12book.process_rels(zflo)
  738. del zflo
  739. zflo = zf.open(component_names['xl/workbook.xml'])
  740. x12book.process_stream(zflo, 'Workbook')
  741. del zflo
  742. props_name = 'docprops/core.xml'
  743. if props_name in component_names:
  744. zflo = zf.open(component_names[props_name])
  745. x12book.process_coreprops(zflo)
  746. x12sty = X12Styles(bk, logfile, verbosity)
  747. if 'xl/styles.xml' in component_names:
  748. zflo = zf.open(component_names['xl/styles.xml'])
  749. x12sty.process_stream(zflo, 'styles')
  750. del zflo
  751. else:
  752. # seen in MS sample file MergedCells.xlsx
  753. pass
  754. sst_fname = 'xl/sharedstrings.xml'
  755. x12sst = X12SST(bk, logfile, verbosity)
  756. if sst_fname in component_names:
  757. zflo = zf.open(component_names[sst_fname])
  758. x12sst.process_stream(zflo, 'SST')
  759. del zflo
  760. for sheetx in range(bk.nsheets):
  761. fname = x12book.sheet_targets[sheetx]
  762. zflo = zf.open(component_names[fname])
  763. sheet = bk._sheet_list[sheetx]
  764. x12sheet = X12Sheet(sheet, logfile, verbosity)
  765. heading = "Sheet %r (sheetx=%d) from %r" % (sheet.name, sheetx, fname)
  766. x12sheet.process_stream(zflo, heading)
  767. del zflo
  768. rels_fname = 'xl/worksheets/_rels/%s.rels' % fname.rsplit('/', 1)[-1]
  769. if rels_fname in component_names:
  770. zfrels = zf.open(rels_fname)
  771. x12sheet.process_rels(zfrels)
  772. del zfrels
  773. for relid, reltype in x12sheet.relid2reltype.items():
  774. if reltype == 'comments':
  775. comments_fname = x12sheet.relid2path.get(relid)
  776. if comments_fname and comments_fname in component_names:
  777. comments_stream = zf.open(comments_fname)
  778. x12sheet.process_comments_stream(comments_stream)
  779. del comments_stream
  780. sheet.tidy_dimensions()
  781. return bk