compdoc.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. # -*- coding: cp1252 -*-
  2. ##
  3. # Implements the minimal functionality required
  4. # to extract a "Workbook" or "Book" stream (as one big string)
  5. # from an OLE2 Compound Document file.
  6. # <p>Copyright � 2005-2012 Stephen John Machin, Lingfo Pty Ltd</p>
  7. # <p>This module is part of the xlrd package, which is released under a BSD-style licence.</p>
  8. ##
  9. # No part of the content of this file was derived from the works of David Giffin.
  10. # 2008-11-04 SJM Avoid assertion error when -1 used instead of -2 for first_SID of empty SCSS [Frank Hoffsuemmer]
  11. # 2007-09-08 SJM Warning message if sector sizes are extremely large.
  12. # 2007-05-07 SJM Meaningful exception instead of IndexError if a SAT (sector allocation table) is corrupted.
  13. # 2007-04-22 SJM Missing "<" in a struct.unpack call => can't open files on bigendian platforms.
  14. from __future__ import print_function
  15. import sys
  16. from struct import unpack
  17. from .timemachine import *
  18. import array
  19. ##
  20. # Magic cookie that should appear in the first 8 bytes of the file.
  21. SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
  22. EOCSID = -2
  23. FREESID = -1
  24. SATSID = -3
  25. MSATSID = -4
  26. EVILSID = -5
  27. class CompDocError(Exception):
  28. pass
  29. class DirNode(object):
  30. def __init__(self, DID, dent, DEBUG=0, logfile=sys.stdout):
  31. # dent is the 128-byte directory entry
  32. self.DID = DID
  33. self.logfile = logfile
  34. (cbufsize, self.etype, self.colour, self.left_DID, self.right_DID,
  35. self.root_DID) = \
  36. unpack('<HBBiii', dent[64:80])
  37. (self.first_SID, self.tot_size) = \
  38. unpack('<ii', dent[116:124])
  39. if cbufsize == 0:
  40. self.name = UNICODE_LITERAL('')
  41. else:
  42. self.name = unicode(dent[0:cbufsize-2], 'utf_16_le') # omit the trailing U+0000
  43. self.children = [] # filled in later
  44. self.parent = -1 # indicates orphan; fixed up later
  45. self.tsinfo = unpack('<IIII', dent[100:116])
  46. if DEBUG:
  47. self.dump(DEBUG)
  48. def dump(self, DEBUG=1):
  49. fprintf(
  50. self.logfile,
  51. "DID=%d name=%r etype=%d DIDs(left=%d right=%d root=%d parent=%d kids=%r) first_SID=%d tot_size=%d\n",
  52. self.DID, self.name, self.etype, self.left_DID,
  53. self.right_DID, self.root_DID, self.parent, self.children, self.first_SID, self.tot_size
  54. )
  55. if DEBUG == 2:
  56. # cre_lo, cre_hi, mod_lo, mod_hi = tsinfo
  57. print("timestamp info", self.tsinfo, file=self.logfile)
  58. def _build_family_tree(dirlist, parent_DID, child_DID):
  59. if child_DID < 0: return
  60. _build_family_tree(dirlist, parent_DID, dirlist[child_DID].left_DID)
  61. dirlist[parent_DID].children.append(child_DID)
  62. dirlist[child_DID].parent = parent_DID
  63. _build_family_tree(dirlist, parent_DID, dirlist[child_DID].right_DID)
  64. if dirlist[child_DID].etype == 1: # storage
  65. _build_family_tree(dirlist, child_DID, dirlist[child_DID].root_DID)
  66. ##
  67. # Compound document handler.
  68. # @param mem The raw contents of the file, as a string, or as an mmap.mmap() object. The
  69. # only operation it needs to support is slicing.
  70. class CompDoc(object):
  71. def __init__(self, mem, logfile=sys.stdout, DEBUG=0):
  72. self.logfile = logfile
  73. self.DEBUG = DEBUG
  74. if mem[0:8] != SIGNATURE:
  75. raise CompDocError('Not an OLE2 compound document')
  76. if mem[28:30] != b'\xFE\xFF':
  77. raise CompDocError('Expected "little-endian" marker, found %r' % mem[28:30])
  78. revision, version = unpack('<HH', mem[24:28])
  79. if DEBUG:
  80. print("\nCompDoc format: version=0x%04x revision=0x%04x" % (version, revision), file=logfile)
  81. self.mem = mem
  82. ssz, sssz = unpack('<HH', mem[30:34])
  83. if ssz > 20: # allows for 2**20 bytes i.e. 1MB
  84. print("WARNING: sector size (2**%d) is preposterous; assuming 512 and continuing ..." \
  85. % ssz, file=logfile)
  86. ssz = 9
  87. if sssz > ssz:
  88. print("WARNING: short stream sector size (2**%d) is preposterous; assuming 64 and continuing ..." \
  89. % sssz, file=logfile)
  90. sssz = 6
  91. self.sec_size = sec_size = 1 << ssz
  92. self.short_sec_size = 1 << sssz
  93. if self.sec_size != 512 or self.short_sec_size != 64:
  94. print("@@@@ sec_size=%d short_sec_size=%d" % (self.sec_size, self.short_sec_size), file=logfile)
  95. (
  96. SAT_tot_secs, self.dir_first_sec_sid, _unused, self.min_size_std_stream,
  97. SSAT_first_sec_sid, SSAT_tot_secs,
  98. MSATX_first_sec_sid, MSATX_tot_secs,
  99. # ) = unpack('<ii4xiiiii', mem[44:76])
  100. ) = unpack('<iiiiiiii', mem[44:76])
  101. mem_data_len = len(mem) - 512
  102. mem_data_secs, left_over = divmod(mem_data_len, sec_size)
  103. if left_over:
  104. #### raise CompDocError("Not a whole number of sectors")
  105. mem_data_secs += 1
  106. print("WARNING *** file size (%d) not 512 + multiple of sector size (%d)" \
  107. % (len(mem), sec_size), file=logfile)
  108. self.mem_data_secs = mem_data_secs # use for checking later
  109. self.mem_data_len = mem_data_len
  110. seen = self.seen = array.array('B', [0]) * mem_data_secs
  111. if DEBUG:
  112. print('sec sizes', ssz, sssz, sec_size, self.short_sec_size, file=logfile)
  113. print("mem data: %d bytes == %d sectors" % (mem_data_len, mem_data_secs), file=logfile)
  114. print("SAT_tot_secs=%d, dir_first_sec_sid=%d, min_size_std_stream=%d" \
  115. % (SAT_tot_secs, self.dir_first_sec_sid, self.min_size_std_stream,), file=logfile)
  116. print("SSAT_first_sec_sid=%d, SSAT_tot_secs=%d" % (SSAT_first_sec_sid, SSAT_tot_secs,), file=logfile)
  117. print("MSATX_first_sec_sid=%d, MSATX_tot_secs=%d" % (MSATX_first_sec_sid, MSATX_tot_secs,), file=logfile)
  118. nent = sec_size // 4 # number of SID entries in a sector
  119. fmt = "<%di" % nent
  120. trunc_warned = 0
  121. #
  122. # === build the MSAT ===
  123. #
  124. MSAT = list(unpack('<109i', mem[76:512]))
  125. SAT_sectors_reqd = (mem_data_secs + nent - 1) // nent
  126. expected_MSATX_sectors = max(0, (SAT_sectors_reqd - 109 + nent - 2) // (nent - 1))
  127. actual_MSATX_sectors = 0
  128. if MSATX_tot_secs == 0 and MSATX_first_sec_sid in (EOCSID, FREESID, 0):
  129. # Strictly, if there is no MSAT extension, then MSATX_first_sec_sid
  130. # should be set to EOCSID ... FREESID and 0 have been met in the wild.
  131. pass # Presuming no extension
  132. else:
  133. sid = MSATX_first_sec_sid
  134. while sid not in (EOCSID, FREESID, MSATSID):
  135. # Above should be only EOCSID according to MS & OOo docs
  136. # but Excel doesn't complain about FREESID. Zero is a valid
  137. # sector number, not a sentinel.
  138. if DEBUG > 1:
  139. print('MSATX: sid=%d (0x%08X)' % (sid, sid), file=logfile)
  140. if sid >= mem_data_secs:
  141. msg = "MSAT extension: accessing sector %d but only %d in file" % (sid, mem_data_secs)
  142. if DEBUG > 1:
  143. print(msg, file=logfile)
  144. break
  145. raise CompDocError(msg)
  146. elif sid < 0:
  147. raise CompDocError("MSAT extension: invalid sector id: %d" % sid)
  148. if seen[sid]:
  149. raise CompDocError("MSAT corruption: seen[%d] == %d" % (sid, seen[sid]))
  150. seen[sid] = 1
  151. actual_MSATX_sectors += 1
  152. if DEBUG and actual_MSATX_sectors > expected_MSATX_sectors:
  153. print("[1]===>>>", mem_data_secs, nent, SAT_sectors_reqd, expected_MSATX_sectors, actual_MSATX_sectors, file=logfile)
  154. offset = 512 + sec_size * sid
  155. MSAT.extend(unpack(fmt, mem[offset:offset+sec_size]))
  156. sid = MSAT.pop() # last sector id is sid of next sector in the chain
  157. if DEBUG and actual_MSATX_sectors != expected_MSATX_sectors:
  158. print("[2]===>>>", mem_data_secs, nent, SAT_sectors_reqd, expected_MSATX_sectors, actual_MSATX_sectors, file=logfile)
  159. if DEBUG:
  160. print("MSAT: len =", len(MSAT), file=logfile)
  161. dump_list(MSAT, 10, logfile)
  162. #
  163. # === build the SAT ===
  164. #
  165. self.SAT = []
  166. actual_SAT_sectors = 0
  167. dump_again = 0
  168. for msidx in xrange(len(MSAT)):
  169. msid = MSAT[msidx]
  170. if msid in (FREESID, EOCSID):
  171. # Specification: the MSAT array may be padded with trailing FREESID entries.
  172. # Toleration: a FREESID or EOCSID entry anywhere in the MSAT array will be ignored.
  173. continue
  174. if msid >= mem_data_secs:
  175. if not trunc_warned:
  176. print("WARNING *** File is truncated, or OLE2 MSAT is corrupt!!", file=logfile)
  177. print("INFO: Trying to access sector %d but only %d available" \
  178. % (msid, mem_data_secs), file=logfile)
  179. trunc_warned = 1
  180. MSAT[msidx] = EVILSID
  181. dump_again = 1
  182. continue
  183. elif msid < -2:
  184. raise CompDocError("MSAT: invalid sector id: %d" % msid)
  185. if seen[msid]:
  186. raise CompDocError("MSAT extension corruption: seen[%d] == %d" % (msid, seen[msid]))
  187. seen[msid] = 2
  188. actual_SAT_sectors += 1
  189. if DEBUG and actual_SAT_sectors > SAT_sectors_reqd:
  190. print("[3]===>>>", mem_data_secs, nent, SAT_sectors_reqd, expected_MSATX_sectors, actual_MSATX_sectors, actual_SAT_sectors, msid, file=logfile)
  191. offset = 512 + sec_size * msid
  192. self.SAT.extend(unpack(fmt, mem[offset:offset+sec_size]))
  193. if DEBUG:
  194. print("SAT: len =", len(self.SAT), file=logfile)
  195. dump_list(self.SAT, 10, logfile)
  196. # print >> logfile, "SAT ",
  197. # for i, s in enumerate(self.SAT):
  198. # print >> logfile, "entry: %4d offset: %6d, next entry: %4d" % (i, 512 + sec_size * i, s)
  199. # print >> logfile, "%d:%d " % (i, s),
  200. print(file=logfile)
  201. if DEBUG and dump_again:
  202. print("MSAT: len =", len(MSAT), file=logfile)
  203. dump_list(MSAT, 10, logfile)
  204. for satx in xrange(mem_data_secs, len(self.SAT)):
  205. self.SAT[satx] = EVILSID
  206. print("SAT: len =", len(self.SAT), file=logfile)
  207. dump_list(self.SAT, 10, logfile)
  208. #
  209. # === build the directory ===
  210. #
  211. dbytes = self._get_stream(
  212. self.mem, 512, self.SAT, self.sec_size, self.dir_first_sec_sid,
  213. name="directory", seen_id=3)
  214. dirlist = []
  215. did = -1
  216. for pos in xrange(0, len(dbytes), 128):
  217. did += 1
  218. dirlist.append(DirNode(did, dbytes[pos:pos+128], 0, logfile))
  219. self.dirlist = dirlist
  220. _build_family_tree(dirlist, 0, dirlist[0].root_DID) # and stand well back ...
  221. if DEBUG:
  222. for d in dirlist:
  223. d.dump(DEBUG)
  224. #
  225. # === get the SSCS ===
  226. #
  227. sscs_dir = self.dirlist[0]
  228. assert sscs_dir.etype == 5 # root entry
  229. if sscs_dir.first_SID < 0 or sscs_dir.tot_size == 0:
  230. # Problem reported by Frank Hoffsuemmer: some software was
  231. # writing -1 instead of -2 (EOCSID) for the first_SID
  232. # when the SCCS was empty. Not having EOCSID caused assertion
  233. # failure in _get_stream.
  234. # Solution: avoid calling _get_stream in any case when the
  235. # SCSS appears to be empty.
  236. self.SSCS = ""
  237. else:
  238. self.SSCS = self._get_stream(
  239. self.mem, 512, self.SAT, sec_size, sscs_dir.first_SID,
  240. sscs_dir.tot_size, name="SSCS", seen_id=4)
  241. # if DEBUG: print >> logfile, "SSCS", repr(self.SSCS)
  242. #
  243. # === build the SSAT ===
  244. #
  245. self.SSAT = []
  246. if SSAT_tot_secs > 0 and sscs_dir.tot_size == 0:
  247. print("WARNING *** OLE2 inconsistency: SSCS size is 0 but SSAT size is non-zero", file=logfile)
  248. if sscs_dir.tot_size > 0:
  249. sid = SSAT_first_sec_sid
  250. nsecs = SSAT_tot_secs
  251. while sid >= 0 and nsecs > 0:
  252. if seen[sid]:
  253. raise CompDocError("SSAT corruption: seen[%d] == %d" % (sid, seen[sid]))
  254. seen[sid] = 5
  255. nsecs -= 1
  256. start_pos = 512 + sid * sec_size
  257. news = list(unpack(fmt, mem[start_pos:start_pos+sec_size]))
  258. self.SSAT.extend(news)
  259. sid = self.SAT[sid]
  260. if DEBUG: print("SSAT last sid %d; remaining sectors %d" % (sid, nsecs), file=logfile)
  261. assert nsecs == 0 and sid == EOCSID
  262. if DEBUG:
  263. print("SSAT", file=logfile)
  264. dump_list(self.SSAT, 10, logfile)
  265. if DEBUG:
  266. print("seen", file=logfile)
  267. dump_list(seen, 20, logfile)
  268. def _get_stream(self, mem, base, sat, sec_size, start_sid, size=None, name='', seen_id=None):
  269. # print >> self.logfile, "_get_stream", base, sec_size, start_sid, size
  270. sectors = []
  271. s = start_sid
  272. if size is None:
  273. # nothing to check against
  274. while s >= 0:
  275. if seen_id is not None:
  276. if self.seen[s]:
  277. raise CompDocError("%s corruption: seen[%d] == %d" % (name, s, self.seen[s]))
  278. self.seen[s] = seen_id
  279. start_pos = base + s * sec_size
  280. sectors.append(mem[start_pos:start_pos+sec_size])
  281. try:
  282. s = sat[s]
  283. except IndexError:
  284. raise CompDocError(
  285. "OLE2 stream %r: sector allocation table invalid entry (%d)" %
  286. (name, s)
  287. )
  288. assert s == EOCSID
  289. else:
  290. todo = size
  291. while s >= 0:
  292. if seen_id is not None:
  293. if self.seen[s]:
  294. raise CompDocError("%s corruption: seen[%d] == %d" % (name, s, self.seen[s]))
  295. self.seen[s] = seen_id
  296. start_pos = base + s * sec_size
  297. grab = sec_size
  298. if grab > todo:
  299. grab = todo
  300. todo -= grab
  301. sectors.append(mem[start_pos:start_pos+grab])
  302. try:
  303. s = sat[s]
  304. except IndexError:
  305. raise CompDocError(
  306. "OLE2 stream %r: sector allocation table invalid entry (%d)" %
  307. (name, s)
  308. )
  309. assert s == EOCSID
  310. if todo != 0:
  311. fprintf(self.logfile,
  312. "WARNING *** OLE2 stream %r: expected size %d, actual size %d\n",
  313. name, size, size - todo)
  314. return b''.join(sectors)
  315. def _dir_search(self, path, storage_DID=0):
  316. # Return matching DirNode instance, or None
  317. head = path[0]
  318. tail = path[1:]
  319. dl = self.dirlist
  320. for child in dl[storage_DID].children:
  321. if dl[child].name.lower() == head.lower():
  322. et = dl[child].etype
  323. if et == 2:
  324. return dl[child]
  325. if et == 1:
  326. if not tail:
  327. raise CompDocError("Requested component is a 'storage'")
  328. return self._dir_search(tail, child)
  329. dl[child].dump(1)
  330. raise CompDocError("Requested stream is not a 'user stream'")
  331. return None
  332. ##
  333. # Interrogate the compound document's directory; return the stream as a string if found, otherwise
  334. # return None.
  335. # @param qname Name of the desired stream e.g. u'Workbook'. Should be in Unicode or convertible thereto.
  336. def get_named_stream(self, qname):
  337. d = self._dir_search(qname.split("/"))
  338. if d is None:
  339. return None
  340. if d.tot_size >= self.min_size_std_stream:
  341. return self._get_stream(
  342. self.mem, 512, self.SAT, self.sec_size, d.first_SID,
  343. d.tot_size, name=qname, seen_id=d.DID+6)
  344. else:
  345. return self._get_stream(
  346. self.SSCS, 0, self.SSAT, self.short_sec_size, d.first_SID,
  347. d.tot_size, name=qname + " (from SSCS)", seen_id=None)
  348. ##
  349. # Interrogate the compound document's directory.
  350. # If the named stream is not found, (None, 0, 0) will be returned.
  351. # If the named stream is found and is contiguous within the original byte sequence ("mem")
  352. # used when the document was opened,
  353. # then (mem, offset_to_start_of_stream, length_of_stream) is returned.
  354. # Otherwise a new string is built from the fragments and (new_string, 0, length_of_stream) is returned.
  355. # @param qname Name of the desired stream e.g. u'Workbook'. Should be in Unicode or convertible thereto.
  356. def locate_named_stream(self, qname):
  357. d = self._dir_search(qname.split("/"))
  358. if d is None:
  359. return (None, 0, 0)
  360. if d.tot_size > self.mem_data_len:
  361. raise CompDocError("%r stream length (%d bytes) > file data size (%d bytes)"
  362. % (qname, d.tot_size, self.mem_data_len))
  363. if d.tot_size >= self.min_size_std_stream:
  364. result = self._locate_stream(
  365. self.mem, 512, self.SAT, self.sec_size, d.first_SID,
  366. d.tot_size, qname, d.DID+6)
  367. if self.DEBUG:
  368. print("\nseen", file=self.logfile)
  369. dump_list(self.seen, 20, self.logfile)
  370. return result
  371. else:
  372. return (
  373. self._get_stream(
  374. self.SSCS, 0, self.SSAT, self.short_sec_size, d.first_SID,
  375. d.tot_size, qname + " (from SSCS)", None),
  376. 0,
  377. d.tot_size
  378. )
  379. def _locate_stream(self, mem, base, sat, sec_size, start_sid, expected_stream_size, qname, seen_id):
  380. # print >> self.logfile, "_locate_stream", base, sec_size, start_sid, expected_stream_size
  381. s = start_sid
  382. if s < 0:
  383. raise CompDocError("_locate_stream: start_sid (%d) is -ve" % start_sid)
  384. p = -99 # dummy previous SID
  385. start_pos = -9999
  386. end_pos = -8888
  387. slices = []
  388. tot_found = 0
  389. found_limit = (expected_stream_size + sec_size - 1) // sec_size
  390. while s >= 0:
  391. if self.seen[s]:
  392. print("_locate_stream(%s): seen" % qname, file=self.logfile); dump_list(self.seen, 20, self.logfile)
  393. raise CompDocError("%s corruption: seen[%d] == %d" % (qname, s, self.seen[s]))
  394. self.seen[s] = seen_id
  395. tot_found += 1
  396. if tot_found > found_limit:
  397. raise CompDocError(
  398. "%s: size exceeds expected %d bytes; corrupt?"
  399. % (qname, found_limit * sec_size)
  400. ) # Note: expected size rounded up to higher sector
  401. if s == p+1:
  402. # contiguous sectors
  403. end_pos += sec_size
  404. else:
  405. # start new slice
  406. if p >= 0:
  407. # not first time
  408. slices.append((start_pos, end_pos))
  409. start_pos = base + s * sec_size
  410. end_pos = start_pos + sec_size
  411. p = s
  412. s = sat[s]
  413. assert s == EOCSID
  414. assert tot_found == found_limit
  415. # print >> self.logfile, "_locate_stream(%s): seen" % qname; dump_list(self.seen, 20, self.logfile)
  416. if not slices:
  417. # The stream is contiguous ... just what we like!
  418. return (mem, start_pos, expected_stream_size)
  419. slices.append((start_pos, end_pos))
  420. # print >> self.logfile, "+++>>> %d fragments" % len(slices)
  421. return (b''.join([mem[start_pos:end_pos] for start_pos, end_pos in slices]), 0, expected_stream_size)
  422. # ==========================================================================================
  423. def x_dump_line(alist, stride, f, dpos, equal=0):
  424. print("%5d%s" % (dpos, " ="[equal]), end=' ', file=f)
  425. for value in alist[dpos:dpos + stride]:
  426. print(str(value), end=' ', file=f)
  427. print(file=f)
  428. def dump_list(alist, stride, f=sys.stdout):
  429. def _dump_line(dpos, equal=0):
  430. print("%5d%s" % (dpos, " ="[equal]), end=' ', file=f)
  431. for value in alist[dpos:dpos + stride]:
  432. print(str(value), end=' ', file=f)
  433. print(file=f)
  434. pos = None
  435. oldpos = None
  436. for pos in xrange(0, len(alist), stride):
  437. if oldpos is None:
  438. _dump_line(pos)
  439. oldpos = pos
  440. elif alist[pos:pos+stride] != alist[oldpos:oldpos+stride]:
  441. if pos - oldpos > stride:
  442. _dump_line(pos - stride, equal=1)
  443. _dump_line(pos)
  444. oldpos = pos
  445. if oldpos is not None and pos is not None and pos != oldpos:
  446. _dump_line(pos, equal=1)