zipstream.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. # -*- test-case-name: twisted.python.test.test_zipstream -*-
  2. # Copyright (c) Twisted Matrix Laboratories.
  3. # See LICENSE for details.
  4. """
  5. An incremental approach to unzipping files. This allows you to unzip a little
  6. bit of a file at a time, which means you can report progress as a file unzips.
  7. """
  8. import zipfile
  9. import os.path
  10. import zlib
  11. import struct
  12. _fileHeaderSize = struct.calcsize(zipfile.structFileHeader)
  13. class ChunkingZipFile(zipfile.ZipFile):
  14. """
  15. A L{zipfile.ZipFile} object which, with L{readfile}, also gives you access
  16. to a file-like object for each entry.
  17. """
  18. def readfile(self, name):
  19. """
  20. Return file-like object for name.
  21. """
  22. if self.mode not in ("r", "a"):
  23. raise RuntimeError('read() requires mode "r" or "a"')
  24. if not self.fp:
  25. raise RuntimeError(
  26. "Attempt to read ZIP archive that was already closed")
  27. zinfo = self.getinfo(name)
  28. self.fp.seek(zinfo.header_offset, 0)
  29. fheader = self.fp.read(_fileHeaderSize)
  30. if fheader[0:4] != zipfile.stringFileHeader:
  31. raise zipfile.BadZipfile("Bad magic number for file header")
  32. fheader = struct.unpack(zipfile.structFileHeader, fheader)
  33. fname = self.fp.read(fheader[zipfile._FH_FILENAME_LENGTH])
  34. if fheader[zipfile._FH_EXTRA_FIELD_LENGTH]:
  35. self.fp.read(fheader[zipfile._FH_EXTRA_FIELD_LENGTH])
  36. if zinfo.flag_bits & 0x800:
  37. # UTF-8 filename
  38. fname_str = fname.decode("utf-8")
  39. else:
  40. fname_str = fname.decode("cp437")
  41. if fname_str != zinfo.orig_filename:
  42. raise zipfile.BadZipfile(
  43. 'File name in directory "%s" and header "%s" differ.' % (
  44. zinfo.orig_filename, fname_str))
  45. if zinfo.compress_type == zipfile.ZIP_STORED:
  46. return ZipFileEntry(self, zinfo.compress_size)
  47. elif zinfo.compress_type == zipfile.ZIP_DEFLATED:
  48. return DeflatedZipFileEntry(self, zinfo.compress_size)
  49. else:
  50. raise zipfile.BadZipfile(
  51. "Unsupported compression method %d for file %s" %
  52. (zinfo.compress_type, name))
  53. class _FileEntry(object):
  54. """
  55. Abstract superclass of both compressed and uncompressed variants of
  56. file-like objects within a zip archive.
  57. @ivar chunkingZipFile: a chunking zip file.
  58. @type chunkingZipFile: L{ChunkingZipFile}
  59. @ivar length: The number of bytes within the zip file that represent this
  60. file. (This is the size on disk, not the number of decompressed bytes
  61. which will result from reading it.)
  62. @ivar fp: the underlying file object (that contains pkzip data). Do not
  63. touch this, please. It will quite likely move or go away.
  64. @ivar closed: File-like 'closed' attribute; True before this file has been
  65. closed, False after.
  66. @type closed: L{bool}
  67. @ivar finished: An older, broken synonym for 'closed'. Do not touch this,
  68. please.
  69. @type finished: L{int}
  70. """
  71. def __init__(self, chunkingZipFile, length):
  72. """
  73. Create a L{_FileEntry} from a L{ChunkingZipFile}.
  74. """
  75. self.chunkingZipFile = chunkingZipFile
  76. self.fp = self.chunkingZipFile.fp
  77. self.length = length
  78. self.finished = 0
  79. self.closed = False
  80. def isatty(self):
  81. """
  82. Returns false because zip files should not be ttys
  83. """
  84. return False
  85. def close(self):
  86. """
  87. Close self (file-like object)
  88. """
  89. self.closed = True
  90. self.finished = 1
  91. del self.fp
  92. def readline(self):
  93. """
  94. Read a line.
  95. """
  96. line = b""
  97. for byte in iter(lambda : self.read(1), b""):
  98. line += byte
  99. if byte == b"\n":
  100. break
  101. return line
  102. def __next__(self):
  103. """
  104. Implement next as file does (like readline, except raises StopIteration
  105. at EOF)
  106. """
  107. nextline = self.readline()
  108. if nextline:
  109. return nextline
  110. raise StopIteration()
  111. # Iterators on Python 2 use next(), not __next__()
  112. next = __next__
  113. def readlines(self):
  114. """
  115. Returns a list of all the lines
  116. """
  117. return list(self)
  118. def xreadlines(self):
  119. """
  120. Returns an iterator (so self)
  121. """
  122. return self
  123. def __iter__(self):
  124. """
  125. Returns an iterator (so self)
  126. """
  127. return self
  128. def __enter__(self):
  129. return self
  130. def __exit__(self, exc_type, exc_value, traceback):
  131. self.close()
  132. class ZipFileEntry(_FileEntry):
  133. """
  134. File-like object used to read an uncompressed entry in a ZipFile
  135. """
  136. def __init__(self, chunkingZipFile, length):
  137. _FileEntry.__init__(self, chunkingZipFile, length)
  138. self.readBytes = 0
  139. def tell(self):
  140. return self.readBytes
  141. def read(self, n=None):
  142. if n is None:
  143. n = self.length - self.readBytes
  144. if n == 0 or self.finished:
  145. return b''
  146. data = self.chunkingZipFile.fp.read(
  147. min(n, self.length - self.readBytes))
  148. self.readBytes += len(data)
  149. if self.readBytes == self.length or len(data) < n:
  150. self.finished = 1
  151. return data
  152. class DeflatedZipFileEntry(_FileEntry):
  153. """
  154. File-like object used to read a deflated entry in a ZipFile
  155. """
  156. def __init__(self, chunkingZipFile, length):
  157. _FileEntry.__init__(self, chunkingZipFile, length)
  158. self.returnedBytes = 0
  159. self.readBytes = 0
  160. self.decomp = zlib.decompressobj(-15)
  161. self.buffer = b""
  162. def tell(self):
  163. return self.returnedBytes
  164. def read(self, n=None):
  165. if self.finished:
  166. return b""
  167. if n is None:
  168. result = [self.buffer,]
  169. result.append(
  170. self.decomp.decompress(
  171. self.chunkingZipFile.fp.read(
  172. self.length - self.readBytes)))
  173. result.append(self.decomp.decompress(b"Z"))
  174. result.append(self.decomp.flush())
  175. self.buffer = b""
  176. self.finished = 1
  177. result = b"".join(result)
  178. self.returnedBytes += len(result)
  179. return result
  180. else:
  181. while len(self.buffer) < n:
  182. data = self.chunkingZipFile.fp.read(
  183. min(n, 1024, self.length - self.readBytes))
  184. self.readBytes += len(data)
  185. if not data:
  186. result = (self.buffer
  187. + self.decomp.decompress(b"Z")
  188. + self.decomp.flush())
  189. self.finished = 1
  190. self.buffer = b""
  191. self.returnedBytes += len(result)
  192. return result
  193. else:
  194. self.buffer += self.decomp.decompress(data)
  195. result = self.buffer[:n]
  196. self.buffer = self.buffer[n:]
  197. self.returnedBytes += len(result)
  198. return result
  199. DIR_BIT = 16
  200. def countZipFileChunks(filename, chunksize):
  201. """
  202. Predict the number of chunks that will be extracted from the entire
  203. zipfile, given chunksize blocks.
  204. """
  205. totalchunks = 0
  206. zf = ChunkingZipFile(filename)
  207. for info in zf.infolist():
  208. totalchunks += countFileChunks(info, chunksize)
  209. return totalchunks
  210. def countFileChunks(zipinfo, chunksize):
  211. """
  212. Count the number of chunks that will result from the given C{ZipInfo}.
  213. @param zipinfo: a C{zipfile.ZipInfo} instance describing an entry in a zip
  214. archive to be counted.
  215. @return: the number of chunks present in the zip file. (Even an empty file
  216. counts as one chunk.)
  217. @rtype: L{int}
  218. """
  219. count, extra = divmod(zipinfo.file_size, chunksize)
  220. if extra > 0:
  221. count += 1
  222. return count or 1
  223. def unzipIterChunky(filename, directory='.', overwrite=0,
  224. chunksize=4096):
  225. """
  226. Return a generator for the zipfile. This implementation will yield after
  227. every chunksize uncompressed bytes, or at the end of a file, whichever
  228. comes first.
  229. The value it yields is the number of chunks left to unzip.
  230. """
  231. czf = ChunkingZipFile(filename, 'r')
  232. if not os.path.exists(directory):
  233. os.makedirs(directory)
  234. remaining = countZipFileChunks(filename, chunksize)
  235. names = czf.namelist()
  236. infos = czf.infolist()
  237. for entry, info in zip(names, infos):
  238. isdir = info.external_attr & DIR_BIT
  239. f = os.path.join(directory, entry)
  240. if isdir:
  241. # overwrite flag only applies to files
  242. if not os.path.exists(f):
  243. os.makedirs(f)
  244. remaining -= 1
  245. yield remaining
  246. else:
  247. # create the directory the file will be in first,
  248. # since we can't guarantee it exists
  249. fdir = os.path.split(f)[0]
  250. if not os.path.exists(fdir):
  251. os.makedirs(fdir)
  252. if overwrite or not os.path.exists(f):
  253. fp = czf.readfile(entry)
  254. if info.file_size == 0:
  255. remaining -= 1
  256. yield remaining
  257. with open(f, 'wb') as outfile:
  258. while fp.tell() < info.file_size:
  259. hunk = fp.read(chunksize)
  260. outfile.write(hunk)
  261. remaining -= 1
  262. yield remaining
  263. else:
  264. remaining -= countFileChunks(info, chunksize)
  265. yield remaining