pdfdocument.py 20 KB


  1. #!/usr/bin/env python
  2. import sys
  3. import re
  4. import struct
  5. try:
  6. import hashlib as md5
  7. except ImportError:
  8. import md5
  9. from psparser import PSEOF
  10. from psparser import literal_name
  11. from psparser import LIT, KWD, STRICT
  12. from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError
  13. from pdftypes import PDFObjectNotFound, PDFStream
  14. from pdftypes import decipher_all
  15. from pdftypes import int_value
  16. from pdftypes import str_value, list_value, dict_value, stream_value
  17. from pdfparser import PDFSyntaxError
  18. from pdfparser import PDFStreamParser
  19. from arcfour import Arcfour
  20. from utils import choplist, nunpack
  21. from utils import decode_text
  22. ## Exceptions
  23. ##
  24. class PDFNoValidXRef(PDFSyntaxError):
  25. pass
  26. class PDFNoOutlines(PDFException):
  27. pass
  28. class PDFDestinationNotFound(PDFException):
  29. pass
  30. class PDFEncryptionError(PDFException):
  31. pass
  32. class PDFPasswordIncorrect(PDFEncryptionError):
  33. pass
  34. class PDFTextExtractionNotAllowed(PDFEncryptionError):
  35. pass
  36. # some predefined literals and keywords.
  37. LITERAL_OBJSTM = LIT('ObjStm')
  38. LITERAL_XREF = LIT('XRef')
  39. LITERAL_CATALOG = LIT('Catalog')
  40. ## XRefs
  41. ##
  42. class PDFBaseXRef(object):
  43. def get_trailer(self):
  44. raise NotImplementedError
  45. def get_objids(self):
  46. return []
  47. # Must return
  48. # (strmid, index, genno)
  49. # or (None, pos, genno)
  50. def get_pos(self, objid):
  51. raise KeyError(objid)
  52. ## PDFXRef
  53. ##
  54. class PDFXRef(PDFBaseXRef):
  55. def __init__(self):
  56. self.offsets = {}
  57. self.trailer = {}
  58. return
  59. def __repr__(self):
  60. return '<PDFXRef: offsets=%r>' % (self.offsets.keys())
  61. def load(self, parser, debug=0):
  62. while 1:
  63. try:
  64. (pos, line) = parser.nextline()
  65. if not line.strip():
  66. continue
  67. except PSEOF:
  68. raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
  69. if not line:
  70. raise PDFNoValidXRef('Premature eof: %r' % parser)
  71. if line.startswith('trailer'):
  72. parser.seek(pos)
  73. break
  74. f = line.strip().split(' ')
  75. if len(f) != 2:
  76. raise PDFNoValidXRef('Trailer not found: %r: line=%r' % (parser, line))
  77. try:
  78. (start, nobjs) = map(long, f)
  79. except ValueError:
  80. raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
  81. for objid in xrange(start, start+nobjs):
  82. try:
  83. (_, line) = parser.nextline()
  84. except PSEOF:
  85. raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
  86. f = line.strip().split(' ')
  87. if len(f) != 3:
  88. raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
  89. (pos, genno, use) = f
  90. if use != 'n':
  91. continue
  92. self.offsets[objid] = (None, long(pos), int(genno))
  93. if 1 <= debug:
  94. print >>sys.stderr, 'xref objects:', self.offsets
  95. self.load_trailer(parser)
  96. return
  97. KEYWORD_TRAILER = KWD('trailer')
  98. def load_trailer(self, parser):
  99. try:
  100. (_, kwd) = parser.nexttoken()
  101. assert kwd is self.KEYWORD_TRAILER
  102. (_, dic) = parser.nextobject()
  103. except PSEOF:
  104. x = parser.pop(1)
  105. if not x:
  106. raise PDFNoValidXRef('Unexpected EOF - file corrupted')
  107. (_, dic) = x[0]
  108. self.trailer.update(dict_value(dic))
  109. return
  110. def get_trailer(self):
  111. return self.trailer
  112. def get_objids(self):
  113. return self.offsets.iterkeys()
  114. def get_pos(self, objid):
  115. try:
  116. return self.offsets[objid]
  117. except KeyError:
  118. raise
  119. ## PDFXRefFallback
  120. ##
  121. class PDFXRefFallback(PDFXRef):
  122. def __repr__(self):
  123. return '<PDFXRefFallback: offsets=%r>' % (self.offsets.keys())
  124. PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
  125. def load(self, parser, debug=0):
  126. parser.seek(0)
  127. while 1:
  128. try:
  129. (pos, line) = parser.nextline()
  130. except PSEOF:
  131. break
  132. if line.startswith('trailer'):
  133. parser.seek(pos)
  134. self.load_trailer(parser)
  135. if 1 <= debug:
  136. print >>sys.stderr, 'trailer: %r' % self.get_trailer()
  137. break
  138. m = self.PDFOBJ_CUE.match(line)
  139. if not m:
  140. continue
  141. (objid, genno) = m.groups()
  142. objid = int(objid)
  143. genno = int(genno)
  144. self.offsets[objid] = (None, pos, genno)
  145. # expand ObjStm.
  146. parser.seek(pos)
  147. (_, obj) = parser.nextobject()
  148. if isinstance(obj, PDFStream) and obj.get('Type') is LITERAL_OBJSTM:
  149. stream = stream_value(obj)
  150. try:
  151. n = stream['N']
  152. except KeyError:
  153. if STRICT:
  154. raise PDFSyntaxError('N is not defined: %r' % stream)
  155. n = 0
  156. parser1 = PDFStreamParser(stream.get_data())
  157. objs = []
  158. try:
  159. while 1:
  160. (_, obj) = parser1.nextobject()
  161. objs.append(obj)
  162. except PSEOF:
  163. pass
  164. n = min(n, len(objs)//2)
  165. for index in xrange(n):
  166. objid1 = objs[index*2]
  167. self.offsets[objid1] = (objid, index, 0)
  168. return
  169. ## PDFXRefStream
  170. ##
  171. class PDFXRefStream(PDFBaseXRef):
  172. def __init__(self):
  173. self.data = None
  174. self.entlen = None
  175. self.fl1 = self.fl2 = self.fl3 = None
  176. self.ranges = []
  177. return
  178. def __repr__(self):
  179. return '<PDFXRefStream: ranges=%r>' % (self.ranges)
  180. def load(self, parser, debug=0):
  181. (_, objid) = parser.nexttoken() # ignored
  182. (_, genno) = parser.nexttoken() # ignored
  183. (_, kwd) = parser.nexttoken()
  184. (_, stream) = parser.nextobject()
  185. if not isinstance(stream, PDFStream) or stream['Type'] is not LITERAL_XREF:
  186. raise PDFNoValidXRef('Invalid PDF stream spec.')
  187. size = stream['Size']
  188. index_array = stream.get('Index', (0, size))
  189. if len(index_array) % 2 != 0:
  190. raise PDFSyntaxError('Invalid index number')
  191. self.ranges.extend(choplist(2, index_array))
  192. (self.fl1, self.fl2, self.fl3) = stream['W']
  193. self.data = stream.get_data()
  194. self.entlen = self.fl1+self.fl2+self.fl3
  195. self.trailer = stream.attrs
  196. if 1 <= debug:
  197. print >>sys.stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
  198. (', '.join(map(repr, self.ranges)),
  199. self.fl1, self.fl2, self.fl3))
  200. return
  201. def get_trailer(self):
  202. return self.trailer
  203. def get_objids(self):
  204. for (start, nobjs) in self.ranges:
  205. for i in xrange(nobjs):
  206. offset = self.entlen * i
  207. ent = self.data[offset:offset+self.entlen]
  208. f1 = nunpack(ent[:self.fl1], 1)
  209. if f1 == 1 or f1 == 2:
  210. yield start+i
  211. return
  212. def get_pos(self, objid):
  213. index = 0
  214. for (start, nobjs) in self.ranges:
  215. if start <= objid and objid < start+nobjs:
  216. index += objid - start
  217. break
  218. else:
  219. index += nobjs
  220. else:
  221. raise KeyError(objid)
  222. offset = self.entlen * index
  223. ent = self.data[offset:offset+self.entlen]
  224. f1 = nunpack(ent[:self.fl1], 1)
  225. f2 = nunpack(ent[self.fl1:self.fl1+self.fl2])
  226. f3 = nunpack(ent[self.fl1+self.fl2:])
  227. if f1 == 1:
  228. return (None, f2, f3)
  229. elif f1 == 2:
  230. return (f2, f3, 0)
  231. else:
  232. # this is a free object
  233. raise KeyError(objid)
  234. ## PDFDocument
  235. ##
  236. class PDFDocument(object):
  237. """PDFDocument object represents a PDF document.
  238. Since a PDF file can be very big, normally it is not loaded at
  239. once. So PDF document has to cooperate with a PDF parser in order to
  240. dynamically import the data as processing goes.
  241. Typical usage:
  242. doc = PDFDocument(parser, password)
  243. obj = doc.getobj(objid)
  244. """
  245. debug = 0
  246. PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
  247. def __init__(self, parser, password='', caching=True, fallback=True):
  248. "Set the document to use a given PDFParser object."
  249. self.caching = caching
  250. self.xrefs = []
  251. self.info = []
  252. self.catalog = None
  253. self.encryption = None
  254. self.decipher = None
  255. self._parser = None
  256. self._cached_objs = {}
  257. self._parsed_objs = {}
  258. self._parser = parser
  259. self._parser.set_document(self)
  260. self.is_printable = self.is_modifiable = self.is_extractable = True
  261. # Retrieve the information of each header that was appended
  262. # (maybe multiple times) at the end of the document.
  263. try:
  264. pos = self.find_xref(parser)
  265. self.read_xref_from(parser, pos, self.xrefs)
  266. except PDFNoValidXRef:
  267. fallback = True
  268. if fallback:
  269. parser.fallback = True
  270. xref = PDFXRefFallback()
  271. xref.load(parser)
  272. self.xrefs.append(xref)
  273. for xref in self.xrefs:
  274. trailer = xref.get_trailer()
  275. if not trailer:
  276. continue
  277. # If there's an encryption info, remember it.
  278. if 'Encrypt' in trailer:
  279. #assert not self.encryption
  280. self.encryption = (list_value(trailer['ID']),
  281. dict_value(trailer['Encrypt']))
  282. self._initialize_password(password)
  283. if 'Info' in trailer:
  284. self.info.append(dict_value(trailer['Info']))
  285. if 'Root' in trailer:
  286. # Every PDF file must have exactly one /Root dictionary.
  287. self.catalog = dict_value(trailer['Root'])
  288. break
  289. else:
  290. raise PDFSyntaxError('No /Root object! - Is this really a PDF?')
  291. if self.catalog.get('Type') is not LITERAL_CATALOG:
  292. if STRICT:
  293. raise PDFSyntaxError('Catalog not found!')
  294. return
  295. # _initialize_password(password='')
  296. # Perform the initialization with a given password.
  297. def _initialize_password(self, password=''):
  298. (docid, param) = self.encryption
  299. if literal_name(param.get('Filter')) != 'Standard':
  300. raise PDFEncryptionError('Unknown filter: param=%r' % param)
  301. V = int_value(param.get('V', 0))
  302. if not (V == 1 or V == 2):
  303. raise PDFEncryptionError('Unknown algorithm: param=%r' % param)
  304. length = int_value(param.get('Length', 40)) # Key length (bits)
  305. O = str_value(param['O'])
  306. R = int_value(param['R']) # Revision
  307. if 5 <= R:
  308. raise PDFEncryptionError('Unknown revision: %r' % R)
  309. U = str_value(param['U'])
  310. P = int_value(param['P'])
  311. self.is_printable = bool(P & 4)
  312. self.is_modifiable = bool(P & 8)
  313. self.is_extractable = bool(P & 16)
  314. # Algorithm 3.2
  315. password = (password+self.PASSWORD_PADDING)[:32] # 1
  316. hash = md5.md5(password) # 2
  317. hash.update(O) # 3
  318. hash.update(struct.pack('<l', P)) # 4
  319. hash.update(docid[0]) # 5
  320. if 4 <= R:
  321. # 6
  322. raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
  323. if 3 <= R:
  324. # 8
  325. for _ in xrange(50):
  326. hash = md5.md5(hash.digest()[:length//8])
  327. key = hash.digest()[:length//8]
  328. if R == 2:
  329. # Algorithm 3.4
  330. u1 = Arcfour(key).process(self.PASSWORD_PADDING)
  331. elif R == 3:
  332. # Algorithm 3.5
  333. hash = md5.md5(self.PASSWORD_PADDING) # 2
  334. hash.update(docid[0]) # 3
  335. x = Arcfour(key).process(hash.digest()[:16]) # 4
  336. for i in xrange(1, 19+1):
  337. k = ''.join(chr(ord(c) ^ i) for c in key)
  338. x = Arcfour(k).process(x)
  339. u1 = x+x # 32bytes total
  340. if R == 2:
  341. is_authenticated = (u1 == U)
  342. else:
  343. is_authenticated = (u1[:16] == U[:16])
  344. if not is_authenticated:
  345. raise PDFPasswordIncorrect
  346. self.decrypt_key = key
  347. self.decipher = self.decrypt_rc4 # XXX may be AES
  348. return
  349. def decrypt_rc4(self, objid, genno, data):
  350. key = self.decrypt_key + struct.pack('<L', objid)[:3]+struct.pack('<L', genno)[:2]
  351. hash = md5.md5(key)
  352. key = hash.digest()[:min(len(key), 16)]
  353. return Arcfour(key).process(data)
  354. def _getobj_objstm(self, stream, index, objid):
  355. if stream.objid in self._parsed_objs:
  356. (objs, n) = self._parsed_objs[stream.objid]
  357. else:
  358. (objs, n) = self._get_objects(stream)
  359. if self.caching:
  360. self._parsed_objs[stream.objid] = (objs, n)
  361. i = n*2+index
  362. try:
  363. obj = objs[i]
  364. except IndexError:
  365. raise PDFSyntaxError('index too big: %r' % index)
  366. return obj
  367. def _get_objects(self, stream):
  368. if stream.get('Type') is not LITERAL_OBJSTM:
  369. if STRICT:
  370. raise PDFSyntaxError('Not a stream object: %r' % stream)
  371. try:
  372. n = stream['N']
  373. except KeyError:
  374. if STRICT:
  375. raise PDFSyntaxError('N is not defined: %r' % stream)
  376. n = 0
  377. parser = PDFStreamParser(stream.get_data())
  378. parser.set_document(self)
  379. objs = []
  380. try:
  381. while 1:
  382. (_, obj) = parser.nextobject()
  383. objs.append(obj)
  384. except PSEOF:
  385. pass
  386. return (objs, n)
  387. KEYWORD_OBJ = KWD('obj')
  388. def _getobj_parse(self, pos, objid):
  389. self._parser.seek(pos)
  390. (_, objid1) = self._parser.nexttoken() # objid
  391. if objid1 != objid:
  392. raise PDFSyntaxError('objid mismatch: %r=%r' % (objid1, objid))
  393. (_, genno) = self._parser.nexttoken() # genno
  394. (_, kwd) = self._parser.nexttoken()
  395. if kwd is not self.KEYWORD_OBJ:
  396. raise PDFSyntaxError('Invalid object spec: offset=%r' % pos)
  397. (_, obj) = self._parser.nextobject()
  398. return obj
  399. # can raise PDFObjectNotFound
  400. def getobj(self, objid):
  401. assert objid != 0
  402. if not self.xrefs:
  403. raise PDFException('PDFDocument is not initialized')
  404. if 2 <= self.debug:
  405. print >>sys.stderr, 'getobj: objid=%r' % (objid)
  406. if objid in self._cached_objs:
  407. (obj, genno) = self._cached_objs[objid]
  408. else:
  409. for xref in self.xrefs:
  410. try:
  411. (strmid, index, genno) = xref.get_pos(objid)
  412. except KeyError:
  413. continue
  414. try:
  415. if strmid is not None:
  416. stream = stream_value(self.getobj(strmid))
  417. obj = self._getobj_objstm(stream, index, objid)
  418. else:
  419. obj = self._getobj_parse(index, objid)
  420. if isinstance(obj, PDFStream):
  421. obj.set_objid(objid, genno)
  422. break
  423. except (PSEOF, PDFSyntaxError):
  424. continue
  425. else:
  426. raise PDFObjectNotFound(objid)
  427. if 2 <= self.debug:
  428. print >>sys.stderr, 'register: objid=%r: %r' % (objid, obj)
  429. if self.caching:
  430. self._cached_objs[objid] = (obj, genno)
  431. if self.decipher:
  432. obj = decipher_all(self.decipher, objid, genno, obj)
  433. return obj
  434. def get_outlines(self):
  435. if 'Outlines' not in self.catalog:
  436. raise PDFNoOutlines
  437. def search(entry, level):
  438. entry = dict_value(entry)
  439. if 'Title' in entry:
  440. if 'A' in entry or 'Dest' in entry:
  441. title = decode_text(str_value(entry['Title']))
  442. dest = entry.get('Dest')
  443. action = entry.get('A')
  444. se = entry.get('SE')
  445. yield (level, title, dest, action, se)
  446. if 'First' in entry and 'Last' in entry:
  447. for x in search(entry['First'], level+1):
  448. yield x
  449. if 'Next' in entry:
  450. for x in search(entry['Next'], level):
  451. yield x
  452. return
  453. return search(self.catalog['Outlines'], 0)
  454. def lookup_name(self, cat, key):
  455. try:
  456. names = dict_value(self.catalog['Names'])
  457. except (PDFTypeError, KeyError):
  458. raise KeyError((cat, key))
  459. # may raise KeyError
  460. d0 = dict_value(names[cat])
  461. def lookup(d):
  462. if 'Limits' in d:
  463. (k1, k2) = list_value(d['Limits'])
  464. if key < k1 or k2 < key:
  465. return None
  466. if 'Names' in d:
  467. objs = list_value(d['Names'])
  468. names = dict(choplist(2, objs))
  469. return names[key]
  470. if 'Kids' in d:
  471. for c in list_value(d['Kids']):
  472. v = lookup(dict_value(c))
  473. if v:
  474. return v
  475. raise KeyError((cat, key))
  476. return lookup(d0)
  477. def get_dest(self, name):
  478. try:
  479. # PDF-1.2 or later
  480. obj = self.lookup_name('Dests', name)
  481. except KeyError:
  482. # PDF-1.1 or prior
  483. if 'Dests' not in self.catalog:
  484. raise PDFDestinationNotFound(name)
  485. d0 = dict_value(self.catalog['Dests'])
  486. if name not in d0:
  487. raise PDFDestinationNotFound(name)
  488. obj = d0[name]
  489. return obj
  490. # find_xref
  491. def find_xref(self, parser):
  492. """Internal function used to locate the first XRef."""
  493. # search the last xref table by scanning the file backwards.
  494. prev = None
  495. for line in parser.revreadlines():
  496. line = line.strip()
  497. if 2 <= self.debug:
  498. print >>sys.stderr, 'find_xref: %r' % line
  499. if line == 'startxref':
  500. break
  501. if line:
  502. prev = line
  503. else:
  504. raise PDFNoValidXRef('Unexpected EOF')
  505. if 1 <= self.debug:
  506. print >>sys.stderr, 'xref found: pos=%r' % prev
  507. return long(prev)
  508. # read xref table
  509. def read_xref_from(self, parser, start, xrefs):
  510. """Reads XRefs from the given location."""
  511. parser.seek(start)
  512. parser.reset()
  513. try:
  514. (pos, token) = parser.nexttoken()
  515. except PSEOF:
  516. raise PDFNoValidXRef('Unexpected EOF')
  517. if 2 <= self.debug:
  518. print >>sys.stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
  519. if isinstance(token, int):
  520. # XRefStream: PDF-1.5
  521. parser.seek(pos)
  522. parser.reset()
  523. xref = PDFXRefStream()
  524. xref.load(parser, debug=self.debug)
  525. else:
  526. if token is parser.KEYWORD_XREF:
  527. parser.nextline()
  528. xref = PDFXRef()
  529. xref.load(parser, debug=self.debug)
  530. xrefs.append(xref)
  531. trailer = xref.get_trailer()
  532. if 1 <= self.debug:
  533. print >>sys.stderr, 'trailer: %r' % trailer
  534. if 'XRefStm' in trailer:
  535. pos = int_value(trailer['XRefStm'])
  536. self.read_xref_from(parser, pos, xrefs)
  537. if 'Prev' in trailer:
  538. # find previous xref
  539. pos = int_value(trailer['Prev'])
  540. self.read_xref_from(parser, pos, xrefs)
  541. return