pdfparser.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. #!/usr/bin/env python
  2. import sys
  3. try:
  4. from cStringIO import StringIO
  5. except ImportError:
  6. from StringIO import StringIO
  7. from psparser import PSStackParser
  8. from psparser import PSSyntaxError, PSEOF
  9. from psparser import KWD, STRICT
  10. from pdftypes import PDFException
  11. from pdftypes import PDFStream, PDFObjRef
  12. from pdftypes import int_value
  13. from pdftypes import dict_value
  14. ## Exceptions
  15. ##
  16. class PDFSyntaxError(PDFException):
  17. pass
  18. ## PDFParser
  19. ##
  20. class PDFParser(PSStackParser):
  21. """
  22. PDFParser fetch PDF objects from a file stream.
  23. It can handle indirect references by referring to
  24. a PDF document set by set_document method.
  25. It also reads XRefs at the end of every PDF file.
  26. Typical usage:
  27. parser = PDFParser(fp)
  28. parser.read_xref()
  29. parser.read_xref(fallback=True) # optional
  30. parser.set_document(doc)
  31. parser.seek(offset)
  32. parser.nextobject()
  33. """
  34. def __init__(self, fp):
  35. PSStackParser.__init__(self, fp)
  36. self.doc = None
  37. self.fallback = False
  38. return
  39. def set_document(self, doc):
  40. """Associates the parser with a PDFDocument object."""
  41. self.doc = doc
  42. return
  43. KEYWORD_R = KWD('R')
  44. KEYWORD_NULL = KWD('null')
  45. KEYWORD_ENDOBJ = KWD('endobj')
  46. KEYWORD_STREAM = KWD('stream')
  47. KEYWORD_XREF = KWD('xref')
  48. KEYWORD_STARTXREF = KWD('startxref')
  49. def do_keyword(self, pos, token):
  50. """Handles PDF-related keywords."""
  51. if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
  52. self.add_results(*self.pop(1))
  53. elif token is self.KEYWORD_ENDOBJ:
  54. self.add_results(*self.pop(4))
  55. elif token is self.KEYWORD_NULL:
  56. # null object
  57. self.push((pos, None))
  58. elif token is self.KEYWORD_R:
  59. # reference to indirect object
  60. try:
  61. ((_, objid), (_, genno)) = self.pop(2)
  62. (objid, genno) = (int(objid), int(genno))
  63. obj = PDFObjRef(self.doc, objid, genno)
  64. self.push((pos, obj))
  65. except PSSyntaxError:
  66. pass
  67. elif token is self.KEYWORD_STREAM:
  68. # stream object
  69. ((_, dic),) = self.pop(1)
  70. dic = dict_value(dic)
  71. objlen = 0
  72. if not self.fallback:
  73. try:
  74. objlen = int_value(dic['Length'])
  75. except KeyError:
  76. if STRICT:
  77. raise PDFSyntaxError('/Length is undefined: %r' % dic)
  78. self.seek(pos)
  79. try:
  80. (_, line) = self.nextline() # 'stream'
  81. except PSEOF:
  82. if STRICT:
  83. raise PDFSyntaxError('Unexpected EOF')
  84. return
  85. pos += len(line)
  86. self.fp.seek(pos)
  87. data = self.fp.read(objlen)
  88. self.seek(pos+objlen)
  89. while 1:
  90. try:
  91. (linepos, line) = self.nextline()
  92. except PSEOF:
  93. if STRICT:
  94. raise PDFSyntaxError('Unexpected EOF')
  95. break
  96. if 'endstream' in line:
  97. i = line.index('endstream')
  98. objlen += i
  99. data += line[:i]
  100. break
  101. objlen += len(line)
  102. data += line
  103. self.seek(pos+objlen)
  104. # XXX limit objlen not to exceed object boundary
  105. if 2 <= self.debug:
  106. print >>sys.stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
  107. (pos, objlen, dic, data[:10])
  108. obj = PDFStream(dic, data, self.doc.decipher)
  109. self.push((pos, obj))
  110. else:
  111. # others
  112. self.push((pos, token))
  113. return
  114. ## PDFStreamParser
  115. ##
  116. class PDFStreamParser(PDFParser):
  117. """
  118. PDFStreamParser is used to parse PDF content streams
  119. that is contained in each page and has instructions
  120. for rendering the page. A reference to a PDF document is
  121. needed because a PDF content stream can also have
  122. indirect references to other objects in the same document.
  123. """
  124. def __init__(self, data):
  125. PDFParser.__init__(self, StringIO(data))
  126. return
  127. def flush(self):
  128. self.add_results(*self.popall())
  129. return
  130. def do_keyword(self, pos, token):
  131. if token is self.KEYWORD_R:
  132. # reference to indirect object
  133. try:
  134. ((_, objid), (_, genno)) = self.pop(2)
  135. (objid, genno) = (int(objid), int(genno))
  136. obj = PDFObjRef(self.doc, objid, genno)
  137. self.push((pos, obj))
  138. except PSSyntaxError:
  139. pass
  140. return
  141. # others
  142. self.push((pos, token))
  143. return