dumppdf.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. #!c:\tools\python27\python.exe
  2. #
  3. # dumppdf.py - dump pdf contents in XML format.
  4. #
  5. # usage: dumppdf.py [options] [files ...]
  6. # options:
  7. # -i objid : object id
  8. #
  9. import sys, os.path, re
  10. from pdfminer.psparser import PSKeyword, PSLiteral, LIT
  11. from pdfminer.pdfparser import PDFParser
  12. from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
  13. from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
  14. from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
  15. from pdfminer.pdfpage import PDFPage
  16. from pdfminer.utils import isnumber
  17. ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
  18. def e(s):
  19. return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
  20. # dumpxml
  21. def dumpxml(out, obj, codec=None):
  22. if obj is None:
  23. out.write('<null />')
  24. return
  25. if isinstance(obj, dict):
  26. out.write('<dict size="%d">\n' % len(obj))
  27. for (k,v) in obj.iteritems():
  28. out.write('<key>%s</key>\n' % k)
  29. out.write('<value>')
  30. dumpxml(out, v)
  31. out.write('</value>\n')
  32. out.write('</dict>')
  33. return
  34. if isinstance(obj, list):
  35. out.write('<list size="%d">\n' % len(obj))
  36. for v in obj:
  37. dumpxml(out, v)
  38. out.write('\n')
  39. out.write('</list>')
  40. return
  41. if isinstance(obj, str):
  42. out.write('<string size="%d">%s</string>' % (len(obj), e(obj)))
  43. return
  44. if isinstance(obj, PDFStream):
  45. if codec == 'raw':
  46. out.write(obj.get_rawdata())
  47. elif codec == 'binary':
  48. out.write(obj.get_data())
  49. else:
  50. out.write('<stream>\n<props>\n')
  51. dumpxml(out, obj.attrs)
  52. out.write('\n</props>\n')
  53. if codec == 'text':
  54. data = obj.get_data()
  55. out.write('<data size="%d">%s</data>\n' % (len(data), e(data)))
  56. out.write('</stream>')
  57. return
  58. if isinstance(obj, PDFObjRef):
  59. out.write('<ref id="%d" />' % obj.objid)
  60. return
  61. if isinstance(obj, PSKeyword):
  62. out.write('<keyword>%s</keyword>' % obj.name)
  63. return
  64. if isinstance(obj, PSLiteral):
  65. out.write('<literal>%s</literal>' % obj.name)
  66. return
  67. if isnumber(obj):
  68. out.write('<number>%s</number>' % obj)
  69. return
  70. raise TypeError(obj)
  71. # dumptrailers
  72. def dumptrailers(out, doc):
  73. for xref in doc.xrefs:
  74. out.write('<trailer>\n')
  75. dumpxml(out, xref.trailer)
  76. out.write('\n</trailer>\n\n')
  77. return
  78. # dumpallobjs
  79. def dumpallobjs(out, doc, codec=None):
  80. visited = set()
  81. out.write('<pdf>')
  82. for xref in doc.xrefs:
  83. for objid in xref.get_objids():
  84. if objid in visited: continue
  85. visited.add(objid)
  86. try:
  87. obj = doc.getobj(objid)
  88. if obj is None: continue
  89. out.write('<object id="%d">\n' % objid)
  90. dumpxml(out, obj, codec=codec)
  91. out.write('\n</object>\n\n')
  92. except PDFObjectNotFound, e:
  93. print >>sys.stderr, 'not found: %r' % e
  94. dumptrailers(out, doc)
  95. out.write('</pdf>')
  96. return
  97. # dumpoutline
  98. def dumpoutline(outfp, fname, objids, pagenos, password='',
  99. dumpall=False, codec=None, extractdir=None):
  100. fp = file(fname, 'rb')
  101. parser = PDFParser(fp)
  102. doc = PDFDocument(parser, password)
  103. pages = dict( (page.pageid, pageno) for (pageno,page)
  104. in enumerate(PDFPage.create_pages(doc)) )
  105. def resolve_dest(dest):
  106. if isinstance(dest, str):
  107. dest = resolve1(doc.get_dest(dest))
  108. elif isinstance(dest, PSLiteral):
  109. dest = resolve1(doc.get_dest(dest.name))
  110. if isinstance(dest, dict):
  111. dest = dest['D']
  112. return dest
  113. try:
  114. outlines = doc.get_outlines()
  115. outfp.write('<outlines>\n')
  116. for (level,title,dest,a,se) in outlines:
  117. pageno = None
  118. if dest:
  119. dest = resolve_dest(dest)
  120. pageno = pages[dest[0].objid]
  121. elif a:
  122. action = a.resolve()
  123. if isinstance(action, dict):
  124. subtype = action.get('S')
  125. if subtype and repr(subtype) == '/GoTo' and action.get('D'):
  126. dest = resolve_dest(action['D'])
  127. pageno = pages[dest[0].objid]
  128. s = e(title).encode('utf-8', 'xmlcharrefreplace')
  129. outfp.write('<outline level="%r" title="%s">\n' % (level, s))
  130. if dest is not None:
  131. outfp.write('<dest>')
  132. dumpxml(outfp, dest)
  133. outfp.write('</dest>\n')
  134. if pageno is not None:
  135. outfp.write('<pageno>%r</pageno>\n' % pageno)
  136. outfp.write('</outline>\n')
  137. outfp.write('</outlines>\n')
  138. except PDFNoOutlines:
  139. pass
  140. parser.close()
  141. fp.close()
  142. return
  143. # extractembedded
  144. LITERAL_FILESPEC = LIT('Filespec')
  145. LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')
  146. def extractembedded(outfp, fname, objids, pagenos, password='',
  147. dumpall=False, codec=None, extractdir=None):
  148. def extract1(obj):
  149. filename = os.path.basename(obj['UF'] or obj['F'])
  150. fileref = obj['EF']['F']
  151. fileobj = doc.getobj(fileref.objid)
  152. if not isinstance(fileobj, PDFStream):
  153. raise PDFValueError(
  154. 'unable to process PDF: reference for %r is not a PDFStream' %
  155. (filename))
  156. if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
  157. raise PDFValueError(
  158. 'unable to process PDF: reference for %r is not an EmbeddedFile' %
  159. (filename))
  160. path = os.path.join(extractdir, filename)
  161. if os.path.exists(path):
  162. raise IOError('file exists: %r' % path)
  163. print >>sys.stderr, 'extracting: %r' % path
  164. out = file(path, 'wb')
  165. out.write(fileobj.get_data())
  166. out.close()
  167. return
  168. fp = file(fname, 'rb')
  169. parser = PDFParser(fp)
  170. doc = PDFDocument(parser, password)
  171. for xref in doc.xrefs:
  172. for objid in xref.get_objids():
  173. obj = doc.getobj(objid)
  174. if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
  175. extract1(obj)
  176. return
  177. # dumppdf
  178. def dumppdf(outfp, fname, objids, pagenos, password='',
  179. dumpall=False, codec=None, extractdir=None):
  180. fp = file(fname, 'rb')
  181. parser = PDFParser(fp)
  182. doc = PDFDocument(parser, password)
  183. if objids:
  184. for objid in objids:
  185. obj = doc.getobj(objid)
  186. dumpxml(outfp, obj, codec=codec)
  187. if pagenos:
  188. for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
  189. if pageno in pagenos:
  190. if codec:
  191. for obj in page.contents:
  192. obj = stream_value(obj)
  193. dumpxml(outfp, obj, codec=codec)
  194. else:
  195. dumpxml(outfp, page.attrs)
  196. if dumpall:
  197. dumpallobjs(outfp, doc, codec=codec)
  198. if (not objids) and (not pagenos) and (not dumpall):
  199. dumptrailers(outfp, doc)
  200. fp.close()
  201. if codec not in ('raw','binary'):
  202. outfp.write('\n')
  203. return
  204. # main
  205. def main(argv):
  206. import getopt
  207. def usage():
  208. print 'usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-E directory] [-i objid] file ...' % argv[0]
  209. return 100
  210. try:
  211. (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTE:i:')
  212. except getopt.GetoptError:
  213. return usage()
  214. if not args: return usage()
  215. debug = 0
  216. objids = []
  217. pagenos = set()
  218. codec = None
  219. password = ''
  220. dumpall = False
  221. proc = dumppdf
  222. outfp = sys.stdout
  223. extractdir = None
  224. for (k, v) in opts:
  225. if k == '-d': debug += 1
  226. elif k == '-o': outfp = file(v, 'wb')
  227. elif k == '-i': objids.extend( int(x) for x in v.split(',') )
  228. elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
  229. elif k == '-P': password = v
  230. elif k == '-a': dumpall = True
  231. elif k == '-r': codec = 'raw'
  232. elif k == '-b': codec = 'binary'
  233. elif k == '-t': codec = 'text'
  234. elif k == '-T': proc = dumpoutline
  235. elif k == '-E':
  236. extractdir = v
  237. proc = extractembedded
  238. #
  239. PDFDocument.debug = debug
  240. PDFParser.debug = debug
  241. #
  242. for fname in args:
  243. proc(outfp, fname, objids, pagenos, password=password,
  244. dumpall=dumpall, codec=codec, extractdir=extractdir)
  245. return
  246. if __name__ == '__main__': sys.exit(main(sys.argv))