pdf2txt.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. #!c:\tools\python27\python.exe
  2. import sys
  3. from pdfminer.pdfdocument import PDFDocument
  4. from pdfminer.pdfparser import PDFParser
  5. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  6. from pdfminer.pdfdevice import PDFDevice, TagExtractor
  7. from pdfminer.pdfpage import PDFPage
  8. from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
  9. from pdfminer.cmapdb import CMapDB
  10. from pdfminer.layout import LAParams
  11. from pdfminer.image import ImageWriter
  12. # main
  13. def main(argv):
  14. import getopt
  15. def usage():
  16. print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
  17. ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
  18. ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
  19. ' [-t text|html|xml|tag] [-c codec] [-s scale]'
  20. ' file ...' % argv[0])
  21. return 100
  22. try:
  23. (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
  24. except getopt.GetoptError:
  25. return usage()
  26. if not args: return usage()
  27. # debug option
  28. debug = 0
  29. # input option
  30. password = ''
  31. pagenos = set()
  32. maxpages = 0
  33. # output option
  34. outfile = None
  35. outtype = None
  36. imagewriter = None
  37. rotation = 0
  38. layoutmode = 'normal'
  39. codec = 'utf-8'
  40. pageno = 1
  41. scale = 1
  42. caching = True
  43. showpageno = True
  44. laparams = LAParams()
  45. for (k, v) in opts:
  46. if k == '-d': debug += 1
  47. elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
  48. elif k == '-m': maxpages = int(v)
  49. elif k == '-P': password = v
  50. elif k == '-o': outfile = v
  51. elif k == '-C': caching = False
  52. elif k == '-n': laparams = None
  53. elif k == '-A': laparams.all_texts = True
  54. elif k == '-V': laparams.detect_vertical = True
  55. elif k == '-M': laparams.char_margin = float(v)
  56. elif k == '-L': laparams.line_margin = float(v)
  57. elif k == '-W': laparams.word_margin = float(v)
  58. elif k == '-F': laparams.boxes_flow = float(v)
  59. elif k == '-Y': layoutmode = v
  60. elif k == '-O': imagewriter = ImageWriter(v)
  61. elif k == '-R': rotation = int(v)
  62. elif k == '-t': outtype = v
  63. elif k == '-c': codec = v
  64. elif k == '-s': scale = float(v)
  65. #
  66. PDFDocument.debug = debug
  67. PDFParser.debug = debug
  68. CMapDB.debug = debug
  69. PDFResourceManager.debug = debug
  70. PDFPageInterpreter.debug = debug
  71. PDFDevice.debug = debug
  72. #
  73. rsrcmgr = PDFResourceManager(caching=caching)
  74. if not outtype:
  75. outtype = 'text'
  76. if outfile:
  77. if outfile.endswith('.htm') or outfile.endswith('.html'):
  78. outtype = 'html'
  79. elif outfile.endswith('.xml'):
  80. outtype = 'xml'
  81. elif outfile.endswith('.tag'):
  82. outtype = 'tag'
  83. if outfile:
  84. outfp = file(outfile, 'w')
  85. else:
  86. outfp = sys.stdout
  87. if outtype == 'text':
  88. device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
  89. imagewriter=imagewriter)
  90. elif outtype == 'xml':
  91. device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
  92. imagewriter=imagewriter)
  93. elif outtype == 'html':
  94. device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
  95. layoutmode=layoutmode, laparams=laparams,
  96. imagewriter=imagewriter)
  97. elif outtype == 'tag':
  98. device = TagExtractor(rsrcmgr, outfp, codec=codec)
  99. else:
  100. return usage()
  101. for fname in args:
  102. fp = file(fname, 'rb')
  103. interpreter = PDFPageInterpreter(rsrcmgr, device)
  104. for page in PDFPage.get_pages(fp, pagenos,
  105. maxpages=maxpages, password=password,
  106. caching=caching, check_extractable=True):
  107. page.rotate = (page.rotate+rotation) % 360
  108. interpreter.process_page(page)
  109. fp.close()
  110. device.close()
  111. outfp.close()
  112. return
  113. if __name__ == '__main__': sys.exit(main(sys.argv))