converter.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. #!/usr/bin/env python
  2. import sys
  3. from pdfdevice import PDFTextDevice
  4. from pdffont import PDFUnicodeNotDefined
  5. from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve
  6. from layout import LTFigure, LTImage, LTChar, LTTextLine
  7. from layout import LTTextBox, LTTextBoxVertical, LTTextGroup
  8. from utils import apply_matrix_pt, mult_matrix
  9. from utils import enc, bbox2str
  10. ## PDFLayoutAnalyzer
  11. ##
  12. class PDFLayoutAnalyzer(PDFTextDevice):
  13. def __init__(self, rsrcmgr, pageno=1, laparams=None):
  14. PDFTextDevice.__init__(self, rsrcmgr)
  15. self.pageno = pageno
  16. self.laparams = laparams
  17. self._stack = []
  18. return
  19. def begin_page(self, page, ctm):
  20. (x0, y0, x1, y1) = page.mediabox
  21. (x0, y0) = apply_matrix_pt(ctm, (x0, y0))
  22. (x1, y1) = apply_matrix_pt(ctm, (x1, y1))
  23. mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
  24. self.cur_item = LTPage(self.pageno, mediabox)
  25. return
  26. def end_page(self, page):
  27. assert not self._stack
  28. assert isinstance(self.cur_item, LTPage)
  29. if self.laparams is not None:
  30. self.cur_item.analyze(self.laparams)
  31. self.pageno += 1
  32. self.receive_layout(self.cur_item)
  33. return
  34. def begin_figure(self, name, bbox, matrix):
  35. self._stack.append(self.cur_item)
  36. self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
  37. return
  38. def end_figure(self, _):
  39. fig = self.cur_item
  40. assert isinstance(self.cur_item, LTFigure)
  41. self.cur_item = self._stack.pop()
  42. self.cur_item.add(fig)
  43. return
  44. def render_image(self, name, stream):
  45. assert isinstance(self.cur_item, LTFigure)
  46. item = LTImage(name, stream,
  47. (self.cur_item.x0, self.cur_item.y0,
  48. self.cur_item.x1, self.cur_item.y1))
  49. self.cur_item.add(item)
  50. return
  51. def paint_path(self, gstate, stroke, fill, evenodd, path):
  52. shape = ''.join(x[0] for x in path)
  53. if shape == 'ml':
  54. # horizontal/vertical line
  55. (_, x0, y0) = path[0]
  56. (_, x1, y1) = path[1]
  57. (x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
  58. (x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
  59. if x0 == x1 or y0 == y1:
  60. self.cur_item.add(LTLine(gstate.linewidth, (x0, y0), (x1, y1)))
  61. return
  62. if shape == 'mlllh':
  63. # rectangle
  64. (_, x0, y0) = path[0]
  65. (_, x1, y1) = path[1]
  66. (_, x2, y2) = path[2]
  67. (_, x3, y3) = path[3]
  68. (x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
  69. (x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
  70. (x2, y2) = apply_matrix_pt(self.ctm, (x2, y2))
  71. (x3, y3) = apply_matrix_pt(self.ctm, (x3, y3))
  72. if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
  73. (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
  74. self.cur_item.add(LTRect(gstate.linewidth, (x0, y0, x2, y2)))
  75. return
  76. # other shapes
  77. pts = []
  78. for p in path:
  79. for i in xrange(1, len(p), 2):
  80. pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
  81. self.cur_item.add(LTCurve(gstate.linewidth, pts))
  82. return
  83. def render_char(self, matrix, font, fontsize, scaling, rise, cid):
  84. try:
  85. text = font.to_unichr(cid)
  86. assert isinstance(text, unicode), text
  87. except PDFUnicodeNotDefined:
  88. text = self.handle_undefined_char(font, cid)
  89. textwidth = font.char_width(cid)
  90. textdisp = font.char_disp(cid)
  91. item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp)
  92. self.cur_item.add(item)
  93. return item.adv
  94. def handle_undefined_char(self, font, cid):
  95. if self.debug:
  96. print >>sys.stderr, 'undefined: %r, %r' % (font, cid)
  97. return '(cid:%d)' % cid
  98. def receive_layout(self, ltpage):
  99. return
  100. ## PDFPageAggregator
  101. ##
  102. class PDFPageAggregator(PDFLayoutAnalyzer):
  103. def __init__(self, rsrcmgr, pageno=1, laparams=None):
  104. PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
  105. self.result = None
  106. return
  107. def receive_layout(self, ltpage):
  108. self.result = ltpage
  109. return
  110. def get_result(self):
  111. return self.result
  112. ## PDFConverter
  113. ##
  114. class PDFConverter(PDFLayoutAnalyzer):
  115. def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
  116. PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
  117. self.outfp = outfp
  118. self.codec = codec
  119. return
  120. ## TextConverter
  121. ##
  122. class TextConverter(PDFConverter):
  123. def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
  124. showpageno=False, imagewriter=None):
  125. PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
  126. self.showpageno = showpageno
  127. self.imagewriter = imagewriter
  128. return
  129. def write_text(self, text):
  130. self.outfp.write(text.encode(self.codec, 'ignore'))
  131. return
  132. def receive_layout(self, ltpage):
  133. def render(item):
  134. if isinstance(item, LTContainer):
  135. for child in item:
  136. render(child)
  137. elif isinstance(item, LTText):
  138. self.write_text(item.get_text())
  139. if isinstance(item, LTTextBox):
  140. self.write_text('\n')
  141. elif isinstance(item, LTImage):
  142. if self.imagewriter is not None:
  143. self.imagewriter.export_image(item)
  144. if self.showpageno:
  145. self.write_text('Page %s\n' % ltpage.pageid)
  146. render(ltpage)
  147. self.write_text('\f')
  148. return
  149. # Some dummy functions to save memory/CPU when all that is wanted
  150. # is text. This stops all the image and drawing ouput from being
  151. # recorded and taking up RAM.
  152. def render_image(self, name, stream):
  153. if self.imagewriter is None:
  154. return
  155. PDFConverter.render_image(self, name, stream)
  156. return
  157. def paint_path(self, gstate, stroke, fill, evenodd, path):
  158. return
  159. ## HTMLConverter
  160. ##
  161. class HTMLConverter(PDFConverter):
  162. RECT_COLORS = {
  163. #'char': 'green',
  164. 'figure': 'yellow',
  165. 'textline': 'magenta',
  166. 'textbox': 'cyan',
  167. 'textgroup': 'red',
  168. 'curve': 'black',
  169. 'page': 'gray',
  170. }
  171. TEXT_COLORS = {
  172. 'textbox': 'blue',
  173. 'char': 'black',
  174. }
  175. def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
  176. scale=1, fontscale=1.0, layoutmode='normal', showpageno=True,
  177. pagemargin=50, imagewriter=None,
  178. rect_colors={'curve': 'black', 'page': 'gray'},
  179. text_colors={'char': 'black'}):
  180. PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
  181. self.scale = scale
  182. self.fontscale = fontscale
  183. self.layoutmode = layoutmode
  184. self.showpageno = showpageno
  185. self.pagemargin = pagemargin
  186. self.imagewriter = imagewriter
  187. self.rect_colors = rect_colors
  188. self.text_colors = text_colors
  189. if self.debug:
  190. self.rect_colors.update(self.RECT_COLORS)
  191. self.text_colors.update(self.TEXT_COLORS)
  192. self._yoffset = self.pagemargin
  193. self._font = None
  194. self._fontstack = []
  195. self.write_header()
  196. return
  197. def write(self, text):
  198. self.outfp.write(text)
  199. return
  200. def write_header(self):
  201. self.write('<html><head>\n')
  202. self.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
  203. self.write('</head><body>\n')
  204. return
  205. def write_footer(self):
  206. self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
  207. ', '.join('<a href="#%s">%s</a>' % (i, i) for i in xrange(1, self.pageno)))
  208. self.write('</body></html>\n')
  209. return
  210. def write_text(self, text):
  211. self.write(enc(text, self.codec))
  212. return
  213. def place_rect(self, color, borderwidth, x, y, w, h):
  214. color = self.rect_colors.get(color)
  215. if color is not None:
  216. self.write('<span style="position:absolute; border: %s %dpx solid; '
  217. 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
  218. (color, borderwidth,
  219. x*self.scale, (self._yoffset-y)*self.scale,
  220. w*self.scale, h*self.scale))
  221. return
  222. def place_border(self, color, borderwidth, item):
  223. self.place_rect(color, borderwidth, item.x0, item.y1, item.width, item.height)
  224. return
  225. def place_image(self, item, borderwidth, x, y, w, h):
  226. if self.imagewriter is not None:
  227. name = self.imagewriter.export_image(item)
  228. self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" '
  229. 'width="%d" height="%d" />\n' %
  230. (enc(name), borderwidth,
  231. x*self.scale, (self._yoffset-y)*self.scale,
  232. w*self.scale, h*self.scale))
  233. return
  234. def place_text(self, color, text, x, y, size):
  235. color = self.text_colors.get(color)
  236. if color is not None:
  237. self.write('<span style="position:absolute; color:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
  238. (color, x*self.scale, (self._yoffset-y)*self.scale, size*self.scale*self.fontscale))
  239. self.write_text(text)
  240. self.write('</span>\n')
  241. return
  242. def begin_div(self, color, borderwidth, x, y, w, h, writing_mode=False):
  243. self._fontstack.append(self._font)
  244. self._font = None
  245. self.write('<div style="position:absolute; border: %s %dpx solid; writing-mode:%s; '
  246. 'left:%dpx; top:%dpx; width:%dpx; height:%dpx;">' %
  247. (color, borderwidth, writing_mode,
  248. x*self.scale, (self._yoffset-y)*self.scale,
  249. w*self.scale, h*self.scale))
  250. return
  251. def end_div(self, color):
  252. if self._font is not None:
  253. self.write('</span>')
  254. self._font = self._fontstack.pop()
  255. self.write('</div>')
  256. return
  257. def put_text(self, text, fontname, fontsize):
  258. font = (fontname, fontsize)
  259. if font != self._font:
  260. if self._font is not None:
  261. self.write('</span>')
  262. self.write('<span style="font-family: %s; font-size:%dpx">' %
  263. (fontname, fontsize * self.scale * self.fontscale))
  264. self._font = font
  265. self.write_text(text)
  266. return
  267. def put_newline(self):
  268. self.write('<br>')
  269. return
  270. def receive_layout(self, ltpage):
  271. def show_group(item):
  272. if isinstance(item, LTTextGroup):
  273. self.place_border('textgroup', 1, item)
  274. for child in item:
  275. show_group(child)
  276. return
  277. def render(item):
  278. if isinstance(item, LTPage):
  279. self._yoffset += item.y1
  280. self.place_border('page', 1, item)
  281. if self.showpageno:
  282. self.write('<div style="position:absolute; top:%dpx;">' %
  283. ((self._yoffset-item.y1)*self.scale))
  284. self.write('<a name="%s">Page %s</a></div>\n' % (item.pageid, item.pageid))
  285. for child in item:
  286. render(child)
  287. if item.groups is not None:
  288. for group in item.groups:
  289. show_group(group)
  290. elif isinstance(item, LTCurve):
  291. self.place_border('curve', 1, item)
  292. elif isinstance(item, LTFigure):
  293. self.begin_div('figure', 1, item.x0, item.y1, item.width, item.height)
  294. for child in item:
  295. render(child)
  296. self.end_div('figure')
  297. elif isinstance(item, LTImage):
  298. self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
  299. else:
  300. if self.layoutmode == 'exact':
  301. if isinstance(item, LTTextLine):
  302. self.place_border('textline', 1, item)
  303. for child in item:
  304. render(child)
  305. elif isinstance(item, LTTextBox):
  306. self.place_border('textbox', 1, item)
  307. self.place_text('textbox', str(item.index+1), item.x0, item.y1, 20)
  308. for child in item:
  309. render(child)
  310. elif isinstance(item, LTChar):
  311. self.place_border('char', 1, item)
  312. self.place_text('char', item.get_text(), item.x0, item.y1, item.size)
  313. else:
  314. if isinstance(item, LTTextLine):
  315. for child in item:
  316. render(child)
  317. if self.layoutmode != 'loose':
  318. self.put_newline()
  319. elif isinstance(item, LTTextBox):
  320. self.begin_div('textbox', 1, item.x0, item.y1, item.width, item.height,
  321. item.get_writing_mode())
  322. for child in item:
  323. render(child)
  324. self.end_div('textbox')
  325. elif isinstance(item, LTChar):
  326. self.put_text(item.get_text(), item.fontname, item.size)
  327. elif isinstance(item, LTText):
  328. self.write_text(item.get_text())
  329. return
  330. render(ltpage)
  331. self._yoffset += self.pagemargin
  332. return
  333. def close(self):
  334. self.write_footer()
  335. return
  336. ## XMLConverter
  337. ##
  338. class XMLConverter(PDFConverter):
  339. def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
  340. laparams=None, imagewriter=None):
  341. PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
  342. self.imagewriter = imagewriter
  343. self.write_header()
  344. return
  345. def write_header(self):
  346. self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % self.codec)
  347. self.outfp.write('<pages>\n')
  348. return
  349. def write_footer(self):
  350. self.outfp.write('</pages>\n')
  351. return
  352. def write_text(self, text):
  353. self.outfp.write(enc(text, self.codec))
  354. return
  355. def receive_layout(self, ltpage):
  356. def show_group(item):
  357. if isinstance(item, LTTextBox):
  358. self.outfp.write('<textbox id="%d" bbox="%s" />\n' %
  359. (item.index, bbox2str(item.bbox)))
  360. elif isinstance(item, LTTextGroup):
  361. self.outfp.write('<textgroup bbox="%s">\n' % bbox2str(item.bbox))
  362. for child in item:
  363. show_group(child)
  364. self.outfp.write('</textgroup>\n')
  365. return
  366. def render(item):
  367. if isinstance(item, LTPage):
  368. self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
  369. (item.pageid, bbox2str(item.bbox), item.rotate))
  370. for child in item:
  371. render(child)
  372. if item.groups is not None:
  373. self.outfp.write('<layout>\n')
  374. for group in item.groups:
  375. show_group(group)
  376. self.outfp.write('</layout>\n')
  377. self.outfp.write('</page>\n')
  378. elif isinstance(item, LTLine):
  379. self.outfp.write('<line linewidth="%d" bbox="%s" />\n' %
  380. (item.linewidth, bbox2str(item.bbox)))
  381. elif isinstance(item, LTRect):
  382. self.outfp.write('<rect linewidth="%d" bbox="%s" />\n' %
  383. (item.linewidth, bbox2str(item.bbox)))
  384. elif isinstance(item, LTCurve):
  385. self.outfp.write('<curve linewidth="%d" bbox="%s" pts="%s"/>\n' %
  386. (item.linewidth, bbox2str(item.bbox), item.get_pts()))
  387. elif isinstance(item, LTFigure):
  388. self.outfp.write('<figure name="%s" bbox="%s">\n' %
  389. (item.name, bbox2str(item.bbox)))
  390. for child in item:
  391. render(child)
  392. self.outfp.write('</figure>\n')
  393. elif isinstance(item, LTTextLine):
  394. self.outfp.write('<textline bbox="%s">\n' % bbox2str(item.bbox))
  395. for child in item:
  396. render(child)
  397. self.outfp.write('</textline>\n')
  398. elif isinstance(item, LTTextBox):
  399. wmode = ''
  400. if isinstance(item, LTTextBoxVertical):
  401. wmode = ' wmode="vertical"'
  402. self.outfp.write('<textbox id="%d" bbox="%s"%s>\n' %
  403. (item.index, bbox2str(item.bbox), wmode))
  404. for child in item:
  405. render(child)
  406. self.outfp.write('</textbox>\n')
  407. elif isinstance(item, LTChar):
  408. self.outfp.write('<text font="%s" bbox="%s" size="%.3f">' %
  409. (enc(item.fontname), bbox2str(item.bbox), item.size))
  410. self.write_text(item.get_text())
  411. self.outfp.write('</text>\n')
  412. elif isinstance(item, LTText):
  413. self.outfp.write('<text>%s</text>\n' % item.get_text())
  414. elif isinstance(item, LTImage):
  415. if self.imagewriter is not None:
  416. name = self.imagewriter.export_image(item)
  417. self.outfp.write('<image src="%s" width="%d" height="%d" />\n' %
  418. (enc(name), item.width, item.height))
  419. else:
  420. self.outfp.write('<image width="%d" height="%d" />\n' %
  421. (item.width, item.height))
  422. else:
  423. assert 0, item
  424. return
  425. render(ltpage)
  426. return
  427. def close(self):
  428. self.write_footer()
  429. return