pdfdevice.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. #!/usr/bin/env python
  2. from utils import mult_matrix, translate_matrix
  3. from utils import enc, bbox2str, isnumber
  4. from pdffont import PDFUnicodeNotDefined
  5. ## PDFDevice
  6. ##
  7. class PDFDevice(object):
  8. debug = 0
  9. def __init__(self, rsrcmgr):
  10. self.rsrcmgr = rsrcmgr
  11. self.ctm = None
  12. return
  13. def __repr__(self):
  14. return '<PDFDevice>'
  15. def close(self):
  16. return
  17. def set_ctm(self, ctm):
  18. self.ctm = ctm
  19. return
  20. def begin_tag(self, tag, props=None):
  21. return
  22. def end_tag(self):
  23. return
  24. def do_tag(self, tag, props=None):
  25. return
  26. def begin_page(self, page, ctm):
  27. return
  28. def end_page(self, page):
  29. return
  30. def begin_figure(self, name, bbox, matrix):
  31. return
  32. def end_figure(self, name):
  33. return
  34. def paint_path(self, graphicstate, stroke, fill, evenodd, path):
  35. return
  36. def render_image(self, name, stream):
  37. return
  38. def render_string(self, textstate, seq):
  39. return
  40. ## PDFTextDevice
  41. ##
  42. class PDFTextDevice(PDFDevice):
  43. def render_string(self, textstate, seq):
  44. matrix = mult_matrix(textstate.matrix, self.ctm)
  45. font = textstate.font
  46. fontsize = textstate.fontsize
  47. scaling = textstate.scaling * .01
  48. charspace = textstate.charspace * scaling
  49. wordspace = textstate.wordspace * scaling
  50. rise = textstate.rise
  51. if font.is_multibyte():
  52. wordspace = 0
  53. dxscale = .001 * fontsize * scaling
  54. if font.is_vertical():
  55. textstate.linematrix = self.render_string_vertical(
  56. seq, matrix, textstate.linematrix, font, fontsize,
  57. scaling, charspace, wordspace, rise, dxscale)
  58. else:
  59. textstate.linematrix = self.render_string_horizontal(
  60. seq, matrix, textstate.linematrix, font, fontsize,
  61. scaling, charspace, wordspace, rise, dxscale)
  62. return
  63. def render_string_horizontal(self, seq, matrix, (x, y),
  64. font, fontsize, scaling, charspace, wordspace, rise, dxscale):
  65. needcharspace = False
  66. for obj in seq:
  67. if isnumber(obj):
  68. x -= obj*dxscale
  69. needcharspace = True
  70. else:
  71. for cid in font.decode(obj):
  72. if needcharspace:
  73. x += charspace
  74. x += self.render_char(translate_matrix(matrix, (x, y)),
  75. font, fontsize, scaling, rise, cid)
  76. if cid == 32 and wordspace:
  77. x += wordspace
  78. needcharspace = True
  79. return (x, y)
  80. def render_string_vertical(self, seq, matrix, (x, y),
  81. font, fontsize, scaling, charspace, wordspace, rise, dxscale):
  82. needcharspace = False
  83. for obj in seq:
  84. if isnumber(obj):
  85. y -= obj*dxscale
  86. needcharspace = True
  87. else:
  88. for cid in font.decode(obj):
  89. if needcharspace:
  90. y += charspace
  91. y += self.render_char(translate_matrix(matrix, (x, y)),
  92. font, fontsize, scaling, rise, cid)
  93. if cid == 32 and wordspace:
  94. y += wordspace
  95. needcharspace = True
  96. return (x, y)
  97. def render_char(self, matrix, font, fontsize, scaling, rise, cid):
  98. return 0
  99. ## TagExtractor
  100. ##
  101. class TagExtractor(PDFDevice):
  102. def __init__(self, rsrcmgr, outfp, codec='utf-8', debug=0):
  103. PDFDevice.__init__(self, rsrcmgr)
  104. self.outfp = outfp
  105. self.codec = codec
  106. self.debug = debug
  107. self.pageno = 0
  108. self._stack = []
  109. return
  110. def render_string(self, textstate, seq):
  111. font = textstate.font
  112. text = ''
  113. for obj in seq:
  114. if not isinstance(obj, str):
  115. continue
  116. chars = font.decode(obj)
  117. for cid in chars:
  118. try:
  119. char = font.to_unichr(cid)
  120. text += char
  121. except PDFUnicodeNotDefined:
  122. pass
  123. self.outfp.write(enc(text, self.codec))
  124. return
  125. def begin_page(self, page, ctm):
  126. self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
  127. (self.pageno, bbox2str(page.mediabox), page.rotate))
  128. return
  129. def end_page(self, page):
  130. self.outfp.write('</page>\n')
  131. self.pageno += 1
  132. return
  133. def begin_tag(self, tag, props=None):
  134. s = ''
  135. if isinstance(props, dict):
  136. s = ''.join(' %s="%s"' % (enc(k), enc(str(v))) for (k, v)
  137. in sorted(props.iteritems()))
  138. self.outfp.write('<%s%s>' % (enc(tag.name), s))
  139. self._stack.append(tag)
  140. return
  141. def end_tag(self):
  142. assert self._stack
  143. tag = self._stack.pop(-1)
  144. self.outfp.write('</%s>' % enc(tag.name))
  145. return
  146. def do_tag(self, tag, props=None):
  147. self.begin_tag(tag, props)
  148. self._stack.pop(-1)
  149. return