pdffont.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726
  1. #!/usr/bin/env python
  2. import sys
  3. import struct
  4. try:
  5. from cStringIO import StringIO
  6. except ImportError:
  7. from StringIO import StringIO
  8. from cmapdb import CMapDB, CMapParser, FileUnicodeMap, CMap
  9. from encodingdb import EncodingDB, name2unicode
  10. from psparser import PSStackParser
  11. from psparser import PSEOF
  12. from psparser import LIT, KWD, STRICT
  13. from psparser import PSLiteral, literal_name
  14. from pdftypes import PDFException, resolve1
  15. from pdftypes import int_value, num_value
  16. from pdftypes import list_value, dict_value, stream_value
  17. from fontmetrics import FONT_METRICS
  18. from utils import apply_matrix_norm, nunpack, choplist, isnumber
  19. def get_widths(seq):
  20. widths = {}
  21. r = []
  22. for v in seq:
  23. if isinstance(v, list):
  24. if r:
  25. char1 = r[-1]
  26. for (i, w) in enumerate(v):
  27. widths[char1+i] = w
  28. r = []
  29. elif isnumber(v):
  30. r.append(v)
  31. if len(r) == 3:
  32. (char1, char2, w) = r
  33. for i in xrange(char1, char2+1):
  34. widths[i] = w
  35. r = []
  36. return widths
  37. #assert get_widths([1]) == {}
  38. #assert get_widths([1,2,3]) == {1:3, 2:3}
  39. #assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
  40. def get_widths2(seq):
  41. widths = {}
  42. r = []
  43. for v in seq:
  44. if isinstance(v, list):
  45. if r:
  46. char1 = r[-1]
  47. for (i, (w, vx, vy)) in enumerate(choplist(3, v)):
  48. widths[char1+i] = (w, (vx, vy))
  49. r = []
  50. elif isnumber(v):
  51. r.append(v)
  52. if len(r) == 5:
  53. (char1, char2, w, vx, vy) = r
  54. for i in xrange(char1, char2+1):
  55. widths[i] = (w, (vx, vy))
  56. r = []
  57. return widths
  58. #assert get_widths2([1]) == {}
  59. #assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))}
  60. #assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))}
  61. ## FontMetricsDB
  62. ##
  63. class FontMetricsDB(object):
  64. @classmethod
  65. def get_metrics(klass, fontname):
  66. return FONT_METRICS[fontname]
  67. ## Type1FontHeaderParser
  68. ##
  69. class Type1FontHeaderParser(PSStackParser):
  70. KEYWORD_BEGIN = KWD('begin')
  71. KEYWORD_END = KWD('end')
  72. KEYWORD_DEF = KWD('def')
  73. KEYWORD_PUT = KWD('put')
  74. KEYWORD_DICT = KWD('dict')
  75. KEYWORD_ARRAY = KWD('array')
  76. KEYWORD_READONLY = KWD('readonly')
  77. KEYWORD_FOR = KWD('for')
  78. KEYWORD_FOR = KWD('for')
  79. def __init__(self, data):
  80. PSStackParser.__init__(self, data)
  81. self._cid2unicode = {}
  82. return
  83. def get_encoding(self):
  84. while 1:
  85. try:
  86. (cid, name) = self.nextobject()
  87. except PSEOF:
  88. break
  89. try:
  90. self._cid2unicode[cid] = name2unicode(name)
  91. except KeyError:
  92. pass
  93. return self._cid2unicode
  94. def do_keyword(self, pos, token):
  95. if token is self.KEYWORD_PUT:
  96. ((_, key), (_, value)) = self.pop(2)
  97. if (isinstance(key, int) and
  98. isinstance(value, PSLiteral)):
  99. self.add_results((key, literal_name(value)))
  100. return
  101. NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
  102. ## CFFFont
  103. ## (Format specified in Adobe Technical Note: #5176
  104. ## "The Compact Font Format Specification")
  105. ##
  106. def getdict(data):
  107. d = {}
  108. fp = StringIO(data)
  109. stack = []
  110. while 1:
  111. c = fp.read(1)
  112. if not c:
  113. break
  114. b0 = ord(c)
  115. if b0 <= 21:
  116. d[b0] = stack
  117. stack = []
  118. continue
  119. if b0 == 30:
  120. s = ''
  121. loop = True
  122. while loop:
  123. b = ord(fp.read(1))
  124. for n in (b >> 4, b & 15):
  125. if n == 15:
  126. loop = False
  127. else:
  128. s += NIBBLES[n]
  129. value = float(s)
  130. elif 32 <= b0 and b0 <= 246:
  131. value = b0-139
  132. else:
  133. b1 = ord(fp.read(1))
  134. if 247 <= b0 and b0 <= 250:
  135. value = ((b0-247) << 8)+b1+108
  136. elif 251 <= b0 and b0 <= 254:
  137. value = -((b0-251) << 8)-b1-108
  138. else:
  139. b2 = ord(fp.read(1))
  140. if 128 <= b1:
  141. b1 -= 256
  142. if b0 == 28:
  143. value = b1 << 8 | b2
  144. else:
  145. value = b1 << 24 | b2 << 16 | struct.unpack('>H', fp.read(2))[0]
  146. stack.append(value)
  147. return d
  148. class CFFFont(object):
  149. STANDARD_STRINGS = (
  150. '.notdef', 'space', 'exclam', 'quotedbl', 'numbersign',
  151. 'dollar', 'percent', 'ampersand', 'quoteright', 'parenleft',
  152. 'parenright', 'asterisk', 'plus', 'comma', 'hyphen', 'period',
  153. 'slash', 'zero', 'one', 'two', 'three', 'four', 'five', 'six',
  154. 'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal',
  155. 'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
  156. 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
  157. 'U', 'V', 'W', 'X', 'Y', 'Z', 'bracketleft', 'backslash',
  158. 'bracketright', 'asciicircum', 'underscore', 'quoteleft', 'a',
  159. 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
  160. 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
  161. 'braceleft', 'bar', 'braceright', 'asciitilde', 'exclamdown',
  162. 'cent', 'sterling', 'fraction', 'yen', 'florin', 'section',
  163. 'currency', 'quotesingle', 'quotedblleft', 'guillemotleft',
  164. 'guilsinglleft', 'guilsinglright', 'fi', 'fl', 'endash',
  165. 'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet',
  166. 'quotesinglbase', 'quotedblbase', 'quotedblright',
  167. 'guillemotright', 'ellipsis', 'perthousand', 'questiondown',
  168. 'grave', 'acute', 'circumflex', 'tilde', 'macron', 'breve',
  169. 'dotaccent', 'dieresis', 'ring', 'cedilla', 'hungarumlaut',
  170. 'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash',
  171. 'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash',
  172. 'oslash', 'oe', 'germandbls', 'onesuperior', 'logicalnot', 'mu',
  173. 'trademark', 'Eth', 'onehalf', 'plusminus', 'Thorn',
  174. 'onequarter', 'divide', 'brokenbar', 'degree', 'thorn',
  175. 'threequarters', 'twosuperior', 'registered', 'minus', 'eth',
  176. 'multiply', 'threesuperior', 'copyright', 'Aacute',
  177. 'Acircumflex', 'Adieresis', 'Agrave', 'Aring', 'Atilde',
  178. 'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave',
  179. 'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde',
  180. 'Oacute', 'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde',
  181. 'Scaron', 'Uacute', 'Ucircumflex', 'Udieresis', 'Ugrave',
  182. 'Yacute', 'Ydieresis', 'Zcaron', 'aacute', 'acircumflex',
  183. 'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla', 'eacute',
  184. 'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex',
  185. 'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex',
  186. 'odieresis', 'ograve', 'otilde', 'scaron', 'uacute',
  187. 'ucircumflex', 'udieresis', 'ugrave', 'yacute', 'ydieresis',
  188. 'zcaron', 'exclamsmall', 'Hungarumlautsmall', 'dollaroldstyle',
  189. 'dollarsuperior', 'ampersandsmall', 'Acutesmall',
  190. 'parenleftsuperior', 'parenrightsuperior', 'twodotenleader',
  191. 'onedotenleader', 'zerooldstyle', 'oneoldstyle', 'twooldstyle',
  192. 'threeoldstyle', 'fouroldstyle', 'fiveoldstyle', 'sixoldstyle',
  193. 'sevenoldstyle', 'eightoldstyle', 'nineoldstyle',
  194. 'commasuperior', 'threequartersemdash', 'periodsuperior',
  195. 'questionsmall', 'asuperior', 'bsuperior', 'centsuperior',
  196. 'dsuperior', 'esuperior', 'isuperior', 'lsuperior', 'msuperior',
  197. 'nsuperior', 'osuperior', 'rsuperior', 'ssuperior', 'tsuperior',
  198. 'ff', 'ffi', 'ffl', 'parenleftinferior', 'parenrightinferior',
  199. 'Circumflexsmall', 'hyphensuperior', 'Gravesmall', 'Asmall',
  200. 'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall',
  201. 'Hsmall', 'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall',
  202. 'Nsmall', 'Osmall', 'Psmall', 'Qsmall', 'Rsmall', 'Ssmall',
  203. 'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall', 'Ysmall',
  204. 'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall',
  205. 'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall',
  206. 'Zcaronsmall', 'Dieresissmall', 'Brevesmall', 'Caronsmall',
  207. 'Dotaccentsmall', 'Macronsmall', 'figuredash', 'hypheninferior',
  208. 'Ogoneksmall', 'Ringsmall', 'Cedillasmall', 'questiondownsmall',
  209. 'oneeighth', 'threeeighths', 'fiveeighths', 'seveneighths',
  210. 'onethird', 'twothirds', 'zerosuperior', 'foursuperior',
  211. 'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior',
  212. 'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior',
  213. 'threeinferior', 'fourinferior', 'fiveinferior', 'sixinferior',
  214. 'seveninferior', 'eightinferior', 'nineinferior',
  215. 'centinferior', 'dollarinferior', 'periodinferior',
  216. 'commainferior', 'Agravesmall', 'Aacutesmall',
  217. 'Acircumflexsmall', 'Atildesmall', 'Adieresissmall',
  218. 'Aringsmall', 'AEsmall', 'Ccedillasmall', 'Egravesmall',
  219. 'Eacutesmall', 'Ecircumflexsmall', 'Edieresissmall',
  220. 'Igravesmall', 'Iacutesmall', 'Icircumflexsmall',
  221. 'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall',
  222. 'Oacutesmall', 'Ocircumflexsmall', 'Otildesmall',
  223. 'Odieresissmall', 'OEsmall', 'Oslashsmall', 'Ugravesmall',
  224. 'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall',
  225. 'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000',
  226. '001.001', '001.002', '001.003', 'Black', 'Bold', 'Book',
  227. 'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
  228. )
  229. class INDEX(object):
  230. def __init__(self, fp):
  231. self.fp = fp
  232. self.offsets = []
  233. (count, offsize) = struct.unpack('>HB', self.fp.read(3))
  234. for i in xrange(count+1):
  235. self.offsets.append(nunpack(self.fp.read(offsize)))
  236. self.base = self.fp.tell()-1
  237. self.fp.seek(self.base+self.offsets[-1])
  238. return
  239. def __repr__(self):
  240. return '<INDEX: size=%d>' % len(self)
  241. def __len__(self):
  242. return len(self.offsets)-1
  243. def __getitem__(self, i):
  244. self.fp.seek(self.base+self.offsets[i])
  245. return self.fp.read(self.offsets[i+1]-self.offsets[i])
  246. def __iter__(self):
  247. return iter(self[i] for i in xrange(len(self)))
  248. def __init__(self, name, fp):
  249. self.name = name
  250. self.fp = fp
  251. # Header
  252. (_major, _minor, hdrsize, offsize) = struct.unpack('BBBB', self.fp.read(4))
  253. self.fp.read(hdrsize-4)
  254. # Name INDEX
  255. self.name_index = self.INDEX(self.fp)
  256. # Top DICT INDEX
  257. self.dict_index = self.INDEX(self.fp)
  258. # String INDEX
  259. self.string_index = self.INDEX(self.fp)
  260. # Global Subr INDEX
  261. self.subr_index = self.INDEX(self.fp)
  262. # Top DICT DATA
  263. self.top_dict = getdict(self.dict_index[0])
  264. (charset_pos,) = self.top_dict.get(15, [0])
  265. (encoding_pos,) = self.top_dict.get(16, [0])
  266. (charstring_pos,) = self.top_dict.get(17, [0])
  267. # CharStrings
  268. self.fp.seek(charstring_pos)
  269. self.charstring = self.INDEX(self.fp)
  270. self.nglyphs = len(self.charstring)
  271. # Encodings
  272. self.code2gid = {}
  273. self.gid2code = {}
  274. self.fp.seek(encoding_pos)
  275. format = self.fp.read(1)
  276. if format == '\x00':
  277. # Format 0
  278. (n,) = struct.unpack('B', self.fp.read(1))
  279. for (code, gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))):
  280. self.code2gid[code] = gid
  281. self.gid2code[gid] = code
  282. elif format == '\x01':
  283. # Format 1
  284. (n,) = struct.unpack('B', self.fp.read(1))
  285. code = 0
  286. for i in xrange(n):
  287. (first, nleft) = struct.unpack('BB', self.fp.read(2))
  288. for gid in xrange(first, first+nleft+1):
  289. self.code2gid[code] = gid
  290. self.gid2code[gid] = code
  291. code += 1
  292. else:
  293. raise ValueError('unsupported encoding format: %r' % format)
  294. # Charsets
  295. self.name2gid = {}
  296. self.gid2name = {}
  297. self.fp.seek(charset_pos)
  298. format = self.fp.read(1)
  299. if format == '\x00':
  300. # Format 0
  301. n = self.nglyphs-1
  302. for (gid, sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))):
  303. gid += 1
  304. name = self.getstr(sid)
  305. self.name2gid[name] = gid
  306. self.gid2name[gid] = name
  307. elif format == '\x01':
  308. # Format 1
  309. (n,) = struct.unpack('B', self.fp.read(1))
  310. sid = 0
  311. for i in xrange(n):
  312. (first, nleft) = struct.unpack('BB', self.fp.read(2))
  313. for gid in xrange(first, first+nleft+1):
  314. name = self.getstr(sid)
  315. self.name2gid[name] = gid
  316. self.gid2name[gid] = name
  317. sid += 1
  318. elif format == '\x02':
  319. # Format 2
  320. assert 0
  321. else:
  322. raise ValueError('unsupported charset format: %r' % format)
  323. #print self.code2gid
  324. #print self.name2gid
  325. #assert 0
  326. return
  327. def getstr(self, sid):
  328. if sid < len(self.STANDARD_STRINGS):
  329. return self.STANDARD_STRINGS[sid]
  330. return self.string_index[sid-len(self.STANDARD_STRINGS)]
  331. ## TrueTypeFont
  332. ##
  333. class TrueTypeFont(object):
  334. class CMapNotFound(Exception):
  335. pass
  336. def __init__(self, name, fp):
  337. self.name = name
  338. self.fp = fp
  339. self.tables = {}
  340. self.fonttype = fp.read(4)
  341. (ntables, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8))
  342. for _ in xrange(ntables):
  343. (name, tsum, offset, length) = struct.unpack('>4sLLL', fp.read(16))
  344. self.tables[name] = (offset, length)
  345. return
  346. def create_unicode_map(self):
  347. if 'cmap' not in self.tables:
  348. raise TrueTypeFont.CMapNotFound
  349. (base_offset, length) = self.tables['cmap']
  350. fp = self.fp
  351. fp.seek(base_offset)
  352. (version, nsubtables) = struct.unpack('>HH', fp.read(4))
  353. subtables = []
  354. for i in xrange(nsubtables):
  355. subtables.append(struct.unpack('>HHL', fp.read(8)))
  356. char2gid = {}
  357. # Only supports subtable type 0, 2 and 4.
  358. for (_1, _2, st_offset) in subtables:
  359. fp.seek(base_offset+st_offset)
  360. (fmttype, fmtlen, fmtlang) = struct.unpack('>HHH', fp.read(6))
  361. if fmttype == 0:
  362. char2gid.update(enumerate(struct.unpack('>256B', fp.read(256))))
  363. elif fmttype == 2:
  364. subheaderkeys = struct.unpack('>256H', fp.read(512))
  365. firstbytes = [0]*8192
  366. for (i, k) in enumerate(subheaderkeys):
  367. firstbytes[k//8] = i
  368. nhdrs = max(subheaderkeys)//8 + 1
  369. hdrs = []
  370. for i in xrange(nhdrs):
  371. (firstcode, entcount, delta, offset) = struct.unpack('>HHhH', fp.read(8))
  372. hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset))
  373. for (i, firstcode, entcount, delta, pos) in hdrs:
  374. if not entcount:
  375. continue
  376. first = firstcode + (firstbytes[i] << 8)
  377. fp.seek(pos)
  378. for c in xrange(entcount):
  379. gid = struct.unpack('>H', fp.read(2))
  380. if gid:
  381. gid += delta
  382. char2gid[first+c] = gid
  383. elif fmttype == 4:
  384. (segcount, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8))
  385. segcount //= 2
  386. ecs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
  387. fp.read(2)
  388. scs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
  389. idds = struct.unpack('>%dh' % segcount, fp.read(2*segcount))
  390. pos = fp.tell()
  391. idrs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
  392. for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs):
  393. if idr:
  394. fp.seek(pos+idr)
  395. for c in xrange(sc, ec+1):
  396. char2gid[c] = (struct.unpack('>H', fp.read(2))[0] + idd) & 0xffff
  397. else:
  398. for c in xrange(sc, ec+1):
  399. char2gid[c] = (c + idd) & 0xffff
  400. else:
  401. assert 0
  402. # create unicode map
  403. unicode_map = FileUnicodeMap()
  404. for (char, gid) in char2gid.iteritems():
  405. unicode_map.add_cid2unichr(gid, char)
  406. return unicode_map
  407. ## Fonts
  408. ##
  409. class PDFFontError(PDFException):
  410. pass
  411. class PDFUnicodeNotDefined(PDFFontError):
  412. pass
  413. LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
  414. LITERAL_TYPE1C = LIT('Type1C')
  415. # PDFFont
  416. class PDFFont(object):
  417. def __init__(self, descriptor, widths, default_width=None):
  418. self.descriptor = descriptor
  419. self.widths = widths
  420. self.fontname = resolve1(descriptor.get('FontName', 'unknown'))
  421. if isinstance(self.fontname, PSLiteral):
  422. self.fontname = literal_name(self.fontname)
  423. self.flags = int_value(descriptor.get('Flags', 0))
  424. self.ascent = num_value(descriptor.get('Ascent', 0))
  425. self.descent = num_value(descriptor.get('Descent', 0))
  426. self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
  427. self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
  428. self.leading = num_value(descriptor.get('Leading', 0))
  429. self.bbox = list_value(descriptor.get('FontBBox', (0, 0, 0, 0)))
  430. self.hscale = self.vscale = .001
  431. return
  432. def __repr__(self):
  433. return '<PDFFont>'
  434. def is_vertical(self):
  435. return False
  436. def is_multibyte(self):
  437. return False
  438. def decode(self, bytes):
  439. return map(ord, bytes)
  440. def get_ascent(self):
  441. return self.ascent * self.vscale
  442. def get_descent(self):
  443. return self.descent * self.vscale
  444. def get_width(self):
  445. w = self.bbox[2]-self.bbox[0]
  446. if w == 0:
  447. w = -self.default_width
  448. return w * self.hscale
  449. def get_height(self):
  450. h = self.bbox[3]-self.bbox[1]
  451. if h == 0:
  452. h = self.ascent - self.descent
  453. return h * self.vscale
  454. def char_width(self, cid):
  455. try:
  456. return self.widths[cid] * self.hscale
  457. except KeyError:
  458. try:
  459. return self.widths[self.to_unichr(cid)] * self.hscale
  460. except (KeyError, PDFUnicodeNotDefined):
  461. return self.default_width * self.hscale
  462. def char_disp(self, cid):
  463. return 0
  464. def string_width(self, s):
  465. return sum(self.char_width(cid) for cid in self.decode(s))
  466. # PDFSimpleFont
  467. class PDFSimpleFont(PDFFont):
  468. def __init__(self, descriptor, widths, spec):
  469. # Font encoding is specified either by a name of
  470. # built-in encoding or a dictionary that describes
  471. # the differences.
  472. if 'Encoding' in spec:
  473. encoding = resolve1(spec['Encoding'])
  474. else:
  475. encoding = LITERAL_STANDARD_ENCODING
  476. if isinstance(encoding, dict):
  477. name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
  478. diff = list_value(encoding.get('Differences', None))
  479. self.cid2unicode = EncodingDB.get_encoding(name, diff)
  480. else:
  481. self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
  482. self.unicode_map = None
  483. if 'ToUnicode' in spec:
  484. strm = stream_value(spec['ToUnicode'])
  485. self.unicode_map = FileUnicodeMap()
  486. CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
  487. PDFFont.__init__(self, descriptor, widths)
  488. return
  489. def to_unichr(self, cid):
  490. if self.unicode_map:
  491. try:
  492. return self.unicode_map.get_unichr(cid)
  493. except KeyError:
  494. pass
  495. try:
  496. return self.cid2unicode[cid]
  497. except KeyError:
  498. raise PDFUnicodeNotDefined(None, cid)
  499. # PDFType1Font
  500. class PDFType1Font(PDFSimpleFont):
  501. def __init__(self, rsrcmgr, spec):
  502. try:
  503. self.basefont = literal_name(spec['BaseFont'])
  504. except KeyError:
  505. if STRICT:
  506. raise PDFFontError('BaseFont is missing')
  507. self.basefont = 'unknown'
  508. try:
  509. (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
  510. except KeyError:
  511. descriptor = dict_value(spec.get('FontDescriptor', {}))
  512. firstchar = int_value(spec.get('FirstChar', 0))
  513. lastchar = int_value(spec.get('LastChar', 255))
  514. widths = list_value(spec.get('Widths', [0]*256))
  515. widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
  516. PDFSimpleFont.__init__(self, descriptor, widths, spec)
  517. if 'Encoding' not in spec and 'FontFile' in descriptor:
  518. # try to recover the missing encoding info from the font file.
  519. self.fontfile = stream_value(descriptor.get('FontFile'))
  520. length1 = int_value(self.fontfile['Length1'])
  521. data = self.fontfile.get_data()[:length1]
  522. parser = Type1FontHeaderParser(StringIO(data))
  523. self.cid2unicode = parser.get_encoding()
  524. return
  525. def __repr__(self):
  526. return '<PDFType1Font: basefont=%r>' % self.basefont
  527. # PDFTrueTypeFont
  528. class PDFTrueTypeFont(PDFType1Font):
  529. def __repr__(self):
  530. return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
  531. # PDFType3Font
  532. class PDFType3Font(PDFSimpleFont):
  533. def __init__(self, rsrcmgr, spec):
  534. firstchar = int_value(spec.get('FirstChar', 0))
  535. lastchar = int_value(spec.get('LastChar', 0))
  536. widths = list_value(spec.get('Widths', [0]*256))
  537. widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
  538. if 'FontDescriptor' in spec:
  539. descriptor = dict_value(spec['FontDescriptor'])
  540. else:
  541. descriptor = {'Ascent': 0, 'Descent': 0,
  542. 'FontBBox': spec['FontBBox']}
  543. PDFSimpleFont.__init__(self, descriptor, widths, spec)
  544. self.matrix = tuple(list_value(spec.get('FontMatrix')))
  545. (_, self.descent, _, self.ascent) = self.bbox
  546. (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
  547. return
  548. def __repr__(self):
  549. return '<PDFType3Font>'
  550. # PDFCIDFont
  551. class PDFCIDFont(PDFFont):
  552. def __init__(self, rsrcmgr, spec):
  553. try:
  554. self.basefont = literal_name(spec['BaseFont'])
  555. except KeyError:
  556. if STRICT:
  557. raise PDFFontError('BaseFont is missing')
  558. self.basefont = 'unknown'
  559. self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
  560. self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
  561. self.cidsysteminfo.get('Ordering', 'unknown'))
  562. try:
  563. name = literal_name(spec['Encoding'])
  564. except KeyError:
  565. if STRICT:
  566. raise PDFFontError('Encoding is unspecified')
  567. name = 'unknown'
  568. try:
  569. self.cmap = CMapDB.get_cmap(name)
  570. except CMapDB.CMapNotFound, e:
  571. if STRICT:
  572. raise PDFFontError(e)
  573. self.cmap = CMap()
  574. try:
  575. descriptor = dict_value(spec['FontDescriptor'])
  576. except KeyError:
  577. if STRICT:
  578. raise PDFFontError('FontDescriptor is missing')
  579. descriptor = {}
  580. ttf = None
  581. if 'FontFile2' in descriptor:
  582. self.fontfile = stream_value(descriptor.get('FontFile2'))
  583. ttf = TrueTypeFont(self.basefont,
  584. StringIO(self.fontfile.get_data()))
  585. self.unicode_map = None
  586. if 'ToUnicode' in spec:
  587. strm = stream_value(spec['ToUnicode'])
  588. self.unicode_map = FileUnicodeMap()
  589. CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
  590. elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'):
  591. if ttf:
  592. try:
  593. self.unicode_map = ttf.create_unicode_map()
  594. except TrueTypeFont.CMapNotFound:
  595. pass
  596. else:
  597. try:
  598. self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical())
  599. except CMapDB.CMapNotFound, e:
  600. pass
  601. self.vertical = self.cmap.is_vertical()
  602. if self.vertical:
  603. # writing mode: vertical
  604. widths = get_widths2(list_value(spec.get('W2', [])))
  605. self.disps = dict((cid, (vx, vy)) for (cid, (_, (vx, vy))) in widths.iteritems())
  606. (vy, w) = spec.get('DW2', [880, -1000])
  607. self.default_disp = (None, vy)
  608. widths = dict((cid, w) for (cid, (w, _)) in widths.iteritems())
  609. default_width = w
  610. else:
  611. # writing mode: horizontal
  612. self.disps = {}
  613. self.default_disp = 0
  614. widths = get_widths(list_value(spec.get('W', [])))
  615. default_width = spec.get('DW', 1000)
  616. PDFFont.__init__(self, descriptor, widths, default_width=default_width)
  617. return
  618. def __repr__(self):
  619. return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
  620. def is_vertical(self):
  621. return self.vertical
  622. def is_multibyte(self):
  623. return True
  624. def decode(self, bytes):
  625. return self.cmap.decode(bytes)
  626. def char_disp(self, cid):
  627. "Returns an integer for horizontal fonts, a tuple for vertical fonts."
  628. return self.disps.get(cid, self.default_disp)
  629. def to_unichr(self, cid):
  630. try:
  631. if not self.unicode_map:
  632. raise KeyError(cid)
  633. return self.unicode_map.get_unichr(cid)
  634. except KeyError:
  635. raise PDFUnicodeNotDefined(self.cidcoding, cid)
  636. # main
  637. def main(argv):
  638. for fname in argv[1:]:
  639. fp = file(fname, 'rb')
  640. #font = TrueTypeFont(fname, fp)
  641. font = CFFFont(fname, fp)
  642. print font
  643. fp.close()
  644. return
  645. if __name__ == '__main__':
  646. sys.exit(main(sys.argv))