| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726 |
- #!/usr/bin/env python
- import sys
- import struct
- try:
- from cStringIO import StringIO
- except ImportError:
- from StringIO import StringIO
- from cmapdb import CMapDB, CMapParser, FileUnicodeMap, CMap
- from encodingdb import EncodingDB, name2unicode
- from psparser import PSStackParser
- from psparser import PSEOF
- from psparser import LIT, KWD, STRICT
- from psparser import PSLiteral, literal_name
- from pdftypes import PDFException, resolve1
- from pdftypes import int_value, num_value
- from pdftypes import list_value, dict_value, stream_value
- from fontmetrics import FONT_METRICS
- from utils import apply_matrix_norm, nunpack, choplist, isnumber
- def get_widths(seq):
- widths = {}
- r = []
- for v in seq:
- if isinstance(v, list):
- if r:
- char1 = r[-1]
- for (i, w) in enumerate(v):
- widths[char1+i] = w
- r = []
- elif isnumber(v):
- r.append(v)
- if len(r) == 3:
- (char1, char2, w) = r
- for i in xrange(char1, char2+1):
- widths[i] = w
- r = []
- return widths
- #assert get_widths([1]) == {}
- #assert get_widths([1,2,3]) == {1:3, 2:3}
- #assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
- def get_widths2(seq):
- widths = {}
- r = []
- for v in seq:
- if isinstance(v, list):
- if r:
- char1 = r[-1]
- for (i, (w, vx, vy)) in enumerate(choplist(3, v)):
- widths[char1+i] = (w, (vx, vy))
- r = []
- elif isnumber(v):
- r.append(v)
- if len(r) == 5:
- (char1, char2, w, vx, vy) = r
- for i in xrange(char1, char2+1):
- widths[i] = (w, (vx, vy))
- r = []
- return widths
- #assert get_widths2([1]) == {}
- #assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))}
- #assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))}
- ## FontMetricsDB
- ##
- class FontMetricsDB(object):
- @classmethod
- def get_metrics(klass, fontname):
- return FONT_METRICS[fontname]
- ## Type1FontHeaderParser
- ##
- class Type1FontHeaderParser(PSStackParser):
- KEYWORD_BEGIN = KWD('begin')
- KEYWORD_END = KWD('end')
- KEYWORD_DEF = KWD('def')
- KEYWORD_PUT = KWD('put')
- KEYWORD_DICT = KWD('dict')
- KEYWORD_ARRAY = KWD('array')
- KEYWORD_READONLY = KWD('readonly')
- KEYWORD_FOR = KWD('for')
- KEYWORD_FOR = KWD('for')
- def __init__(self, data):
- PSStackParser.__init__(self, data)
- self._cid2unicode = {}
- return
- def get_encoding(self):
- while 1:
- try:
- (cid, name) = self.nextobject()
- except PSEOF:
- break
- try:
- self._cid2unicode[cid] = name2unicode(name)
- except KeyError:
- pass
- return self._cid2unicode
- def do_keyword(self, pos, token):
- if token is self.KEYWORD_PUT:
- ((_, key), (_, value)) = self.pop(2)
- if (isinstance(key, int) and
- isinstance(value, PSLiteral)):
- self.add_results((key, literal_name(value)))
- return
- NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
- ## CFFFont
- ## (Format specified in Adobe Technical Note: #5176
- ## "The Compact Font Format Specification")
- ##
- def getdict(data):
- d = {}
- fp = StringIO(data)
- stack = []
- while 1:
- c = fp.read(1)
- if not c:
- break
- b0 = ord(c)
- if b0 <= 21:
- d[b0] = stack
- stack = []
- continue
- if b0 == 30:
- s = ''
- loop = True
- while loop:
- b = ord(fp.read(1))
- for n in (b >> 4, b & 15):
- if n == 15:
- loop = False
- else:
- s += NIBBLES[n]
- value = float(s)
- elif 32 <= b0 and b0 <= 246:
- value = b0-139
- else:
- b1 = ord(fp.read(1))
- if 247 <= b0 and b0 <= 250:
- value = ((b0-247) << 8)+b1+108
- elif 251 <= b0 and b0 <= 254:
- value = -((b0-251) << 8)-b1-108
- else:
- b2 = ord(fp.read(1))
- if 128 <= b1:
- b1 -= 256
- if b0 == 28:
- value = b1 << 8 | b2
- else:
- value = b1 << 24 | b2 << 16 | struct.unpack('>H', fp.read(2))[0]
- stack.append(value)
- return d
- class CFFFont(object):
- STANDARD_STRINGS = (
- '.notdef', 'space', 'exclam', 'quotedbl', 'numbersign',
- 'dollar', 'percent', 'ampersand', 'quoteright', 'parenleft',
- 'parenright', 'asterisk', 'plus', 'comma', 'hyphen', 'period',
- 'slash', 'zero', 'one', 'two', 'three', 'four', 'five', 'six',
- 'seven', 'eight', 'nine', 'colon', 'semicolon', 'less', 'equal',
- 'greater', 'question', 'at', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
- 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
- 'U', 'V', 'W', 'X', 'Y', 'Z', 'bracketleft', 'backslash',
- 'bracketright', 'asciicircum', 'underscore', 'quoteleft', 'a',
- 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
- 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
- 'braceleft', 'bar', 'braceright', 'asciitilde', 'exclamdown',
- 'cent', 'sterling', 'fraction', 'yen', 'florin', 'section',
- 'currency', 'quotesingle', 'quotedblleft', 'guillemotleft',
- 'guilsinglleft', 'guilsinglright', 'fi', 'fl', 'endash',
- 'dagger', 'daggerdbl', 'periodcentered', 'paragraph', 'bullet',
- 'quotesinglbase', 'quotedblbase', 'quotedblright',
- 'guillemotright', 'ellipsis', 'perthousand', 'questiondown',
- 'grave', 'acute', 'circumflex', 'tilde', 'macron', 'breve',
- 'dotaccent', 'dieresis', 'ring', 'cedilla', 'hungarumlaut',
- 'ogonek', 'caron', 'emdash', 'AE', 'ordfeminine', 'Lslash',
- 'Oslash', 'OE', 'ordmasculine', 'ae', 'dotlessi', 'lslash',
- 'oslash', 'oe', 'germandbls', 'onesuperior', 'logicalnot', 'mu',
- 'trademark', 'Eth', 'onehalf', 'plusminus', 'Thorn',
- 'onequarter', 'divide', 'brokenbar', 'degree', 'thorn',
- 'threequarters', 'twosuperior', 'registered', 'minus', 'eth',
- 'multiply', 'threesuperior', 'copyright', 'Aacute',
- 'Acircumflex', 'Adieresis', 'Agrave', 'Aring', 'Atilde',
- 'Ccedilla', 'Eacute', 'Ecircumflex', 'Edieresis', 'Egrave',
- 'Iacute', 'Icircumflex', 'Idieresis', 'Igrave', 'Ntilde',
- 'Oacute', 'Ocircumflex', 'Odieresis', 'Ograve', 'Otilde',
- 'Scaron', 'Uacute', 'Ucircumflex', 'Udieresis', 'Ugrave',
- 'Yacute', 'Ydieresis', 'Zcaron', 'aacute', 'acircumflex',
- 'adieresis', 'agrave', 'aring', 'atilde', 'ccedilla', 'eacute',
- 'ecircumflex', 'edieresis', 'egrave', 'iacute', 'icircumflex',
- 'idieresis', 'igrave', 'ntilde', 'oacute', 'ocircumflex',
- 'odieresis', 'ograve', 'otilde', 'scaron', 'uacute',
- 'ucircumflex', 'udieresis', 'ugrave', 'yacute', 'ydieresis',
- 'zcaron', 'exclamsmall', 'Hungarumlautsmall', 'dollaroldstyle',
- 'dollarsuperior', 'ampersandsmall', 'Acutesmall',
- 'parenleftsuperior', 'parenrightsuperior', 'twodotenleader',
- 'onedotenleader', 'zerooldstyle', 'oneoldstyle', 'twooldstyle',
- 'threeoldstyle', 'fouroldstyle', 'fiveoldstyle', 'sixoldstyle',
- 'sevenoldstyle', 'eightoldstyle', 'nineoldstyle',
- 'commasuperior', 'threequartersemdash', 'periodsuperior',
- 'questionsmall', 'asuperior', 'bsuperior', 'centsuperior',
- 'dsuperior', 'esuperior', 'isuperior', 'lsuperior', 'msuperior',
- 'nsuperior', 'osuperior', 'rsuperior', 'ssuperior', 'tsuperior',
- 'ff', 'ffi', 'ffl', 'parenleftinferior', 'parenrightinferior',
- 'Circumflexsmall', 'hyphensuperior', 'Gravesmall', 'Asmall',
- 'Bsmall', 'Csmall', 'Dsmall', 'Esmall', 'Fsmall', 'Gsmall',
- 'Hsmall', 'Ismall', 'Jsmall', 'Ksmall', 'Lsmall', 'Msmall',
- 'Nsmall', 'Osmall', 'Psmall', 'Qsmall', 'Rsmall', 'Ssmall',
- 'Tsmall', 'Usmall', 'Vsmall', 'Wsmall', 'Xsmall', 'Ysmall',
- 'Zsmall', 'colonmonetary', 'onefitted', 'rupiah', 'Tildesmall',
- 'exclamdownsmall', 'centoldstyle', 'Lslashsmall', 'Scaronsmall',
- 'Zcaronsmall', 'Dieresissmall', 'Brevesmall', 'Caronsmall',
- 'Dotaccentsmall', 'Macronsmall', 'figuredash', 'hypheninferior',
- 'Ogoneksmall', 'Ringsmall', 'Cedillasmall', 'questiondownsmall',
- 'oneeighth', 'threeeighths', 'fiveeighths', 'seveneighths',
- 'onethird', 'twothirds', 'zerosuperior', 'foursuperior',
- 'fivesuperior', 'sixsuperior', 'sevensuperior', 'eightsuperior',
- 'ninesuperior', 'zeroinferior', 'oneinferior', 'twoinferior',
- 'threeinferior', 'fourinferior', 'fiveinferior', 'sixinferior',
- 'seveninferior', 'eightinferior', 'nineinferior',
- 'centinferior', 'dollarinferior', 'periodinferior',
- 'commainferior', 'Agravesmall', 'Aacutesmall',
- 'Acircumflexsmall', 'Atildesmall', 'Adieresissmall',
- 'Aringsmall', 'AEsmall', 'Ccedillasmall', 'Egravesmall',
- 'Eacutesmall', 'Ecircumflexsmall', 'Edieresissmall',
- 'Igravesmall', 'Iacutesmall', 'Icircumflexsmall',
- 'Idieresissmall', 'Ethsmall', 'Ntildesmall', 'Ogravesmall',
- 'Oacutesmall', 'Ocircumflexsmall', 'Otildesmall',
- 'Odieresissmall', 'OEsmall', 'Oslashsmall', 'Ugravesmall',
- 'Uacutesmall', 'Ucircumflexsmall', 'Udieresissmall',
- 'Yacutesmall', 'Thornsmall', 'Ydieresissmall', '001.000',
- '001.001', '001.002', '001.003', 'Black', 'Bold', 'Book',
- 'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
- )
- class INDEX(object):
- def __init__(self, fp):
- self.fp = fp
- self.offsets = []
- (count, offsize) = struct.unpack('>HB', self.fp.read(3))
- for i in xrange(count+1):
- self.offsets.append(nunpack(self.fp.read(offsize)))
- self.base = self.fp.tell()-1
- self.fp.seek(self.base+self.offsets[-1])
- return
- def __repr__(self):
- return '<INDEX: size=%d>' % len(self)
- def __len__(self):
- return len(self.offsets)-1
- def __getitem__(self, i):
- self.fp.seek(self.base+self.offsets[i])
- return self.fp.read(self.offsets[i+1]-self.offsets[i])
- def __iter__(self):
- return iter(self[i] for i in xrange(len(self)))
- def __init__(self, name, fp):
- self.name = name
- self.fp = fp
- # Header
- (_major, _minor, hdrsize, offsize) = struct.unpack('BBBB', self.fp.read(4))
- self.fp.read(hdrsize-4)
- # Name INDEX
- self.name_index = self.INDEX(self.fp)
- # Top DICT INDEX
- self.dict_index = self.INDEX(self.fp)
- # String INDEX
- self.string_index = self.INDEX(self.fp)
- # Global Subr INDEX
- self.subr_index = self.INDEX(self.fp)
- # Top DICT DATA
- self.top_dict = getdict(self.dict_index[0])
- (charset_pos,) = self.top_dict.get(15, [0])
- (encoding_pos,) = self.top_dict.get(16, [0])
- (charstring_pos,) = self.top_dict.get(17, [0])
- # CharStrings
- self.fp.seek(charstring_pos)
- self.charstring = self.INDEX(self.fp)
- self.nglyphs = len(self.charstring)
- # Encodings
- self.code2gid = {}
- self.gid2code = {}
- self.fp.seek(encoding_pos)
- format = self.fp.read(1)
- if format == '\x00':
- # Format 0
- (n,) = struct.unpack('B', self.fp.read(1))
- for (code, gid) in enumerate(struct.unpack('B'*n, self.fp.read(n))):
- self.code2gid[code] = gid
- self.gid2code[gid] = code
- elif format == '\x01':
- # Format 1
- (n,) = struct.unpack('B', self.fp.read(1))
- code = 0
- for i in xrange(n):
- (first, nleft) = struct.unpack('BB', self.fp.read(2))
- for gid in xrange(first, first+nleft+1):
- self.code2gid[code] = gid
- self.gid2code[gid] = code
- code += 1
- else:
- raise ValueError('unsupported encoding format: %r' % format)
- # Charsets
- self.name2gid = {}
- self.gid2name = {}
- self.fp.seek(charset_pos)
- format = self.fp.read(1)
- if format == '\x00':
- # Format 0
- n = self.nglyphs-1
- for (gid, sid) in enumerate(struct.unpack('>'+'H'*n, self.fp.read(2*n))):
- gid += 1
- name = self.getstr(sid)
- self.name2gid[name] = gid
- self.gid2name[gid] = name
- elif format == '\x01':
- # Format 1
- (n,) = struct.unpack('B', self.fp.read(1))
- sid = 0
- for i in xrange(n):
- (first, nleft) = struct.unpack('BB', self.fp.read(2))
- for gid in xrange(first, first+nleft+1):
- name = self.getstr(sid)
- self.name2gid[name] = gid
- self.gid2name[gid] = name
- sid += 1
- elif format == '\x02':
- # Format 2
- assert 0
- else:
- raise ValueError('unsupported charset format: %r' % format)
- #print self.code2gid
- #print self.name2gid
- #assert 0
- return
- def getstr(self, sid):
- if sid < len(self.STANDARD_STRINGS):
- return self.STANDARD_STRINGS[sid]
- return self.string_index[sid-len(self.STANDARD_STRINGS)]
- ## TrueTypeFont
- ##
- class TrueTypeFont(object):
- class CMapNotFound(Exception):
- pass
- def __init__(self, name, fp):
- self.name = name
- self.fp = fp
- self.tables = {}
- self.fonttype = fp.read(4)
- (ntables, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8))
- for _ in xrange(ntables):
- (name, tsum, offset, length) = struct.unpack('>4sLLL', fp.read(16))
- self.tables[name] = (offset, length)
- return
- def create_unicode_map(self):
- if 'cmap' not in self.tables:
- raise TrueTypeFont.CMapNotFound
- (base_offset, length) = self.tables['cmap']
- fp = self.fp
- fp.seek(base_offset)
- (version, nsubtables) = struct.unpack('>HH', fp.read(4))
- subtables = []
- for i in xrange(nsubtables):
- subtables.append(struct.unpack('>HHL', fp.read(8)))
- char2gid = {}
- # Only supports subtable type 0, 2 and 4.
- for (_1, _2, st_offset) in subtables:
- fp.seek(base_offset+st_offset)
- (fmttype, fmtlen, fmtlang) = struct.unpack('>HHH', fp.read(6))
- if fmttype == 0:
- char2gid.update(enumerate(struct.unpack('>256B', fp.read(256))))
- elif fmttype == 2:
- subheaderkeys = struct.unpack('>256H', fp.read(512))
- firstbytes = [0]*8192
- for (i, k) in enumerate(subheaderkeys):
- firstbytes[k//8] = i
- nhdrs = max(subheaderkeys)//8 + 1
- hdrs = []
- for i in xrange(nhdrs):
- (firstcode, entcount, delta, offset) = struct.unpack('>HHhH', fp.read(8))
- hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset))
- for (i, firstcode, entcount, delta, pos) in hdrs:
- if not entcount:
- continue
- first = firstcode + (firstbytes[i] << 8)
- fp.seek(pos)
- for c in xrange(entcount):
- gid = struct.unpack('>H', fp.read(2))
- if gid:
- gid += delta
- char2gid[first+c] = gid
- elif fmttype == 4:
- (segcount, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8))
- segcount //= 2
- ecs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
- fp.read(2)
- scs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
- idds = struct.unpack('>%dh' % segcount, fp.read(2*segcount))
- pos = fp.tell()
- idrs = struct.unpack('>%dH' % segcount, fp.read(2*segcount))
- for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs):
- if idr:
- fp.seek(pos+idr)
- for c in xrange(sc, ec+1):
- char2gid[c] = (struct.unpack('>H', fp.read(2))[0] + idd) & 0xffff
- else:
- for c in xrange(sc, ec+1):
- char2gid[c] = (c + idd) & 0xffff
- else:
- assert 0
- # create unicode map
- unicode_map = FileUnicodeMap()
- for (char, gid) in char2gid.iteritems():
- unicode_map.add_cid2unichr(gid, char)
- return unicode_map
- ## Fonts
- ##
- class PDFFontError(PDFException):
- pass
- class PDFUnicodeNotDefined(PDFFontError):
- pass
- LITERAL_STANDARD_ENCODING = LIT('StandardEncoding')
- LITERAL_TYPE1C = LIT('Type1C')
- # PDFFont
- class PDFFont(object):
- def __init__(self, descriptor, widths, default_width=None):
- self.descriptor = descriptor
- self.widths = widths
- self.fontname = resolve1(descriptor.get('FontName', 'unknown'))
- if isinstance(self.fontname, PSLiteral):
- self.fontname = literal_name(self.fontname)
- self.flags = int_value(descriptor.get('Flags', 0))
- self.ascent = num_value(descriptor.get('Ascent', 0))
- self.descent = num_value(descriptor.get('Descent', 0))
- self.italic_angle = num_value(descriptor.get('ItalicAngle', 0))
- self.default_width = default_width or num_value(descriptor.get('MissingWidth', 0))
- self.leading = num_value(descriptor.get('Leading', 0))
- self.bbox = list_value(descriptor.get('FontBBox', (0, 0, 0, 0)))
- self.hscale = self.vscale = .001
- return
- def __repr__(self):
- return '<PDFFont>'
- def is_vertical(self):
- return False
- def is_multibyte(self):
- return False
- def decode(self, bytes):
- return map(ord, bytes)
- def get_ascent(self):
- return self.ascent * self.vscale
- def get_descent(self):
- return self.descent * self.vscale
- def get_width(self):
- w = self.bbox[2]-self.bbox[0]
- if w == 0:
- w = -self.default_width
- return w * self.hscale
- def get_height(self):
- h = self.bbox[3]-self.bbox[1]
- if h == 0:
- h = self.ascent - self.descent
- return h * self.vscale
- def char_width(self, cid):
- try:
- return self.widths[cid] * self.hscale
- except KeyError:
- try:
- return self.widths[self.to_unichr(cid)] * self.hscale
- except (KeyError, PDFUnicodeNotDefined):
- return self.default_width * self.hscale
- def char_disp(self, cid):
- return 0
- def string_width(self, s):
- return sum(self.char_width(cid) for cid in self.decode(s))
- # PDFSimpleFont
- class PDFSimpleFont(PDFFont):
- def __init__(self, descriptor, widths, spec):
- # Font encoding is specified either by a name of
- # built-in encoding or a dictionary that describes
- # the differences.
- if 'Encoding' in spec:
- encoding = resolve1(spec['Encoding'])
- else:
- encoding = LITERAL_STANDARD_ENCODING
- if isinstance(encoding, dict):
- name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
- diff = list_value(encoding.get('Differences', None))
- self.cid2unicode = EncodingDB.get_encoding(name, diff)
- else:
- self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
- self.unicode_map = None
- if 'ToUnicode' in spec:
- strm = stream_value(spec['ToUnicode'])
- self.unicode_map = FileUnicodeMap()
- CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
- PDFFont.__init__(self, descriptor, widths)
- return
- def to_unichr(self, cid):
- if self.unicode_map:
- try:
- return self.unicode_map.get_unichr(cid)
- except KeyError:
- pass
- try:
- return self.cid2unicode[cid]
- except KeyError:
- raise PDFUnicodeNotDefined(None, cid)
- # PDFType1Font
- class PDFType1Font(PDFSimpleFont):
- def __init__(self, rsrcmgr, spec):
- try:
- self.basefont = literal_name(spec['BaseFont'])
- except KeyError:
- if STRICT:
- raise PDFFontError('BaseFont is missing')
- self.basefont = 'unknown'
- try:
- (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
- except KeyError:
- descriptor = dict_value(spec.get('FontDescriptor', {}))
- firstchar = int_value(spec.get('FirstChar', 0))
- lastchar = int_value(spec.get('LastChar', 255))
- widths = list_value(spec.get('Widths', [0]*256))
- widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
- PDFSimpleFont.__init__(self, descriptor, widths, spec)
- if 'Encoding' not in spec and 'FontFile' in descriptor:
- # try to recover the missing encoding info from the font file.
- self.fontfile = stream_value(descriptor.get('FontFile'))
- length1 = int_value(self.fontfile['Length1'])
- data = self.fontfile.get_data()[:length1]
- parser = Type1FontHeaderParser(StringIO(data))
- self.cid2unicode = parser.get_encoding()
- return
- def __repr__(self):
- return '<PDFType1Font: basefont=%r>' % self.basefont
- # PDFTrueTypeFont
- class PDFTrueTypeFont(PDFType1Font):
- def __repr__(self):
- return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
- # PDFType3Font
- class PDFType3Font(PDFSimpleFont):
- def __init__(self, rsrcmgr, spec):
- firstchar = int_value(spec.get('FirstChar', 0))
- lastchar = int_value(spec.get('LastChar', 0))
- widths = list_value(spec.get('Widths', [0]*256))
- widths = dict((i+firstchar, w) for (i, w) in enumerate(widths))
- if 'FontDescriptor' in spec:
- descriptor = dict_value(spec['FontDescriptor'])
- else:
- descriptor = {'Ascent': 0, 'Descent': 0,
- 'FontBBox': spec['FontBBox']}
- PDFSimpleFont.__init__(self, descriptor, widths, spec)
- self.matrix = tuple(list_value(spec.get('FontMatrix')))
- (_, self.descent, _, self.ascent) = self.bbox
- (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
- return
- def __repr__(self):
- return '<PDFType3Font>'
- # PDFCIDFont
- class PDFCIDFont(PDFFont):
- def __init__(self, rsrcmgr, spec):
- try:
- self.basefont = literal_name(spec['BaseFont'])
- except KeyError:
- if STRICT:
- raise PDFFontError('BaseFont is missing')
- self.basefont = 'unknown'
- self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
- self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
- self.cidsysteminfo.get('Ordering', 'unknown'))
- try:
- name = literal_name(spec['Encoding'])
- except KeyError:
- if STRICT:
- raise PDFFontError('Encoding is unspecified')
- name = 'unknown'
- try:
- self.cmap = CMapDB.get_cmap(name)
- except CMapDB.CMapNotFound, e:
- if STRICT:
- raise PDFFontError(e)
- self.cmap = CMap()
- try:
- descriptor = dict_value(spec['FontDescriptor'])
- except KeyError:
- if STRICT:
- raise PDFFontError('FontDescriptor is missing')
- descriptor = {}
- ttf = None
- if 'FontFile2' in descriptor:
- self.fontfile = stream_value(descriptor.get('FontFile2'))
- ttf = TrueTypeFont(self.basefont,
- StringIO(self.fontfile.get_data()))
- self.unicode_map = None
- if 'ToUnicode' in spec:
- strm = stream_value(spec['ToUnicode'])
- self.unicode_map = FileUnicodeMap()
- CMapParser(self.unicode_map, StringIO(strm.get_data())).run()
- elif self.cidcoding in ('Adobe-Identity', 'Adobe-UCS'):
- if ttf:
- try:
- self.unicode_map = ttf.create_unicode_map()
- except TrueTypeFont.CMapNotFound:
- pass
- else:
- try:
- self.unicode_map = CMapDB.get_unicode_map(self.cidcoding, self.cmap.is_vertical())
- except CMapDB.CMapNotFound, e:
- pass
- self.vertical = self.cmap.is_vertical()
- if self.vertical:
- # writing mode: vertical
- widths = get_widths2(list_value(spec.get('W2', [])))
- self.disps = dict((cid, (vx, vy)) for (cid, (_, (vx, vy))) in widths.iteritems())
- (vy, w) = spec.get('DW2', [880, -1000])
- self.default_disp = (None, vy)
- widths = dict((cid, w) for (cid, (w, _)) in widths.iteritems())
- default_width = w
- else:
- # writing mode: horizontal
- self.disps = {}
- self.default_disp = 0
- widths = get_widths(list_value(spec.get('W', [])))
- default_width = spec.get('DW', 1000)
- PDFFont.__init__(self, descriptor, widths, default_width=default_width)
- return
- def __repr__(self):
- return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
- def is_vertical(self):
- return self.vertical
- def is_multibyte(self):
- return True
- def decode(self, bytes):
- return self.cmap.decode(bytes)
- def char_disp(self, cid):
- "Returns an integer for horizontal fonts, a tuple for vertical fonts."
- return self.disps.get(cid, self.default_disp)
- def to_unichr(self, cid):
- try:
- if not self.unicode_map:
- raise KeyError(cid)
- return self.unicode_map.get_unichr(cid)
- except KeyError:
- raise PDFUnicodeNotDefined(self.cidcoding, cid)
- # main
- def main(argv):
- for fname in argv[1:]:
- fp = file(fname, 'rb')
- #font = TrueTypeFont(fname, fp)
- font = CFFFont(fname, fp)
- print font
- fp.close()
- return
- if __name__ == '__main__':
- sys.exit(main(sys.argv))
|