| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429 |
- #!/usr/bin/env python
- """ Adobe character mapping (CMap) support.
- CMaps provide the mapping between character codes and Unicode
- code-points to character ids (CIDs).
- More information is available on the Adobe website:
- http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
- """
- import sys
- import os
- import os.path
- import gzip
- import cPickle as pickle
- import struct
- from psparser import PSStackParser
- from psparser import PSSyntaxError, PSEOF
- from psparser import PSLiteral
- from psparser import literal_name
- from encodingdb import name2unicode
- from utils import choplist, nunpack
- class CMapError(Exception):
- pass
- ## CMap
- ##
- class CMap(object):
- debug = 0
- def __init__(self, code2cid=None):
- self.code2cid = code2cid or {}
- return
- def is_vertical(self):
- return False
- def use_cmap(self, cmap):
- assert isinstance(cmap, CMap)
- def copy(dst, src):
- for (k, v) in src.iteritems():
- if isinstance(v, dict):
- d = {}
- dst[k] = d
- copy(d, v)
- else:
- dst[k] = v
- copy(self.code2cid, cmap.code2cid)
- return
- def decode(self, code):
- if self.debug:
- print >>sys.stderr, 'decode: %r, %r' % (self, code)
- d = self.code2cid
- for c in code:
- c = ord(c)
- if c in d:
- d = d[c]
- if isinstance(d, int):
- yield d
- d = self.code2cid
- else:
- d = self.code2cid
- return
- def dump(self, out=sys.stdout, code2cid=None, code=None):
- if code2cid is None:
- code2cid = self.code2cid
- code = ()
- for (k, v) in sorted(code2cid.iteritems()):
- c = code+(k,)
- if isinstance(v, int):
- out.write('code %r = cid %d\n' % (c, v))
- else:
- self.dump(out=out, code2cid=v, code=c)
- return
- ## IdentityCMap
- ##
- class IdentityCMap(object):
- def __init__(self, vertical):
- self.vertical = vertical
- return
- def is_vertical(self):
- return self.vertical
- def decode(self, code):
- n = len(code)//2
- if n:
- return struct.unpack('>%dH' % n, code)
- else:
- return ()
- ## UnicodeMap
- ##
- class UnicodeMap(object):
- debug = 0
- def __init__(self, cid2unichr=None):
- self.cid2unichr = cid2unichr or {}
- return
- def get_unichr(self, cid):
- if self.debug:
- print >>sys.stderr, 'get_unichr: %r, %r' % (self, cid)
- return self.cid2unichr[cid]
- def dump(self, out=sys.stdout):
- for (k, v) in sorted(self.cid2unichr.iteritems()):
- out.write('cid %d = unicode %r\n' % (k, v))
- return
- ## FileCMap
- ##
- class FileCMap(CMap):
- def __init__(self):
- CMap.__init__(self)
- self.attrs = {}
- return
- def __repr__(self):
- return '<CMap: %s>' % self.attrs.get('CMapName')
- def is_vertical(self):
- return self.attrs.get('WMode', 0) != 0
- def set_attr(self, k, v):
- self.attrs[k] = v
- return
- def add_code2cid(self, code, cid):
- assert isinstance(code, str) and isinstance(cid, int)
- d = self.code2cid
- for c in code[:-1]:
- c = ord(c)
- if c in d:
- d = d[c]
- else:
- t = {}
- d[c] = t
- d = t
- c = ord(code[-1])
- d[c] = cid
- return
- ## FileUnicodeMap
- ##
- class FileUnicodeMap(UnicodeMap):
- def __init__(self):
- UnicodeMap.__init__(self)
- self.attrs = {}
- return
- def __repr__(self):
- return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
- def set_attr(self, k, v):
- self.attrs[k] = v
- return
- def add_cid2unichr(self, cid, code):
- assert isinstance(cid, int)
- if isinstance(code, PSLiteral):
- # Interpret as an Adobe glyph name.
- self.cid2unichr[cid] = name2unicode(code.name)
- elif isinstance(code, str):
- # Interpret as UTF-16BE.
- self.cid2unichr[cid] = unicode(code, 'UTF-16BE', 'ignore')
- elif isinstance(code, int):
- self.cid2unichr[cid] = unichr(code)
- else:
- raise TypeError(code)
- return
- ## PyCMap
- ##
- class PyCMap(CMap):
- def __init__(self, name, module):
- CMap.__init__(self, module.CODE2CID)
- self.name = name
- self._is_vertical = module.IS_VERTICAL
- return
- def __repr__(self):
- return '<PyCMap: %s>' % (self.name)
- def is_vertical(self):
- return self._is_vertical
- ## PyUnicodeMap
- ##
- class PyUnicodeMap(UnicodeMap):
- def __init__(self, name, module, vertical):
- if vertical:
- cid2unichr = module.CID2UNICHR_V
- else:
- cid2unichr = module.CID2UNICHR_H
- UnicodeMap.__init__(self, cid2unichr)
- self.name = name
- return
- def __repr__(self):
- return '<PyUnicodeMap: %s>' % (self.name)
- ## CMapDB
- ##
- class CMapDB(object):
- debug = 0
- _cmap_cache = {}
- _umap_cache = {}
- class CMapNotFound(CMapError):
- pass
- @classmethod
- def _load_data(klass, name):
- filename = '%s.pickle.gz' % name
- if klass.debug:
- print >>sys.stderr, 'loading:', name
- cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
- os.path.join(os.path.dirname(__file__), 'cmap'),)
- for directory in cmap_paths:
- path = os.path.join(directory, filename)
- if os.path.exists(path):
- gzfile = gzip.open(path)
- try:
- return type(name, (), pickle.loads(gzfile.read()))
- finally:
- gzfile.close()
- else:
- raise CMapDB.CMapNotFound(name)
- @classmethod
- def get_cmap(klass, name):
- if name == 'Identity-H':
- return IdentityCMap(False)
- elif name == 'Identity-V':
- return IdentityCMap(True)
- try:
- return klass._cmap_cache[name]
- except KeyError:
- pass
- data = klass._load_data(name)
- klass._cmap_cache[name] = cmap = PyCMap(name, data)
- return cmap
- @classmethod
- def get_unicode_map(klass, name, vertical=False):
- try:
- return klass._umap_cache[name][vertical]
- except KeyError:
- pass
- data = klass._load_data('to-unicode-%s' % name)
- klass._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v) for v in (False, True)]
- return umaps[vertical]
- ## CMapParser
- ##
- class CMapParser(PSStackParser):
- def __init__(self, cmap, fp):
- PSStackParser.__init__(self, fp)
- self.cmap = cmap
- # some ToUnicode maps don't have "begincmap" keyword.
- self._in_cmap = True
- return
- def run(self):
- try:
- self.nextobject()
- except PSEOF:
- pass
- return
- def do_keyword(self, pos, token):
- name = token.name
- if name == 'begincmap':
- self._in_cmap = True
- self.popall()
- return
- elif name == 'endcmap':
- self._in_cmap = False
- return
- if not self._in_cmap:
- return
- #
- if name == 'def':
- try:
- ((_, k), (_, v)) = self.pop(2)
- self.cmap.set_attr(literal_name(k), v)
- except PSSyntaxError:
- pass
- return
- if name == 'usecmap':
- try:
- ((_, cmapname),) = self.pop(1)
- self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
- except PSSyntaxError:
- pass
- except CMapDB.CMapNotFound:
- pass
- return
- if name == 'begincodespacerange':
- self.popall()
- return
- if name == 'endcodespacerange':
- self.popall()
- return
- if name == 'begincidrange':
- self.popall()
- return
- if name == 'endcidrange':
- objs = [obj for (__, obj) in self.popall()]
- for (s, e, cid) in choplist(3, objs):
- if (not isinstance(s, str) or not isinstance(e, str) or
- not isinstance(cid, int) or len(s) != len(e)):
- continue
- sprefix = s[:-4]
- eprefix = e[:-4]
- if sprefix != eprefix:
- continue
- svar = s[-4:]
- evar = e[-4:]
- s1 = nunpack(svar)
- e1 = nunpack(evar)
- vlen = len(svar)
- #assert s1 <= e1
- for i in xrange(e1-s1+1):
- x = sprefix+struct.pack('>L', s1+i)[-vlen:]
- self.cmap.add_code2cid(x, cid+i)
- return
- if name == 'begincidchar':
- self.popall()
- return
- if name == 'endcidchar':
- objs = [obj for (__, obj) in self.popall()]
- for (cid, code) in choplist(2, objs):
- if isinstance(code, str) and isinstance(cid, str):
- self.cmap.add_code2cid(code, nunpack(cid))
- return
- if name == 'beginbfrange':
- self.popall()
- return
- if name == 'endbfrange':
- objs = [obj for (__, obj) in self.popall()]
- for (s, e, code) in choplist(3, objs):
- if (not isinstance(s, str) or not isinstance(e, str) or
- len(s) != len(e)):
- continue
- s1 = nunpack(s)
- e1 = nunpack(e)
- #assert s1 <= e1
- if isinstance(code, list):
- for i in xrange(e1-s1+1):
- self.cmap.add_cid2unichr(s1+i, code[i])
- else:
- var = code[-4:]
- base = nunpack(var)
- prefix = code[:-4]
- vlen = len(var)
- for i in xrange(e1-s1+1):
- x = prefix+struct.pack('>L', base+i)[-vlen:]
- self.cmap.add_cid2unichr(s1+i, x)
- return
- if name == 'beginbfchar':
- self.popall()
- return
- if name == 'endbfchar':
- objs = [obj for (__, obj) in self.popall()]
- for (cid, code) in choplist(2, objs):
- if isinstance(cid, str) and isinstance(code, str):
- self.cmap.add_cid2unichr(nunpack(cid), code)
- return
- if name == 'beginnotdefrange':
- self.popall()
- return
- if name == 'endnotdefrange':
- self.popall()
- return
- self.push((pos, token))
- return
- # test
- def main(argv):
- args = argv[1:]
- for fname in args:
- fp = file(fname, 'rb')
- cmap = FileUnicodeMap()
- #cmap = FileCMap()
- CMapParser(cmap, fp).run()
- fp.close()
- cmap.dump()
- return
- if __name__ == '__main__':
- sys.exit(main(sys.argv))
|