cmapdb.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. #!/usr/bin/env python
  2. """ Adobe character mapping (CMap) support.
  3. CMaps provide the mapping between character codes and Unicode
  4. code-points to character ids (CIDs).
  5. More information is available on the Adobe website:
  6. http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
  7. """
  8. import sys
  9. import os
  10. import os.path
  11. import gzip
  12. import cPickle as pickle
  13. import struct
  14. from psparser import PSStackParser
  15. from psparser import PSSyntaxError, PSEOF
  16. from psparser import PSLiteral
  17. from psparser import literal_name
  18. from encodingdb import name2unicode
  19. from utils import choplist, nunpack
  20. class CMapError(Exception):
  21. pass
  22. ## CMap
  23. ##
  24. class CMap(object):
  25. debug = 0
  26. def __init__(self, code2cid=None):
  27. self.code2cid = code2cid or {}
  28. return
  29. def is_vertical(self):
  30. return False
  31. def use_cmap(self, cmap):
  32. assert isinstance(cmap, CMap)
  33. def copy(dst, src):
  34. for (k, v) in src.iteritems():
  35. if isinstance(v, dict):
  36. d = {}
  37. dst[k] = d
  38. copy(d, v)
  39. else:
  40. dst[k] = v
  41. copy(self.code2cid, cmap.code2cid)
  42. return
  43. def decode(self, code):
  44. if self.debug:
  45. print >>sys.stderr, 'decode: %r, %r' % (self, code)
  46. d = self.code2cid
  47. for c in code:
  48. c = ord(c)
  49. if c in d:
  50. d = d[c]
  51. if isinstance(d, int):
  52. yield d
  53. d = self.code2cid
  54. else:
  55. d = self.code2cid
  56. return
  57. def dump(self, out=sys.stdout, code2cid=None, code=None):
  58. if code2cid is None:
  59. code2cid = self.code2cid
  60. code = ()
  61. for (k, v) in sorted(code2cid.iteritems()):
  62. c = code+(k,)
  63. if isinstance(v, int):
  64. out.write('code %r = cid %d\n' % (c, v))
  65. else:
  66. self.dump(out=out, code2cid=v, code=c)
  67. return
  68. ## IdentityCMap
  69. ##
  70. class IdentityCMap(object):
  71. def __init__(self, vertical):
  72. self.vertical = vertical
  73. return
  74. def is_vertical(self):
  75. return self.vertical
  76. def decode(self, code):
  77. n = len(code)//2
  78. if n:
  79. return struct.unpack('>%dH' % n, code)
  80. else:
  81. return ()
  82. ## UnicodeMap
  83. ##
  84. class UnicodeMap(object):
  85. debug = 0
  86. def __init__(self, cid2unichr=None):
  87. self.cid2unichr = cid2unichr or {}
  88. return
  89. def get_unichr(self, cid):
  90. if self.debug:
  91. print >>sys.stderr, 'get_unichr: %r, %r' % (self, cid)
  92. return self.cid2unichr[cid]
  93. def dump(self, out=sys.stdout):
  94. for (k, v) in sorted(self.cid2unichr.iteritems()):
  95. out.write('cid %d = unicode %r\n' % (k, v))
  96. return
  97. ## FileCMap
  98. ##
  99. class FileCMap(CMap):
  100. def __init__(self):
  101. CMap.__init__(self)
  102. self.attrs = {}
  103. return
  104. def __repr__(self):
  105. return '<CMap: %s>' % self.attrs.get('CMapName')
  106. def is_vertical(self):
  107. return self.attrs.get('WMode', 0) != 0
  108. def set_attr(self, k, v):
  109. self.attrs[k] = v
  110. return
  111. def add_code2cid(self, code, cid):
  112. assert isinstance(code, str) and isinstance(cid, int)
  113. d = self.code2cid
  114. for c in code[:-1]:
  115. c = ord(c)
  116. if c in d:
  117. d = d[c]
  118. else:
  119. t = {}
  120. d[c] = t
  121. d = t
  122. c = ord(code[-1])
  123. d[c] = cid
  124. return
  125. ## FileUnicodeMap
  126. ##
  127. class FileUnicodeMap(UnicodeMap):
  128. def __init__(self):
  129. UnicodeMap.__init__(self)
  130. self.attrs = {}
  131. return
  132. def __repr__(self):
  133. return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
  134. def set_attr(self, k, v):
  135. self.attrs[k] = v
  136. return
  137. def add_cid2unichr(self, cid, code):
  138. assert isinstance(cid, int)
  139. if isinstance(code, PSLiteral):
  140. # Interpret as an Adobe glyph name.
  141. self.cid2unichr[cid] = name2unicode(code.name)
  142. elif isinstance(code, str):
  143. # Interpret as UTF-16BE.
  144. self.cid2unichr[cid] = unicode(code, 'UTF-16BE', 'ignore')
  145. elif isinstance(code, int):
  146. self.cid2unichr[cid] = unichr(code)
  147. else:
  148. raise TypeError(code)
  149. return
  150. ## PyCMap
  151. ##
  152. class PyCMap(CMap):
  153. def __init__(self, name, module):
  154. CMap.__init__(self, module.CODE2CID)
  155. self.name = name
  156. self._is_vertical = module.IS_VERTICAL
  157. return
  158. def __repr__(self):
  159. return '<PyCMap: %s>' % (self.name)
  160. def is_vertical(self):
  161. return self._is_vertical
  162. ## PyUnicodeMap
  163. ##
  164. class PyUnicodeMap(UnicodeMap):
  165. def __init__(self, name, module, vertical):
  166. if vertical:
  167. cid2unichr = module.CID2UNICHR_V
  168. else:
  169. cid2unichr = module.CID2UNICHR_H
  170. UnicodeMap.__init__(self, cid2unichr)
  171. self.name = name
  172. return
  173. def __repr__(self):
  174. return '<PyUnicodeMap: %s>' % (self.name)
  175. ## CMapDB
  176. ##
  177. class CMapDB(object):
  178. debug = 0
  179. _cmap_cache = {}
  180. _umap_cache = {}
  181. class CMapNotFound(CMapError):
  182. pass
  183. @classmethod
  184. def _load_data(klass, name):
  185. filename = '%s.pickle.gz' % name
  186. if klass.debug:
  187. print >>sys.stderr, 'loading:', name
  188. cmap_paths = (os.environ.get('CMAP_PATH', '/usr/share/pdfminer/'),
  189. os.path.join(os.path.dirname(__file__), 'cmap'),)
  190. for directory in cmap_paths:
  191. path = os.path.join(directory, filename)
  192. if os.path.exists(path):
  193. gzfile = gzip.open(path)
  194. try:
  195. return type(name, (), pickle.loads(gzfile.read()))
  196. finally:
  197. gzfile.close()
  198. else:
  199. raise CMapDB.CMapNotFound(name)
  200. @classmethod
  201. def get_cmap(klass, name):
  202. if name == 'Identity-H':
  203. return IdentityCMap(False)
  204. elif name == 'Identity-V':
  205. return IdentityCMap(True)
  206. try:
  207. return klass._cmap_cache[name]
  208. except KeyError:
  209. pass
  210. data = klass._load_data(name)
  211. klass._cmap_cache[name] = cmap = PyCMap(name, data)
  212. return cmap
  213. @classmethod
  214. def get_unicode_map(klass, name, vertical=False):
  215. try:
  216. return klass._umap_cache[name][vertical]
  217. except KeyError:
  218. pass
  219. data = klass._load_data('to-unicode-%s' % name)
  220. klass._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v) for v in (False, True)]
  221. return umaps[vertical]
  222. ## CMapParser
  223. ##
  224. class CMapParser(PSStackParser):
  225. def __init__(self, cmap, fp):
  226. PSStackParser.__init__(self, fp)
  227. self.cmap = cmap
  228. # some ToUnicode maps don't have "begincmap" keyword.
  229. self._in_cmap = True
  230. return
  231. def run(self):
  232. try:
  233. self.nextobject()
  234. except PSEOF:
  235. pass
  236. return
  237. def do_keyword(self, pos, token):
  238. name = token.name
  239. if name == 'begincmap':
  240. self._in_cmap = True
  241. self.popall()
  242. return
  243. elif name == 'endcmap':
  244. self._in_cmap = False
  245. return
  246. if not self._in_cmap:
  247. return
  248. #
  249. if name == 'def':
  250. try:
  251. ((_, k), (_, v)) = self.pop(2)
  252. self.cmap.set_attr(literal_name(k), v)
  253. except PSSyntaxError:
  254. pass
  255. return
  256. if name == 'usecmap':
  257. try:
  258. ((_, cmapname),) = self.pop(1)
  259. self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
  260. except PSSyntaxError:
  261. pass
  262. except CMapDB.CMapNotFound:
  263. pass
  264. return
  265. if name == 'begincodespacerange':
  266. self.popall()
  267. return
  268. if name == 'endcodespacerange':
  269. self.popall()
  270. return
  271. if name == 'begincidrange':
  272. self.popall()
  273. return
  274. if name == 'endcidrange':
  275. objs = [obj for (__, obj) in self.popall()]
  276. for (s, e, cid) in choplist(3, objs):
  277. if (not isinstance(s, str) or not isinstance(e, str) or
  278. not isinstance(cid, int) or len(s) != len(e)):
  279. continue
  280. sprefix = s[:-4]
  281. eprefix = e[:-4]
  282. if sprefix != eprefix:
  283. continue
  284. svar = s[-4:]
  285. evar = e[-4:]
  286. s1 = nunpack(svar)
  287. e1 = nunpack(evar)
  288. vlen = len(svar)
  289. #assert s1 <= e1
  290. for i in xrange(e1-s1+1):
  291. x = sprefix+struct.pack('>L', s1+i)[-vlen:]
  292. self.cmap.add_code2cid(x, cid+i)
  293. return
  294. if name == 'begincidchar':
  295. self.popall()
  296. return
  297. if name == 'endcidchar':
  298. objs = [obj for (__, obj) in self.popall()]
  299. for (cid, code) in choplist(2, objs):
  300. if isinstance(code, str) and isinstance(cid, str):
  301. self.cmap.add_code2cid(code, nunpack(cid))
  302. return
  303. if name == 'beginbfrange':
  304. self.popall()
  305. return
  306. if name == 'endbfrange':
  307. objs = [obj for (__, obj) in self.popall()]
  308. for (s, e, code) in choplist(3, objs):
  309. if (not isinstance(s, str) or not isinstance(e, str) or
  310. len(s) != len(e)):
  311. continue
  312. s1 = nunpack(s)
  313. e1 = nunpack(e)
  314. #assert s1 <= e1
  315. if isinstance(code, list):
  316. for i in xrange(e1-s1+1):
  317. self.cmap.add_cid2unichr(s1+i, code[i])
  318. else:
  319. var = code[-4:]
  320. base = nunpack(var)
  321. prefix = code[:-4]
  322. vlen = len(var)
  323. for i in xrange(e1-s1+1):
  324. x = prefix+struct.pack('>L', base+i)[-vlen:]
  325. self.cmap.add_cid2unichr(s1+i, x)
  326. return
  327. if name == 'beginbfchar':
  328. self.popall()
  329. return
  330. if name == 'endbfchar':
  331. objs = [obj for (__, obj) in self.popall()]
  332. for (cid, code) in choplist(2, objs):
  333. if isinstance(cid, str) and isinstance(code, str):
  334. self.cmap.add_cid2unichr(nunpack(cid), code)
  335. return
  336. if name == 'beginnotdefrange':
  337. self.popall()
  338. return
  339. if name == 'endnotdefrange':
  340. self.popall()
  341. return
  342. self.push((pos, token))
  343. return
  344. # test
  345. def main(argv):
  346. args = argv[1:]
  347. for fname in args:
  348. fp = file(fname, 'rb')
  349. cmap = FileUnicodeMap()
  350. #cmap = FileCMap()
  351. CMapParser(cmap, fp).run()
  352. fp.close()
  353. cmap.dump()
  354. return
  355. if __name__ == '__main__':
  356. sys.exit(main(sys.argv))