StringEncoding.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. #
  2. # Cython -- encoding related tools
  3. #
  4. from __future__ import absolute_import
  5. import re
  6. import sys
  7. if sys.version_info[0] >= 3:
  8. _unicode, _str, _bytes, _unichr = str, str, bytes, chr
  9. IS_PYTHON3 = True
  10. else:
  11. _unicode, _str, _bytes, _unichr = unicode, str, str, unichr
  12. IS_PYTHON3 = False
  13. empty_bytes = _bytes()
  14. empty_unicode = _unicode()
  15. join_bytes = empty_bytes.join
  16. class UnicodeLiteralBuilder(object):
  17. """Assemble a unicode string.
  18. """
  19. def __init__(self):
  20. self.chars = []
  21. def append(self, characters):
  22. if isinstance(characters, _bytes):
  23. # this came from a Py2 string literal in the parser code
  24. characters = characters.decode("ASCII")
  25. assert isinstance(characters, _unicode), str(type(characters))
  26. self.chars.append(characters)
  27. if sys.maxunicode == 65535:
  28. def append_charval(self, char_number):
  29. if char_number > 65535:
  30. # wide Unicode character on narrow platform => replace
  31. # by surrogate pair
  32. char_number -= 0x10000
  33. self.chars.append( _unichr((char_number // 1024) + 0xD800) )
  34. self.chars.append( _unichr((char_number % 1024) + 0xDC00) )
  35. else:
  36. self.chars.append( _unichr(char_number) )
  37. else:
  38. def append_charval(self, char_number):
  39. self.chars.append( _unichr(char_number) )
  40. def append_uescape(self, char_number, escape_string):
  41. self.append_charval(char_number)
  42. def getstring(self):
  43. return EncodedString(u''.join(self.chars))
  44. def getstrings(self):
  45. return (None, self.getstring())
  46. class BytesLiteralBuilder(object):
  47. """Assemble a byte string or char value.
  48. """
  49. def __init__(self, target_encoding):
  50. self.chars = []
  51. self.target_encoding = target_encoding
  52. def append(self, characters):
  53. if isinstance(characters, _unicode):
  54. characters = characters.encode(self.target_encoding)
  55. assert isinstance(characters, _bytes), str(type(characters))
  56. self.chars.append(characters)
  57. def append_charval(self, char_number):
  58. self.chars.append( _unichr(char_number).encode('ISO-8859-1') )
  59. def append_uescape(self, char_number, escape_string):
  60. self.append(escape_string)
  61. def getstring(self):
  62. # this *must* return a byte string!
  63. return bytes_literal(join_bytes(self.chars), self.target_encoding)
  64. def getchar(self):
  65. # this *must* return a byte string!
  66. return self.getstring()
  67. def getstrings(self):
  68. return (self.getstring(), None)
  69. class StrLiteralBuilder(object):
  70. """Assemble both a bytes and a unicode representation of a string.
  71. """
  72. def __init__(self, target_encoding):
  73. self._bytes = BytesLiteralBuilder(target_encoding)
  74. self._unicode = UnicodeLiteralBuilder()
  75. def append(self, characters):
  76. self._bytes.append(characters)
  77. self._unicode.append(characters)
  78. def append_charval(self, char_number):
  79. self._bytes.append_charval(char_number)
  80. self._unicode.append_charval(char_number)
  81. def append_uescape(self, char_number, escape_string):
  82. self._bytes.append(escape_string)
  83. self._unicode.append_charval(char_number)
  84. def getstrings(self):
  85. return (self._bytes.getstring(), self._unicode.getstring())
  86. class EncodedString(_unicode):
  87. # unicode string subclass to keep track of the original encoding.
  88. # 'encoding' is None for unicode strings and the source encoding
  89. # otherwise
  90. encoding = None
  91. def __deepcopy__(self, memo):
  92. return self
  93. def byteencode(self):
  94. assert self.encoding is not None
  95. return self.encode(self.encoding)
  96. def utf8encode(self):
  97. assert self.encoding is None
  98. return self.encode("UTF-8")
  99. @property
  100. def is_unicode(self):
  101. return self.encoding is None
  102. def contains_surrogates(self):
  103. return string_contains_surrogates(self)
  104. def as_utf8_string(self):
  105. return bytes_literal(self.utf8encode(), 'utf8')
  106. def string_contains_surrogates(ustring):
  107. """
  108. Check if the unicode string contains surrogate code points
  109. on a CPython platform with wide (UCS-4) or narrow (UTF-16)
  110. Unicode, i.e. characters that would be spelled as two
  111. separate code units on a narrow platform.
  112. """
  113. for c in map(ord, ustring):
  114. if c > 65535: # can only happen on wide platforms
  115. return True
  116. if 0xD800 <= c <= 0xDFFF:
  117. return True
  118. return False
  119. class BytesLiteral(_bytes):
  120. # bytes subclass that is compatible with EncodedString
  121. encoding = None
  122. def __deepcopy__(self, memo):
  123. return self
  124. def byteencode(self):
  125. if IS_PYTHON3:
  126. return _bytes(self)
  127. else:
  128. # fake-recode the string to make it a plain bytes object
  129. return self.decode('ISO-8859-1').encode('ISO-8859-1')
  130. def utf8encode(self):
  131. assert False, "this is not a unicode string: %r" % self
  132. def __str__(self):
  133. """Fake-decode the byte string to unicode to support %
  134. formatting of unicode strings.
  135. """
  136. return self.decode('ISO-8859-1')
  137. is_unicode = False
  138. def as_c_string_literal(self):
  139. value = split_string_literal(escape_byte_string(self))
  140. return '"%s"' % value
  141. def bytes_literal(s, encoding):
  142. assert isinstance(s, bytes)
  143. s = BytesLiteral(s)
  144. s.encoding = encoding
  145. return s
  146. def encoded_string(s, encoding):
  147. assert isinstance(s, (_unicode, bytes))
  148. s = EncodedString(s)
  149. if encoding is not None:
  150. s.encoding = encoding
  151. return s
  152. char_from_escape_sequence = {
  153. r'\a' : u'\a',
  154. r'\b' : u'\b',
  155. r'\f' : u'\f',
  156. r'\n' : u'\n',
  157. r'\r' : u'\r',
  158. r'\t' : u'\t',
  159. r'\v' : u'\v',
  160. }.get
  161. _c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
  162. def _to_escape_sequence(s):
  163. if s in '\n\r\t':
  164. return repr(s)[1:-1]
  165. elif s == '"':
  166. return r'\"'
  167. elif s == '\\':
  168. return r'\\'
  169. else:
  170. # within a character sequence, oct passes much better than hex
  171. return ''.join(['\\%03o' % ord(c) for c in s])
  172. def _build_specials_replacer():
  173. subexps = []
  174. replacements = {}
  175. for special in _c_special:
  176. regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
  177. subexps.append(regexp)
  178. replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
  179. sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
  180. def replace_specials(m):
  181. return replacements[m.group(1)]
  182. def replace(s):
  183. return sub(replace_specials, s)
  184. return replace
  185. _replace_specials = _build_specials_replacer()
  186. def escape_char(c):
  187. if IS_PYTHON3:
  188. c = c.decode('ISO-8859-1')
  189. if c in '\n\r\t\\':
  190. return repr(c)[1:-1]
  191. elif c == "'":
  192. return "\\'"
  193. n = ord(c)
  194. if n < 32 or n > 127:
  195. # hex works well for characters
  196. return "\\x%02X" % n
  197. else:
  198. return c
  199. def escape_byte_string(s):
  200. """Escape a byte string so that it can be written into C code.
  201. Note that this returns a Unicode string instead which, when
  202. encoded as ISO-8859-1, will result in the correct byte sequence
  203. being written.
  204. """
  205. s = _replace_specials(s)
  206. try:
  207. return s.decode("ASCII") # trial decoding: plain ASCII => done
  208. except UnicodeDecodeError:
  209. pass
  210. if IS_PYTHON3:
  211. s_new = bytearray()
  212. append, extend = s_new.append, s_new.extend
  213. for b in s:
  214. if b >= 128:
  215. extend(('\\%3o' % b).encode('ASCII'))
  216. else:
  217. append(b)
  218. return s_new.decode('ISO-8859-1')
  219. else:
  220. l = []
  221. append = l.append
  222. for c in s:
  223. o = ord(c)
  224. if o >= 128:
  225. append('\\%3o' % o)
  226. else:
  227. append(c)
  228. return join_bytes(l).decode('ISO-8859-1')
  229. def split_string_literal(s, limit=2000):
  230. # MSVC can't handle long string literals.
  231. if len(s) < limit:
  232. return s
  233. else:
  234. start = 0
  235. chunks = []
  236. while start < len(s):
  237. end = start + limit
  238. if len(s) > end-4 and '\\' in s[end-4:end]:
  239. end -= 4 - s[end-4:end].find('\\') # just before the backslash
  240. while s[end-1] == '\\':
  241. end -= 1
  242. if end == start:
  243. # must have been a long line of backslashes
  244. end = start + limit - (limit % 2) - 4
  245. break
  246. chunks.append(s[start:end])
  247. start = end
  248. return '""'.join(chunks)
  249. def encode_pyunicode_string(s):
  250. """Create Py_UNICODE[] representation of a given unicode string.
  251. """
  252. s = list(map(ord, s)) + [0]
  253. if sys.maxunicode >= 0x10000: # Wide build or Py3.3
  254. utf16, utf32 = [], s
  255. for code_point in s:
  256. if code_point >= 0x10000: # outside of BMP
  257. high, low = divmod(code_point - 0x10000, 1024)
  258. utf16.append(high + 0xD800)
  259. utf16.append(low + 0xDC00)
  260. else:
  261. utf16.append(code_point)
  262. else:
  263. utf16, utf32 = s, []
  264. for code_unit in s:
  265. if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
  266. high, low = utf32[-1], code_unit
  267. utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
  268. else:
  269. utf32.append(code_unit)
  270. if utf16 == utf32:
  271. utf16 = []
  272. return ",".join(map(_unicode, utf16)), ",".join(map(_unicode, utf32))