test_unicode.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. import sys
  2. import codecs
  3. from unittest import TestCase
  4. import simplejson as json
  5. from simplejson.compat import unichr, text_type, b, u, BytesIO
  6. class TestUnicode(TestCase):
  7. def test_encoding1(self):
  8. encoder = json.JSONEncoder(encoding='utf-8')
  9. u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
  10. s = u.encode('utf-8')
  11. ju = encoder.encode(u)
  12. js = encoder.encode(s)
  13. self.assertEqual(ju, js)
  14. def test_encoding2(self):
  15. u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
  16. s = u.encode('utf-8')
  17. ju = json.dumps(u, encoding='utf-8')
  18. js = json.dumps(s, encoding='utf-8')
  19. self.assertEqual(ju, js)
  20. def test_encoding3(self):
  21. u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
  22. j = json.dumps(u)
  23. self.assertEqual(j, '"\\u03b1\\u03a9"')
  24. def test_encoding4(self):
  25. u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
  26. j = json.dumps([u])
  27. self.assertEqual(j, '["\\u03b1\\u03a9"]')
  28. def test_encoding5(self):
  29. u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
  30. j = json.dumps(u, ensure_ascii=False)
  31. self.assertEqual(j, u'"' + u + u'"')
  32. def test_encoding6(self):
  33. u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}'
  34. j = json.dumps([u], ensure_ascii=False)
  35. self.assertEqual(j, u'["' + u + u'"]')
  36. def test_big_unicode_encode(self):
  37. u = u'\U0001d120'
  38. self.assertEqual(json.dumps(u), '"\\ud834\\udd20"')
  39. self.assertEqual(json.dumps(u, ensure_ascii=False), u'"\U0001d120"')
  40. def test_big_unicode_decode(self):
  41. u = u'z\U0001d120x'
  42. self.assertEqual(json.loads('"' + u + '"'), u)
  43. self.assertEqual(json.loads('"z\\ud834\\udd20x"'), u)
  44. def test_unicode_decode(self):
  45. for i in range(0, 0xd7ff):
  46. u = unichr(i)
  47. #s = '"\\u{0:04x}"'.format(i)
  48. s = '"\\u%04x"' % (i,)
  49. self.assertEqual(json.loads(s), u)
  50. def test_object_pairs_hook_with_unicode(self):
  51. s = u'{"xkd":1, "kcw":2, "art":3, "hxm":4, "qrt":5, "pad":6, "hoy":7}'
  52. p = [(u"xkd", 1), (u"kcw", 2), (u"art", 3), (u"hxm", 4),
  53. (u"qrt", 5), (u"pad", 6), (u"hoy", 7)]
  54. self.assertEqual(json.loads(s), eval(s))
  55. self.assertEqual(json.loads(s, object_pairs_hook=lambda x: x), p)
  56. od = json.loads(s, object_pairs_hook=json.OrderedDict)
  57. self.assertEqual(od, json.OrderedDict(p))
  58. self.assertEqual(type(od), json.OrderedDict)
  59. # the object_pairs_hook takes priority over the object_hook
  60. self.assertEqual(json.loads(s,
  61. object_pairs_hook=json.OrderedDict,
  62. object_hook=lambda x: None),
  63. json.OrderedDict(p))
  64. def test_default_encoding(self):
  65. self.assertEqual(json.loads(u'{"a": "\xe9"}'.encode('utf-8')),
  66. {'a': u'\xe9'})
  67. def test_unicode_preservation(self):
  68. self.assertEqual(type(json.loads(u'""')), text_type)
  69. self.assertEqual(type(json.loads(u'"a"')), text_type)
  70. self.assertEqual(type(json.loads(u'["a"]')[0]), text_type)
  71. def test_ensure_ascii_false_returns_unicode(self):
  72. # http://code.google.com/p/simplejson/issues/detail?id=48
  73. self.assertEqual(type(json.dumps([], ensure_ascii=False)), text_type)
  74. self.assertEqual(type(json.dumps(0, ensure_ascii=False)), text_type)
  75. self.assertEqual(type(json.dumps({}, ensure_ascii=False)), text_type)
  76. self.assertEqual(type(json.dumps("", ensure_ascii=False)), text_type)
  77. def test_ensure_ascii_false_bytestring_encoding(self):
  78. # http://code.google.com/p/simplejson/issues/detail?id=48
  79. doc1 = {u'quux': b('Arr\xc3\xaat sur images')}
  80. doc2 = {u'quux': u('Arr\xeat sur images')}
  81. doc_ascii = '{"quux": "Arr\\u00eat sur images"}'
  82. doc_unicode = u'{"quux": "Arr\xeat sur images"}'
  83. self.assertEqual(json.dumps(doc1), doc_ascii)
  84. self.assertEqual(json.dumps(doc2), doc_ascii)
  85. self.assertEqual(json.dumps(doc1, ensure_ascii=False), doc_unicode)
  86. self.assertEqual(json.dumps(doc2, ensure_ascii=False), doc_unicode)
  87. def test_ensure_ascii_linebreak_encoding(self):
  88. # http://timelessrepo.com/json-isnt-a-javascript-subset
  89. s1 = u'\u2029\u2028'
  90. s2 = s1.encode('utf8')
  91. expect = '"\\u2029\\u2028"'
  92. self.assertEqual(json.dumps(s1), expect)
  93. self.assertEqual(json.dumps(s2), expect)
  94. self.assertEqual(json.dumps(s1, ensure_ascii=False), expect)
  95. self.assertEqual(json.dumps(s2, ensure_ascii=False), expect)
  96. def test_invalid_escape_sequences(self):
  97. # incomplete escape sequence
  98. self.assertRaises(json.JSONDecodeError, json.loads, '"\\u')
  99. self.assertRaises(json.JSONDecodeError, json.loads, '"\\u1')
  100. self.assertRaises(json.JSONDecodeError, json.loads, '"\\u12')
  101. self.assertRaises(json.JSONDecodeError, json.loads, '"\\u123')
  102. self.assertRaises(json.JSONDecodeError, json.loads, '"\\u1234')
  103. # invalid escape sequence
  104. self.assertRaises(json.JSONDecodeError, json.loads, '"\\u123x"')
  105. self.assertRaises(json.JSONDecodeError, json.loads, '"\\u12x4"')
  106. self.assertRaises(json.JSONDecodeError, json.loads, '"\\u1x34"')
  107. self.assertRaises(json.JSONDecodeError, json.loads, '"\\ux234"')
  108. if sys.maxunicode > 65535:
  109. # invalid escape sequence for low surrogate
  110. self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u"')
  111. self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u0"')
  112. self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u00"')
  113. self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u000"')
  114. self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u000x"')
  115. self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u00x0"')
  116. self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u0x00"')
  117. self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\ux000"')
  118. def test_ensure_ascii_still_works(self):
  119. # in the ascii range, ensure that everything is the same
  120. for c in map(unichr, range(0, 127)):
  121. self.assertEqual(
  122. json.dumps(c, ensure_ascii=False),
  123. json.dumps(c))
  124. snowman = u'\N{SNOWMAN}'
  125. self.assertEqual(
  126. json.dumps(c, ensure_ascii=False),
  127. '"' + c + '"')
  128. def test_strip_bom(self):
  129. content = u"\u3053\u3093\u306b\u3061\u308f"
  130. json_doc = codecs.BOM_UTF8 + b(json.dumps(content))
  131. self.assertEqual(json.load(BytesIO(json_doc)), content)
  132. for doc in json_doc, json_doc.decode('utf8'):
  133. self.assertEqual(json.loads(doc), content)