test_scanstring.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. import sys
  2. from unittest import TestCase
  3. import simplejson as json
  4. import simplejson.decoder
  5. from simplejson.compat import b, PY3
  6. class TestScanString(TestCase):
  7. # The bytes type is intentionally not used in most of these tests
  8. # under Python 3 because the decoder immediately coerces to str before
  9. # calling scanstring. In Python 2 we are testing the code paths
  10. # for both unicode and str.
  11. #
  12. # The reason this is done is because Python 3 would require
  13. # entirely different code paths for parsing bytes and str.
  14. #
  15. def test_py_scanstring(self):
  16. self._test_scanstring(simplejson.decoder.py_scanstring)
  17. def test_c_scanstring(self):
  18. if not simplejson.decoder.c_scanstring:
  19. return
  20. self._test_scanstring(simplejson.decoder.c_scanstring)
  21. def _test_scanstring(self, scanstring):
  22. if sys.maxunicode == 65535:
  23. self.assertEqual(
  24. scanstring(u'"z\U0001d120x"', 1, None, True),
  25. (u'z\U0001d120x', 6))
  26. else:
  27. self.assertEqual(
  28. scanstring(u'"z\U0001d120x"', 1, None, True),
  29. (u'z\U0001d120x', 5))
  30. self.assertEqual(
  31. scanstring('"\\u007b"', 1, None, True),
  32. (u'{', 8))
  33. self.assertEqual(
  34. scanstring('"A JSON payload should be an object or array, not a string."', 1, None, True),
  35. (u'A JSON payload should be an object or array, not a string.', 60))
  36. self.assertEqual(
  37. scanstring('["Unclosed array"', 2, None, True),
  38. (u'Unclosed array', 17))
  39. self.assertEqual(
  40. scanstring('["extra comma",]', 2, None, True),
  41. (u'extra comma', 14))
  42. self.assertEqual(
  43. scanstring('["double extra comma",,]', 2, None, True),
  44. (u'double extra comma', 21))
  45. self.assertEqual(
  46. scanstring('["Comma after the close"],', 2, None, True),
  47. (u'Comma after the close', 24))
  48. self.assertEqual(
  49. scanstring('["Extra close"]]', 2, None, True),
  50. (u'Extra close', 14))
  51. self.assertEqual(
  52. scanstring('{"Extra comma": true,}', 2, None, True),
  53. (u'Extra comma', 14))
  54. self.assertEqual(
  55. scanstring('{"Extra value after close": true} "misplaced quoted value"', 2, None, True),
  56. (u'Extra value after close', 26))
  57. self.assertEqual(
  58. scanstring('{"Illegal expression": 1 + 2}', 2, None, True),
  59. (u'Illegal expression', 21))
  60. self.assertEqual(
  61. scanstring('{"Illegal invocation": alert()}', 2, None, True),
  62. (u'Illegal invocation', 21))
  63. self.assertEqual(
  64. scanstring('{"Numbers cannot have leading zeroes": 013}', 2, None, True),
  65. (u'Numbers cannot have leading zeroes', 37))
  66. self.assertEqual(
  67. scanstring('{"Numbers cannot be hex": 0x14}', 2, None, True),
  68. (u'Numbers cannot be hex', 24))
  69. self.assertEqual(
  70. scanstring('[[[[[[[[[[[[[[[[[[[["Too deep"]]]]]]]]]]]]]]]]]]]]', 21, None, True),
  71. (u'Too deep', 30))
  72. self.assertEqual(
  73. scanstring('{"Missing colon" null}', 2, None, True),
  74. (u'Missing colon', 16))
  75. self.assertEqual(
  76. scanstring('{"Double colon":: null}', 2, None, True),
  77. (u'Double colon', 15))
  78. self.assertEqual(
  79. scanstring('{"Comma instead of colon", null}', 2, None, True),
  80. (u'Comma instead of colon', 25))
  81. self.assertEqual(
  82. scanstring('["Colon instead of comma": false]', 2, None, True),
  83. (u'Colon instead of comma', 25))
  84. self.assertEqual(
  85. scanstring('["Bad value", truth]', 2, None, True),
  86. (u'Bad value', 12))
  87. for c in map(chr, range(0x00, 0x1f)):
  88. self.assertEqual(
  89. scanstring(c + '"', 0, None, False),
  90. (c, 2))
  91. self.assertRaises(
  92. ValueError,
  93. scanstring, c + '"', 0, None, True)
  94. self.assertRaises(ValueError, scanstring, '', 0, None, True)
  95. self.assertRaises(ValueError, scanstring, 'a', 0, None, True)
  96. self.assertRaises(ValueError, scanstring, '\\', 0, None, True)
  97. self.assertRaises(ValueError, scanstring, '\\u', 0, None, True)
  98. self.assertRaises(ValueError, scanstring, '\\u0', 0, None, True)
  99. self.assertRaises(ValueError, scanstring, '\\u01', 0, None, True)
  100. self.assertRaises(ValueError, scanstring, '\\u012', 0, None, True)
  101. self.assertRaises(ValueError, scanstring, '\\u0123', 0, None, True)
  102. if sys.maxunicode > 65535:
  103. self.assertRaises(ValueError,
  104. scanstring, '\\ud834\\u"', 0, None, True)
  105. self.assertRaises(ValueError,
  106. scanstring, '\\ud834\\x0123"', 0, None, True)
  107. def test_issue3623(self):
  108. self.assertRaises(ValueError, json.decoder.scanstring, "xxx", 1,
  109. "xxx")
  110. self.assertRaises(UnicodeDecodeError,
  111. json.encoder.encode_basestring_ascii, b("xx\xff"))
  112. def test_overflow(self):
  113. # Python 2.5 does not have maxsize, Python 3 does not have maxint
  114. maxsize = getattr(sys, 'maxsize', getattr(sys, 'maxint', None))
  115. assert maxsize is not None
  116. self.assertRaises(OverflowError, json.decoder.scanstring, "xxx",
  117. maxsize + 1)
  118. def test_surrogates(self):
  119. scanstring = json.decoder.scanstring
  120. def assertScan(given, expect, test_utf8=True):
  121. givens = [given]
  122. if not PY3 and test_utf8:
  123. givens.append(given.encode('utf8'))
  124. for given in givens:
  125. (res, count) = scanstring(given, 1, None, True)
  126. self.assertEqual(len(given), count)
  127. self.assertEqual(res, expect)
  128. assertScan(
  129. u'"z\\ud834\\u0079x"',
  130. u'z\ud834yx')
  131. assertScan(
  132. u'"z\\ud834\\udd20x"',
  133. u'z\U0001d120x')
  134. assertScan(
  135. u'"z\\ud834\\ud834\\udd20x"',
  136. u'z\ud834\U0001d120x')
  137. assertScan(
  138. u'"z\\ud834x"',
  139. u'z\ud834x')
  140. assertScan(
  141. u'"z\\udd20x"',
  142. u'z\udd20x')
  143. assertScan(
  144. u'"z\ud834x"',
  145. u'z\ud834x')
  146. # It may look strange to join strings together, but Python is drunk.
  147. # https://gist.github.com/etrepum/5538443
  148. assertScan(
  149. u'"z\\ud834\udd20x12345"',
  150. u''.join([u'z\ud834', u'\udd20x12345']))
  151. assertScan(
  152. u'"z\ud834\\udd20x"',
  153. u''.join([u'z\ud834', u'\udd20x']))
  154. # these have different behavior given UTF8 input, because the surrogate
  155. # pair may be joined (in maxunicode > 65535 builds)
  156. assertScan(
  157. u''.join([u'"z\ud834', u'\udd20x"']),
  158. u''.join([u'z\ud834', u'\udd20x']),
  159. test_utf8=False)
  160. self.assertRaises(ValueError,
  161. scanstring, u'"z\\ud83x"', 1, None, True)
  162. self.assertRaises(ValueError,
  163. scanstring, u'"z\\ud834\\udd2x"', 1, None, True)