test_readlines.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. # -*- coding: utf-8 -*-
  2. import pytest
  3. from pandas.compat import StringIO
  4. import pandas as pd
  5. from pandas import DataFrame, read_json
  6. import pandas.util.testing as tm
  7. from pandas.util.testing import (
  8. assert_frame_equal, assert_series_equal, ensure_clean)
  9. from pandas.io.json.json import JsonReader
  10. @pytest.fixture
  11. def lines_json_df():
  12. df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
  13. return df.to_json(lines=True, orient="records")
  14. def test_read_jsonl():
  15. # GH9180
  16. result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
  17. expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
  18. assert_frame_equal(result, expected)
  19. def test_read_jsonl_unicode_chars():
  20. # GH15132: non-ascii unicode characters
  21. # \u201d == RIGHT DOUBLE QUOTATION MARK
  22. # simulate file handle
  23. json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
  24. json = StringIO(json)
  25. result = read_json(json, lines=True)
  26. expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
  27. columns=['a', 'b'])
  28. assert_frame_equal(result, expected)
  29. # simulate string
  30. json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
  31. result = read_json(json, lines=True)
  32. expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
  33. columns=['a', 'b'])
  34. assert_frame_equal(result, expected)
  35. def test_to_jsonl():
  36. # GH9180
  37. df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
  38. result = df.to_json(orient="records", lines=True)
  39. expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
  40. assert result == expected
  41. df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
  42. result = df.to_json(orient="records", lines=True)
  43. expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
  44. assert result == expected
  45. assert_frame_equal(read_json(result, lines=True), df)
  46. # GH15096: escaped characters in columns and data
  47. df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
  48. columns=["a\\", 'b'])
  49. result = df.to_json(orient="records", lines=True)
  50. expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
  51. '{"a\\\\":"foo\\"","b":"bar"}')
  52. assert result == expected
  53. assert_frame_equal(read_json(result, lines=True), df)
  54. @pytest.mark.parametrize("chunksize", [1, 1.0])
  55. def test_readjson_chunks(lines_json_df, chunksize):
  56. # Basic test that read_json(chunks=True) gives the same result as
  57. # read_json(chunks=False)
  58. # GH17048: memory usage when lines=True
  59. unchunked = read_json(StringIO(lines_json_df), lines=True)
  60. reader = read_json(StringIO(lines_json_df), lines=True,
  61. chunksize=chunksize)
  62. chunked = pd.concat(reader)
  63. assert_frame_equal(chunked, unchunked)
  64. def test_readjson_chunksize_requires_lines(lines_json_df):
  65. msg = "chunksize can only be passed if lines=True"
  66. with pytest.raises(ValueError, match=msg):
  67. pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
  68. def test_readjson_chunks_series():
  69. # Test reading line-format JSON to Series with chunksize param
  70. s = pd.Series({'A': 1, 'B': 2})
  71. strio = StringIO(s.to_json(lines=True, orient="records"))
  72. unchunked = pd.read_json(strio, lines=True, typ='Series')
  73. strio = StringIO(s.to_json(lines=True, orient="records"))
  74. chunked = pd.concat(pd.read_json(
  75. strio, lines=True, typ='Series', chunksize=1
  76. ))
  77. assert_series_equal(chunked, unchunked)
  78. def test_readjson_each_chunk(lines_json_df):
  79. # Other tests check that the final result of read_json(chunksize=True)
  80. # is correct. This checks the intermediate chunks.
  81. chunks = list(
  82. pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
  83. )
  84. assert chunks[0].shape == (2, 2)
  85. assert chunks[1].shape == (1, 2)
  86. def test_readjson_chunks_from_file():
  87. with ensure_clean('test.json') as path:
  88. df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
  89. df.to_json(path, lines=True, orient="records")
  90. chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
  91. unchunked = pd.read_json(path, lines=True)
  92. assert_frame_equal(unchunked, chunked)
  93. @pytest.mark.parametrize("chunksize", [None, 1])
  94. def test_readjson_chunks_closes(chunksize):
  95. with ensure_clean('test.json') as path:
  96. df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
  97. df.to_json(path, lines=True, orient="records")
  98. reader = JsonReader(
  99. path, orient=None, typ="frame", dtype=True, convert_axes=True,
  100. convert_dates=True, keep_default_dates=True, numpy=False,
  101. precise_float=False, date_unit=None, encoding=None,
  102. lines=True, chunksize=chunksize, compression=None)
  103. reader.read()
  104. assert reader.open_stream.closed, "didn't close stream with \
  105. chunksize = {chunksize}".format(chunksize=chunksize)
  106. @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
  107. def test_readjson_invalid_chunksize(lines_json_df, chunksize):
  108. msg = r"'chunksize' must be an integer >=1"
  109. with pytest.raises(ValueError, match=msg):
  110. pd.read_json(StringIO(lines_json_df), lines=True,
  111. chunksize=chunksize)
  112. @pytest.mark.parametrize("chunksize", [None, 1, 2])
  113. def test_readjson_chunks_multiple_empty_lines(chunksize):
  114. j = """
  115. {"A":1,"B":4}
  116. {"A":2,"B":5}
  117. {"A":3,"B":6}
  118. """
  119. orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
  120. test = pd.read_json(j, lines=True, chunksize=chunksize)
  121. if chunksize is not None:
  122. test = pd.concat(test)
  123. tm.assert_frame_equal(
  124. orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize))