123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 |
- # -*- coding: utf-8 -*-
- import pytest
- from pandas.compat import StringIO
- import pandas as pd
- from pandas import DataFrame, read_json
- import pandas.util.testing as tm
- from pandas.util.testing import (
- assert_frame_equal, assert_series_equal, ensure_clean)
- from pandas.io.json.json import JsonReader
- @pytest.fixture
- def lines_json_df():
- df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
- return df.to_json(lines=True, orient="records")
- def test_read_jsonl():
- # GH9180
- result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
- expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
- assert_frame_equal(result, expected)
- def test_read_jsonl_unicode_chars():
- # GH15132: non-ascii unicode characters
- # \u201d == RIGHT DOUBLE QUOTATION MARK
- # simulate file handle
- json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
- json = StringIO(json)
- result = read_json(json, lines=True)
- expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
- columns=['a', 'b'])
- assert_frame_equal(result, expected)
- # simulate string
- json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
- result = read_json(json, lines=True)
- expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
- columns=['a', 'b'])
- assert_frame_equal(result, expected)
- def test_to_jsonl():
- # GH9180
- df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
- result = df.to_json(orient="records", lines=True)
- expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
- assert result == expected
- df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
- result = df.to_json(orient="records", lines=True)
- expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
- assert result == expected
- assert_frame_equal(read_json(result, lines=True), df)
- # GH15096: escaped characters in columns and data
- df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
- columns=["a\\", 'b'])
- result = df.to_json(orient="records", lines=True)
- expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
- '{"a\\\\":"foo\\"","b":"bar"}')
- assert result == expected
- assert_frame_equal(read_json(result, lines=True), df)
- @pytest.mark.parametrize("chunksize", [1, 1.0])
- def test_readjson_chunks(lines_json_df, chunksize):
- # Basic test that read_json(chunks=True) gives the same result as
- # read_json(chunks=False)
- # GH17048: memory usage when lines=True
- unchunked = read_json(StringIO(lines_json_df), lines=True)
- reader = read_json(StringIO(lines_json_df), lines=True,
- chunksize=chunksize)
- chunked = pd.concat(reader)
- assert_frame_equal(chunked, unchunked)
- def test_readjson_chunksize_requires_lines(lines_json_df):
- msg = "chunksize can only be passed if lines=True"
- with pytest.raises(ValueError, match=msg):
- pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
- def test_readjson_chunks_series():
- # Test reading line-format JSON to Series with chunksize param
- s = pd.Series({'A': 1, 'B': 2})
- strio = StringIO(s.to_json(lines=True, orient="records"))
- unchunked = pd.read_json(strio, lines=True, typ='Series')
- strio = StringIO(s.to_json(lines=True, orient="records"))
- chunked = pd.concat(pd.read_json(
- strio, lines=True, typ='Series', chunksize=1
- ))
- assert_series_equal(chunked, unchunked)
- def test_readjson_each_chunk(lines_json_df):
- # Other tests check that the final result of read_json(chunksize=True)
- # is correct. This checks the intermediate chunks.
- chunks = list(
- pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
- )
- assert chunks[0].shape == (2, 2)
- assert chunks[1].shape == (1, 2)
- def test_readjson_chunks_from_file():
- with ensure_clean('test.json') as path:
- df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
- df.to_json(path, lines=True, orient="records")
- chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
- unchunked = pd.read_json(path, lines=True)
- assert_frame_equal(unchunked, chunked)
- @pytest.mark.parametrize("chunksize", [None, 1])
- def test_readjson_chunks_closes(chunksize):
- with ensure_clean('test.json') as path:
- df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
- df.to_json(path, lines=True, orient="records")
- reader = JsonReader(
- path, orient=None, typ="frame", dtype=True, convert_axes=True,
- convert_dates=True, keep_default_dates=True, numpy=False,
- precise_float=False, date_unit=None, encoding=None,
- lines=True, chunksize=chunksize, compression=None)
- reader.read()
- assert reader.open_stream.closed, "didn't close stream with \
- chunksize = {chunksize}".format(chunksize=chunksize)
- @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
- def test_readjson_invalid_chunksize(lines_json_df, chunksize):
- msg = r"'chunksize' must be an integer >=1"
- with pytest.raises(ValueError, match=msg):
- pd.read_json(StringIO(lines_json_df), lines=True,
- chunksize=chunksize)
- @pytest.mark.parametrize("chunksize", [None, 1, 2])
- def test_readjson_chunks_multiple_empty_lines(chunksize):
- j = """
- {"A":1,"B":4}
- {"A":2,"B":5}
- {"A":3,"B":6}
- """
- orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
- test = pd.read_json(j, lines=True, chunksize=chunksize)
- if chunksize is not None:
- test = pd.concat(test)
- tm.assert_frame_equal(
- orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize))
|