123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580 |
- # -*- coding: utf-8 -*-
- """
- Tests the 'read_fwf' function in parsers.py. This
- test suite is independent of the others because the
- engine is set to 'python-fwf' internally.
- """
- from datetime import datetime
- import numpy as np
- import pytest
- import pandas.compat as compat
- from pandas.compat import BytesIO, StringIO
- import pandas as pd
- from pandas import DataFrame, DatetimeIndex
- import pandas.util.testing as tm
- from pandas.io.parsers import EmptyDataError, read_csv, read_fwf
- def test_basic():
- data = """\
- A B C D
- 201158 360.242940 149.910199 11950.7
- 201159 444.953632 166.985655 11788.4
- 201160 364.136849 183.628767 11806.2
- 201161 413.836124 184.375703 11916.8
- 201162 502.953953 173.237159 12468.3
- """
- result = read_fwf(StringIO(data))
- expected = DataFrame([[201158, 360.242940, 149.910199, 11950.7],
- [201159, 444.953632, 166.985655, 11788.4],
- [201160, 364.136849, 183.628767, 11806.2],
- [201161, 413.836124, 184.375703, 11916.8],
- [201162, 502.953953, 173.237159, 12468.3]],
- columns=["A", "B", "C", "D"])
- tm.assert_frame_equal(result, expected)
- def test_colspecs():
- data = """\
- A B C D E
- 201158 360.242940 149.910199 11950.7
- 201159 444.953632 166.985655 11788.4
- 201160 364.136849 183.628767 11806.2
- 201161 413.836124 184.375703 11916.8
- 201162 502.953953 173.237159 12468.3
- """
- colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
- result = read_fwf(StringIO(data), colspecs=colspecs)
- expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7],
- [2011, 59, 444.953632, 166.985655, 11788.4],
- [2011, 60, 364.136849, 183.628767, 11806.2],
- [2011, 61, 413.836124, 184.375703, 11916.8],
- [2011, 62, 502.953953, 173.237159, 12468.3]],
- columns=["A", "B", "C", "D", "E"])
- tm.assert_frame_equal(result, expected)
- def test_widths():
- data = """\
- A B C D E
- 2011 58 360.242940 149.910199 11950.7
- 2011 59 444.953632 166.985655 11788.4
- 2011 60 364.136849 183.628767 11806.2
- 2011 61 413.836124 184.375703 11916.8
- 2011 62 502.953953 173.237159 12468.3
- """
- result = read_fwf(StringIO(data), widths=[5, 5, 13, 13, 7])
- expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7],
- [2011, 59, 444.953632, 166.985655, 11788.4],
- [2011, 60, 364.136849, 183.628767, 11806.2],
- [2011, 61, 413.836124, 184.375703, 11916.8],
- [2011, 62, 502.953953, 173.237159, 12468.3]],
- columns=["A", "B", "C", "D", "E"])
- tm.assert_frame_equal(result, expected)
- def test_non_space_filler():
- # From Thomas Kluyver:
- #
- # Apparently, some non-space filler characters can be seen, this is
- # supported by specifying the 'delimiter' character:
- #
- # http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html
- data = """\
- A~~~~B~~~~C~~~~~~~~~~~~D~~~~~~~~~~~~E
- 201158~~~~360.242940~~~149.910199~~~11950.7
- 201159~~~~444.953632~~~166.985655~~~11788.4
- 201160~~~~364.136849~~~183.628767~~~11806.2
- 201161~~~~413.836124~~~184.375703~~~11916.8
- 201162~~~~502.953953~~~173.237159~~~12468.3
- """
- colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
- result = read_fwf(StringIO(data), colspecs=colspecs, delimiter="~")
- expected = DataFrame([[2011, 58, 360.242940, 149.910199, 11950.7],
- [2011, 59, 444.953632, 166.985655, 11788.4],
- [2011, 60, 364.136849, 183.628767, 11806.2],
- [2011, 61, 413.836124, 184.375703, 11916.8],
- [2011, 62, 502.953953, 173.237159, 12468.3]],
- columns=["A", "B", "C", "D", "E"])
- tm.assert_frame_equal(result, expected)
- def test_over_specified():
- data = """\
- A B C D E
- 201158 360.242940 149.910199 11950.7
- 201159 444.953632 166.985655 11788.4
- 201160 364.136849 183.628767 11806.2
- 201161 413.836124 184.375703 11916.8
- 201162 502.953953 173.237159 12468.3
- """
- colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
- with pytest.raises(ValueError, match="must specify only one of"):
- read_fwf(StringIO(data), colspecs=colspecs, widths=[6, 10, 10, 7])
- def test_under_specified():
- data = """\
- A B C D E
- 201158 360.242940 149.910199 11950.7
- 201159 444.953632 166.985655 11788.4
- 201160 364.136849 183.628767 11806.2
- 201161 413.836124 184.375703 11916.8
- 201162 502.953953 173.237159 12468.3
- """
- with pytest.raises(ValueError, match="Must specify either"):
- read_fwf(StringIO(data), colspecs=None, widths=None)
- def test_read_csv_compat():
- csv_data = """\
- A,B,C,D,E
- 2011,58,360.242940,149.910199,11950.7
- 2011,59,444.953632,166.985655,11788.4
- 2011,60,364.136849,183.628767,11806.2
- 2011,61,413.836124,184.375703,11916.8
- 2011,62,502.953953,173.237159,12468.3
- """
- expected = read_csv(StringIO(csv_data), engine="python")
- fwf_data = """\
- A B C D E
- 201158 360.242940 149.910199 11950.7
- 201159 444.953632 166.985655 11788.4
- 201160 364.136849 183.628767 11806.2
- 201161 413.836124 184.375703 11916.8
- 201162 502.953953 173.237159 12468.3
- """
- colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
- result = read_fwf(StringIO(fwf_data), colspecs=colspecs)
- tm.assert_frame_equal(result, expected)
- def test_bytes_io_input():
- if not compat.PY3:
- pytest.skip("Bytes-related test - only needs to work on Python 3")
- result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')),
- widths=[2, 2], encoding="utf8")
- expected = DataFrame([["של", "ום"]], columns=["של", "ום"])
- tm.assert_frame_equal(result, expected)
- def test_fwf_colspecs_is_list_or_tuple():
- data = """index,A,B,C,D
- foo,2,3,4,5
- bar,7,8,9,10
- baz,12,13,14,15
- qux,12,13,14,15
- foo2,12,13,14,15
- bar2,12,13,14,15
- """
- msg = "column specifications must be a list or tuple.+"
- with pytest.raises(TypeError, match=msg):
- read_fwf(StringIO(data), colspecs={"a": 1}, delimiter=",")
- def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples():
- data = """index,A,B,C,D
- foo,2,3,4,5
- bar,7,8,9,10
- baz,12,13,14,15
- qux,12,13,14,15
- foo2,12,13,14,15
- bar2,12,13,14,15
- """
- msg = "Each column specification must be.+"
- with pytest.raises(TypeError, match=msg):
- read_fwf(StringIO(data), [("a", 1)])
- @pytest.mark.parametrize("colspecs,exp_data", [
- ([(0, 3), (3, None)], [[123, 456], [456, 789]]),
- ([(None, 3), (3, 6)], [[123, 456], [456, 789]]),
- ([(0, None), (3, None)], [[123456, 456], [456789, 789]]),
- ([(None, None), (3, 6)], [[123456, 456], [456789, 789]]),
- ])
- def test_fwf_colspecs_none(colspecs, exp_data):
- # see gh-7079
- data = """\
- 123456
- 456789
- """
- expected = DataFrame(exp_data)
- result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("infer_nrows,exp_data", [
- # infer_nrows --> colspec == [(2, 3), (5, 6)]
- (1, [[1, 2], [3, 8]]),
- # infer_nrows > number of rows
- (10, [[1, 2], [123, 98]]),
- ])
- def test_fwf_colspecs_infer_nrows(infer_nrows, exp_data):
- # see gh-15138
- data = """\
- 1 2
- 123 98
- """
- expected = DataFrame(exp_data)
- result = read_fwf(StringIO(data), infer_nrows=infer_nrows, header=None)
- tm.assert_frame_equal(result, expected)
- def test_fwf_regression():
- # see gh-3594
- #
- # Turns out "T060" is parsable as a datetime slice!
- tz_list = [1, 10, 20, 30, 60, 80, 100]
- widths = [16] + [8] * len(tz_list)
- names = ["SST"] + ["T%03d" % z for z in tz_list[1:]]
- data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192
- 2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869
- 2009164204000 9.5873 9.1326 8.4694 7.5889 6.0422 5.8526 5.4657
- 2009164205000 9.5810 9.0896 8.4009 7.4652 6.0322 5.8189 5.4379
- 2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039
- """
- result = read_fwf(StringIO(data), index_col=0, header=None, names=names,
- widths=widths, parse_dates=True,
- date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"))
- expected = DataFrame([
- [9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192],
- [9.5435, 9.2010, 8.6167, 7.8176, 6.0804, 5.8728, 5.4869],
- [9.5873, 9.1326, 8.4694, 7.5889, 6.0422, 5.8526, 5.4657],
- [9.5810, 9.0896, 8.4009, 7.4652, 6.0322, 5.8189, 5.4379],
- [9.6034, 9.0897, 8.3822, 7.4905, 6.0908, 5.7904, 5.4039],
- ], index=DatetimeIndex(["2009-06-13 20:20:00", "2009-06-13 20:30:00",
- "2009-06-13 20:40:00", "2009-06-13 20:50:00",
- "2009-06-13 21:00:00"]),
- columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"])
- tm.assert_frame_equal(result, expected)
- def test_fwf_for_uint8():
- data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127
- 1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" # noqa
- df = read_fwf(StringIO(data),
- colspecs=[(0, 17), (25, 26), (33, 37),
- (49, 51), (58, 62), (63, 1000)],
- names=["time", "pri", "pgn", "dst", "src", "data"],
- converters={
- "pgn": lambda x: int(x, 16),
- "src": lambda x: int(x, 16),
- "dst": lambda x: int(x, 16),
- "data": lambda x: len(x.split(" "))})
- expected = DataFrame([[1421302965.213420, 3, 61184, 23, 40, 8],
- [1421302964.226776, 6, 61442, None, 71, 8]],
- columns=["time", "pri", "pgn",
- "dst", "src", "data"])
- expected["dst"] = expected["dst"].astype(object)
- tm.assert_frame_equal(df, expected)
- @pytest.mark.parametrize("comment", ["#", "~", "!"])
- def test_fwf_comment(comment):
- data = """\
- 1 2. 4 #hello world
- 5 NaN 10.0
- """
- data = data.replace("#", comment)
- colspecs = [(0, 3), (4, 9), (9, 25)]
- expected = DataFrame([[1, 2., 4], [5, np.nan, 10.]])
- result = read_fwf(StringIO(data), colspecs=colspecs,
- header=None, comment=comment)
- tm.assert_almost_equal(result, expected)
- @pytest.mark.parametrize("thousands", [",", "#", "~"])
- def test_fwf_thousands(thousands):
- data = """\
- 1 2,334.0 5
- 10 13 10.
- """
- data = data.replace(",", thousands)
- colspecs = [(0, 3), (3, 11), (12, 16)]
- expected = DataFrame([[1, 2334., 5], [10, 13, 10.]])
- result = read_fwf(StringIO(data), header=None,
- colspecs=colspecs, thousands=thousands)
- tm.assert_almost_equal(result, expected)
- @pytest.mark.parametrize("header", [True, False])
- def test_bool_header_arg(header):
- # see gh-6114
- data = """\
- MyColumn
- a
- b
- a
- b"""
- msg = "Passing a bool to header is invalid"
- with pytest.raises(TypeError, match=msg):
- read_fwf(StringIO(data), header=header)
- def test_full_file():
- # File with all values.
- test = """index A B C
- 2000-01-03T00:00:00 0.980268513777 3 foo
- 2000-01-04T00:00:00 1.04791624281 -4 bar
- 2000-01-05T00:00:00 0.498580885705 73 baz
- 2000-01-06T00:00:00 1.12020151869 1 foo
- 2000-01-07T00:00:00 0.487094399463 0 bar
- 2000-01-10T00:00:00 0.836648671666 2 baz
- 2000-01-11T00:00:00 0.157160753327 34 foo"""
- colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
- expected = read_fwf(StringIO(test), colspecs=colspecs)
- result = read_fwf(StringIO(test))
- tm.assert_frame_equal(result, expected)
- def test_full_file_with_missing():
- # File with missing values.
- test = """index A B C
- 2000-01-03T00:00:00 0.980268513777 3 foo
- 2000-01-04T00:00:00 1.04791624281 -4 bar
- 0.498580885705 73 baz
- 2000-01-06T00:00:00 1.12020151869 1 foo
- 2000-01-07T00:00:00 0 bar
- 2000-01-10T00:00:00 0.836648671666 2 baz
- 34"""
- colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
- expected = read_fwf(StringIO(test), colspecs=colspecs)
- result = read_fwf(StringIO(test))
- tm.assert_frame_equal(result, expected)
- def test_full_file_with_spaces():
- # File with spaces in columns.
- test = """
- Account Name Balance CreditLimit AccountCreated
- 101 Keanu Reeves 9315.45 10000.00 1/17/1998
- 312 Gerard Butler 90.00 1000.00 8/6/2003
- 868 Jennifer Love Hewitt 0 17000.00 5/25/1985
- 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
- 317 Bill Murray 789.65 5000.00 2/5/2007
- """.strip("\r\n")
- colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
- expected = read_fwf(StringIO(test), colspecs=colspecs)
- result = read_fwf(StringIO(test))
- tm.assert_frame_equal(result, expected)
- def test_full_file_with_spaces_and_missing():
- # File with spaces and missing values in columns.
- test = """
- Account Name Balance CreditLimit AccountCreated
- 101 10000.00 1/17/1998
- 312 Gerard Butler 90.00 1000.00 8/6/2003
- 868 5/25/1985
- 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
- 317 Bill Murray 789.65
- """.strip("\r\n")
- colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
- expected = read_fwf(StringIO(test), colspecs=colspecs)
- result = read_fwf(StringIO(test))
- tm.assert_frame_equal(result, expected)
- def test_messed_up_data():
- # Completely messed up file.
- test = """
- Account Name Balance Credit Limit Account Created
- 101 10000.00 1/17/1998
- 312 Gerard Butler 90.00 1000.00
- 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
- 317 Bill Murray 789.65
- """.strip("\r\n")
- colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
- expected = read_fwf(StringIO(test), colspecs=colspecs)
- result = read_fwf(StringIO(test))
- tm.assert_frame_equal(result, expected)
- def test_multiple_delimiters():
- test = r"""
- col1~~~~~col2 col3++++++++++++++++++col4
- ~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
- 33+++122.33\\\bar.........Gerard Butler
- ++44~~~~12.01 baz~~Jennifer Love Hewitt
- ~~55 11+++foo++++Jada Pinkett-Smith
- ..66++++++.03~~~bar Bill Murray
- """.strip("\r\n")
- delimiter = " +~.\\"
- colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
- expected = read_fwf(StringIO(test), colspecs=colspecs, delimiter=delimiter)
- result = read_fwf(StringIO(test), delimiter=delimiter)
- tm.assert_frame_equal(result, expected)
- def test_variable_width_unicode():
- if not compat.PY3:
- pytest.skip("Bytes-related test - only needs to work on Python 3")
- data = """
- שלום שלום
- ום שלל
- של ום
- """.strip("\r\n")
- encoding = "utf8"
- kwargs = dict(header=None, encoding=encoding)
- expected = read_fwf(BytesIO(data.encode(encoding)),
- colspecs=[(0, 4), (5, 9)], **kwargs)
- result = read_fwf(BytesIO(data.encode(encoding)), **kwargs)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("dtype", [
- dict(), {"a": "float64", "b": str, "c": "int32"}
- ])
- def test_dtype(dtype):
- data = """ a b c
- 1 2 3.2
- 3 4 5.2
- """
- colspecs = [(0, 5), (5, 10), (10, None)]
- result = read_fwf(StringIO(data), colspecs=colspecs, dtype=dtype)
- expected = pd.DataFrame({
- "a": [1, 3], "b": [2, 4],
- "c": [3.2, 5.2]}, columns=["a", "b", "c"])
- for col, dt in dtype.items():
- expected[col] = expected[col].astype(dt)
- tm.assert_frame_equal(result, expected)
- def test_skiprows_inference():
- # see gh-11256
- data = """
- Text contained in the file header
- DataCol1 DataCol2
- 0.0 1.0
- 101.6 956.1
- """.strip()
- skiprows = 2
- expected = read_csv(StringIO(data), skiprows=skiprows,
- delim_whitespace=True)
- result = read_fwf(StringIO(data), skiprows=skiprows)
- tm.assert_frame_equal(result, expected)
- def test_skiprows_by_index_inference():
- data = """
- To be skipped
- Not To Be Skipped
- Once more to be skipped
- 123 34 8 123
- 456 78 9 456
- """.strip()
- skiprows = [0, 2]
- expected = read_csv(StringIO(data), skiprows=skiprows,
- delim_whitespace=True)
- result = read_fwf(StringIO(data), skiprows=skiprows)
- tm.assert_frame_equal(result, expected)
- def test_skiprows_inference_empty():
- data = """
- AA BBB C
- 12 345 6
- 78 901 2
- """.strip()
- msg = "No rows from which to infer column width"
- with pytest.raises(EmptyDataError, match=msg):
- read_fwf(StringIO(data), skiprows=3)
- def test_whitespace_preservation():
- # see gh-16772
- header = None
- csv_data = """
- a ,bbb
- cc,dd """
- fwf_data = """
- a bbb
- ccdd """
- result = read_fwf(StringIO(fwf_data), widths=[3, 3],
- header=header, skiprows=[0], delimiter="\n\t")
- expected = read_csv(StringIO(csv_data), header=header)
- tm.assert_frame_equal(result, expected)
- def test_default_delimiter():
- header = None
- csv_data = """
- a,bbb
- cc,dd"""
- fwf_data = """
- a \tbbb
- cc\tdd """
- result = read_fwf(StringIO(fwf_data), widths=[3, 3],
- header=header, skiprows=[0])
- expected = read_csv(StringIO(csv_data), header=header)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("infer", [True, False, None])
- def test_fwf_compression(compression_only, infer):
- data = """1111111111
- 2222222222
- 3333333333""".strip()
- compression = compression_only
- extension = "gz" if compression == "gzip" else compression
- kwargs = dict(widths=[5, 5], names=["one", "two"])
- expected = read_fwf(StringIO(data), **kwargs)
- if compat.PY3:
- data = bytes(data, encoding="utf-8")
- with tm.ensure_clean(filename="tmp." + extension) as path:
- tm.write_to_compressed(compression, path, data)
- if infer is not None:
- kwargs["compression"] = "infer" if infer else compression
- result = read_fwf(path, **kwargs)
- tm.assert_frame_equal(result, expected)
|