dayuan
/
manyi


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523
							# -*- coding: utf-8 -*-

from __future__ import print_function

from datetime import datetime, timedelta
import re
import sys
import textwrap

import numpy as np
import pytest

from pandas.compat import PYPY, StringIO, lrange, u

import pandas as pd
from pandas import (
    Categorical, DataFrame, Series, compat, date_range, option_context,
    period_range)
from pandas.tests.frame.common import TestData
import pandas.util.testing as tm

import pandas.io.formats.format as fmt

# Segregated collection of methods that require the BlockManager internal data
# structure


class TestDataFrameReprInfoEtc(TestData):

    def test_repr_empty(self):
        # empty
        foo = repr(self.empty)  # noqa

        # empty with index
        frame = DataFrame(index=np.arange(1000))
        foo = repr(frame)  # noqa

    def test_repr_mixed(self):
        buf = StringIO()

        # mixed
        foo = repr(self.mixed_frame)  # noqa
        self.mixed_frame.info(verbose=False, buf=buf)

    @pytest.mark.slow
    def test_repr_mixed_big(self):
        # big mixed
        biggie = DataFrame({'A': np.random.randn(200),
                            'B': tm.makeStringIndex(200)},
                           index=lrange(200))
        biggie.loc[:20, 'A'] = np.nan
        biggie.loc[:20, 'B'] = np.nan

        foo = repr(biggie)  # noqa

    def test_repr(self):
        buf = StringIO()

        # small one
        foo = repr(self.frame)
        self.frame.info(verbose=False, buf=buf)

        # even smaller
        self.frame.reindex(columns=['A']).info(verbose=False, buf=buf)
        self.frame.reindex(columns=['A', 'B']).info(verbose=False, buf=buf)

        # exhausting cases in DataFrame.info

        # columns but no index
        no_index = DataFrame(columns=[0, 1, 3])
        foo = repr(no_index)  # noqa

        # no columns or index
        self.empty.info(buf=buf)

        df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"])
        assert "\t" not in repr(df)
        assert "\r" not in repr(df)
        assert "a\n" not in repr(df)

    def test_repr_dimensions(self):
        df = DataFrame([[1, 2, ], [3, 4]])
        with option_context('display.show_dimensions', True):
            assert "2 rows x 2 columns" in repr(df)

        with option_context('display.show_dimensions', False):
            assert "2 rows x 2 columns" not in repr(df)

        with option_context('display.show_dimensions', 'truncate'):
            assert "2 rows x 2 columns" not in repr(df)

    @pytest.mark.slow
    def test_repr_big(self):
        # big one
        biggie = DataFrame(np.zeros((200, 4)), columns=lrange(4),
                           index=lrange(200))
        repr(biggie)

    def test_repr_unsortable(self):
        # columns are not sortable
        import warnings
        warn_filters = warnings.filters
        warnings.filterwarnings('ignore',
                                category=FutureWarning,
                                module=".*format")

        unsortable = DataFrame({'foo': [1] * 50,
                                datetime.today(): [1] * 50,
                                'bar': ['bar'] * 50,
                                datetime.today() + timedelta(1): ['bar'] * 50},
                               index=np.arange(50))
        repr(unsortable)

        fmt.set_option('display.precision', 3, 'display.column_space', 10)
        repr(self.frame)

        fmt.set_option('display.max_rows', 10, 'display.max_columns', 2)
        repr(self.frame)

        fmt.set_option('display.max_rows', 1000, 'display.max_columns', 1000)
        repr(self.frame)

        tm.reset_display_options()

        warnings.filters = warn_filters

    def test_repr_unicode(self):
        uval = u('\u03c3\u03c3\u03c3\u03c3')

        # TODO(wesm): is this supposed to be used?
        bval = uval.encode('utf-8')  # noqa

        df = DataFrame({'A': [uval, uval]})

        result = repr(df)
        ex_top = '      A'
        assert result.split('\n')[0].rstrip() == ex_top

        df = DataFrame({'A': [uval, uval]})
        result = repr(df)
        assert result.split('\n')[0].rstrip() == ex_top

    def test_unicode_string_with_unicode(self):
        df = DataFrame({'A': [u("\u05d0")]})

        if compat.PY3:
            str(df)
        else:
            compat.text_type(df)

    def test_bytestring_with_unicode(self):
        df = DataFrame({'A': [u("\u05d0")]})
        if compat.PY3:
            bytes(df)
        else:
            str(df)

    def test_very_wide_info_repr(self):
        df = DataFrame(np.random.randn(10, 20),
                       columns=tm.rands_array(10, 20))
        repr(df)

    def test_repr_column_name_unicode_truncation_bug(self):
        # #1906
        df = DataFrame({'Id': [7117434],
                        'StringCol': ('Is it possible to modify drop plot code'
                                      ' so that the output graph is displayed '
                                      'in iphone simulator, Is it possible to '
                                      'modify drop plot code so that the '
                                      'output graph is \xe2\x80\xa8displayed '
                                      'in iphone simulator.Now we are adding '
                                      'the CSV file externally. I want to Call'
                                      ' the File through the code..')})

        with option_context('display.max_columns', 20):
            assert 'StringCol' in repr(df)

    def test_latex_repr(self):
        result = r"""\begin{tabular}{llll}
\toprule
{} &         0 &  1 &  2 \\
\midrule
0 &  $\alpha$ &  b &  c \\
1 &         1 &  2 &  3 \\
\bottomrule
\end{tabular}
"""
        with option_context("display.latex.escape", False,
                            'display.latex.repr', True):
            df = DataFrame([[r'$\alpha$', 'b', 'c'], [1, 2, 3]])
            assert result == df._repr_latex_()

        # GH 12182
        assert df._repr_latex_() is None

    def test_info(self):
        io = StringIO()
        self.frame.info(buf=io)
        self.tsframe.info(buf=io)

        frame = DataFrame(np.random.randn(5, 3))

        frame.info()
        frame.info(verbose=False)

    def test_info_memory(self):
        # https://github.com/pandas-dev/pandas/issues/21056
        df = pd.DataFrame({'a': pd.Series([1, 2], dtype='i8')})
        buf = StringIO()
        df.info(buf=buf)
        result = buf.getvalue()
        bytes = float(df.memory_usage().sum())

        expected = textwrap.dedent("""\
        <class 'pandas.core.frame.DataFrame'>
        RangeIndex: 2 entries, 0 to 1
        Data columns (total 1 columns):
        a    2 non-null int64
        dtypes: int64(1)
        memory usage: {} bytes
        """.format(bytes))

        assert result == expected

    def test_info_wide(self):
        from pandas import set_option, reset_option
        io = StringIO()
        df = DataFrame(np.random.randn(5, 101))
        df.info(buf=io)

        io = StringIO()
        df.info(buf=io, max_cols=101)
        rs = io.getvalue()
        assert len(rs.splitlines()) > 100
        xp = rs

        set_option('display.max_info_columns', 101)
        io = StringIO()
        df.info(buf=io)
        assert rs == xp
        reset_option('display.max_info_columns')

    def test_info_duplicate_columns(self):
        io = StringIO()

        # it works!
        frame = DataFrame(np.random.randn(1500, 4),
                          columns=['a', 'a', 'b', 'b'])
        frame.info(buf=io)

    def test_info_duplicate_columns_shows_correct_dtypes(self):
        # GH11761
        io = StringIO()

        frame = DataFrame([[1, 2.0]],
                          columns=['a', 'a'])
        frame.info(buf=io)
        io.seek(0)
        lines = io.readlines()
        assert 'a    1 non-null int64\n' == lines[3]
        assert 'a    1 non-null float64\n' == lines[4]

    def test_info_shows_column_dtypes(self):
        dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
                  'complex128', 'object', 'bool']
        data = {}
        n = 10
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        buf = StringIO()
        df.info(buf=buf)
        res = buf.getvalue()
        for i, dtype in enumerate(dtypes):
            name = '%d    %d non-null %s' % (i, n, dtype)
            assert name in res

    def test_info_max_cols(self):
        df = DataFrame(np.random.randn(10, 5))
        for len_, verbose in [(5, None), (5, False), (10, True)]:
            # For verbose always      ^ setting  ^ summarize ^ full output
            with option_context('max_info_columns', 4):
                buf = StringIO()
                df.info(buf=buf, verbose=verbose)
                res = buf.getvalue()
                assert len(res.strip().split('\n')) == len_

        for len_, verbose in [(10, None), (5, False), (10, True)]:

            # max_cols no exceeded
            with option_context('max_info_columns', 5):
                buf = StringIO()
                df.info(buf=buf, verbose=verbose)
                res = buf.getvalue()
                assert len(res.strip().split('\n')) == len_

        for len_, max_cols in [(10, 5), (5, 4)]:
            # setting truncates
            with option_context('max_info_columns', 4):
                buf = StringIO()
                df.info(buf=buf, max_cols=max_cols)
                res = buf.getvalue()
                assert len(res.strip().split('\n')) == len_

            # setting wouldn't truncate
            with option_context('max_info_columns', 5):
                buf = StringIO()
                df.info(buf=buf, max_cols=max_cols)
                res = buf.getvalue()
                assert len(res.strip().split('\n')) == len_

    def test_info_memory_usage(self):
        # Ensure memory usage is displayed, when asserted, on the last line
        dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
                  'complex128', 'object', 'bool']
        data = {}
        n = 10
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        buf = StringIO()

        # display memory usage case
        df.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        assert "memory usage: " in res[-1]

        # do not display memory usage case
        df.info(buf=buf, memory_usage=False)
        res = buf.getvalue().splitlines()
        assert "memory usage: " not in res[-1]

        df.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()

        # memory usage is a lower bound, so print it as XYZ+ MB
        assert re.match(r"memory usage: [^+]+\+", res[-1])

        df.iloc[:, :5].info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()

        # excluded column with object dtype, so estimate is accurate
        assert not re.match(r"memory usage: [^+]+\+", res[-1])

        # Test a DataFrame with duplicate columns
        dtypes = ['int64', 'int64', 'int64', 'float64']
        data = {}
        n = 100
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        df.columns = dtypes

        df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
        df_with_object_index.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        assert re.match(r"memory usage: [^+]+\+", res[-1])

        df_with_object_index.info(buf=buf, memory_usage='deep')
        res = buf.getvalue().splitlines()
        assert re.match(r"memory usage: [^+]+$", res[-1])

        # Ensure df size is as expected
        # (cols * rows * bytes) + index size
        df_size = df.memory_usage().sum()
        exp_size = len(dtypes) * n * 8 + df.index.nbytes
        assert df_size == exp_size

        # Ensure number of cols in memory_usage is the same as df
        size_df = np.size(df.columns.values) + 1  # index=True; default
        assert size_df == np.size(df.memory_usage())

        # assert deep works only on object
        assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()

        # test for validity
        DataFrame(1, index=['a'], columns=['A']
                  ).memory_usage(index=True)
        DataFrame(1, index=['a'], columns=['A']
                  ).index.nbytes
        df = DataFrame(
            data=1,
            index=pd.MultiIndex.from_product(
                [['a'], range(1000)]),
            columns=['A']
        )
        df.index.nbytes
        df.memory_usage(index=True)
        df.index.values.nbytes

        mem = df.memory_usage(deep=True).sum()
        assert mem > 0

    @pytest.mark.skipif(PYPY,
                        reason="on PyPy deep=True doesn't change result")
    def test_info_memory_usage_deep_not_pypy(self):
        df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
        assert (df_with_object_index.memory_usage(
                index=True, deep=True).sum() >
                df_with_object_index.memory_usage(
                    index=True).sum())

        df_object = pd.DataFrame({'a': ['a']})
        assert (df_object.memory_usage(deep=True).sum() >
                df_object.memory_usage().sum())

    @pytest.mark.skipif(not PYPY,
                        reason="on PyPy deep=True does not change result")
    def test_info_memory_usage_deep_pypy(self):
        df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
        assert (df_with_object_index.memory_usage(
                index=True, deep=True).sum() ==
                df_with_object_index.memory_usage(
                    index=True).sum())

        df_object = pd.DataFrame({'a': ['a']})
        assert (df_object.memory_usage(deep=True).sum() ==
                df_object.memory_usage().sum())

    @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
    def test_usage_via_getsizeof(self):
        df = DataFrame(
            data=1,
            index=pd.MultiIndex.from_product(
                [['a'], range(1000)]),
            columns=['A']
        )
        mem = df.memory_usage(deep=True).sum()
        # sys.getsizeof will call the .memory_usage with
        # deep=True, and add on some GC overhead
        diff = mem - sys.getsizeof(df)
        assert abs(diff) < 100

    def test_info_memory_usage_qualified(self):

        buf = StringIO()
        df = DataFrame(1, columns=list('ab'),
                       index=[1, 2, 3])
        df.info(buf=buf)
        assert '+' not in buf.getvalue()

        buf = StringIO()
        df = DataFrame(1, columns=list('ab'),
                       index=list('ABC'))
        df.info(buf=buf)
        assert '+' in buf.getvalue()

        buf = StringIO()
        df = DataFrame(1, columns=list('ab'),
                       index=pd.MultiIndex.from_product(
                           [range(3), range(3)]))
        df.info(buf=buf)
        assert '+' not in buf.getvalue()

        buf = StringIO()
        df = DataFrame(1, columns=list('ab'),
                       index=pd.MultiIndex.from_product(
                           [range(3), ['foo', 'bar']]))
        df.info(buf=buf)
        assert '+' in buf.getvalue()

    def test_info_memory_usage_bug_on_multiindex(self):
        # GH 14308
        # memory usage introspection should not materialize .values

        from string import ascii_uppercase as uppercase

        def memory_usage(f):
            return f.memory_usage(deep=True).sum()

        N = 100
        M = len(uppercase)
        index = pd.MultiIndex.from_product([list(uppercase),
                                            pd.date_range('20160101',
                                                          periods=N)],
                                           names=['id', 'date'])
        df = DataFrame({'value': np.random.randn(N * M)}, index=index)

        unstacked = df.unstack('id')
        assert df.values.nbytes == unstacked.values.nbytes
        assert memory_usage(df) > memory_usage(unstacked)

        # high upper bound
        assert memory_usage(unstacked) - memory_usage(df) < 2000

    def test_info_categorical(self):
        # GH14298
        idx = pd.CategoricalIndex(['a', 'b'])
        df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx)

        buf = StringIO()
        df.info(buf=buf)

    def test_info_categorical_column(self):

        # make sure it works
        n = 2500
        df = DataFrame({'int64': np.random.randint(100, size=n)})
        df['category'] = Series(np.array(list('abcdefghij')).take(
            np.random.randint(0, 10, size=n))).astype('category')
        df.isna()
        buf = StringIO()
        df.info(buf=buf)

        df2 = df[df['category'] == 'd']
        buf = compat.StringIO()
        df2.info(buf=buf)

    def test_repr_categorical_dates_periods(self):
        # normal DataFrame
        dt = date_range('2011-01-01 09:00', freq='H', periods=5,
                        tz='US/Eastern')
        p = period_range('2011-01', freq='M', periods=5)
        df = DataFrame({'dt': dt, 'p': p})
        exp = """                         dt        p
0 2011-01-01 09:00:00-05:00  2011-01
1 2011-01-01 10:00:00-05:00  2011-02
2 2011-01-01 11:00:00-05:00  2011-03
3 2011-01-01 12:00:00-05:00  2011-04
4 2011-01-01 13:00:00-05:00  2011-05"""

        df = DataFrame({'dt': Categorical(dt), 'p': Categorical(p)})
        assert repr(df) == exp