test_repr_info.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523
  1. # -*- coding: utf-8 -*-
  2. from __future__ import print_function
  3. from datetime import datetime, timedelta
  4. import re
  5. import sys
  6. import textwrap
  7. import numpy as np
  8. import pytest
  9. from pandas.compat import PYPY, StringIO, lrange, u
  10. import pandas as pd
  11. from pandas import (
  12. Categorical, DataFrame, Series, compat, date_range, option_context,
  13. period_range)
  14. from pandas.tests.frame.common import TestData
  15. import pandas.util.testing as tm
  16. import pandas.io.formats.format as fmt
  17. # Segregated collection of methods that require the BlockManager internal data
  18. # structure
  19. class TestDataFrameReprInfoEtc(TestData):
  20. def test_repr_empty(self):
  21. # empty
  22. foo = repr(self.empty) # noqa
  23. # empty with index
  24. frame = DataFrame(index=np.arange(1000))
  25. foo = repr(frame) # noqa
  26. def test_repr_mixed(self):
  27. buf = StringIO()
  28. # mixed
  29. foo = repr(self.mixed_frame) # noqa
  30. self.mixed_frame.info(verbose=False, buf=buf)
  31. @pytest.mark.slow
  32. def test_repr_mixed_big(self):
  33. # big mixed
  34. biggie = DataFrame({'A': np.random.randn(200),
  35. 'B': tm.makeStringIndex(200)},
  36. index=lrange(200))
  37. biggie.loc[:20, 'A'] = np.nan
  38. biggie.loc[:20, 'B'] = np.nan
  39. foo = repr(biggie) # noqa
  40. def test_repr(self):
  41. buf = StringIO()
  42. # small one
  43. foo = repr(self.frame)
  44. self.frame.info(verbose=False, buf=buf)
  45. # even smaller
  46. self.frame.reindex(columns=['A']).info(verbose=False, buf=buf)
  47. self.frame.reindex(columns=['A', 'B']).info(verbose=False, buf=buf)
  48. # exhausting cases in DataFrame.info
  49. # columns but no index
  50. no_index = DataFrame(columns=[0, 1, 3])
  51. foo = repr(no_index) # noqa
  52. # no columns or index
  53. self.empty.info(buf=buf)
  54. df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"])
  55. assert "\t" not in repr(df)
  56. assert "\r" not in repr(df)
  57. assert "a\n" not in repr(df)
  58. def test_repr_dimensions(self):
  59. df = DataFrame([[1, 2, ], [3, 4]])
  60. with option_context('display.show_dimensions', True):
  61. assert "2 rows x 2 columns" in repr(df)
  62. with option_context('display.show_dimensions', False):
  63. assert "2 rows x 2 columns" not in repr(df)
  64. with option_context('display.show_dimensions', 'truncate'):
  65. assert "2 rows x 2 columns" not in repr(df)
  66. @pytest.mark.slow
  67. def test_repr_big(self):
  68. # big one
  69. biggie = DataFrame(np.zeros((200, 4)), columns=lrange(4),
  70. index=lrange(200))
  71. repr(biggie)
  72. def test_repr_unsortable(self):
  73. # columns are not sortable
  74. import warnings
  75. warn_filters = warnings.filters
  76. warnings.filterwarnings('ignore',
  77. category=FutureWarning,
  78. module=".*format")
  79. unsortable = DataFrame({'foo': [1] * 50,
  80. datetime.today(): [1] * 50,
  81. 'bar': ['bar'] * 50,
  82. datetime.today() + timedelta(1): ['bar'] * 50},
  83. index=np.arange(50))
  84. repr(unsortable)
  85. fmt.set_option('display.precision', 3, 'display.column_space', 10)
  86. repr(self.frame)
  87. fmt.set_option('display.max_rows', 10, 'display.max_columns', 2)
  88. repr(self.frame)
  89. fmt.set_option('display.max_rows', 1000, 'display.max_columns', 1000)
  90. repr(self.frame)
  91. tm.reset_display_options()
  92. warnings.filters = warn_filters
  93. def test_repr_unicode(self):
  94. uval = u('\u03c3\u03c3\u03c3\u03c3')
  95. # TODO(wesm): is this supposed to be used?
  96. bval = uval.encode('utf-8') # noqa
  97. df = DataFrame({'A': [uval, uval]})
  98. result = repr(df)
  99. ex_top = ' A'
  100. assert result.split('\n')[0].rstrip() == ex_top
  101. df = DataFrame({'A': [uval, uval]})
  102. result = repr(df)
  103. assert result.split('\n')[0].rstrip() == ex_top
  104. def test_unicode_string_with_unicode(self):
  105. df = DataFrame({'A': [u("\u05d0")]})
  106. if compat.PY3:
  107. str(df)
  108. else:
  109. compat.text_type(df)
  110. def test_bytestring_with_unicode(self):
  111. df = DataFrame({'A': [u("\u05d0")]})
  112. if compat.PY3:
  113. bytes(df)
  114. else:
  115. str(df)
  116. def test_very_wide_info_repr(self):
  117. df = DataFrame(np.random.randn(10, 20),
  118. columns=tm.rands_array(10, 20))
  119. repr(df)
  120. def test_repr_column_name_unicode_truncation_bug(self):
  121. # #1906
  122. df = DataFrame({'Id': [7117434],
  123. 'StringCol': ('Is it possible to modify drop plot code'
  124. ' so that the output graph is displayed '
  125. 'in iphone simulator, Is it possible to '
  126. 'modify drop plot code so that the '
  127. 'output graph is \xe2\x80\xa8displayed '
  128. 'in iphone simulator.Now we are adding '
  129. 'the CSV file externally. I want to Call'
  130. ' the File through the code..')})
  131. with option_context('display.max_columns', 20):
  132. assert 'StringCol' in repr(df)
  133. def test_latex_repr(self):
  134. result = r"""\begin{tabular}{llll}
  135. \toprule
  136. {} & 0 & 1 & 2 \\
  137. \midrule
  138. 0 & $\alpha$ & b & c \\
  139. 1 & 1 & 2 & 3 \\
  140. \bottomrule
  141. \end{tabular}
  142. """
  143. with option_context("display.latex.escape", False,
  144. 'display.latex.repr', True):
  145. df = DataFrame([[r'$\alpha$', 'b', 'c'], [1, 2, 3]])
  146. assert result == df._repr_latex_()
  147. # GH 12182
  148. assert df._repr_latex_() is None
  149. def test_info(self):
  150. io = StringIO()
  151. self.frame.info(buf=io)
  152. self.tsframe.info(buf=io)
  153. frame = DataFrame(np.random.randn(5, 3))
  154. frame.info()
  155. frame.info(verbose=False)
  156. def test_info_memory(self):
  157. # https://github.com/pandas-dev/pandas/issues/21056
  158. df = pd.DataFrame({'a': pd.Series([1, 2], dtype='i8')})
  159. buf = StringIO()
  160. df.info(buf=buf)
  161. result = buf.getvalue()
  162. bytes = float(df.memory_usage().sum())
  163. expected = textwrap.dedent("""\
  164. <class 'pandas.core.frame.DataFrame'>
  165. RangeIndex: 2 entries, 0 to 1
  166. Data columns (total 1 columns):
  167. a 2 non-null int64
  168. dtypes: int64(1)
  169. memory usage: {} bytes
  170. """.format(bytes))
  171. assert result == expected
  172. def test_info_wide(self):
  173. from pandas import set_option, reset_option
  174. io = StringIO()
  175. df = DataFrame(np.random.randn(5, 101))
  176. df.info(buf=io)
  177. io = StringIO()
  178. df.info(buf=io, max_cols=101)
  179. rs = io.getvalue()
  180. assert len(rs.splitlines()) > 100
  181. xp = rs
  182. set_option('display.max_info_columns', 101)
  183. io = StringIO()
  184. df.info(buf=io)
  185. assert rs == xp
  186. reset_option('display.max_info_columns')
  187. def test_info_duplicate_columns(self):
  188. io = StringIO()
  189. # it works!
  190. frame = DataFrame(np.random.randn(1500, 4),
  191. columns=['a', 'a', 'b', 'b'])
  192. frame.info(buf=io)
  193. def test_info_duplicate_columns_shows_correct_dtypes(self):
  194. # GH11761
  195. io = StringIO()
  196. frame = DataFrame([[1, 2.0]],
  197. columns=['a', 'a'])
  198. frame.info(buf=io)
  199. io.seek(0)
  200. lines = io.readlines()
  201. assert 'a 1 non-null int64\n' == lines[3]
  202. assert 'a 1 non-null float64\n' == lines[4]
  203. def test_info_shows_column_dtypes(self):
  204. dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
  205. 'complex128', 'object', 'bool']
  206. data = {}
  207. n = 10
  208. for i, dtype in enumerate(dtypes):
  209. data[i] = np.random.randint(2, size=n).astype(dtype)
  210. df = DataFrame(data)
  211. buf = StringIO()
  212. df.info(buf=buf)
  213. res = buf.getvalue()
  214. for i, dtype in enumerate(dtypes):
  215. name = '%d %d non-null %s' % (i, n, dtype)
  216. assert name in res
  217. def test_info_max_cols(self):
  218. df = DataFrame(np.random.randn(10, 5))
  219. for len_, verbose in [(5, None), (5, False), (10, True)]:
  220. # For verbose always ^ setting ^ summarize ^ full output
  221. with option_context('max_info_columns', 4):
  222. buf = StringIO()
  223. df.info(buf=buf, verbose=verbose)
  224. res = buf.getvalue()
  225. assert len(res.strip().split('\n')) == len_
  226. for len_, verbose in [(10, None), (5, False), (10, True)]:
  227. # max_cols no exceeded
  228. with option_context('max_info_columns', 5):
  229. buf = StringIO()
  230. df.info(buf=buf, verbose=verbose)
  231. res = buf.getvalue()
  232. assert len(res.strip().split('\n')) == len_
  233. for len_, max_cols in [(10, 5), (5, 4)]:
  234. # setting truncates
  235. with option_context('max_info_columns', 4):
  236. buf = StringIO()
  237. df.info(buf=buf, max_cols=max_cols)
  238. res = buf.getvalue()
  239. assert len(res.strip().split('\n')) == len_
  240. # setting wouldn't truncate
  241. with option_context('max_info_columns', 5):
  242. buf = StringIO()
  243. df.info(buf=buf, max_cols=max_cols)
  244. res = buf.getvalue()
  245. assert len(res.strip().split('\n')) == len_
  246. def test_info_memory_usage(self):
  247. # Ensure memory usage is displayed, when asserted, on the last line
  248. dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
  249. 'complex128', 'object', 'bool']
  250. data = {}
  251. n = 10
  252. for i, dtype in enumerate(dtypes):
  253. data[i] = np.random.randint(2, size=n).astype(dtype)
  254. df = DataFrame(data)
  255. buf = StringIO()
  256. # display memory usage case
  257. df.info(buf=buf, memory_usage=True)
  258. res = buf.getvalue().splitlines()
  259. assert "memory usage: " in res[-1]
  260. # do not display memory usage case
  261. df.info(buf=buf, memory_usage=False)
  262. res = buf.getvalue().splitlines()
  263. assert "memory usage: " not in res[-1]
  264. df.info(buf=buf, memory_usage=True)
  265. res = buf.getvalue().splitlines()
  266. # memory usage is a lower bound, so print it as XYZ+ MB
  267. assert re.match(r"memory usage: [^+]+\+", res[-1])
  268. df.iloc[:, :5].info(buf=buf, memory_usage=True)
  269. res = buf.getvalue().splitlines()
  270. # excluded column with object dtype, so estimate is accurate
  271. assert not re.match(r"memory usage: [^+]+\+", res[-1])
  272. # Test a DataFrame with duplicate columns
  273. dtypes = ['int64', 'int64', 'int64', 'float64']
  274. data = {}
  275. n = 100
  276. for i, dtype in enumerate(dtypes):
  277. data[i] = np.random.randint(2, size=n).astype(dtype)
  278. df = DataFrame(data)
  279. df.columns = dtypes
  280. df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
  281. df_with_object_index.info(buf=buf, memory_usage=True)
  282. res = buf.getvalue().splitlines()
  283. assert re.match(r"memory usage: [^+]+\+", res[-1])
  284. df_with_object_index.info(buf=buf, memory_usage='deep')
  285. res = buf.getvalue().splitlines()
  286. assert re.match(r"memory usage: [^+]+$", res[-1])
  287. # Ensure df size is as expected
  288. # (cols * rows * bytes) + index size
  289. df_size = df.memory_usage().sum()
  290. exp_size = len(dtypes) * n * 8 + df.index.nbytes
  291. assert df_size == exp_size
  292. # Ensure number of cols in memory_usage is the same as df
  293. size_df = np.size(df.columns.values) + 1 # index=True; default
  294. assert size_df == np.size(df.memory_usage())
  295. # assert deep works only on object
  296. assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
  297. # test for validity
  298. DataFrame(1, index=['a'], columns=['A']
  299. ).memory_usage(index=True)
  300. DataFrame(1, index=['a'], columns=['A']
  301. ).index.nbytes
  302. df = DataFrame(
  303. data=1,
  304. index=pd.MultiIndex.from_product(
  305. [['a'], range(1000)]),
  306. columns=['A']
  307. )
  308. df.index.nbytes
  309. df.memory_usage(index=True)
  310. df.index.values.nbytes
  311. mem = df.memory_usage(deep=True).sum()
  312. assert mem > 0
  313. @pytest.mark.skipif(PYPY,
  314. reason="on PyPy deep=True doesn't change result")
  315. def test_info_memory_usage_deep_not_pypy(self):
  316. df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
  317. assert (df_with_object_index.memory_usage(
  318. index=True, deep=True).sum() >
  319. df_with_object_index.memory_usage(
  320. index=True).sum())
  321. df_object = pd.DataFrame({'a': ['a']})
  322. assert (df_object.memory_usage(deep=True).sum() >
  323. df_object.memory_usage().sum())
  324. @pytest.mark.skipif(not PYPY,
  325. reason="on PyPy deep=True does not change result")
  326. def test_info_memory_usage_deep_pypy(self):
  327. df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
  328. assert (df_with_object_index.memory_usage(
  329. index=True, deep=True).sum() ==
  330. df_with_object_index.memory_usage(
  331. index=True).sum())
  332. df_object = pd.DataFrame({'a': ['a']})
  333. assert (df_object.memory_usage(deep=True).sum() ==
  334. df_object.memory_usage().sum())
  335. @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
  336. def test_usage_via_getsizeof(self):
  337. df = DataFrame(
  338. data=1,
  339. index=pd.MultiIndex.from_product(
  340. [['a'], range(1000)]),
  341. columns=['A']
  342. )
  343. mem = df.memory_usage(deep=True).sum()
  344. # sys.getsizeof will call the .memory_usage with
  345. # deep=True, and add on some GC overhead
  346. diff = mem - sys.getsizeof(df)
  347. assert abs(diff) < 100
  348. def test_info_memory_usage_qualified(self):
  349. buf = StringIO()
  350. df = DataFrame(1, columns=list('ab'),
  351. index=[1, 2, 3])
  352. df.info(buf=buf)
  353. assert '+' not in buf.getvalue()
  354. buf = StringIO()
  355. df = DataFrame(1, columns=list('ab'),
  356. index=list('ABC'))
  357. df.info(buf=buf)
  358. assert '+' in buf.getvalue()
  359. buf = StringIO()
  360. df = DataFrame(1, columns=list('ab'),
  361. index=pd.MultiIndex.from_product(
  362. [range(3), range(3)]))
  363. df.info(buf=buf)
  364. assert '+' not in buf.getvalue()
  365. buf = StringIO()
  366. df = DataFrame(1, columns=list('ab'),
  367. index=pd.MultiIndex.from_product(
  368. [range(3), ['foo', 'bar']]))
  369. df.info(buf=buf)
  370. assert '+' in buf.getvalue()
  371. def test_info_memory_usage_bug_on_multiindex(self):
  372. # GH 14308
  373. # memory usage introspection should not materialize .values
  374. from string import ascii_uppercase as uppercase
  375. def memory_usage(f):
  376. return f.memory_usage(deep=True).sum()
  377. N = 100
  378. M = len(uppercase)
  379. index = pd.MultiIndex.from_product([list(uppercase),
  380. pd.date_range('20160101',
  381. periods=N)],
  382. names=['id', 'date'])
  383. df = DataFrame({'value': np.random.randn(N * M)}, index=index)
  384. unstacked = df.unstack('id')
  385. assert df.values.nbytes == unstacked.values.nbytes
  386. assert memory_usage(df) > memory_usage(unstacked)
  387. # high upper bound
  388. assert memory_usage(unstacked) - memory_usage(df) < 2000
  389. def test_info_categorical(self):
  390. # GH14298
  391. idx = pd.CategoricalIndex(['a', 'b'])
  392. df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
  393. buf = StringIO()
  394. df.info(buf=buf)
  395. def test_info_categorical_column(self):
  396. # make sure it works
  397. n = 2500
  398. df = DataFrame({'int64': np.random.randint(100, size=n)})
  399. df['category'] = Series(np.array(list('abcdefghij')).take(
  400. np.random.randint(0, 10, size=n))).astype('category')
  401. df.isna()
  402. buf = StringIO()
  403. df.info(buf=buf)
  404. df2 = df[df['category'] == 'd']
  405. buf = compat.StringIO()
  406. df2.info(buf=buf)
  407. def test_repr_categorical_dates_periods(self):
  408. # normal DataFrame
  409. dt = date_range('2011-01-01 09:00', freq='H', periods=5,
  410. tz='US/Eastern')
  411. p = period_range('2011-01', freq='M', periods=5)
  412. df = DataFrame({'dt': dt, 'p': p})
  413. exp = """ dt p
  414. 0 2011-01-01 09:00:00-05:00 2011-01
  415. 1 2011-01-01 10:00:00-05:00 2011-02
  416. 2 2011-01-01 11:00:00-05:00 2011-03
  417. 3 2011-01-01 12:00:00-05:00 2011-04
  418. 4 2011-01-01 13:00:00-05:00 2011-05"""
  419. df = DataFrame({'dt': Categorical(dt), 'p': Categorical(p)})
  420. assert repr(df) == exp