test_chaining_and_caching.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. DataFrame, Series, Timestamp, compat, date_range, option_context)
  6. from pandas.core import common as com
  7. from pandas.util import testing as tm
  8. class TestCaching(object):
  9. def test_slice_consolidate_invalidate_item_cache(self):
  10. # this is chained assignment, but will 'work'
  11. with option_context('chained_assignment', None):
  12. # #3970
  13. df = DataFrame({"aa": compat.lrange(5), "bb": [2.2] * 5})
  14. # Creates a second float block
  15. df["cc"] = 0.0
  16. # caches a reference to the 'bb' series
  17. df["bb"]
  18. # repr machinery triggers consolidation
  19. repr(df)
  20. # Assignment to wrong series
  21. df['bb'].iloc[0] = 0.17
  22. df._clear_item_cache()
  23. tm.assert_almost_equal(df['bb'][0], 0.17)
  24. def test_setitem_cache_updating(self):
  25. # GH 5424
  26. cont = ['one', 'two', 'three', 'four', 'five', 'six', 'seven']
  27. for do_ref in [False, False]:
  28. df = DataFrame({'a': cont,
  29. "b": cont[3:] + cont[:3],
  30. 'c': np.arange(7)})
  31. # ref the cache
  32. if do_ref:
  33. df.loc[0, "c"]
  34. # set it
  35. df.loc[7, 'c'] = 1
  36. assert df.loc[0, 'c'] == 0.0
  37. assert df.loc[7, 'c'] == 1.0
  38. # GH 7084
  39. # not updating cache on series setting with slices
  40. expected = DataFrame({'A': [600, 600, 600]},
  41. index=date_range('5/7/2014', '5/9/2014'))
  42. out = DataFrame({'A': [0, 0, 0]},
  43. index=date_range('5/7/2014', '5/9/2014'))
  44. df = DataFrame({'C': ['A', 'A', 'A'], 'D': [100, 200, 300]})
  45. # loop through df to update out
  46. six = Timestamp('5/7/2014')
  47. eix = Timestamp('5/9/2014')
  48. for ix, row in df.iterrows():
  49. out.loc[six:eix, row['C']] = out.loc[six:eix, row['C']] + row['D']
  50. tm.assert_frame_equal(out, expected)
  51. tm.assert_series_equal(out['A'], expected['A'])
  52. # try via a chain indexing
  53. # this actually works
  54. out = DataFrame({'A': [0, 0, 0]},
  55. index=date_range('5/7/2014', '5/9/2014'))
  56. for ix, row in df.iterrows():
  57. v = out[row['C']][six:eix] + row['D']
  58. out[row['C']][six:eix] = v
  59. tm.assert_frame_equal(out, expected)
  60. tm.assert_series_equal(out['A'], expected['A'])
  61. out = DataFrame({'A': [0, 0, 0]},
  62. index=date_range('5/7/2014', '5/9/2014'))
  63. for ix, row in df.iterrows():
  64. out.loc[six:eix, row['C']] += row['D']
  65. tm.assert_frame_equal(out, expected)
  66. tm.assert_series_equal(out['A'], expected['A'])
  67. class TestChaining(object):
  68. def test_setitem_chained_setfault(self):
  69. # GH6026
  70. data = ['right', 'left', 'left', 'left', 'right', 'left', 'timeout']
  71. mdata = ['right', 'left', 'left', 'left', 'right', 'left', 'none']
  72. df = DataFrame({'response': np.array(data)})
  73. mask = df.response == 'timeout'
  74. df.response[mask] = 'none'
  75. tm.assert_frame_equal(df, DataFrame({'response': mdata}))
  76. recarray = np.rec.fromarrays([data], names=['response'])
  77. df = DataFrame(recarray)
  78. mask = df.response == 'timeout'
  79. df.response[mask] = 'none'
  80. tm.assert_frame_equal(df, DataFrame({'response': mdata}))
  81. df = DataFrame({'response': data, 'response1': data})
  82. mask = df.response == 'timeout'
  83. df.response[mask] = 'none'
  84. tm.assert_frame_equal(df, DataFrame({'response': mdata,
  85. 'response1': data}))
  86. # GH 6056
  87. expected = DataFrame(dict(A=[np.nan, 'bar', 'bah', 'foo', 'bar']))
  88. df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar'])))
  89. df['A'].iloc[0] = np.nan
  90. result = df.head()
  91. tm.assert_frame_equal(result, expected)
  92. df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar'])))
  93. df.A.iloc[0] = np.nan
  94. result = df.head()
  95. tm.assert_frame_equal(result, expected)
  96. def test_detect_chained_assignment(self):
  97. pd.set_option('chained_assignment', 'raise')
  98. # work with the chain
  99. expected = DataFrame([[-5, 1], [-6, 3]], columns=list('AB'))
  100. df = DataFrame(np.arange(4).reshape(2, 2),
  101. columns=list('AB'), dtype='int64')
  102. assert df._is_copy is None
  103. df['A'][0] = -5
  104. df['A'][1] = -6
  105. tm.assert_frame_equal(df, expected)
  106. # test with the chaining
  107. df = DataFrame({'A': Series(range(2), dtype='int64'),
  108. 'B': np.array(np.arange(2, 4), dtype=np.float64)})
  109. assert df._is_copy is None
  110. with pytest.raises(com.SettingWithCopyError):
  111. df['A'][0] = -5
  112. with pytest.raises(com.SettingWithCopyError):
  113. df['A'][1] = np.nan
  114. assert df['A']._is_copy is None
  115. # Using a copy (the chain), fails
  116. df = DataFrame({'A': Series(range(2), dtype='int64'),
  117. 'B': np.array(np.arange(2, 4), dtype=np.float64)})
  118. with pytest.raises(com.SettingWithCopyError):
  119. df.loc[0]['A'] = -5
  120. # Doc example
  121. df = DataFrame({'a': ['one', 'one', 'two', 'three',
  122. 'two', 'one', 'six'],
  123. 'c': Series(range(7), dtype='int64')})
  124. assert df._is_copy is None
  125. with pytest.raises(com.SettingWithCopyError):
  126. indexer = df.a.str.startswith('o')
  127. df[indexer]['c'] = 42
  128. expected = DataFrame({'A': [111, 'bbb', 'ccc'], 'B': [1, 2, 3]})
  129. df = DataFrame({'A': ['aaa', 'bbb', 'ccc'], 'B': [1, 2, 3]})
  130. with pytest.raises(com.SettingWithCopyError):
  131. df['A'][0] = 111
  132. with pytest.raises(com.SettingWithCopyError):
  133. df.loc[0]['A'] = 111
  134. df.loc[0, 'A'] = 111
  135. tm.assert_frame_equal(df, expected)
  136. # gh-5475: Make sure that is_copy is picked up reconstruction
  137. df = DataFrame({"A": [1, 2]})
  138. assert df._is_copy is None
  139. with tm.ensure_clean('__tmp__pickle') as path:
  140. df.to_pickle(path)
  141. df2 = pd.read_pickle(path)
  142. df2["B"] = df2["A"]
  143. df2["B"] = df2["A"]
  144. # gh-5597: a spurious raise as we are setting the entire column here
  145. from string import ascii_letters as letters
  146. def random_text(nobs=100):
  147. df = []
  148. for i in range(nobs):
  149. idx = np.random.randint(len(letters), size=2)
  150. idx.sort()
  151. df.append([letters[idx[0]:idx[1]]])
  152. return DataFrame(df, columns=['letters'])
  153. df = random_text(100000)
  154. # Always a copy
  155. x = df.iloc[[0, 1, 2]]
  156. assert x._is_copy is not None
  157. x = df.iloc[[0, 1, 2, 4]]
  158. assert x._is_copy is not None
  159. # Explicitly copy
  160. indexer = df.letters.apply(lambda x: len(x) > 10)
  161. df = df.loc[indexer].copy()
  162. assert df._is_copy is None
  163. df['letters'] = df['letters'].apply(str.lower)
  164. # Implicitly take
  165. df = random_text(100000)
  166. indexer = df.letters.apply(lambda x: len(x) > 10)
  167. df = df.loc[indexer]
  168. assert df._is_copy is not None
  169. df['letters'] = df['letters'].apply(str.lower)
  170. # Implicitly take 2
  171. df = random_text(100000)
  172. indexer = df.letters.apply(lambda x: len(x) > 10)
  173. df = df.loc[indexer]
  174. assert df._is_copy is not None
  175. df.loc[:, 'letters'] = df['letters'].apply(str.lower)
  176. # Should be ok even though it's a copy!
  177. assert df._is_copy is None
  178. df['letters'] = df['letters'].apply(str.lower)
  179. assert df._is_copy is None
  180. df = random_text(100000)
  181. indexer = df.letters.apply(lambda x: len(x) > 10)
  182. df.loc[indexer, 'letters'] = (
  183. df.loc[indexer, 'letters'].apply(str.lower))
  184. # an identical take, so no copy
  185. df = DataFrame({'a': [1]}).dropna()
  186. assert df._is_copy is None
  187. df['a'] += 1
  188. df = DataFrame(np.random.randn(10, 4))
  189. s = df.iloc[:, 0].sort_values()
  190. tm.assert_series_equal(s, df.iloc[:, 0].sort_values())
  191. tm.assert_series_equal(s, df[0].sort_values())
  192. # see gh-6025: false positives
  193. df = DataFrame({'column1': ['a', 'a', 'a'], 'column2': [4, 8, 9]})
  194. str(df)
  195. df['column1'] = df['column1'] + 'b'
  196. str(df)
  197. df = df[df['column2'] != 8]
  198. str(df)
  199. df['column1'] = df['column1'] + 'c'
  200. str(df)
  201. # from SO:
  202. # http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc
  203. df = DataFrame(np.arange(0, 9), columns=['count'])
  204. df['group'] = 'b'
  205. with pytest.raises(com.SettingWithCopyError):
  206. df.iloc[0:5]['group'] = 'a'
  207. # Mixed type setting but same dtype & changing dtype
  208. df = DataFrame(dict(A=date_range('20130101', periods=5),
  209. B=np.random.randn(5),
  210. C=np.arange(5, dtype='int64'),
  211. D=list('abcde')))
  212. with pytest.raises(com.SettingWithCopyError):
  213. df.loc[2]['D'] = 'foo'
  214. with pytest.raises(com.SettingWithCopyError):
  215. df.loc[2]['C'] = 'foo'
  216. with pytest.raises(com.SettingWithCopyError):
  217. df['C'][2] = 'foo'
  218. def test_setting_with_copy_bug(self):
  219. # operating on a copy
  220. df = DataFrame({'a': list(range(4)),
  221. 'b': list('ab..'),
  222. 'c': ['a', 'b', np.nan, 'd']})
  223. mask = pd.isna(df.c)
  224. def f():
  225. df[['c']][mask] = df[['b']][mask]
  226. pytest.raises(com.SettingWithCopyError, f)
  227. # invalid warning as we are returning a new object
  228. # GH 8730
  229. df1 = DataFrame({'x': Series(['a', 'b', 'c']),
  230. 'y': Series(['d', 'e', 'f'])})
  231. df2 = df1[['x']]
  232. # this should not raise
  233. df2['y'] = ['g', 'h', 'i']
  234. def test_detect_chained_assignment_warnings(self):
  235. with option_context("chained_assignment", "warn"):
  236. df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]})
  237. with tm.assert_produces_warning(com.SettingWithCopyWarning):
  238. df.loc[0]["A"] = 111
  239. def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self):
  240. # xref gh-13017.
  241. with option_context("chained_assignment", "warn"):
  242. df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, -9]],
  243. columns=["a", "a", "c"])
  244. with tm.assert_produces_warning(com.SettingWithCopyWarning):
  245. df.c.loc[df.c > 0] = None
  246. expected = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, -9]],
  247. columns=["a", "a", "c"])
  248. tm.assert_frame_equal(df, expected)
  249. def test_chained_getitem_with_lists(self):
  250. # GH6394
  251. # Regression in chained getitem indexing with embedded list-like from
  252. # 0.12
  253. def check(result, expected):
  254. tm.assert_numpy_array_equal(result, expected)
  255. assert isinstance(result, np.ndarray)
  256. df = DataFrame({'A': 5 * [np.zeros(3)], 'B': 5 * [np.ones(3)]})
  257. expected = df['A'].iloc[2]
  258. result = df.loc[2, 'A']
  259. check(result, expected)
  260. result2 = df.iloc[2]['A']
  261. check(result2, expected)
  262. result3 = df['A'].loc[2]
  263. check(result3, expected)
  264. result4 = df['A'].iloc[2]
  265. check(result4, expected)
  266. @pytest.mark.filterwarnings("ignore::DeprecationWarning")
  267. @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
  268. def test_cache_updating(self):
  269. # GH 4939, make sure to update the cache on setitem
  270. df = tm.makeDataFrame()
  271. df['A'] # cache series
  272. df.ix["Hello Friend"] = df.ix[0]
  273. assert "Hello Friend" in df['A'].index
  274. assert "Hello Friend" in df['B'].index
  275. panel = tm.makePanel()
  276. panel.ix[0] # get first item into cache
  277. panel.ix[:, :, 'A+1'] = panel.ix[:, :, 'A'] + 1
  278. assert "A+1" in panel.ix[0].columns
  279. assert "A+1" in panel.ix[1].columns
  280. # 10264
  281. df = DataFrame(np.zeros((5, 5), dtype='int64'), columns=[
  282. 'a', 'b', 'c', 'd', 'e'], index=range(5))
  283. df['f'] = 0
  284. df.f.values[3] = 1
  285. # TODO(wesm): unused?
  286. # y = df.iloc[np.arange(2, len(df))]
  287. df.f.values[3] = 2
  288. expected = DataFrame(np.zeros((5, 6), dtype='int64'), columns=[
  289. 'a', 'b', 'c', 'd', 'e', 'f'], index=range(5))
  290. expected.at[3, 'f'] = 2
  291. tm.assert_frame_equal(df, expected)
  292. expected = Series([0, 0, 0, 2, 0], name='f')
  293. tm.assert_series_equal(df.f, expected)
  294. def test_deprecate_is_copy(self):
  295. # GH18801
  296. df = DataFrame({"A": [1, 2, 3]})
  297. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  298. # getter
  299. df.is_copy
  300. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  301. # setter
  302. df.is_copy = "test deprecated is_copy"