test_partial_slicing.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. """ test partial slicing on Series/Frame """
  2. from datetime import datetime
  3. import operator as op
  4. import numpy as np
  5. import pytest
  6. import pandas as pd
  7. from pandas import (
  8. DataFrame, DatetimeIndex, Index, Series, Timedelta, Timestamp, date_range)
  9. from pandas.core.indexing import IndexingError
  10. from pandas.util import testing as tm
  11. class TestSlicing(object):
  12. def test_dti_slicing(self):
  13. dti = date_range(start='1/1/2005', end='12/1/2005', freq='M')
  14. dti2 = dti[[1, 3, 5]]
  15. v1 = dti2[0]
  16. v2 = dti2[1]
  17. v3 = dti2[2]
  18. assert v1 == Timestamp('2/28/2005')
  19. assert v2 == Timestamp('4/30/2005')
  20. assert v3 == Timestamp('6/30/2005')
  21. # don't carry freq through irregular slicing
  22. assert dti2.freq is None
  23. def test_slice_keeps_name(self):
  24. # GH4226
  25. st = pd.Timestamp('2013-07-01 00:00:00', tz='America/Los_Angeles')
  26. et = pd.Timestamp('2013-07-02 00:00:00', tz='America/Los_Angeles')
  27. dr = pd.date_range(st, et, freq='H', name='timebucket')
  28. assert dr[1:].name == dr.name
  29. def test_slice_with_negative_step(self):
  30. ts = Series(np.arange(20),
  31. date_range('2014-01-01', periods=20, freq='MS'))
  32. SLC = pd.IndexSlice
  33. def assert_slices_equivalent(l_slc, i_slc):
  34. tm.assert_series_equal(ts[l_slc], ts.iloc[i_slc])
  35. tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc])
  36. tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc])
  37. assert_slices_equivalent(SLC[Timestamp('2014-10-01')::-1], SLC[9::-1])
  38. assert_slices_equivalent(SLC['2014-10-01'::-1], SLC[9::-1])
  39. assert_slices_equivalent(SLC[:Timestamp('2014-10-01'):-1], SLC[:8:-1])
  40. assert_slices_equivalent(SLC[:'2014-10-01':-1], SLC[:8:-1])
  41. assert_slices_equivalent(SLC['2015-02-01':'2014-10-01':-1],
  42. SLC[13:8:-1])
  43. assert_slices_equivalent(SLC[Timestamp('2015-02-01'):Timestamp(
  44. '2014-10-01'):-1], SLC[13:8:-1])
  45. assert_slices_equivalent(SLC['2015-02-01':Timestamp('2014-10-01'):-1],
  46. SLC[13:8:-1])
  47. assert_slices_equivalent(SLC[Timestamp('2015-02-01'):'2014-10-01':-1],
  48. SLC[13:8:-1])
  49. assert_slices_equivalent(SLC['2014-10-01':'2015-02-01':-1], SLC[:0])
  50. def test_slice_with_zero_step_raises(self):
  51. ts = Series(np.arange(20),
  52. date_range('2014-01-01', periods=20, freq='MS'))
  53. with pytest.raises(ValueError, match='slice step cannot be zero'):
  54. ts[::0]
  55. with pytest.raises(ValueError, match='slice step cannot be zero'):
  56. ts.loc[::0]
  57. with pytest.raises(ValueError, match='slice step cannot be zero'):
  58. ts.loc[::0]
  59. def test_slice_bounds_empty(self):
  60. # GH#14354
  61. empty_idx = date_range(freq='1H', periods=0, end='2015')
  62. right = empty_idx._maybe_cast_slice_bound('2015-01-02', 'right', 'loc')
  63. exp = Timestamp('2015-01-02 23:59:59.999999999')
  64. assert right == exp
  65. left = empty_idx._maybe_cast_slice_bound('2015-01-02', 'left', 'loc')
  66. exp = Timestamp('2015-01-02 00:00:00')
  67. assert left == exp
  68. def test_slice_duplicate_monotonic(self):
  69. # https://github.com/pandas-dev/pandas/issues/16515
  70. idx = pd.DatetimeIndex(['2017', '2017'])
  71. result = idx._maybe_cast_slice_bound('2017-01-01', 'left', 'loc')
  72. expected = Timestamp('2017-01-01')
  73. assert result == expected
  74. def test_monotone_DTI_indexing_bug(self):
  75. # GH 19362
  76. # Testing accessing the first element in a montononic descending
  77. # partial string indexing.
  78. df = pd.DataFrame(list(range(5)))
  79. date_list = ['2018-01-02', '2017-02-10', '2016-03-10',
  80. '2015-03-15', '2014-03-16']
  81. date_index = pd.to_datetime(date_list)
  82. df['date'] = date_index
  83. expected = pd.DataFrame({0: list(range(5)), 'date': date_index})
  84. tm.assert_frame_equal(df, expected)
  85. df = pd.DataFrame({'A': [1, 2, 3]},
  86. index=pd.date_range('20170101',
  87. periods=3)[::-1])
  88. expected = pd.DataFrame({'A': 1},
  89. index=pd.date_range('20170103',
  90. periods=1))
  91. tm.assert_frame_equal(df.loc['2017-01-03'], expected)
  92. def test_slice_year(self):
  93. dti = date_range(freq='B', start=datetime(2005, 1, 1), periods=500)
  94. s = Series(np.arange(len(dti)), index=dti)
  95. result = s['2005']
  96. expected = s[s.index.year == 2005]
  97. tm.assert_series_equal(result, expected)
  98. df = DataFrame(np.random.rand(len(dti), 5), index=dti)
  99. result = df.loc['2005']
  100. expected = df[df.index.year == 2005]
  101. tm.assert_frame_equal(result, expected)
  102. rng = date_range('1/1/2000', '1/1/2010')
  103. result = rng.get_loc('2009')
  104. expected = slice(3288, 3653)
  105. assert result == expected
  106. def test_slice_quarter(self):
  107. dti = date_range(freq='D', start=datetime(2000, 6, 1), periods=500)
  108. s = Series(np.arange(len(dti)), index=dti)
  109. assert len(s['2001Q1']) == 90
  110. df = DataFrame(np.random.rand(len(dti), 5), index=dti)
  111. assert len(df.loc['1Q01']) == 90
  112. def test_slice_month(self):
  113. dti = date_range(freq='D', start=datetime(2005, 1, 1), periods=500)
  114. s = Series(np.arange(len(dti)), index=dti)
  115. assert len(s['2005-11']) == 30
  116. df = DataFrame(np.random.rand(len(dti), 5), index=dti)
  117. assert len(df.loc['2005-11']) == 30
  118. tm.assert_series_equal(s['2005-11'], s['11-2005'])
  119. def test_partial_slice(self):
  120. rng = date_range(freq='D', start=datetime(2005, 1, 1), periods=500)
  121. s = Series(np.arange(len(rng)), index=rng)
  122. result = s['2005-05':'2006-02']
  123. expected = s['20050501':'20060228']
  124. tm.assert_series_equal(result, expected)
  125. result = s['2005-05':]
  126. expected = s['20050501':]
  127. tm.assert_series_equal(result, expected)
  128. result = s[:'2006-02']
  129. expected = s[:'20060228']
  130. tm.assert_series_equal(result, expected)
  131. result = s['2005-1-1']
  132. assert result == s.iloc[0]
  133. pytest.raises(Exception, s.__getitem__, '2004-12-31')
  134. def test_partial_slice_daily(self):
  135. rng = date_range(freq='H', start=datetime(2005, 1, 31), periods=500)
  136. s = Series(np.arange(len(rng)), index=rng)
  137. result = s['2005-1-31']
  138. tm.assert_series_equal(result, s.iloc[:24])
  139. pytest.raises(Exception, s.__getitem__, '2004-12-31 00')
  140. def test_partial_slice_hourly(self):
  141. rng = date_range(freq='T', start=datetime(2005, 1, 1, 20, 0, 0),
  142. periods=500)
  143. s = Series(np.arange(len(rng)), index=rng)
  144. result = s['2005-1-1']
  145. tm.assert_series_equal(result, s.iloc[:60 * 4])
  146. result = s['2005-1-1 20']
  147. tm.assert_series_equal(result, s.iloc[:60])
  148. assert s['2005-1-1 20:00'] == s.iloc[0]
  149. pytest.raises(Exception, s.__getitem__, '2004-12-31 00:15')
  150. def test_partial_slice_minutely(self):
  151. rng = date_range(freq='S', start=datetime(2005, 1, 1, 23, 59, 0),
  152. periods=500)
  153. s = Series(np.arange(len(rng)), index=rng)
  154. result = s['2005-1-1 23:59']
  155. tm.assert_series_equal(result, s.iloc[:60])
  156. result = s['2005-1-1']
  157. tm.assert_series_equal(result, s.iloc[:60])
  158. assert s[Timestamp('2005-1-1 23:59:00')] == s.iloc[0]
  159. pytest.raises(Exception, s.__getitem__, '2004-12-31 00:00:00')
  160. def test_partial_slice_second_precision(self):
  161. rng = date_range(start=datetime(2005, 1, 1, 0, 0, 59,
  162. microsecond=999990),
  163. periods=20, freq='US')
  164. s = Series(np.arange(20), rng)
  165. tm.assert_series_equal(s['2005-1-1 00:00'], s.iloc[:10])
  166. tm.assert_series_equal(s['2005-1-1 00:00:59'], s.iloc[:10])
  167. tm.assert_series_equal(s['2005-1-1 00:01'], s.iloc[10:])
  168. tm.assert_series_equal(s['2005-1-1 00:01:00'], s.iloc[10:])
  169. assert s[Timestamp('2005-1-1 00:00:59.999990')] == s.iloc[0]
  170. with pytest.raises(KeyError, match='2005-1-1 00:00:00'):
  171. s['2005-1-1 00:00:00']
  172. def test_partial_slicing_dataframe(self):
  173. # GH14856
  174. # Test various combinations of string slicing resolution vs.
  175. # index resolution
  176. # - If string resolution is less precise than index resolution,
  177. # string is considered a slice
  178. # - If string resolution is equal to or more precise than index
  179. # resolution, string is considered an exact match
  180. formats = ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%d %H',
  181. '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S']
  182. resolutions = ['year', 'month', 'day', 'hour', 'minute', 'second']
  183. for rnum, resolution in enumerate(resolutions[2:], 2):
  184. # we check only 'day', 'hour', 'minute' and 'second'
  185. unit = Timedelta("1 " + resolution)
  186. middate = datetime(2012, 1, 1, 0, 0, 0)
  187. index = DatetimeIndex([middate - unit,
  188. middate, middate + unit])
  189. values = [1, 2, 3]
  190. df = DataFrame({'a': values}, index, dtype=np.int64)
  191. assert df.index.resolution == resolution
  192. # Timestamp with the same resolution as index
  193. # Should be exact match for Series (return scalar)
  194. # and raise KeyError for Frame
  195. for timestamp, expected in zip(index, values):
  196. ts_string = timestamp.strftime(formats[rnum])
  197. # make ts_string as precise as index
  198. result = df['a'][ts_string]
  199. assert isinstance(result, np.int64)
  200. assert result == expected
  201. pytest.raises(KeyError, df.__getitem__, ts_string)
  202. # Timestamp with resolution less precise than index
  203. for fmt in formats[:rnum]:
  204. for element, theslice in [[0, slice(None, 1)],
  205. [1, slice(1, None)]]:
  206. ts_string = index[element].strftime(fmt)
  207. # Series should return slice
  208. result = df['a'][ts_string]
  209. expected = df['a'][theslice]
  210. tm.assert_series_equal(result, expected)
  211. # Frame should return slice as well
  212. result = df[ts_string]
  213. expected = df[theslice]
  214. tm.assert_frame_equal(result, expected)
  215. # Timestamp with resolution more precise than index
  216. # Compatible with existing key
  217. # Should return scalar for Series
  218. # and raise KeyError for Frame
  219. for fmt in formats[rnum + 1:]:
  220. ts_string = index[1].strftime(fmt)
  221. result = df['a'][ts_string]
  222. assert isinstance(result, np.int64)
  223. assert result == 2
  224. pytest.raises(KeyError, df.__getitem__, ts_string)
  225. # Not compatible with existing key
  226. # Should raise KeyError
  227. for fmt, res in list(zip(formats, resolutions))[rnum + 1:]:
  228. ts = index[1] + Timedelta("1 " + res)
  229. ts_string = ts.strftime(fmt)
  230. pytest.raises(KeyError, df['a'].__getitem__, ts_string)
  231. pytest.raises(KeyError, df.__getitem__, ts_string)
  232. def test_partial_slicing_with_multiindex(self):
  233. # GH 4758
  234. # partial string indexing with a multi-index buggy
  235. df = DataFrame({'ACCOUNT': ["ACCT1", "ACCT1", "ACCT1", "ACCT2"],
  236. 'TICKER': ["ABC", "MNP", "XYZ", "XYZ"],
  237. 'val': [1, 2, 3, 4]},
  238. index=date_range("2013-06-19 09:30:00",
  239. periods=4, freq='5T'))
  240. df_multi = df.set_index(['ACCOUNT', 'TICKER'], append=True)
  241. expected = DataFrame([
  242. [1]
  243. ], index=Index(['ABC'], name='TICKER'), columns=['val'])
  244. result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1')]
  245. tm.assert_frame_equal(result, expected)
  246. expected = df_multi.loc[
  247. (pd.Timestamp('2013-06-19 09:30:00', tz=None), 'ACCT1', 'ABC')]
  248. result = df_multi.loc[('2013-06-19 09:30:00', 'ACCT1', 'ABC')]
  249. tm.assert_series_equal(result, expected)
  250. # this is an IndexingError as we don't do partial string selection on
  251. # multi-levels.
  252. def f():
  253. df_multi.loc[('2013-06-19', 'ACCT1', 'ABC')]
  254. pytest.raises(IndexingError, f)
  255. # GH 4294
  256. # partial slice on a series mi
  257. s = pd.DataFrame(np.random.rand(1000, 1000), index=pd.date_range(
  258. '2000-1-1', periods=1000)).stack()
  259. s2 = s[:-1].copy()
  260. expected = s2['2000-1-4']
  261. result = s2[pd.Timestamp('2000-1-4')]
  262. tm.assert_series_equal(result, expected)
  263. result = s[pd.Timestamp('2000-1-4')]
  264. expected = s['2000-1-4']
  265. tm.assert_series_equal(result, expected)
  266. df2 = pd.DataFrame(s)
  267. expected = df2.xs('2000-1-4')
  268. result = df2.loc[pd.Timestamp('2000-1-4')]
  269. tm.assert_frame_equal(result, expected)
  270. def test_partial_slice_doesnt_require_monotonicity(self):
  271. # For historical reasons.
  272. s = pd.Series(np.arange(10), pd.date_range('2014-01-01', periods=10))
  273. nonmonotonic = s[[3, 5, 4]]
  274. expected = nonmonotonic.iloc[:0]
  275. timestamp = pd.Timestamp('2014-01-10')
  276. tm.assert_series_equal(nonmonotonic['2014-01-10':], expected)
  277. with pytest.raises(KeyError,
  278. match=r"Timestamp\('2014-01-10 00:00:00'\)"):
  279. nonmonotonic[timestamp:]
  280. tm.assert_series_equal(nonmonotonic.loc['2014-01-10':], expected)
  281. with pytest.raises(KeyError,
  282. match=r"Timestamp\('2014-01-10 00:00:00'\)"):
  283. nonmonotonic.loc[timestamp:]
  284. def test_loc_datetime_length_one(self):
  285. # GH16071
  286. df = pd.DataFrame(columns=['1'],
  287. index=pd.date_range('2016-10-01T00:00:00',
  288. '2016-10-01T23:59:59'))
  289. result = df.loc[datetime(2016, 10, 1):]
  290. tm.assert_frame_equal(result, df)
  291. result = df.loc['2016-10-01T00:00:00':]
  292. tm.assert_frame_equal(result, df)
  293. @pytest.mark.parametrize('datetimelike', [
  294. Timestamp('20130101'), datetime(2013, 1, 1),
  295. np.datetime64('2013-01-01T00:00', 'ns')])
  296. @pytest.mark.parametrize('op,expected', [
  297. (op.lt, [True, False, False, False]),
  298. (op.le, [True, True, False, False]),
  299. (op.eq, [False, True, False, False]),
  300. (op.gt, [False, False, False, True])])
  301. def test_selection_by_datetimelike(self, datetimelike, op, expected):
  302. # GH issue #17965, test for ability to compare datetime64[ns] columns
  303. # to datetimelike
  304. df = DataFrame({'A': [pd.Timestamp('20120101'),
  305. pd.Timestamp('20130101'),
  306. np.nan, pd.Timestamp('20130103')]})
  307. result = op(df.A, datetimelike)
  308. expected = Series(expected, name='A')
  309. tm.assert_series_equal(result, expected)