test_nth.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. import numpy as np
  2. import pytest
  3. from pandas.compat import lrange
  4. import pandas as pd
  5. from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna
  6. from pandas.util.testing import (
  7. assert_frame_equal, assert_produces_warning, assert_series_equal)
  8. def test_first_last_nth(df):
  9. # tests for first / last / nth
  10. grouped = df.groupby('A')
  11. first = grouped.first()
  12. expected = df.loc[[1, 0], ['B', 'C', 'D']]
  13. expected.index = Index(['bar', 'foo'], name='A')
  14. expected = expected.sort_index()
  15. assert_frame_equal(first, expected)
  16. nth = grouped.nth(0)
  17. assert_frame_equal(nth, expected)
  18. last = grouped.last()
  19. expected = df.loc[[5, 7], ['B', 'C', 'D']]
  20. expected.index = Index(['bar', 'foo'], name='A')
  21. assert_frame_equal(last, expected)
  22. nth = grouped.nth(-1)
  23. assert_frame_equal(nth, expected)
  24. nth = grouped.nth(1)
  25. expected = df.loc[[2, 3], ['B', 'C', 'D']].copy()
  26. expected.index = Index(['foo', 'bar'], name='A')
  27. expected = expected.sort_index()
  28. assert_frame_equal(nth, expected)
  29. # it works!
  30. grouped['B'].first()
  31. grouped['B'].last()
  32. grouped['B'].nth(0)
  33. df.loc[df['A'] == 'foo', 'B'] = np.nan
  34. assert isna(grouped['B'].first()['foo'])
  35. assert isna(grouped['B'].last()['foo'])
  36. assert isna(grouped['B'].nth(0)['foo'])
  37. # v0.14.0 whatsnew
  38. df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
  39. g = df.groupby('A')
  40. result = g.first()
  41. expected = df.iloc[[1, 2]].set_index('A')
  42. assert_frame_equal(result, expected)
  43. expected = df.iloc[[1, 2]].set_index('A')
  44. result = g.nth(0, dropna='any')
  45. assert_frame_equal(result, expected)
  46. def test_first_last_nth_dtypes(df_mixed_floats):
  47. df = df_mixed_floats.copy()
  48. df['E'] = True
  49. df['F'] = 1
  50. # tests for first / last / nth
  51. grouped = df.groupby('A')
  52. first = grouped.first()
  53. expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']]
  54. expected.index = Index(['bar', 'foo'], name='A')
  55. expected = expected.sort_index()
  56. assert_frame_equal(first, expected)
  57. last = grouped.last()
  58. expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']]
  59. expected.index = Index(['bar', 'foo'], name='A')
  60. expected = expected.sort_index()
  61. assert_frame_equal(last, expected)
  62. nth = grouped.nth(1)
  63. expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']]
  64. expected.index = Index(['bar', 'foo'], name='A')
  65. expected = expected.sort_index()
  66. assert_frame_equal(nth, expected)
  67. # GH 2763, first/last shifting dtypes
  68. idx = lrange(10)
  69. idx.append(9)
  70. s = Series(data=lrange(11), index=idx, name='IntCol')
  71. assert s.dtype == 'int64'
  72. f = s.groupby(level=0).first()
  73. assert f.dtype == 'int64'
  74. def test_nth():
  75. df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
  76. g = df.groupby('A')
  77. assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A'))
  78. assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A'))
  79. assert_frame_equal(g.nth(2), df.loc[[]].set_index('A'))
  80. assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A'))
  81. assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A'))
  82. assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A'))
  83. assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]])
  84. assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]])
  85. assert_frame_equal(g[['B']].nth(0),
  86. df.loc[[0, 2], ['A', 'B']].set_index('A'))
  87. exp = df.set_index('A')
  88. assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
  89. assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])
  90. exp['B'] = np.nan
  91. assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
  92. assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])
  93. # out of bounds, regression from 0.13.1
  94. # GH 6621
  95. df = DataFrame({'color': {0: 'green',
  96. 1: 'green',
  97. 2: 'red',
  98. 3: 'red',
  99. 4: 'red'},
  100. 'food': {0: 'ham',
  101. 1: 'eggs',
  102. 2: 'eggs',
  103. 3: 'ham',
  104. 4: 'pork'},
  105. 'two': {0: 1.5456590000000001,
  106. 1: -0.070345000000000005,
  107. 2: -2.4004539999999999,
  108. 3: 0.46206000000000003,
  109. 4: 0.52350799999999997},
  110. 'one': {0: 0.56573799999999996,
  111. 1: -0.9742360000000001,
  112. 2: 1.033801,
  113. 3: -0.78543499999999999,
  114. 4: 0.70422799999999997}}).set_index(['color',
  115. 'food'])
  116. result = df.groupby(level=0, as_index=False).nth(2)
  117. expected = df.iloc[[-1]]
  118. assert_frame_equal(result, expected)
  119. result = df.groupby(level=0, as_index=False).nth(3)
  120. expected = df.loc[[]]
  121. assert_frame_equal(result, expected)
  122. # GH 7559
  123. # from the vbench
  124. df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64')
  125. s = df[1]
  126. g = df[0]
  127. expected = s.groupby(g).first()
  128. expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
  129. assert_series_equal(expected2, expected, check_names=False)
  130. assert expected.name == 1
  131. assert expected2.name == 1
  132. # validate first
  133. v = s[g == 1].iloc[0]
  134. assert expected.iloc[0] == v
  135. assert expected2.iloc[0] == v
  136. # this is NOT the same as .first (as sorted is default!)
  137. # as it keeps the order in the series (and not the group order)
  138. # related GH 7287
  139. expected = s.groupby(g, sort=False).first()
  140. result = s.groupby(g, sort=False).nth(0, dropna='all')
  141. assert_series_equal(result, expected)
  142. # doc example
  143. df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
  144. g = df.groupby('A')
  145. # PR 17493, related to issue 11038
  146. # test Series.nth with True for dropna produces FutureWarning
  147. with assert_produces_warning(FutureWarning):
  148. result = g.B.nth(0, dropna=True)
  149. expected = g.B.first()
  150. assert_series_equal(result, expected)
  151. # test multiple nth values
  152. df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]],
  153. columns=['A', 'B'])
  154. g = df.groupby('A')
  155. assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A'))
  156. assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A'))
  157. assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A'))
  158. assert_frame_equal(
  159. g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A'))
  160. assert_frame_equal(
  161. g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
  162. assert_frame_equal(
  163. g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
  164. assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A'))
  165. assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A'))
  166. business_dates = pd.date_range(start='4/1/2014', end='6/30/2014',
  167. freq='B')
  168. df = DataFrame(1, index=business_dates, columns=['a', 'b'])
  169. # get the first, fourth and last two business days for each month
  170. key = [df.index.year, df.index.month]
  171. result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
  172. expected_dates = pd.to_datetime(
  173. ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1',
  174. '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5',
  175. '2014/6/27', '2014/6/30'])
  176. expected = DataFrame(1, columns=['a', 'b'], index=expected_dates)
  177. assert_frame_equal(result, expected)
  178. def test_nth_multi_index(three_group):
  179. # PR 9090, related to issue 8979
  180. # test nth on MultiIndex, should match .first()
  181. grouped = three_group.groupby(['A', 'B'])
  182. result = grouped.nth(0)
  183. expected = grouped.first()
  184. assert_frame_equal(result, expected)
  185. @pytest.mark.parametrize('data, expected_first, expected_last', [
  186. ({'id': ['A'],
  187. 'time': Timestamp('2012-02-01 14:00:00',
  188. tz='US/Central'),
  189. 'foo': [1]},
  190. {'id': ['A'],
  191. 'time': Timestamp('2012-02-01 14:00:00',
  192. tz='US/Central'),
  193. 'foo': [1]},
  194. {'id': ['A'],
  195. 'time': Timestamp('2012-02-01 14:00:00',
  196. tz='US/Central'),
  197. 'foo': [1]}),
  198. ({'id': ['A', 'B', 'A'],
  199. 'time': [Timestamp('2012-01-01 13:00:00',
  200. tz='America/New_York'),
  201. Timestamp('2012-02-01 14:00:00',
  202. tz='US/Central'),
  203. Timestamp('2012-03-01 12:00:00',
  204. tz='Europe/London')],
  205. 'foo': [1, 2, 3]},
  206. {'id': ['A', 'B'],
  207. 'time': [Timestamp('2012-01-01 13:00:00',
  208. tz='America/New_York'),
  209. Timestamp('2012-02-01 14:00:00',
  210. tz='US/Central')],
  211. 'foo': [1, 2]},
  212. {'id': ['A', 'B'],
  213. 'time': [Timestamp('2012-03-01 12:00:00',
  214. tz='Europe/London'),
  215. Timestamp('2012-02-01 14:00:00',
  216. tz='US/Central')],
  217. 'foo': [3, 2]})
  218. ])
  219. def test_first_last_tz(data, expected_first, expected_last):
  220. # GH15884
  221. # Test that the timezone is retained when calling first
  222. # or last on groupby with as_index=False
  223. df = DataFrame(data)
  224. result = df.groupby('id', as_index=False).first()
  225. expected = DataFrame(expected_first)
  226. cols = ['id', 'time', 'foo']
  227. assert_frame_equal(result[cols], expected[cols])
  228. result = df.groupby('id', as_index=False)['time'].first()
  229. assert_frame_equal(result, expected[['id', 'time']])
  230. result = df.groupby('id', as_index=False).last()
  231. expected = DataFrame(expected_last)
  232. cols = ['id', 'time', 'foo']
  233. assert_frame_equal(result[cols], expected[cols])
  234. result = df.groupby('id', as_index=False)['time'].last()
  235. assert_frame_equal(result, expected[['id', 'time']])
  236. def test_nth_multi_index_as_expected():
  237. # PR 9090, related to issue 8979
  238. # test nth on MultiIndex
  239. three_group = DataFrame(
  240. {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
  241. 'foo', 'foo', 'foo'],
  242. 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
  243. 'two', 'two', 'one'],
  244. 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
  245. 'dull', 'shiny', 'shiny', 'shiny']})
  246. grouped = three_group.groupby(['A', 'B'])
  247. result = grouped.nth(0)
  248. expected = DataFrame(
  249. {'C': ['dull', 'dull', 'dull', 'dull']},
  250. index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'],
  251. ['one', 'two', 'one', 'two']],
  252. names=['A', 'B']))
  253. assert_frame_equal(result, expected)
  254. def test_groupby_head_tail():
  255. df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
  256. g_as = df.groupby('A', as_index=True)
  257. g_not_as = df.groupby('A', as_index=False)
  258. # as_index= False, much easier
  259. assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1))
  260. assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1))
  261. empty_not_as = DataFrame(columns=df.columns,
  262. index=pd.Index([], dtype=df.index.dtype))
  263. empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype)
  264. empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype)
  265. assert_frame_equal(empty_not_as, g_not_as.head(0))
  266. assert_frame_equal(empty_not_as, g_not_as.tail(0))
  267. assert_frame_equal(empty_not_as, g_not_as.head(-1))
  268. assert_frame_equal(empty_not_as, g_not_as.tail(-1))
  269. assert_frame_equal(df, g_not_as.head(7)) # contains all
  270. assert_frame_equal(df, g_not_as.tail(7))
  271. # as_index=True, (used to be different)
  272. df_as = df
  273. assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1))
  274. assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1))
  275. empty_as = DataFrame(index=df_as.index[:0], columns=df.columns)
  276. empty_as['A'] = empty_not_as['A'].astype(df.A.dtype)
  277. empty_as['B'] = empty_not_as['B'].astype(df.B.dtype)
  278. assert_frame_equal(empty_as, g_as.head(0))
  279. assert_frame_equal(empty_as, g_as.tail(0))
  280. assert_frame_equal(empty_as, g_as.head(-1))
  281. assert_frame_equal(empty_as, g_as.tail(-1))
  282. assert_frame_equal(df_as, g_as.head(7)) # contains all
  283. assert_frame_equal(df_as, g_as.tail(7))
  284. # test with selection
  285. assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []])
  286. assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0, 2], ['A']])
  287. assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0, 2], ['B']])
  288. assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0, 2]])
  289. assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []])
  290. assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0, 2], ['A']])
  291. assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0, 2], ['B']])
  292. assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0, 2]])
  293. def test_group_selection_cache():
  294. # GH 12839 nth, head, and tail should return same result consistently
  295. df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
  296. expected = df.iloc[[0, 2]].set_index('A')
  297. g = df.groupby('A')
  298. result1 = g.head(n=2)
  299. result2 = g.nth(0)
  300. assert_frame_equal(result1, df)
  301. assert_frame_equal(result2, expected)
  302. g = df.groupby('A')
  303. result1 = g.tail(n=2)
  304. result2 = g.nth(0)
  305. assert_frame_equal(result1, df)
  306. assert_frame_equal(result2, expected)
  307. g = df.groupby('A')
  308. result1 = g.nth(0)
  309. result2 = g.head(n=2)
  310. assert_frame_equal(result1, expected)
  311. assert_frame_equal(result2, df)
  312. g = df.groupby('A')
  313. result1 = g.nth(0)
  314. result2 = g.tail(n=2)
  315. assert_frame_equal(result1, expected)
  316. assert_frame_equal(result2, df)
  317. def test_nth_empty():
  318. # GH 16064
  319. df = DataFrame(index=[0], columns=['a', 'b', 'c'])
  320. result = df.groupby('a').nth(10)
  321. expected = DataFrame(index=Index([], name='a'), columns=['b', 'c'])
  322. assert_frame_equal(result, expected)
  323. result = df.groupby(['a', 'b']).nth(10)
  324. expected = DataFrame(index=MultiIndex([[], []], [[], []],
  325. names=['a', 'b']),
  326. columns=['c'])
  327. assert_frame_equal(result, expected)
  328. def test_nth_column_order():
  329. # GH 20760
  330. # Check that nth preserves column order
  331. df = DataFrame([[1, 'b', 100],
  332. [1, 'a', 50],
  333. [1, 'a', np.nan],
  334. [2, 'c', 200],
  335. [2, 'd', 150]],
  336. columns=['A', 'C', 'B'])
  337. result = df.groupby('A').nth(0)
  338. expected = DataFrame([['b', 100.0],
  339. ['c', 200.0]],
  340. columns=['C', 'B'],
  341. index=Index([1, 2], name='A'))
  342. assert_frame_equal(result, expected)
  343. result = df.groupby('A').nth(-1, dropna='any')
  344. expected = DataFrame([['a', 50.0],
  345. ['d', 150.0]],
  346. columns=['C', 'B'],
  347. index=Index([1, 2], name='A'))
  348. assert_frame_equal(result, expected)