test_timegrouper.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652
  1. """ test with the TimeGrouper / grouping with datetimes """
  2. from datetime import datetime
  3. import numpy as np
  4. from numpy import nan
  5. import pytest
  6. import pytz
  7. from pandas.compat import StringIO
  8. import pandas as pd
  9. from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range
  10. from pandas.core.groupby.ops import BinGrouper
  11. from pandas.util import testing as tm
  12. from pandas.util.testing import assert_frame_equal, assert_series_equal
  13. class TestGroupBy(object):
  14. def test_groupby_with_timegrouper(self):
  15. # GH 4161
  16. # TimeGrouper requires a sorted index
  17. # also verifies that the resultant index has the correct name
  18. df_original = DataFrame({
  19. 'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(),
  20. 'Quantity': [18, 3, 5, 1, 9, 3],
  21. 'Date': [
  22. datetime(2013, 9, 1, 13, 0),
  23. datetime(2013, 9, 1, 13, 5),
  24. datetime(2013, 10, 1, 20, 0),
  25. datetime(2013, 10, 3, 10, 0),
  26. datetime(2013, 12, 2, 12, 0),
  27. datetime(2013, 9, 2, 14, 0),
  28. ]
  29. })
  30. # GH 6908 change target column's order
  31. df_reordered = df_original.sort_values(by='Quantity')
  32. for df in [df_original, df_reordered]:
  33. df = df.set_index(['Date'])
  34. expected = DataFrame(
  35. {'Quantity': 0},
  36. index=date_range('20130901',
  37. '20131205', freq='5D',
  38. name='Date', closed='left'))
  39. expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype='int64')
  40. result1 = df.resample('5D') .sum()
  41. assert_frame_equal(result1, expected)
  42. df_sorted = df.sort_index()
  43. result2 = df_sorted.groupby(pd.Grouper(freq='5D')).sum()
  44. assert_frame_equal(result2, expected)
  45. result3 = df.groupby(pd.Grouper(freq='5D')).sum()
  46. assert_frame_equal(result3, expected)
  47. @pytest.mark.parametrize("should_sort", [True, False])
  48. def test_groupby_with_timegrouper_methods(self, should_sort):
  49. # GH 3881
  50. # make sure API of timegrouper conforms
  51. df = pd.DataFrame({
  52. 'Branch': 'A A A A A B'.split(),
  53. 'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(),
  54. 'Quantity': [1, 3, 5, 8, 9, 3],
  55. 'Date': [
  56. datetime(2013, 1, 1, 13, 0),
  57. datetime(2013, 1, 1, 13, 5),
  58. datetime(2013, 10, 1, 20, 0),
  59. datetime(2013, 10, 2, 10, 0),
  60. datetime(2013, 12, 2, 12, 0),
  61. datetime(2013, 12, 2, 14, 0),
  62. ]
  63. })
  64. if should_sort:
  65. df = df.sort_values(by='Quantity', ascending=False)
  66. df = df.set_index('Date', drop=False)
  67. g = df.groupby(pd.Grouper(freq='6M'))
  68. assert g.group_keys
  69. assert isinstance(g.grouper, BinGrouper)
  70. groups = g.groups
  71. assert isinstance(groups, dict)
  72. assert len(groups) == 3
  73. def test_timegrouper_with_reg_groups(self):
  74. # GH 3794
  75. # allow combinateion of timegrouper/reg groups
  76. df_original = DataFrame({
  77. 'Branch': 'A A A A A A A B'.split(),
  78. 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
  79. 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
  80. 'Date': [
  81. datetime(2013, 1, 1, 13, 0),
  82. datetime(2013, 1, 1, 13, 5),
  83. datetime(2013, 10, 1, 20, 0),
  84. datetime(2013, 10, 2, 10, 0),
  85. datetime(2013, 10, 1, 20, 0),
  86. datetime(2013, 10, 2, 10, 0),
  87. datetime(2013, 12, 2, 12, 0),
  88. datetime(2013, 12, 2, 14, 0),
  89. ]
  90. }).set_index('Date')
  91. df_sorted = df_original.sort_values(by='Quantity', ascending=False)
  92. for df in [df_original, df_sorted]:
  93. expected = DataFrame({
  94. 'Buyer': 'Carl Joe Mark'.split(),
  95. 'Quantity': [10, 18, 3],
  96. 'Date': [
  97. datetime(2013, 12, 31, 0, 0),
  98. datetime(2013, 12, 31, 0, 0),
  99. datetime(2013, 12, 31, 0, 0),
  100. ]
  101. }).set_index(['Date', 'Buyer'])
  102. result = df.groupby([pd.Grouper(freq='A'), 'Buyer']).sum()
  103. assert_frame_equal(result, expected)
  104. expected = DataFrame({
  105. 'Buyer': 'Carl Mark Carl Joe'.split(),
  106. 'Quantity': [1, 3, 9, 18],
  107. 'Date': [
  108. datetime(2013, 1, 1, 0, 0),
  109. datetime(2013, 1, 1, 0, 0),
  110. datetime(2013, 7, 1, 0, 0),
  111. datetime(2013, 7, 1, 0, 0),
  112. ]
  113. }).set_index(['Date', 'Buyer'])
  114. result = df.groupby([pd.Grouper(freq='6MS'), 'Buyer']).sum()
  115. assert_frame_equal(result, expected)
  116. df_original = DataFrame({
  117. 'Branch': 'A A A A A A A B'.split(),
  118. 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
  119. 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
  120. 'Date': [
  121. datetime(2013, 10, 1, 13, 0),
  122. datetime(2013, 10, 1, 13, 5),
  123. datetime(2013, 10, 1, 20, 0),
  124. datetime(2013, 10, 2, 10, 0),
  125. datetime(2013, 10, 1, 20, 0),
  126. datetime(2013, 10, 2, 10, 0),
  127. datetime(2013, 10, 2, 12, 0),
  128. datetime(2013, 10, 2, 14, 0),
  129. ]
  130. }).set_index('Date')
  131. df_sorted = df_original.sort_values(by='Quantity', ascending=False)
  132. for df in [df_original, df_sorted]:
  133. expected = DataFrame({
  134. 'Buyer': 'Carl Joe Mark Carl Joe'.split(),
  135. 'Quantity': [6, 8, 3, 4, 10],
  136. 'Date': [
  137. datetime(2013, 10, 1, 0, 0),
  138. datetime(2013, 10, 1, 0, 0),
  139. datetime(2013, 10, 1, 0, 0),
  140. datetime(2013, 10, 2, 0, 0),
  141. datetime(2013, 10, 2, 0, 0),
  142. ]
  143. }).set_index(['Date', 'Buyer'])
  144. result = df.groupby([pd.Grouper(freq='1D'), 'Buyer']).sum()
  145. assert_frame_equal(result, expected)
  146. result = df.groupby([pd.Grouper(freq='1M'), 'Buyer']).sum()
  147. expected = DataFrame({
  148. 'Buyer': 'Carl Joe Mark'.split(),
  149. 'Quantity': [10, 18, 3],
  150. 'Date': [
  151. datetime(2013, 10, 31, 0, 0),
  152. datetime(2013, 10, 31, 0, 0),
  153. datetime(2013, 10, 31, 0, 0),
  154. ]
  155. }).set_index(['Date', 'Buyer'])
  156. assert_frame_equal(result, expected)
  157. # passing the name
  158. df = df.reset_index()
  159. result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer'
  160. ]).sum()
  161. assert_frame_equal(result, expected)
  162. with pytest.raises(KeyError):
  163. df.groupby([pd.Grouper(freq='1M', key='foo'), 'Buyer']).sum()
  164. # passing the level
  165. df = df.set_index('Date')
  166. result = df.groupby([pd.Grouper(freq='1M', level='Date'), 'Buyer'
  167. ]).sum()
  168. assert_frame_equal(result, expected)
  169. result = df.groupby([pd.Grouper(freq='1M', level=0), 'Buyer']).sum(
  170. )
  171. assert_frame_equal(result, expected)
  172. with pytest.raises(ValueError):
  173. df.groupby([pd.Grouper(freq='1M', level='foo'),
  174. 'Buyer']).sum()
  175. # multi names
  176. df = df.copy()
  177. df['Date'] = df.index + pd.offsets.MonthEnd(2)
  178. result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer'
  179. ]).sum()
  180. expected = DataFrame({
  181. 'Buyer': 'Carl Joe Mark'.split(),
  182. 'Quantity': [10, 18, 3],
  183. 'Date': [
  184. datetime(2013, 11, 30, 0, 0),
  185. datetime(2013, 11, 30, 0, 0),
  186. datetime(2013, 11, 30, 0, 0),
  187. ]
  188. }).set_index(['Date', 'Buyer'])
  189. assert_frame_equal(result, expected)
  190. # error as we have both a level and a name!
  191. with pytest.raises(ValueError):
  192. df.groupby([pd.Grouper(freq='1M', key='Date',
  193. level='Date'), 'Buyer']).sum()
  194. # single groupers
  195. expected = DataFrame({'Quantity': [31],
  196. 'Date': [datetime(2013, 10, 31, 0, 0)
  197. ]}).set_index('Date')
  198. result = df.groupby(pd.Grouper(freq='1M')).sum()
  199. assert_frame_equal(result, expected)
  200. result = df.groupby([pd.Grouper(freq='1M')]).sum()
  201. assert_frame_equal(result, expected)
  202. expected = DataFrame({'Quantity': [31],
  203. 'Date': [datetime(2013, 11, 30, 0, 0)
  204. ]}).set_index('Date')
  205. result = df.groupby(pd.Grouper(freq='1M', key='Date')).sum()
  206. assert_frame_equal(result, expected)
  207. result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum()
  208. assert_frame_equal(result, expected)
  209. @pytest.mark.parametrize('freq', ['D', 'M', 'A', 'Q-APR'])
  210. def test_timegrouper_with_reg_groups_freq(self, freq):
  211. # GH 6764 multiple grouping with/without sort
  212. df = DataFrame({
  213. 'date': pd.to_datetime([
  214. '20121002', '20121007', '20130130', '20130202', '20130305',
  215. '20121002', '20121207', '20130130', '20130202', '20130305',
  216. '20130202', '20130305'
  217. ]),
  218. 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
  219. 'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301,
  220. 359, 801],
  221. 'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12]
  222. }).set_index('date')
  223. expected = (
  224. df.groupby('user_id')['whole_cost']
  225. .resample(freq)
  226. .sum(min_count=1) # XXX
  227. .dropna()
  228. .reorder_levels(['date', 'user_id'])
  229. .sort_index()
  230. .astype('int64')
  231. )
  232. expected.name = 'whole_cost'
  233. result1 = df.sort_index().groupby([pd.Grouper(freq=freq),
  234. 'user_id'])['whole_cost'].sum()
  235. assert_series_equal(result1, expected)
  236. result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[
  237. 'whole_cost'].sum()
  238. assert_series_equal(result2, expected)
  239. def test_timegrouper_get_group(self):
  240. # GH 6914
  241. df_original = DataFrame({
  242. 'Buyer': 'Carl Joe Joe Carl Joe Carl'.split(),
  243. 'Quantity': [18, 3, 5, 1, 9, 3],
  244. 'Date': [datetime(2013, 9, 1, 13, 0),
  245. datetime(2013, 9, 1, 13, 5),
  246. datetime(2013, 10, 1, 20, 0),
  247. datetime(2013, 10, 3, 10, 0),
  248. datetime(2013, 12, 2, 12, 0),
  249. datetime(2013, 9, 2, 14, 0), ]
  250. })
  251. df_reordered = df_original.sort_values(by='Quantity')
  252. # single grouping
  253. expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
  254. df_original.iloc[[4]]]
  255. dt_list = ['2013-09-30', '2013-10-31', '2013-12-31']
  256. for df in [df_original, df_reordered]:
  257. grouped = df.groupby(pd.Grouper(freq='M', key='Date'))
  258. for t, expected in zip(dt_list, expected_list):
  259. dt = pd.Timestamp(t)
  260. result = grouped.get_group(dt)
  261. assert_frame_equal(result, expected)
  262. # multiple grouping
  263. expected_list = [df_original.iloc[[1]], df_original.iloc[[3]],
  264. df_original.iloc[[4]]]
  265. g_list = [('Joe', '2013-09-30'), ('Carl', '2013-10-31'),
  266. ('Joe', '2013-12-31')]
  267. for df in [df_original, df_reordered]:
  268. grouped = df.groupby(['Buyer', pd.Grouper(freq='M', key='Date')])
  269. for (b, t), expected in zip(g_list, expected_list):
  270. dt = pd.Timestamp(t)
  271. result = grouped.get_group((b, dt))
  272. assert_frame_equal(result, expected)
  273. # with index
  274. df_original = df_original.set_index('Date')
  275. df_reordered = df_original.sort_values(by='Quantity')
  276. expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
  277. df_original.iloc[[4]]]
  278. for df in [df_original, df_reordered]:
  279. grouped = df.groupby(pd.Grouper(freq='M'))
  280. for t, expected in zip(dt_list, expected_list):
  281. dt = pd.Timestamp(t)
  282. result = grouped.get_group(dt)
  283. assert_frame_equal(result, expected)
  284. def test_timegrouper_apply_return_type_series(self):
  285. # Using `apply` with the `TimeGrouper` should give the
  286. # same return type as an `apply` with a `Grouper`.
  287. # Issue #11742
  288. df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'],
  289. 'value': [10, 13]})
  290. df_dt = df.copy()
  291. df_dt['date'] = pd.to_datetime(df_dt['date'])
  292. def sumfunc_series(x):
  293. return pd.Series([x['value'].sum()], ('sum',))
  294. expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_series)
  295. result = (df_dt.groupby(pd.Grouper(freq='M', key='date'))
  296. .apply(sumfunc_series))
  297. assert_frame_equal(result.reset_index(drop=True),
  298. expected.reset_index(drop=True))
  299. def test_timegrouper_apply_return_type_value(self):
  300. # Using `apply` with the `TimeGrouper` should give the
  301. # same return type as an `apply` with a `Grouper`.
  302. # Issue #11742
  303. df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'],
  304. 'value': [10, 13]})
  305. df_dt = df.copy()
  306. df_dt['date'] = pd.to_datetime(df_dt['date'])
  307. def sumfunc_value(x):
  308. return x.value.sum()
  309. expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_value)
  310. with tm.assert_produces_warning(FutureWarning,
  311. check_stacklevel=False):
  312. result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date'))
  313. .apply(sumfunc_value))
  314. assert_series_equal(result.reset_index(drop=True),
  315. expected.reset_index(drop=True))
  316. def test_groupby_groups_datetimeindex(self):
  317. # GH#1430
  318. periods = 1000
  319. ind = pd.date_range(start='2012/1/1', freq='5min', periods=periods)
  320. df = DataFrame({'high': np.arange(periods),
  321. 'low': np.arange(periods)}, index=ind)
  322. grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
  323. # it works!
  324. groups = grouped.groups
  325. assert isinstance(list(groups.keys())[0], datetime)
  326. # GH#11442
  327. index = pd.date_range('2015/01/01', periods=5, name='date')
  328. df = pd.DataFrame({'A': [5, 6, 7, 8, 9],
  329. 'B': [1, 2, 3, 4, 5]}, index=index)
  330. result = df.groupby(level='date').groups
  331. dates = ['2015-01-05', '2015-01-04', '2015-01-03',
  332. '2015-01-02', '2015-01-01']
  333. expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date')
  334. for date in dates}
  335. tm.assert_dict_equal(result, expected)
  336. grouped = df.groupby(level='date')
  337. for date in dates:
  338. result = grouped.get_group(date)
  339. data = [[df.loc[date, 'A'], df.loc[date, 'B']]]
  340. expected_index = pd.DatetimeIndex([date], name='date')
  341. expected = pd.DataFrame(data,
  342. columns=list('AB'),
  343. index=expected_index)
  344. tm.assert_frame_equal(result, expected)
  345. def test_groupby_groups_datetimeindex_tz(self):
  346. # GH 3950
  347. dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
  348. '2011-07-19 09:00:00', '2011-07-19 07:00:00',
  349. '2011-07-19 08:00:00', '2011-07-19 09:00:00']
  350. df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
  351. 'datetime': dates,
  352. 'value1': np.arange(6, dtype='int64'),
  353. 'value2': [1, 2] * 3})
  354. df['datetime'] = df['datetime'].apply(
  355. lambda d: Timestamp(d, tz='US/Pacific'))
  356. exp_idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00',
  357. '2011-07-19 07:00:00',
  358. '2011-07-19 08:00:00',
  359. '2011-07-19 08:00:00',
  360. '2011-07-19 09:00:00',
  361. '2011-07-19 09:00:00'],
  362. tz='US/Pacific', name='datetime')
  363. exp_idx2 = Index(['a', 'b'] * 3, name='label')
  364. exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
  365. expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5],
  366. 'value2': [1, 2, 2, 1, 1, 2]},
  367. index=exp_idx, columns=['value1', 'value2'])
  368. result = df.groupby(['datetime', 'label']).sum()
  369. assert_frame_equal(result, expected)
  370. # by level
  371. didx = pd.DatetimeIndex(dates, tz='Asia/Tokyo')
  372. df = DataFrame({'value1': np.arange(6, dtype='int64'),
  373. 'value2': [1, 2, 3, 1, 2, 3]},
  374. index=didx)
  375. exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00',
  376. '2011-07-19 08:00:00',
  377. '2011-07-19 09:00:00'], tz='Asia/Tokyo')
  378. expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]},
  379. index=exp_idx, columns=['value1', 'value2'])
  380. result = df.groupby(level=0).sum()
  381. assert_frame_equal(result, expected)
  382. def test_frame_datetime64_handling_groupby(self):
  383. # it works!
  384. df = DataFrame([(3, np.datetime64('2012-07-03')),
  385. (3, np.datetime64('2012-07-04'))],
  386. columns=['a', 'date'])
  387. result = df.groupby('a').first()
  388. assert result['date'][3] == Timestamp('2012-07-03')
  389. def test_groupby_multi_timezone(self):
  390. # combining multiple / different timezones yields UTC
  391. data = """0,2000-01-28 16:47:00,America/Chicago
  392. 1,2000-01-29 16:48:00,America/Chicago
  393. 2,2000-01-30 16:49:00,America/Los_Angeles
  394. 3,2000-01-31 16:50:00,America/Chicago
  395. 4,2000-01-01 16:50:00,America/New_York"""
  396. df = pd.read_csv(StringIO(data), header=None,
  397. names=['value', 'date', 'tz'])
  398. result = df.groupby('tz').date.apply(
  399. lambda x: pd.to_datetime(x).dt.tz_localize(x.name))
  400. expected = Series([Timestamp('2000-01-28 16:47:00-0600',
  401. tz='America/Chicago'),
  402. Timestamp('2000-01-29 16:48:00-0600',
  403. tz='America/Chicago'),
  404. Timestamp('2000-01-30 16:49:00-0800',
  405. tz='America/Los_Angeles'),
  406. Timestamp('2000-01-31 16:50:00-0600',
  407. tz='America/Chicago'),
  408. Timestamp('2000-01-01 16:50:00-0500',
  409. tz='America/New_York')],
  410. name='date',
  411. dtype=object)
  412. assert_series_equal(result, expected)
  413. tz = 'America/Chicago'
  414. res_values = df.groupby('tz').date.get_group(tz)
  415. result = pd.to_datetime(res_values).dt.tz_localize(tz)
  416. exp_values = Series(['2000-01-28 16:47:00', '2000-01-29 16:48:00',
  417. '2000-01-31 16:50:00'],
  418. index=[0, 1, 3], name='date')
  419. expected = pd.to_datetime(exp_values).dt.tz_localize(tz)
  420. assert_series_equal(result, expected)
  421. def test_groupby_groups_periods(self):
  422. dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
  423. '2011-07-19 09:00:00', '2011-07-19 07:00:00',
  424. '2011-07-19 08:00:00', '2011-07-19 09:00:00']
  425. df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
  426. 'period': [pd.Period(d, freq='H') for d in dates],
  427. 'value1': np.arange(6, dtype='int64'),
  428. 'value2': [1, 2] * 3})
  429. exp_idx1 = pd.PeriodIndex(['2011-07-19 07:00:00',
  430. '2011-07-19 07:00:00',
  431. '2011-07-19 08:00:00',
  432. '2011-07-19 08:00:00',
  433. '2011-07-19 09:00:00',
  434. '2011-07-19 09:00:00'],
  435. freq='H', name='period')
  436. exp_idx2 = Index(['a', 'b'] * 3, name='label')
  437. exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
  438. expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5],
  439. 'value2': [1, 2, 2, 1, 1, 2]},
  440. index=exp_idx, columns=['value1', 'value2'])
  441. result = df.groupby(['period', 'label']).sum()
  442. assert_frame_equal(result, expected)
  443. # by level
  444. didx = pd.PeriodIndex(dates, freq='H')
  445. df = DataFrame({'value1': np.arange(6, dtype='int64'),
  446. 'value2': [1, 2, 3, 1, 2, 3]},
  447. index=didx)
  448. exp_idx = pd.PeriodIndex(['2011-07-19 07:00:00',
  449. '2011-07-19 08:00:00',
  450. '2011-07-19 09:00:00'], freq='H')
  451. expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]},
  452. index=exp_idx, columns=['value1', 'value2'])
  453. result = df.groupby(level=0).sum()
  454. assert_frame_equal(result, expected)
  455. def test_groupby_first_datetime64(self):
  456. df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
  457. df[1] = df[1].view('M8[ns]')
  458. assert issubclass(df[1].dtype.type, np.datetime64)
  459. result = df.groupby(level=0).first()
  460. got_dt = result[1].dtype
  461. assert issubclass(got_dt.type, np.datetime64)
  462. result = df[1].groupby(level=0).first()
  463. got_dt = result.dtype
  464. assert issubclass(got_dt.type, np.datetime64)
  465. def test_groupby_max_datetime64(self):
  466. # GH 5869
  467. # datetimelike dtype conversion from int
  468. df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
  469. expected = df.groupby('A')['A'].apply(lambda x: x.max())
  470. result = df.groupby('A')['A'].max()
  471. assert_series_equal(result, expected)
  472. def test_groupby_datetime64_32_bit(self):
  473. # GH 6410 / numpy 4328
  474. # 32-bit under 1.9-dev indexing issue
  475. df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')] * 2})
  476. result = df.groupby("A")["B"].transform(min)
  477. expected = Series([pd.Timestamp('2000-01-1')] * 2, name='B')
  478. assert_series_equal(result, expected)
  479. def test_groupby_with_timezone_selection(self):
  480. # GH 11616
  481. # Test that column selection returns output in correct timezone.
  482. np.random.seed(42)
  483. df = pd.DataFrame({
  484. 'factor': np.random.randint(0, 3, size=60),
  485. 'time': pd.date_range('01/01/2000 00:00', periods=60,
  486. freq='s', tz='UTC')
  487. })
  488. df1 = df.groupby('factor').max()['time']
  489. df2 = df.groupby('factor')['time'].max()
  490. tm.assert_series_equal(df1, df2)
  491. def test_timezone_info(self):
  492. # see gh-11682: Timezone info lost when broadcasting
  493. # scalar datetime to DataFrame
  494. df = pd.DataFrame({'a': [1], 'b': [datetime.now(pytz.utc)]})
  495. assert df['b'][0].tzinfo == pytz.utc
  496. df = pd.DataFrame({'a': [1, 2, 3]})
  497. df['b'] = datetime.now(pytz.utc)
  498. assert df['b'][0].tzinfo == pytz.utc
  499. def test_datetime_count(self):
  500. df = DataFrame({'a': [1, 2, 3] * 2,
  501. 'dates': pd.date_range('now', periods=6, freq='T')})
  502. result = df.groupby('a').dates.count()
  503. expected = Series([
  504. 2, 2, 2
  505. ], index=Index([1, 2, 3], name='a'), name='dates')
  506. tm.assert_series_equal(result, expected)
  507. def test_first_last_max_min_on_time_data(self):
  508. # GH 10295
  509. # Verify that NaT is not in the result of max, min, first and last on
  510. # Dataframe with datetime or timedelta values.
  511. from datetime import timedelta as td
  512. df_test = DataFrame(
  513. {'dt': [nan, '2015-07-24 10:10', '2015-07-25 11:11',
  514. '2015-07-23 12:12', nan],
  515. 'td': [nan, td(days=1), td(days=2), td(days=3), nan]})
  516. df_test.dt = pd.to_datetime(df_test.dt)
  517. df_test['group'] = 'A'
  518. df_ref = df_test[df_test.dt.notna()]
  519. grouped_test = df_test.groupby('group')
  520. grouped_ref = df_ref.groupby('group')
  521. assert_frame_equal(grouped_ref.max(), grouped_test.max())
  522. assert_frame_equal(grouped_ref.min(), grouped_test.min())
  523. assert_frame_equal(grouped_ref.first(), grouped_test.first())
  524. assert_frame_equal(grouped_ref.last(), grouped_test.last())
  525. def test_nunique_with_timegrouper_and_nat(self):
  526. # GH 17575
  527. test = pd.DataFrame({
  528. 'time': [Timestamp('2016-06-28 09:35:35'),
  529. pd.NaT,
  530. Timestamp('2016-06-28 16:46:28')],
  531. 'data': ['1', '2', '3']})
  532. grouper = pd.Grouper(key='time', freq='h')
  533. result = test.groupby(grouper)['data'].nunique()
  534. expected = test[test.time.notnull()].groupby(grouper)['data'].nunique()
  535. tm.assert_series_equal(result, expected)
  536. def test_scalar_call_versus_list_call(self):
  537. # Issue: 17530
  538. data_frame = {
  539. 'location': ['shanghai', 'beijing', 'shanghai'],
  540. 'time': pd.Series(['2017-08-09 13:32:23', '2017-08-11 23:23:15',
  541. '2017-08-11 22:23:15'],
  542. dtype='datetime64[ns]'),
  543. 'value': [1, 2, 3]
  544. }
  545. data_frame = pd.DataFrame(data_frame).set_index('time')
  546. grouper = pd.Grouper(freq='D')
  547. grouped = data_frame.groupby(grouper)
  548. result = grouped.count()
  549. grouped = data_frame.groupby([grouper])
  550. expected = grouped.count()
  551. assert_frame_equal(result, expected)