groupby.py 67 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110
  1. """
  2. Provide the groupby split-apply-combine paradigm. Define the GroupBy
  3. class providing the base-class of operations.
  4. The SeriesGroupBy and DataFrameGroupBy sub-class
  5. (defined in pandas.core.groupby.generic)
  6. expose these user-facing objects to provide specific functionailty.
  7. """
  8. import collections
  9. from contextlib import contextmanager
  10. import datetime
  11. from functools import partial, wraps
  12. import types
  13. import warnings
  14. import numpy as np
  15. from pandas._libs import Timestamp, groupby as libgroupby
  16. import pandas.compat as compat
  17. from pandas.compat import callable, range, set_function_name, zip
  18. from pandas.compat.numpy import function as nv
  19. from pandas.errors import AbstractMethodError
  20. from pandas.util._decorators import Appender, Substitution, cache_readonly
  21. from pandas.util._validators import validate_kwargs
  22. from pandas.core.dtypes.cast import maybe_downcast_to_dtype
  23. from pandas.core.dtypes.common import (
  24. ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar)
  25. from pandas.core.dtypes.missing import isna, notna
  26. import pandas.core.algorithms as algorithms
  27. from pandas.core.base import (
  28. DataError, GroupByError, PandasObject, SelectionMixin, SpecificationError)
  29. import pandas.core.common as com
  30. from pandas.core.config import option_context
  31. from pandas.core.frame import DataFrame
  32. from pandas.core.generic import NDFrame
  33. from pandas.core.groupby import base
  34. from pandas.core.index import Index, MultiIndex
  35. from pandas.core.series import Series
  36. from pandas.core.sorting import get_group_index_sorter
  37. _common_see_also = """
  38. See Also
  39. --------
  40. pandas.Series.%(name)s
  41. pandas.DataFrame.%(name)s
  42. pandas.Panel.%(name)s
  43. """
  44. _apply_docs = dict(
  45. template="""
  46. Apply function `func` group-wise and combine the results together.
  47. The function passed to `apply` must take a {input} as its first
  48. argument and return a DataFrame, Series or scalar. `apply` will
  49. then take care of combining the results back together into a single
  50. dataframe or series. `apply` is therefore a highly flexible
  51. grouping method.
  52. While `apply` is a very flexible method, its downside is that
  53. using it can be quite a bit slower than using more specific methods
  54. like `agg` or `transform`. Pandas offers a wide range of method that will
  55. be much faster than using `apply` for their specific purposes, so try to
  56. use them before reaching for `apply`.
  57. Parameters
  58. ----------
  59. func : callable
  60. A callable that takes a {input} as its first argument, and
  61. returns a dataframe, a series or a scalar. In addition the
  62. callable may take positional and keyword arguments.
  63. args, kwargs : tuple and dict
  64. Optional positional and keyword arguments to pass to `func`.
  65. Returns
  66. -------
  67. applied : Series or DataFrame
  68. See Also
  69. --------
  70. pipe : Apply function to the full GroupBy object instead of to each
  71. group.
  72. aggregate : Apply aggregate function to the GroupBy object.
  73. transform : Apply function column-by-column to the GroupBy object.
  74. Series.apply : Apply a function to a Series.
  75. DataFrame.apply : Apply a function to each row or column of a DataFrame.
  76. """,
  77. dataframe_examples="""
  78. >>> df = pd.DataFrame({'A': 'a a b'.split(),
  79. 'B': [1,2,3],
  80. 'C': [4,6, 5]})
  81. >>> g = df.groupby('A')
  82. Notice that ``g`` has two groups, ``a`` and ``b``.
  83. Calling `apply` in various ways, we can get different grouping results:
  84. Example 1: below the function passed to `apply` takes a DataFrame as
  85. its argument and returns a DataFrame. `apply` combines the result for
  86. each group together into a new DataFrame:
  87. >>> g[['B', 'C']].apply(lambda x: x / x.sum())
  88. B C
  89. 0 0.333333 0.4
  90. 1 0.666667 0.6
  91. 2 1.000000 1.0
  92. Example 2: The function passed to `apply` takes a DataFrame as
  93. its argument and returns a Series. `apply` combines the result for
  94. each group together into a new DataFrame:
  95. >>> g[['B', 'C']].apply(lambda x: x.max() - x.min())
  96. B C
  97. A
  98. a 1 2
  99. b 0 0
  100. Example 3: The function passed to `apply` takes a DataFrame as
  101. its argument and returns a scalar. `apply` combines the result for
  102. each group together into a Series, including setting the index as
  103. appropriate:
  104. >>> g.apply(lambda x: x.C.max() - x.B.min())
  105. A
  106. a 5
  107. b 2
  108. dtype: int64
  109. """,
  110. series_examples="""
  111. >>> s = pd.Series([0, 1, 2], index='a a b'.split())
  112. >>> g = s.groupby(s.index)
  113. From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``.
  114. Calling `apply` in various ways, we can get different grouping results:
  115. Example 1: The function passed to `apply` takes a Series as
  116. its argument and returns a Series. `apply` combines the result for
  117. each group together into a new Series:
  118. >>> g.apply(lambda x: x*2 if x.name == 'b' else x/2)
  119. 0 0.0
  120. 1 0.5
  121. 2 4.0
  122. dtype: float64
  123. Example 2: The function passed to `apply` takes a Series as
  124. its argument and returns a scalar. `apply` combines the result for
  125. each group together into a Series, including setting the index as
  126. appropriate:
  127. >>> g.apply(lambda x: x.max() - x.min())
  128. a 1
  129. b 0
  130. dtype: int64
  131. Notes
  132. -----
  133. In the current implementation `apply` calls `func` twice on the
  134. first group to decide whether it can take a fast or slow code
  135. path. This can lead to unexpected behavior if `func` has
  136. side-effects, as they will take effect twice for the first
  137. group.
  138. Examples
  139. --------
  140. {examples}
  141. """)
  142. _pipe_template = """\
  143. Apply a function `func` with arguments to this %(klass)s object and return
  144. the function's result.
  145. %(versionadded)s
  146. Use `.pipe` when you want to improve readability by chaining together
  147. functions that expect Series, DataFrames, GroupBy or Resampler objects.
  148. Instead of writing
  149. >>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c)
  150. You can write
  151. >>> (df.groupby('group')
  152. ... .pipe(f)
  153. ... .pipe(g, arg1=a)
  154. ... .pipe(h, arg2=b, arg3=c))
  155. which is much more readable.
  156. Parameters
  157. ----------
  158. func : callable or tuple of (callable, string)
  159. Function to apply to this %(klass)s object or, alternatively,
  160. a `(callable, data_keyword)` tuple where `data_keyword` is a
  161. string indicating the keyword of `callable` that expects the
  162. %(klass)s object.
  163. args : iterable, optional
  164. positional arguments passed into `func`.
  165. kwargs : dict, optional
  166. a dictionary of keyword arguments passed into `func`.
  167. Returns
  168. -------
  169. object : the return type of `func`.
  170. See Also
  171. --------
  172. pandas.Series.pipe : Apply a function with arguments to a series.
  173. pandas.DataFrame.pipe: Apply a function with arguments to a dataframe.
  174. apply : Apply function to each group instead of to the
  175. full %(klass)s object.
  176. Notes
  177. -----
  178. See more `here
  179. <http://pandas.pydata.org/pandas-docs/stable/groupby.html#piping-function-calls>`_
  180. Examples
  181. --------
  182. %(examples)s
  183. """
  184. _transform_template = """
  185. Call function producing a like-indexed %(klass)s on each group and
  186. return a %(klass)s having the same indexes as the original object
  187. filled with the transformed values
  188. Parameters
  189. ----------
  190. f : function
  191. Function to apply to each group
  192. Returns
  193. -------
  194. %(klass)s
  195. See Also
  196. --------
  197. aggregate, transform
  198. Notes
  199. -----
  200. Each group is endowed the attribute 'name' in case you need to know
  201. which group you are working on.
  202. The current implementation imposes three requirements on f:
  203. * f must return a value that either has the same shape as the input
  204. subframe or can be broadcast to the shape of the input subframe.
  205. For example, f returns a scalar it will be broadcast to have the
  206. same shape as the input subframe.
  207. * if this is a DataFrame, f must support application column-by-column
  208. in the subframe. If f also supports application to the entire subframe,
  209. then a fast path is used starting from the second chunk.
  210. * f must not mutate groups. Mutation is not supported and may
  211. produce unexpected results.
  212. Examples
  213. --------
  214. # Same shape
  215. >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
  216. ... 'foo', 'bar'],
  217. ... 'B' : ['one', 'one', 'two', 'three',
  218. ... 'two', 'two'],
  219. ... 'C' : [1, 5, 5, 2, 5, 5],
  220. ... 'D' : [2.0, 5., 8., 1., 2., 9.]})
  221. >>> grouped = df.groupby('A')
  222. >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
  223. C D
  224. 0 -1.154701 -0.577350
  225. 1 0.577350 0.000000
  226. 2 0.577350 1.154701
  227. 3 -1.154701 -1.000000
  228. 4 0.577350 -0.577350
  229. 5 0.577350 1.000000
  230. # Broadcastable
  231. >>> grouped.transform(lambda x: x.max() - x.min())
  232. C D
  233. 0 4 6.0
  234. 1 3 8.0
  235. 2 4 6.0
  236. 3 3 8.0
  237. 4 4 6.0
  238. 5 3 8.0
  239. """
  240. class GroupByPlot(PandasObject):
  241. """
  242. Class implementing the .plot attribute for groupby objects.
  243. """
  244. def __init__(self, groupby):
  245. self._groupby = groupby
  246. def __call__(self, *args, **kwargs):
  247. def f(self):
  248. return self.plot(*args, **kwargs)
  249. f.__name__ = 'plot'
  250. return self._groupby.apply(f)
  251. def __getattr__(self, name):
  252. def attr(*args, **kwargs):
  253. def f(self):
  254. return getattr(self.plot, name)(*args, **kwargs)
  255. return self._groupby.apply(f)
  256. return attr
  257. @contextmanager
  258. def _group_selection_context(groupby):
  259. """
  260. Set / reset the _group_selection_context.
  261. """
  262. groupby._set_group_selection()
  263. yield groupby
  264. groupby._reset_group_selection()
  265. class _GroupBy(PandasObject, SelectionMixin):
  266. _group_selection = None
  267. _apply_whitelist = frozenset()
  268. def __init__(self, obj, keys=None, axis=0, level=None,
  269. grouper=None, exclusions=None, selection=None, as_index=True,
  270. sort=True, group_keys=True, squeeze=False,
  271. observed=False, **kwargs):
  272. self._selection = selection
  273. if isinstance(obj, NDFrame):
  274. obj._consolidate_inplace()
  275. self.level = level
  276. if not as_index:
  277. if not isinstance(obj, DataFrame):
  278. raise TypeError('as_index=False only valid with DataFrame')
  279. if axis != 0:
  280. raise ValueError('as_index=False only valid for axis=0')
  281. self.as_index = as_index
  282. self.keys = keys
  283. self.sort = sort
  284. self.group_keys = group_keys
  285. self.squeeze = squeeze
  286. self.observed = observed
  287. self.mutated = kwargs.pop('mutated', False)
  288. if grouper is None:
  289. from pandas.core.groupby.grouper import _get_grouper
  290. grouper, exclusions, obj = _get_grouper(obj, keys,
  291. axis=axis,
  292. level=level,
  293. sort=sort,
  294. observed=observed,
  295. mutated=self.mutated)
  296. self.obj = obj
  297. self.axis = obj._get_axis_number(axis)
  298. self.grouper = grouper
  299. self.exclusions = set(exclusions) if exclusions else set()
  300. # we accept no other args
  301. validate_kwargs('group', kwargs, {})
  302. def __len__(self):
  303. return len(self.groups)
  304. def __unicode__(self):
  305. # TODO: Better unicode/repr for GroupBy object
  306. return object.__repr__(self)
  307. def _assure_grouper(self):
  308. """
  309. We create the grouper on instantiation sub-classes may have a
  310. different policy.
  311. """
  312. pass
  313. @property
  314. def groups(self):
  315. """
  316. Dict {group name -> group labels}.
  317. """
  318. self._assure_grouper()
  319. return self.grouper.groups
  320. @property
  321. def ngroups(self):
  322. self._assure_grouper()
  323. return self.grouper.ngroups
  324. @property
  325. def indices(self):
  326. """
  327. Dict {group name -> group indices}.
  328. """
  329. self._assure_grouper()
  330. return self.grouper.indices
  331. def _get_indices(self, names):
  332. """
  333. Safe get multiple indices, translate keys for
  334. datelike to underlying repr.
  335. """
  336. def get_converter(s):
  337. # possibly convert to the actual key types
  338. # in the indices, could be a Timestamp or a np.datetime64
  339. if isinstance(s, (Timestamp, datetime.datetime)):
  340. return lambda key: Timestamp(key)
  341. elif isinstance(s, np.datetime64):
  342. return lambda key: Timestamp(key).asm8
  343. else:
  344. return lambda key: key
  345. if len(names) == 0:
  346. return []
  347. if len(self.indices) > 0:
  348. index_sample = next(iter(self.indices))
  349. else:
  350. index_sample = None # Dummy sample
  351. name_sample = names[0]
  352. if isinstance(index_sample, tuple):
  353. if not isinstance(name_sample, tuple):
  354. msg = ("must supply a tuple to get_group with multiple"
  355. " grouping keys")
  356. raise ValueError(msg)
  357. if not len(name_sample) == len(index_sample):
  358. try:
  359. # If the original grouper was a tuple
  360. return [self.indices[name] for name in names]
  361. except KeyError:
  362. # turns out it wasn't a tuple
  363. msg = ("must supply a same-length tuple to get_group"
  364. " with multiple grouping keys")
  365. raise ValueError(msg)
  366. converters = [get_converter(s) for s in index_sample]
  367. names = [tuple(f(n) for f, n in zip(converters, name))
  368. for name in names]
  369. else:
  370. converter = get_converter(index_sample)
  371. names = [converter(name) for name in names]
  372. return [self.indices.get(name, []) for name in names]
  373. def _get_index(self, name):
  374. """
  375. Safe get index, translate keys for datelike to underlying repr.
  376. """
  377. return self._get_indices([name])[0]
  378. @cache_readonly
  379. def _selected_obj(self):
  380. if self._selection is None or isinstance(self.obj, Series):
  381. if self._group_selection is not None:
  382. return self.obj[self._group_selection]
  383. return self.obj
  384. else:
  385. return self.obj[self._selection]
  386. def _reset_group_selection(self):
  387. """
  388. Clear group based selection.
  389. Used for methods needing to return info on each group regardless of
  390. whether a group selection was previously set.
  391. """
  392. if self._group_selection is not None:
  393. # GH12839 clear cached selection too when changing group selection
  394. self._group_selection = None
  395. self._reset_cache('_selected_obj')
  396. def _set_group_selection(self):
  397. """
  398. Create group based selection.
  399. Used when selection is not passed directly but instead via a grouper.
  400. NOTE: this should be paired with a call to _reset_group_selection
  401. """
  402. grp = self.grouper
  403. if not (self.as_index and
  404. getattr(grp, 'groupings', None) is not None and
  405. self.obj.ndim > 1 and
  406. self._group_selection is None):
  407. return
  408. ax = self.obj._info_axis
  409. groupers = [g.name for g in grp.groupings
  410. if g.level is None and g.in_axis]
  411. if len(groupers):
  412. # GH12839 clear selected obj cache when group selection changes
  413. self._group_selection = ax.difference(Index(groupers),
  414. sort=False).tolist()
  415. self._reset_cache('_selected_obj')
  416. def _set_result_index_ordered(self, result):
  417. # set the result index on the passed values object and
  418. # return the new object, xref 8046
  419. # the values/counts are repeated according to the group index
  420. # shortcut if we have an already ordered grouper
  421. if not self.grouper.is_monotonic:
  422. index = Index(np.concatenate(
  423. self._get_indices(self.grouper.result_index)))
  424. result.set_axis(index, axis=self.axis, inplace=True)
  425. result = result.sort_index(axis=self.axis)
  426. result.set_axis(self.obj._get_axis(self.axis), axis=self.axis,
  427. inplace=True)
  428. return result
  429. def _dir_additions(self):
  430. return self.obj._dir_additions() | self._apply_whitelist
  431. def __getattr__(self, attr):
  432. if attr in self._internal_names_set:
  433. return object.__getattribute__(self, attr)
  434. if attr in self.obj:
  435. return self[attr]
  436. if hasattr(self.obj, attr):
  437. return self._make_wrapper(attr)
  438. raise AttributeError("%r object has no attribute %r" %
  439. (type(self).__name__, attr))
  440. @Substitution(klass='GroupBy',
  441. versionadded='.. versionadded:: 0.21.0',
  442. examples="""\
  443. >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})
  444. >>> df
  445. A B
  446. 0 a 1
  447. 1 b 2
  448. 2 a 3
  449. 3 b 4
  450. To get the difference between each groups maximum and minimum value in one
  451. pass, you can do
  452. >>> df.groupby('A').pipe(lambda x: x.max() - x.min())
  453. B
  454. A
  455. a 2
  456. b 2""")
  457. @Appender(_pipe_template)
  458. def pipe(self, func, *args, **kwargs):
  459. return com._pipe(self, func, *args, **kwargs)
  460. plot = property(GroupByPlot)
  461. def _make_wrapper(self, name):
  462. if name not in self._apply_whitelist:
  463. is_callable = callable(getattr(self._selected_obj, name, None))
  464. kind = ' callable ' if is_callable else ' '
  465. msg = ("Cannot access{0}attribute {1!r} of {2!r} objects, try "
  466. "using the 'apply' method".format(kind, name,
  467. type(self).__name__))
  468. raise AttributeError(msg)
  469. self._set_group_selection()
  470. # need to setup the selection
  471. # as are not passed directly but in the grouper
  472. f = getattr(self._selected_obj, name)
  473. if not isinstance(f, types.MethodType):
  474. return self.apply(lambda self: getattr(self, name))
  475. f = getattr(type(self._selected_obj), name)
  476. def wrapper(*args, **kwargs):
  477. # a little trickery for aggregation functions that need an axis
  478. # argument
  479. kwargs_with_axis = kwargs.copy()
  480. if ('axis' not in kwargs_with_axis or
  481. kwargs_with_axis['axis'] is None):
  482. kwargs_with_axis['axis'] = self.axis
  483. def curried_with_axis(x):
  484. return f(x, *args, **kwargs_with_axis)
  485. def curried(x):
  486. return f(x, *args, **kwargs)
  487. # preserve the name so we can detect it when calling plot methods,
  488. # to avoid duplicates
  489. curried.__name__ = curried_with_axis.__name__ = name
  490. # special case otherwise extra plots are created when catching the
  491. # exception below
  492. if name in base.plotting_methods:
  493. return self.apply(curried)
  494. try:
  495. return self.apply(curried_with_axis)
  496. except Exception:
  497. try:
  498. return self.apply(curried)
  499. except Exception:
  500. # related to : GH3688
  501. # try item-by-item
  502. # this can be called recursively, so need to raise
  503. # ValueError
  504. # if we don't have this method to indicated to aggregate to
  505. # mark this column as an error
  506. try:
  507. return self._aggregate_item_by_item(name,
  508. *args, **kwargs)
  509. except (AttributeError):
  510. raise ValueError
  511. return wrapper
  512. def get_group(self, name, obj=None):
  513. """
  514. Constructs NDFrame from group with provided name.
  515. Parameters
  516. ----------
  517. name : object
  518. the name of the group to get as a DataFrame
  519. obj : NDFrame, default None
  520. the NDFrame to take the DataFrame out of. If
  521. it is None, the object groupby was called on will
  522. be used
  523. Returns
  524. -------
  525. group : same type as obj
  526. """
  527. if obj is None:
  528. obj = self._selected_obj
  529. inds = self._get_index(name)
  530. if not len(inds):
  531. raise KeyError(name)
  532. return obj._take(inds, axis=self.axis)
  533. def __iter__(self):
  534. """
  535. Groupby iterator.
  536. Returns
  537. -------
  538. Generator yielding sequence of (name, subsetted object)
  539. for each group
  540. """
  541. return self.grouper.get_iterator(self.obj, axis=self.axis)
  542. @Appender(_apply_docs['template']
  543. .format(input="dataframe",
  544. examples=_apply_docs['dataframe_examples']))
  545. def apply(self, func, *args, **kwargs):
  546. func = self._is_builtin_func(func)
  547. # this is needed so we don't try and wrap strings. If we could
  548. # resolve functions to their callable functions prior, this
  549. # wouldn't be needed
  550. if args or kwargs:
  551. if callable(func):
  552. @wraps(func)
  553. def f(g):
  554. with np.errstate(all='ignore'):
  555. return func(g, *args, **kwargs)
  556. else:
  557. raise ValueError('func must be a callable if args or '
  558. 'kwargs are supplied')
  559. else:
  560. f = func
  561. # ignore SettingWithCopy here in case the user mutates
  562. with option_context('mode.chained_assignment', None):
  563. try:
  564. result = self._python_apply_general(f)
  565. except Exception:
  566. # gh-20949
  567. # try again, with .apply acting as a filtering
  568. # operation, by excluding the grouping column
  569. # This would normally not be triggered
  570. # except if the udf is trying an operation that
  571. # fails on *some* columns, e.g. a numeric operation
  572. # on a string grouper column
  573. with _group_selection_context(self):
  574. return self._python_apply_general(f)
  575. return result
  576. def _python_apply_general(self, f):
  577. keys, values, mutated = self.grouper.apply(f, self._selected_obj,
  578. self.axis)
  579. return self._wrap_applied_output(
  580. keys,
  581. values,
  582. not_indexed_same=mutated or self.mutated)
  583. def _iterate_slices(self):
  584. yield self._selection_name, self._selected_obj
  585. def transform(self, func, *args, **kwargs):
  586. raise AbstractMethodError(self)
  587. def _cumcount_array(self, ascending=True):
  588. """
  589. Parameters
  590. ----------
  591. ascending : bool, default True
  592. If False, number in reverse, from length of group - 1 to 0.
  593. Notes
  594. -----
  595. this is currently implementing sort=False
  596. (though the default is sort=True) for groupby in general
  597. """
  598. ids, _, ngroups = self.grouper.group_info
  599. sorter = get_group_index_sorter(ids, ngroups)
  600. ids, count = ids[sorter], len(ids)
  601. if count == 0:
  602. return np.empty(0, dtype=np.int64)
  603. run = np.r_[True, ids[:-1] != ids[1:]]
  604. rep = np.diff(np.r_[np.nonzero(run)[0], count])
  605. out = (~run).cumsum()
  606. if ascending:
  607. out -= np.repeat(out[run], rep)
  608. else:
  609. out = np.repeat(out[np.r_[run[1:], True]], rep) - out
  610. rev = np.empty(count, dtype=np.intp)
  611. rev[sorter] = np.arange(count, dtype=np.intp)
  612. return out[rev].astype(np.int64, copy=False)
  613. def _try_cast(self, result, obj, numeric_only=False):
  614. """
  615. Try to cast the result to our obj original type,
  616. we may have roundtripped through object in the mean-time.
  617. If numeric_only is True, then only try to cast numerics
  618. and not datetimelikes.
  619. """
  620. if obj.ndim > 1:
  621. dtype = obj._values.dtype
  622. else:
  623. dtype = obj.dtype
  624. if not is_scalar(result):
  625. if is_extension_array_dtype(dtype):
  626. # The function can return something of any type, so check
  627. # if the type is compatible with the calling EA.
  628. try:
  629. result = obj._values._from_sequence(result, dtype=dtype)
  630. except Exception:
  631. # https://github.com/pandas-dev/pandas/issues/22850
  632. # pandas has no control over what 3rd-party ExtensionArrays
  633. # do in _values_from_sequence. We still want ops to work
  634. # though, so we catch any regular Exception.
  635. pass
  636. elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
  637. result = maybe_downcast_to_dtype(result, dtype)
  638. return result
  639. def _transform_should_cast(self, func_nm):
  640. """
  641. Parameters:
  642. -----------
  643. func_nm: str
  644. The name of the aggregation function being performed
  645. Returns:
  646. --------
  647. bool
  648. Whether transform should attempt to cast the result of aggregation
  649. """
  650. return (self.size().fillna(0) > 0).any() and (
  651. func_nm not in base.cython_cast_blacklist)
  652. def _cython_transform(self, how, numeric_only=True, **kwargs):
  653. output = collections.OrderedDict()
  654. for name, obj in self._iterate_slices():
  655. is_numeric = is_numeric_dtype(obj.dtype)
  656. if numeric_only and not is_numeric:
  657. continue
  658. try:
  659. result, names = self.grouper.transform(obj.values, how,
  660. **kwargs)
  661. except NotImplementedError:
  662. continue
  663. except AssertionError as e:
  664. raise GroupByError(str(e))
  665. if self._transform_should_cast(how):
  666. output[name] = self._try_cast(result, obj)
  667. else:
  668. output[name] = result
  669. if len(output) == 0:
  670. raise DataError('No numeric types to aggregate')
  671. return self._wrap_transformed_output(output, names)
  672. def _cython_agg_general(self, how, alt=None, numeric_only=True,
  673. min_count=-1):
  674. output = {}
  675. for name, obj in self._iterate_slices():
  676. is_numeric = is_numeric_dtype(obj.dtype)
  677. if numeric_only and not is_numeric:
  678. continue
  679. try:
  680. result, names = self.grouper.aggregate(obj.values, how,
  681. min_count=min_count)
  682. except AssertionError as e:
  683. raise GroupByError(str(e))
  684. output[name] = self._try_cast(result, obj)
  685. if len(output) == 0:
  686. raise DataError('No numeric types to aggregate')
  687. return self._wrap_aggregated_output(output, names)
  688. def _python_agg_general(self, func, *args, **kwargs):
  689. func = self._is_builtin_func(func)
  690. f = lambda x: func(x, *args, **kwargs)
  691. # iterate through "columns" ex exclusions to populate output dict
  692. output = {}
  693. for name, obj in self._iterate_slices():
  694. try:
  695. result, counts = self.grouper.agg_series(obj, f)
  696. output[name] = self._try_cast(result, obj, numeric_only=True)
  697. except TypeError:
  698. continue
  699. if len(output) == 0:
  700. return self._python_apply_general(f)
  701. if self.grouper._filter_empty_groups:
  702. mask = counts.ravel() > 0
  703. for name, result in compat.iteritems(output):
  704. # since we are masking, make sure that we have a float object
  705. values = result
  706. if is_numeric_dtype(values.dtype):
  707. values = ensure_float(values)
  708. output[name] = self._try_cast(values[mask], result)
  709. return self._wrap_aggregated_output(output)
  710. def _wrap_applied_output(self, *args, **kwargs):
  711. raise AbstractMethodError(self)
  712. def _concat_objects(self, keys, values, not_indexed_same=False):
  713. from pandas.core.reshape.concat import concat
  714. def reset_identity(values):
  715. # reset the identities of the components
  716. # of the values to prevent aliasing
  717. for v in com._not_none(*values):
  718. ax = v._get_axis(self.axis)
  719. ax._reset_identity()
  720. return values
  721. if not not_indexed_same:
  722. result = concat(values, axis=self.axis)
  723. ax = self._selected_obj._get_axis(self.axis)
  724. if isinstance(result, Series):
  725. result = result.reindex(ax)
  726. else:
  727. # this is a very unfortunate situation
  728. # we have a multi-index that is NOT lexsorted
  729. # and we have a result which is duplicated
  730. # we can't reindex, so we resort to this
  731. # GH 14776
  732. if isinstance(ax, MultiIndex) and not ax.is_unique:
  733. indexer = algorithms.unique1d(
  734. result.index.get_indexer_for(ax.values))
  735. result = result.take(indexer, axis=self.axis)
  736. else:
  737. result = result.reindex(ax, axis=self.axis)
  738. elif self.group_keys:
  739. values = reset_identity(values)
  740. if self.as_index:
  741. # possible MI return case
  742. group_keys = keys
  743. group_levels = self.grouper.levels
  744. group_names = self.grouper.names
  745. result = concat(values, axis=self.axis, keys=group_keys,
  746. levels=group_levels, names=group_names,
  747. sort=False)
  748. else:
  749. # GH5610, returns a MI, with the first level being a
  750. # range index
  751. keys = list(range(len(values)))
  752. result = concat(values, axis=self.axis, keys=keys)
  753. else:
  754. values = reset_identity(values)
  755. result = concat(values, axis=self.axis)
  756. if (isinstance(result, Series) and
  757. getattr(self, '_selection_name', None) is not None):
  758. result.name = self._selection_name
  759. return result
  760. def _apply_filter(self, indices, dropna):
  761. if len(indices) == 0:
  762. indices = np.array([], dtype='int64')
  763. else:
  764. indices = np.sort(np.concatenate(indices))
  765. if dropna:
  766. filtered = self._selected_obj.take(indices, axis=self.axis)
  767. else:
  768. mask = np.empty(len(self._selected_obj.index), dtype=bool)
  769. mask.fill(False)
  770. mask[indices.astype(int)] = True
  771. # mask fails to broadcast when passed to where; broadcast manually.
  772. mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
  773. filtered = self._selected_obj.where(mask) # Fill with NaNs.
  774. return filtered
  775. class GroupBy(_GroupBy):
  776. """
  777. Class for grouping and aggregating relational data.
  778. See aggregate, transform, and apply functions on this object.
  779. It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
  780. ::
  781. grouped = groupby(obj, ...)
  782. Parameters
  783. ----------
  784. obj : pandas object
  785. axis : int, default 0
  786. level : int, default None
  787. Level of MultiIndex
  788. groupings : list of Grouping objects
  789. Most users should ignore this
  790. exclusions : array-like, optional
  791. List of columns to exclude
  792. name : string
  793. Most users should ignore this
  794. Returns
  795. -------
  796. **Attributes**
  797. groups : dict
  798. {group name -> group labels}
  799. len(grouped) : int
  800. Number of groups
  801. Notes
  802. -----
  803. After grouping, see aggregate, apply, and transform functions. Here are
  804. some other brief notes about usage. When grouping by multiple groups, the
  805. result index will be a MultiIndex (hierarchical) by default.
  806. Iteration produces (key, group) tuples, i.e. chunking the data by group. So
  807. you can write code like:
  808. ::
  809. grouped = obj.groupby(keys, axis=axis)
  810. for key, group in grouped:
  811. # do something with the data
  812. Function calls on GroupBy, if not specially implemented, "dispatch" to the
  813. grouped data. So if you group a DataFrame and wish to invoke the std()
  814. method on each group, you can simply do:
  815. ::
  816. df.groupby(mapper).std()
  817. rather than
  818. ::
  819. df.groupby(mapper).aggregate(np.std)
  820. You can pass arguments to these "wrapped" functions, too.
  821. See the online documentation for full exposition on these topics and much
  822. more
  823. """
  824. def _bool_agg(self, val_test, skipna):
  825. """
  826. Shared func to call any / all Cython GroupBy implementations.
  827. """
  828. def objs_to_bool(vals):
  829. try:
  830. vals = vals.astype(np.bool)
  831. except ValueError: # for objects
  832. vals = np.array([bool(x) for x in vals])
  833. return vals.view(np.uint8)
  834. def result_to_bool(result):
  835. return result.astype(np.bool, copy=False)
  836. return self._get_cythonized_result('group_any_all', self.grouper,
  837. aggregate=True,
  838. cython_dtype=np.uint8,
  839. needs_values=True,
  840. needs_mask=True,
  841. pre_processing=objs_to_bool,
  842. post_processing=result_to_bool,
  843. val_test=val_test, skipna=skipna)
  844. @Substitution(name='groupby')
  845. @Appender(_common_see_also)
  846. def any(self, skipna=True):
  847. """
  848. Returns True if any value in the group is truthful, else False.
  849. Parameters
  850. ----------
  851. skipna : bool, default True
  852. Flag to ignore nan values during truth testing
  853. """
  854. return self._bool_agg('any', skipna)
  855. @Substitution(name='groupby')
  856. @Appender(_common_see_also)
  857. def all(self, skipna=True):
  858. """
  859. Returns True if all values in the group are truthful, else False.
  860. Parameters
  861. ----------
  862. skipna : bool, default True
  863. Flag to ignore nan values during truth testing
  864. """
  865. return self._bool_agg('all', skipna)
  866. @Substitution(name='groupby')
  867. @Appender(_common_see_also)
  868. def count(self):
  869. """
  870. Compute count of group, excluding missing values.
  871. """
  872. # defined here for API doc
  873. raise NotImplementedError
  874. @Substitution(name='groupby', see_also=_common_see_also)
  875. def mean(self, *args, **kwargs):
  876. """
  877. Compute mean of groups, excluding missing values.
  878. Returns
  879. -------
  880. pandas.Series or pandas.DataFrame
  881. %(see_also)s
  882. Examples
  883. --------
  884. >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
  885. ... 'B': [np.nan, 2, 3, 4, 5],
  886. ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])
  887. Groupby one column and return the mean of the remaining columns in
  888. each group.
  889. >>> df.groupby('A').mean()
  890. >>>
  891. B C
  892. A
  893. 1 3.0 1.333333
  894. 2 4.0 1.500000
  895. Groupby two columns and return the mean of the remaining column.
  896. >>> df.groupby(['A', 'B']).mean()
  897. >>>
  898. C
  899. A B
  900. 1 2.0 2
  901. 4.0 1
  902. 2 3.0 1
  903. 5.0 2
  904. Groupby one column and return the mean of only particular column in
  905. the group.
  906. >>> df.groupby('A')['B'].mean()
  907. >>>
  908. A
  909. 1 3.0
  910. 2 4.0
  911. Name: B, dtype: float64
  912. """
  913. nv.validate_groupby_func('mean', args, kwargs, ['numeric_only'])
  914. try:
  915. return self._cython_agg_general('mean', **kwargs)
  916. except GroupByError:
  917. raise
  918. except Exception: # pragma: no cover
  919. with _group_selection_context(self):
  920. f = lambda x: x.mean(axis=self.axis, **kwargs)
  921. return self._python_agg_general(f)
  922. @Substitution(name='groupby')
  923. @Appender(_common_see_also)
  924. def median(self, **kwargs):
  925. """
  926. Compute median of groups, excluding missing values.
  927. For multiple groupings, the result index will be a MultiIndex
  928. """
  929. try:
  930. return self._cython_agg_general('median', **kwargs)
  931. except GroupByError:
  932. raise
  933. except Exception: # pragma: no cover
  934. def f(x):
  935. if isinstance(x, np.ndarray):
  936. x = Series(x)
  937. return x.median(axis=self.axis, **kwargs)
  938. with _group_selection_context(self):
  939. return self._python_agg_general(f)
  940. @Substitution(name='groupby')
  941. @Appender(_common_see_also)
  942. def std(self, ddof=1, *args, **kwargs):
  943. """
  944. Compute standard deviation of groups, excluding missing values.
  945. For multiple groupings, the result index will be a MultiIndex.
  946. Parameters
  947. ----------
  948. ddof : integer, default 1
  949. degrees of freedom
  950. """
  951. # TODO: implement at Cython level?
  952. nv.validate_groupby_func('std', args, kwargs)
  953. return np.sqrt(self.var(ddof=ddof, **kwargs))
  954. @Substitution(name='groupby')
  955. @Appender(_common_see_also)
  956. def var(self, ddof=1, *args, **kwargs):
  957. """
  958. Compute variance of groups, excluding missing values.
  959. For multiple groupings, the result index will be a MultiIndex.
  960. Parameters
  961. ----------
  962. ddof : integer, default 1
  963. degrees of freedom
  964. """
  965. nv.validate_groupby_func('var', args, kwargs)
  966. if ddof == 1:
  967. try:
  968. return self._cython_agg_general('var', **kwargs)
  969. except Exception:
  970. f = lambda x: x.var(ddof=ddof, **kwargs)
  971. with _group_selection_context(self):
  972. return self._python_agg_general(f)
  973. else:
  974. f = lambda x: x.var(ddof=ddof, **kwargs)
  975. with _group_selection_context(self):
  976. return self._python_agg_general(f)
  977. @Substitution(name='groupby')
  978. @Appender(_common_see_also)
  979. def sem(self, ddof=1):
  980. """
  981. Compute standard error of the mean of groups, excluding missing values.
  982. For multiple groupings, the result index will be a MultiIndex.
  983. Parameters
  984. ----------
  985. ddof : integer, default 1
  986. degrees of freedom
  987. """
  988. return self.std(ddof=ddof) / np.sqrt(self.count())
  989. @Substitution(name='groupby')
  990. @Appender(_common_see_also)
  991. def size(self):
  992. """
  993. Compute group sizes.
  994. """
  995. result = self.grouper.size()
  996. if isinstance(self.obj, Series):
  997. result.name = getattr(self.obj, 'name', None)
  998. return result
  999. @classmethod
  1000. def _add_numeric_operations(cls):
  1001. """
  1002. Add numeric operations to the GroupBy generically.
  1003. """
  1004. def groupby_function(name, alias, npfunc,
  1005. numeric_only=True, _convert=False,
  1006. min_count=-1):
  1007. _local_template = "Compute %(f)s of group values"
  1008. @Substitution(name='groupby', f=name)
  1009. @Appender(_common_see_also)
  1010. @Appender(_local_template)
  1011. def f(self, **kwargs):
  1012. if 'numeric_only' not in kwargs:
  1013. kwargs['numeric_only'] = numeric_only
  1014. if 'min_count' not in kwargs:
  1015. kwargs['min_count'] = min_count
  1016. self._set_group_selection()
  1017. try:
  1018. return self._cython_agg_general(
  1019. alias, alt=npfunc, **kwargs)
  1020. except AssertionError as e:
  1021. raise SpecificationError(str(e))
  1022. except Exception:
  1023. result = self.aggregate(
  1024. lambda x: npfunc(x, axis=self.axis))
  1025. if _convert:
  1026. result = result._convert(datetime=True)
  1027. return result
  1028. set_function_name(f, name, cls)
  1029. return f
  1030. def first_compat(x, axis=0):
  1031. def first(x):
  1032. x = x.to_numpy()
  1033. x = x[notna(x)]
  1034. if len(x) == 0:
  1035. return np.nan
  1036. return x[0]
  1037. if isinstance(x, DataFrame):
  1038. return x.apply(first, axis=axis)
  1039. else:
  1040. return first(x)
  1041. def last_compat(x, axis=0):
  1042. def last(x):
  1043. x = x.to_numpy()
  1044. x = x[notna(x)]
  1045. if len(x) == 0:
  1046. return np.nan
  1047. return x[-1]
  1048. if isinstance(x, DataFrame):
  1049. return x.apply(last, axis=axis)
  1050. else:
  1051. return last(x)
  1052. cls.sum = groupby_function('sum', 'add', np.sum, min_count=0)
  1053. cls.prod = groupby_function('prod', 'prod', np.prod, min_count=0)
  1054. cls.min = groupby_function('min', 'min', np.min, numeric_only=False)
  1055. cls.max = groupby_function('max', 'max', np.max, numeric_only=False)
  1056. cls.first = groupby_function('first', 'first', first_compat,
  1057. numeric_only=False)
  1058. cls.last = groupby_function('last', 'last', last_compat,
  1059. numeric_only=False)
  1060. @Substitution(name='groupby')
  1061. @Appender(_common_see_also)
  1062. def ohlc(self):
  1063. """
  1064. Compute sum of values, excluding missing values.
  1065. For multiple groupings, the result index will be a MultiIndex
  1066. """
  1067. return self._apply_to_column_groupbys(
  1068. lambda x: x._cython_agg_general('ohlc'))
  1069. @Appender(DataFrame.describe.__doc__)
  1070. def describe(self, **kwargs):
  1071. with _group_selection_context(self):
  1072. result = self.apply(lambda x: x.describe(**kwargs))
  1073. if self.axis == 1:
  1074. return result.T
  1075. return result.unstack()
  1076. def resample(self, rule, *args, **kwargs):
  1077. """
  1078. Provide resampling when using a TimeGrouper.
  1079. Given a grouper, the function resamples it according to a string
  1080. "string" -> "frequency".
  1081. See the :ref:`frequency aliases <timeseries.offset_aliases>`
  1082. documentation for more details.
  1083. Parameters
  1084. ----------
  1085. rule : str or DateOffset
  1086. The offset string or object representing target grouper conversion.
  1087. *args, **kwargs
  1088. Possible arguments are `how`, `fill_method`, `limit`, `kind` and
  1089. `on`, and other arguments of `TimeGrouper`.
  1090. Returns
  1091. -------
  1092. Grouper
  1093. Return a new grouper with our resampler appended.
  1094. See Also
  1095. --------
  1096. pandas.Grouper : Specify a frequency to resample with when
  1097. grouping by a key.
  1098. DatetimeIndex.resample : Frequency conversion and resampling of
  1099. time series.
  1100. Examples
  1101. --------
  1102. >>> idx = pd.date_range('1/1/2000', periods=4, freq='T')
  1103. >>> df = pd.DataFrame(data=4 * [range(2)],
  1104. ... index=idx,
  1105. ... columns=['a', 'b'])
  1106. >>> df.iloc[2, 0] = 5
  1107. >>> df
  1108. a b
  1109. 2000-01-01 00:00:00 0 1
  1110. 2000-01-01 00:01:00 0 1
  1111. 2000-01-01 00:02:00 5 1
  1112. 2000-01-01 00:03:00 0 1
  1113. Downsample the DataFrame into 3 minute bins and sum the values of
  1114. the timestamps falling into a bin.
  1115. >>> df.groupby('a').resample('3T').sum()
  1116. a b
  1117. a
  1118. 0 2000-01-01 00:00:00 0 2
  1119. 2000-01-01 00:03:00 0 1
  1120. 5 2000-01-01 00:00:00 5 1
  1121. Upsample the series into 30 second bins.
  1122. >>> df.groupby('a').resample('30S').sum()
  1123. a b
  1124. a
  1125. 0 2000-01-01 00:00:00 0 1
  1126. 2000-01-01 00:00:30 0 0
  1127. 2000-01-01 00:01:00 0 1
  1128. 2000-01-01 00:01:30 0 0
  1129. 2000-01-01 00:02:00 0 0
  1130. 2000-01-01 00:02:30 0 0
  1131. 2000-01-01 00:03:00 0 1
  1132. 5 2000-01-01 00:02:00 5 1
  1133. Resample by month. Values are assigned to the month of the period.
  1134. >>> df.groupby('a').resample('M').sum()
  1135. a b
  1136. a
  1137. 0 2000-01-31 0 3
  1138. 5 2000-01-31 5 1
  1139. Downsample the series into 3 minute bins as above, but close the right
  1140. side of the bin interval.
  1141. >>> df.groupby('a').resample('3T', closed='right').sum()
  1142. a b
  1143. a
  1144. 0 1999-12-31 23:57:00 0 1
  1145. 2000-01-01 00:00:00 0 2
  1146. 5 2000-01-01 00:00:00 5 1
  1147. Downsample the series into 3 minute bins and close the right side of
  1148. the bin interval, but label each bin using the right edge instead of
  1149. the left.
  1150. >>> df.groupby('a').resample('3T', closed='right', label='right').sum()
  1151. a b
  1152. a
  1153. 0 2000-01-01 00:00:00 0 1
  1154. 2000-01-01 00:03:00 0 2
  1155. 5 2000-01-01 00:03:00 5 1
  1156. Add an offset of twenty seconds.
  1157. >>> df.groupby('a').resample('3T', loffset='20s').sum()
  1158. a b
  1159. a
  1160. 0 2000-01-01 00:00:20 0 2
  1161. 2000-01-01 00:03:20 0 1
  1162. 5 2000-01-01 00:00:20 5 1
  1163. """
  1164. from pandas.core.resample import get_resampler_for_grouping
  1165. return get_resampler_for_grouping(self, rule, *args, **kwargs)
  1166. @Substitution(name='groupby')
  1167. @Appender(_common_see_also)
  1168. def rolling(self, *args, **kwargs):
  1169. """
  1170. Return a rolling grouper, providing rolling functionality per group.
  1171. """
  1172. from pandas.core.window import RollingGroupby
  1173. return RollingGroupby(self, *args, **kwargs)
  1174. @Substitution(name='groupby')
  1175. @Appender(_common_see_also)
  1176. def expanding(self, *args, **kwargs):
  1177. """
  1178. Return an expanding grouper, providing expanding
  1179. functionality per group.
  1180. """
  1181. from pandas.core.window import ExpandingGroupby
  1182. return ExpandingGroupby(self, *args, **kwargs)
  1183. def _fill(self, direction, limit=None):
  1184. """
  1185. Shared function for `pad` and `backfill` to call Cython method.
  1186. Parameters
  1187. ----------
  1188. direction : {'ffill', 'bfill'}
  1189. Direction passed to underlying Cython function. `bfill` will cause
  1190. values to be filled backwards. `ffill` and any other values will
  1191. default to a forward fill
  1192. limit : int, default None
  1193. Maximum number of consecutive values to fill. If `None`, this
  1194. method will convert to -1 prior to passing to Cython
  1195. Returns
  1196. -------
  1197. `Series` or `DataFrame` with filled values
  1198. See Also
  1199. --------
  1200. pad
  1201. backfill
  1202. """
  1203. # Need int value for Cython
  1204. if limit is None:
  1205. limit = -1
  1206. return self._get_cythonized_result('group_fillna_indexer',
  1207. self.grouper, needs_mask=True,
  1208. cython_dtype=np.int64,
  1209. result_is_index=True,
  1210. direction=direction, limit=limit)
  1211. @Substitution(name='groupby')
  1212. def pad(self, limit=None):
  1213. """
  1214. Forward fill the values.
  1215. Parameters
  1216. ----------
  1217. limit : integer, optional
  1218. limit of how many values to fill
  1219. See Also
  1220. --------
  1221. Series.pad
  1222. DataFrame.pad
  1223. Series.fillna
  1224. DataFrame.fillna
  1225. """
  1226. return self._fill('ffill', limit=limit)
  1227. ffill = pad
  1228. @Substitution(name='groupby')
  1229. def backfill(self, limit=None):
  1230. """
  1231. Backward fill the values.
  1232. Parameters
  1233. ----------
  1234. limit : integer, optional
  1235. limit of how many values to fill
  1236. See Also
  1237. --------
  1238. Series.backfill
  1239. DataFrame.backfill
  1240. Series.fillna
  1241. DataFrame.fillna
  1242. """
  1243. return self._fill('bfill', limit=limit)
  1244. bfill = backfill
  1245. @Substitution(name='groupby', see_also=_common_see_also)
  1246. def nth(self, n, dropna=None):
  1247. """
  1248. Take the nth row from each group if n is an int, or a subset of rows
  1249. if n is a list of ints.
  1250. If dropna, will take the nth non-null row, dropna is either
  1251. Truthy (if a Series) or 'all', 'any' (if a DataFrame);
  1252. this is equivalent to calling dropna(how=dropna) before the
  1253. groupby.
  1254. Parameters
  1255. ----------
  1256. n : int or list of ints
  1257. a single nth value for the row or a list of nth values
  1258. dropna : None or str, optional
  1259. apply the specified dropna operation before counting which row is
  1260. the nth row. Needs to be None, 'any' or 'all'
  1261. %(see_also)s
  1262. Examples
  1263. --------
  1264. >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
  1265. ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])
  1266. >>> g = df.groupby('A')
  1267. >>> g.nth(0)
  1268. B
  1269. A
  1270. 1 NaN
  1271. 2 3.0
  1272. >>> g.nth(1)
  1273. B
  1274. A
  1275. 1 2.0
  1276. 2 5.0
  1277. >>> g.nth(-1)
  1278. B
  1279. A
  1280. 1 4.0
  1281. 2 5.0
  1282. >>> g.nth([0, 1])
  1283. B
  1284. A
  1285. 1 NaN
  1286. 1 2.0
  1287. 2 3.0
  1288. 2 5.0
  1289. Specifying `dropna` allows count ignoring ``NaN``
  1290. >>> g.nth(0, dropna='any')
  1291. B
  1292. A
  1293. 1 2.0
  1294. 2 3.0
  1295. NaNs denote group exhausted when using dropna
  1296. >>> g.nth(3, dropna='any')
  1297. B
  1298. A
  1299. 1 NaN
  1300. 2 NaN
  1301. Specifying `as_index=False` in `groupby` keeps the original index.
  1302. >>> df.groupby('A', as_index=False).nth(1)
  1303. A B
  1304. 1 1 2.0
  1305. 4 2 5.0
  1306. """
  1307. if isinstance(n, int):
  1308. nth_values = [n]
  1309. elif isinstance(n, (set, list, tuple)):
  1310. nth_values = list(set(n))
  1311. if dropna is not None:
  1312. raise ValueError(
  1313. "dropna option with a list of nth values is not supported")
  1314. else:
  1315. raise TypeError("n needs to be an int or a list/set/tuple of ints")
  1316. nth_values = np.array(nth_values, dtype=np.intp)
  1317. self._set_group_selection()
  1318. if not dropna:
  1319. mask_left = np.in1d(self._cumcount_array(), nth_values)
  1320. mask_right = np.in1d(self._cumcount_array(ascending=False) + 1,
  1321. -nth_values)
  1322. mask = mask_left | mask_right
  1323. out = self._selected_obj[mask]
  1324. if not self.as_index:
  1325. return out
  1326. ids, _, _ = self.grouper.group_info
  1327. out.index = self.grouper.result_index[ids[mask]]
  1328. return out.sort_index() if self.sort else out
  1329. if dropna not in ['any', 'all']:
  1330. if isinstance(self._selected_obj, Series) and dropna is True:
  1331. warnings.warn("the dropna={dropna} keyword is deprecated,"
  1332. "use dropna='all' instead. "
  1333. "For a Series groupby, dropna must be "
  1334. "either None, 'any' or 'all'.".format(
  1335. dropna=dropna),
  1336. FutureWarning,
  1337. stacklevel=2)
  1338. dropna = 'all'
  1339. else:
  1340. # Note: when agg-ing picker doesn't raise this,
  1341. # just returns NaN
  1342. raise ValueError("For a DataFrame groupby, dropna must be "
  1343. "either None, 'any' or 'all', "
  1344. "(was passed {dropna}).".format(
  1345. dropna=dropna))
  1346. # old behaviour, but with all and any support for DataFrames.
  1347. # modified in GH 7559 to have better perf
  1348. max_len = n if n >= 0 else - 1 - n
  1349. dropped = self.obj.dropna(how=dropna, axis=self.axis)
  1350. # get a new grouper for our dropped obj
  1351. if self.keys is None and self.level is None:
  1352. # we don't have the grouper info available
  1353. # (e.g. we have selected out
  1354. # a column that is not in the current object)
  1355. axis = self.grouper.axis
  1356. grouper = axis[axis.isin(dropped.index)]
  1357. else:
  1358. # create a grouper with the original parameters, but on the dropped
  1359. # object
  1360. from pandas.core.groupby.grouper import _get_grouper
  1361. grouper, _, _ = _get_grouper(dropped, key=self.keys,
  1362. axis=self.axis, level=self.level,
  1363. sort=self.sort,
  1364. mutated=self.mutated)
  1365. grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort)
  1366. sizes, result = grb.size(), grb.nth(n)
  1367. mask = (sizes < max_len).values
  1368. # set the results which don't meet the criteria
  1369. if len(result) and mask.any():
  1370. result.loc[mask] = np.nan
  1371. # reset/reindex to the original groups
  1372. if (len(self.obj) == len(dropped) or
  1373. len(result) == len(self.grouper.result_index)):
  1374. result.index = self.grouper.result_index
  1375. else:
  1376. result = result.reindex(self.grouper.result_index)
  1377. return result
  1378. @Substitution(name='groupby')
  1379. def ngroup(self, ascending=True):
  1380. """
  1381. Number each group from 0 to the number of groups - 1.
  1382. This is the enumerative complement of cumcount. Note that the
  1383. numbers given to the groups match the order in which the groups
  1384. would be seen when iterating over the groupby object, not the
  1385. order they are first observed.
  1386. .. versionadded:: 0.20.2
  1387. Parameters
  1388. ----------
  1389. ascending : bool, default True
  1390. If False, number in reverse, from number of group - 1 to 0.
  1391. See Also
  1392. --------
  1393. .cumcount : Number the rows in each group.
  1394. Examples
  1395. --------
  1396. >>> df = pd.DataFrame({"A": list("aaabba")})
  1397. >>> df
  1398. A
  1399. 0 a
  1400. 1 a
  1401. 2 a
  1402. 3 b
  1403. 4 b
  1404. 5 a
  1405. >>> df.groupby('A').ngroup()
  1406. 0 0
  1407. 1 0
  1408. 2 0
  1409. 3 1
  1410. 4 1
  1411. 5 0
  1412. dtype: int64
  1413. >>> df.groupby('A').ngroup(ascending=False)
  1414. 0 1
  1415. 1 1
  1416. 2 1
  1417. 3 0
  1418. 4 0
  1419. 5 1
  1420. dtype: int64
  1421. >>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup()
  1422. 0 0
  1423. 1 0
  1424. 2 1
  1425. 3 3
  1426. 4 2
  1427. 5 0
  1428. dtype: int64
  1429. """
  1430. with _group_selection_context(self):
  1431. index = self._selected_obj.index
  1432. result = Series(self.grouper.group_info[0], index)
  1433. if not ascending:
  1434. result = self.ngroups - 1 - result
  1435. return result
  1436. @Substitution(name='groupby')
  1437. def cumcount(self, ascending=True):
  1438. """
  1439. Number each item in each group from 0 to the length of that group - 1.
  1440. Essentially this is equivalent to
  1441. >>> self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))
  1442. Parameters
  1443. ----------
  1444. ascending : bool, default True
  1445. If False, number in reverse, from length of group - 1 to 0.
  1446. See Also
  1447. --------
  1448. .ngroup : Number the groups themselves.
  1449. Examples
  1450. --------
  1451. >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
  1452. ... columns=['A'])
  1453. >>> df
  1454. A
  1455. 0 a
  1456. 1 a
  1457. 2 a
  1458. 3 b
  1459. 4 b
  1460. 5 a
  1461. >>> df.groupby('A').cumcount()
  1462. 0 0
  1463. 1 1
  1464. 2 2
  1465. 3 0
  1466. 4 1
  1467. 5 3
  1468. dtype: int64
  1469. >>> df.groupby('A').cumcount(ascending=False)
  1470. 0 3
  1471. 1 2
  1472. 2 1
  1473. 3 1
  1474. 4 0
  1475. 5 0
  1476. dtype: int64
  1477. """
  1478. with _group_selection_context(self):
  1479. index = self._selected_obj.index
  1480. cumcounts = self._cumcount_array(ascending=ascending)
  1481. return Series(cumcounts, index)
  1482. @Substitution(name='groupby')
  1483. @Appender(_common_see_also)
  1484. def rank(self, method='average', ascending=True, na_option='keep',
  1485. pct=False, axis=0):
  1486. """
  1487. Provides the rank of values within each group.
  1488. Parameters
  1489. ----------
  1490. method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
  1491. * average: average rank of group
  1492. * min: lowest rank in group
  1493. * max: highest rank in group
  1494. * first: ranks assigned in order they appear in the array
  1495. * dense: like 'min', but rank always increases by 1 between groups
  1496. ascending : boolean, default True
  1497. False for ranks by high (1) to low (N)
  1498. na_option : {'keep', 'top', 'bottom'}, default 'keep'
  1499. * keep: leave NA values where they are
  1500. * top: smallest rank if ascending
  1501. * bottom: smallest rank if descending
  1502. pct : boolean, default False
  1503. Compute percentage rank of data within each group
  1504. axis : int, default 0
  1505. The axis of the object over which to compute the rank.
  1506. Returns
  1507. -----
  1508. DataFrame with ranking of values within each group
  1509. """
  1510. if na_option not in {'keep', 'top', 'bottom'}:
  1511. msg = "na_option must be one of 'keep', 'top', or 'bottom'"
  1512. raise ValueError(msg)
  1513. return self._cython_transform('rank', numeric_only=False,
  1514. ties_method=method, ascending=ascending,
  1515. na_option=na_option, pct=pct, axis=axis)
  1516. @Substitution(name='groupby')
  1517. @Appender(_common_see_also)
  1518. def cumprod(self, axis=0, *args, **kwargs):
  1519. """
  1520. Cumulative product for each group.
  1521. """
  1522. nv.validate_groupby_func('cumprod', args, kwargs,
  1523. ['numeric_only', 'skipna'])
  1524. if axis != 0:
  1525. return self.apply(lambda x: x.cumprod(axis=axis, **kwargs))
  1526. return self._cython_transform('cumprod', **kwargs)
  1527. @Substitution(name='groupby')
  1528. @Appender(_common_see_also)
  1529. def cumsum(self, axis=0, *args, **kwargs):
  1530. """
  1531. Cumulative sum for each group.
  1532. """
  1533. nv.validate_groupby_func('cumsum', args, kwargs,
  1534. ['numeric_only', 'skipna'])
  1535. if axis != 0:
  1536. return self.apply(lambda x: x.cumsum(axis=axis, **kwargs))
  1537. return self._cython_transform('cumsum', **kwargs)
  1538. @Substitution(name='groupby')
  1539. @Appender(_common_see_also)
  1540. def cummin(self, axis=0, **kwargs):
  1541. """
  1542. Cumulative min for each group.
  1543. """
  1544. if axis != 0:
  1545. return self.apply(lambda x: np.minimum.accumulate(x, axis))
  1546. return self._cython_transform('cummin', numeric_only=False)
  1547. @Substitution(name='groupby')
  1548. @Appender(_common_see_also)
  1549. def cummax(self, axis=0, **kwargs):
  1550. """
  1551. Cumulative max for each group.
  1552. """
  1553. if axis != 0:
  1554. return self.apply(lambda x: np.maximum.accumulate(x, axis))
  1555. return self._cython_transform('cummax', numeric_only=False)
  1556. def _get_cythonized_result(self, how, grouper, aggregate=False,
  1557. cython_dtype=None, needs_values=False,
  1558. needs_mask=False, needs_ngroups=False,
  1559. result_is_index=False,
  1560. pre_processing=None, post_processing=None,
  1561. **kwargs):
  1562. """
  1563. Get result for Cythonized functions.
  1564. Parameters
  1565. ----------
  1566. how : str, Cythonized function name to be called
  1567. grouper : Grouper object containing pertinent group info
  1568. aggregate : bool, default False
  1569. Whether the result should be aggregated to match the number of
  1570. groups
  1571. cython_dtype : default None
  1572. Type of the array that will be modified by the Cython call. If
  1573. `None`, the type will be inferred from the values of each slice
  1574. needs_values : bool, default False
  1575. Whether the values should be a part of the Cython call
  1576. signature
  1577. needs_mask : bool, default False
  1578. Whether boolean mask needs to be part of the Cython call
  1579. signature
  1580. needs_ngroups : bool, default False
  1581. Whether number of groups is part of the Cython call signature
  1582. result_is_index : bool, default False
  1583. Whether the result of the Cython operation is an index of
  1584. values to be retrieved, instead of the actual values themselves
  1585. pre_processing : function, default None
  1586. Function to be applied to `values` prior to passing to Cython
  1587. Raises if `needs_values` is False
  1588. post_processing : function, default None
  1589. Function to be applied to result of Cython function
  1590. **kwargs : dict
  1591. Extra arguments to be passed back to Cython funcs
  1592. Returns
  1593. -------
  1594. `Series` or `DataFrame` with filled values
  1595. """
  1596. if result_is_index and aggregate:
  1597. raise ValueError("'result_is_index' and 'aggregate' cannot both "
  1598. "be True!")
  1599. if post_processing:
  1600. if not callable(pre_processing):
  1601. raise ValueError("'post_processing' must be a callable!")
  1602. if pre_processing:
  1603. if not callable(pre_processing):
  1604. raise ValueError("'pre_processing' must be a callable!")
  1605. if not needs_values:
  1606. raise ValueError("Cannot use 'pre_processing' without "
  1607. "specifying 'needs_values'!")
  1608. labels, _, ngroups = grouper.group_info
  1609. output = collections.OrderedDict()
  1610. base_func = getattr(libgroupby, how)
  1611. for name, obj in self._iterate_slices():
  1612. if aggregate:
  1613. result_sz = ngroups
  1614. else:
  1615. result_sz = len(obj.values)
  1616. if not cython_dtype:
  1617. cython_dtype = obj.values.dtype
  1618. result = np.zeros(result_sz, dtype=cython_dtype)
  1619. func = partial(base_func, result, labels)
  1620. if needs_values:
  1621. vals = obj.values
  1622. if pre_processing:
  1623. vals = pre_processing(vals)
  1624. func = partial(func, vals)
  1625. if needs_mask:
  1626. mask = isna(obj.values).view(np.uint8)
  1627. func = partial(func, mask)
  1628. if needs_ngroups:
  1629. func = partial(func, ngroups)
  1630. func(**kwargs) # Call func to modify indexer values in place
  1631. if result_is_index:
  1632. result = algorithms.take_nd(obj.values, result)
  1633. if post_processing:
  1634. result = post_processing(result)
  1635. output[name] = result
  1636. if aggregate:
  1637. return self._wrap_aggregated_output(output)
  1638. else:
  1639. return self._wrap_transformed_output(output)
  1640. @Substitution(name='groupby')
  1641. @Appender(_common_see_also)
  1642. def shift(self, periods=1, freq=None, axis=0, fill_value=None):
  1643. """
  1644. Shift each group by periods observations.
  1645. Parameters
  1646. ----------
  1647. periods : integer, default 1
  1648. number of periods to shift
  1649. freq : frequency string
  1650. axis : axis to shift, default 0
  1651. fill_value : optional
  1652. .. versionadded:: 0.24.0
  1653. """
  1654. if freq is not None or axis != 0 or not isna(fill_value):
  1655. return self.apply(lambda x: x.shift(periods, freq,
  1656. axis, fill_value))
  1657. return self._get_cythonized_result('group_shift_indexer',
  1658. self.grouper, cython_dtype=np.int64,
  1659. needs_ngroups=True,
  1660. result_is_index=True,
  1661. periods=periods)
  1662. @Substitution(name='groupby')
  1663. @Appender(_common_see_also)
  1664. def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
  1665. axis=0):
  1666. """
  1667. Calculate pct_change of each value to previous entry in group.
  1668. """
  1669. if freq is not None or axis != 0:
  1670. return self.apply(lambda x: x.pct_change(periods=periods,
  1671. fill_method=fill_method,
  1672. limit=limit, freq=freq,
  1673. axis=axis))
  1674. filled = getattr(self, fill_method)(limit=limit)
  1675. filled = filled.drop(self.grouper.names, axis=1)
  1676. fill_grp = filled.groupby(self.grouper.labels)
  1677. shifted = fill_grp.shift(periods=periods, freq=freq)
  1678. return (filled / shifted) - 1
  1679. @Substitution(name='groupby', see_also=_common_see_also)
  1680. def head(self, n=5):
  1681. """
  1682. Returns first n rows of each group.
  1683. Essentially equivalent to ``.apply(lambda x: x.head(n))``,
  1684. except ignores as_index flag.
  1685. %(see_also)s
  1686. Examples
  1687. --------
  1688. >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]],
  1689. columns=['A', 'B'])
  1690. >>> df.groupby('A', as_index=False).head(1)
  1691. A B
  1692. 0 1 2
  1693. 2 5 6
  1694. >>> df.groupby('A').head(1)
  1695. A B
  1696. 0 1 2
  1697. 2 5 6
  1698. """
  1699. self._reset_group_selection()
  1700. mask = self._cumcount_array() < n
  1701. return self._selected_obj[mask]
  1702. @Substitution(name='groupby', see_also=_common_see_also)
  1703. def tail(self, n=5):
  1704. """
  1705. Returns last n rows of each group.
  1706. Essentially equivalent to ``.apply(lambda x: x.tail(n))``,
  1707. except ignores as_index flag.
  1708. %(see_also)s
  1709. Examples
  1710. --------
  1711. >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],
  1712. columns=['A', 'B'])
  1713. >>> df.groupby('A').tail(1)
  1714. A B
  1715. 1 a 2
  1716. 3 b 2
  1717. >>> df.groupby('A').head(1)
  1718. A B
  1719. 0 a 1
  1720. 2 b 1
  1721. """
  1722. self._reset_group_selection()
  1723. mask = self._cumcount_array(ascending=False) < n
  1724. return self._selected_obj[mask]
  1725. GroupBy._add_numeric_operations()
  1726. @Appender(GroupBy.__doc__)
  1727. def groupby(obj, by, **kwds):
  1728. if isinstance(obj, Series):
  1729. from pandas.core.groupby.generic import SeriesGroupBy
  1730. klass = SeriesGroupBy
  1731. elif isinstance(obj, DataFrame):
  1732. from pandas.core.groupby.generic import DataFrameGroupBy
  1733. klass = DataFrameGroupBy
  1734. else: # pragma: no cover
  1735. raise TypeError('invalid type: {}'.format(obj))
  1736. return klass(obj, by, **kwds)