1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110 |
- """
- Provide the groupby split-apply-combine paradigm. Define the GroupBy
- class providing the base-class of operations.
- The SeriesGroupBy and DataFrameGroupBy sub-class
- (defined in pandas.core.groupby.generic)
- expose these user-facing objects to provide specific functionailty.
- """
- import collections
- from contextlib import contextmanager
- import datetime
- from functools import partial, wraps
- import types
- import warnings
- import numpy as np
- from pandas._libs import Timestamp, groupby as libgroupby
- import pandas.compat as compat
- from pandas.compat import callable, range, set_function_name, zip
- from pandas.compat.numpy import function as nv
- from pandas.errors import AbstractMethodError
- from pandas.util._decorators import Appender, Substitution, cache_readonly
- from pandas.util._validators import validate_kwargs
- from pandas.core.dtypes.cast import maybe_downcast_to_dtype
- from pandas.core.dtypes.common import (
- ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar)
- from pandas.core.dtypes.missing import isna, notna
- import pandas.core.algorithms as algorithms
- from pandas.core.base import (
- DataError, GroupByError, PandasObject, SelectionMixin, SpecificationError)
- import pandas.core.common as com
- from pandas.core.config import option_context
- from pandas.core.frame import DataFrame
- from pandas.core.generic import NDFrame
- from pandas.core.groupby import base
- from pandas.core.index import Index, MultiIndex
- from pandas.core.series import Series
- from pandas.core.sorting import get_group_index_sorter
- _common_see_also = """
- See Also
- --------
- pandas.Series.%(name)s
- pandas.DataFrame.%(name)s
- pandas.Panel.%(name)s
- """
- _apply_docs = dict(
- template="""
- Apply function `func` group-wise and combine the results together.
- The function passed to `apply` must take a {input} as its first
- argument and return a DataFrame, Series or scalar. `apply` will
- then take care of combining the results back together into a single
- dataframe or series. `apply` is therefore a highly flexible
- grouping method.
- While `apply` is a very flexible method, its downside is that
- using it can be quite a bit slower than using more specific methods
- like `agg` or `transform`. Pandas offers a wide range of method that will
- be much faster than using `apply` for their specific purposes, so try to
- use them before reaching for `apply`.
- Parameters
- ----------
- func : callable
- A callable that takes a {input} as its first argument, and
- returns a dataframe, a series or a scalar. In addition the
- callable may take positional and keyword arguments.
- args, kwargs : tuple and dict
- Optional positional and keyword arguments to pass to `func`.
- Returns
- -------
- applied : Series or DataFrame
- See Also
- --------
- pipe : Apply function to the full GroupBy object instead of to each
- group.
- aggregate : Apply aggregate function to the GroupBy object.
- transform : Apply function column-by-column to the GroupBy object.
- Series.apply : Apply a function to a Series.
- DataFrame.apply : Apply a function to each row or column of a DataFrame.
- """,
- dataframe_examples="""
- >>> df = pd.DataFrame({'A': 'a a b'.split(),
- 'B': [1,2,3],
- 'C': [4,6, 5]})
- >>> g = df.groupby('A')
- Notice that ``g`` has two groups, ``a`` and ``b``.
- Calling `apply` in various ways, we can get different grouping results:
- Example 1: below the function passed to `apply` takes a DataFrame as
- its argument and returns a DataFrame. `apply` combines the result for
- each group together into a new DataFrame:
- >>> g[['B', 'C']].apply(lambda x: x / x.sum())
- B C
- 0 0.333333 0.4
- 1 0.666667 0.6
- 2 1.000000 1.0
- Example 2: The function passed to `apply` takes a DataFrame as
- its argument and returns a Series. `apply` combines the result for
- each group together into a new DataFrame:
- >>> g[['B', 'C']].apply(lambda x: x.max() - x.min())
- B C
- A
- a 1 2
- b 0 0
- Example 3: The function passed to `apply` takes a DataFrame as
- its argument and returns a scalar. `apply` combines the result for
- each group together into a Series, including setting the index as
- appropriate:
- >>> g.apply(lambda x: x.C.max() - x.B.min())
- A
- a 5
- b 2
- dtype: int64
- """,
- series_examples="""
- >>> s = pd.Series([0, 1, 2], index='a a b'.split())
- >>> g = s.groupby(s.index)
- From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``.
- Calling `apply` in various ways, we can get different grouping results:
- Example 1: The function passed to `apply` takes a Series as
- its argument and returns a Series. `apply` combines the result for
- each group together into a new Series:
- >>> g.apply(lambda x: x*2 if x.name == 'b' else x/2)
- 0 0.0
- 1 0.5
- 2 4.0
- dtype: float64
- Example 2: The function passed to `apply` takes a Series as
- its argument and returns a scalar. `apply` combines the result for
- each group together into a Series, including setting the index as
- appropriate:
- >>> g.apply(lambda x: x.max() - x.min())
- a 1
- b 0
- dtype: int64
- Notes
- -----
- In the current implementation `apply` calls `func` twice on the
- first group to decide whether it can take a fast or slow code
- path. This can lead to unexpected behavior if `func` has
- side-effects, as they will take effect twice for the first
- group.
- Examples
- --------
- {examples}
- """)
- _pipe_template = """\
- Apply a function `func` with arguments to this %(klass)s object and return
- the function's result.
- %(versionadded)s
- Use `.pipe` when you want to improve readability by chaining together
- functions that expect Series, DataFrames, GroupBy or Resampler objects.
- Instead of writing
- >>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c)
- You can write
- >>> (df.groupby('group')
- ... .pipe(f)
- ... .pipe(g, arg1=a)
- ... .pipe(h, arg2=b, arg3=c))
- which is much more readable.
- Parameters
- ----------
- func : callable or tuple of (callable, string)
- Function to apply to this %(klass)s object or, alternatively,
- a `(callable, data_keyword)` tuple where `data_keyword` is a
- string indicating the keyword of `callable` that expects the
- %(klass)s object.
- args : iterable, optional
- positional arguments passed into `func`.
- kwargs : dict, optional
- a dictionary of keyword arguments passed into `func`.
- Returns
- -------
- object : the return type of `func`.
- See Also
- --------
- pandas.Series.pipe : Apply a function with arguments to a series.
- pandas.DataFrame.pipe: Apply a function with arguments to a dataframe.
- apply : Apply function to each group instead of to the
- full %(klass)s object.
- Notes
- -----
- See more `here
- <http://pandas.pydata.org/pandas-docs/stable/groupby.html#piping-function-calls>`_
- Examples
- --------
- %(examples)s
- """
- _transform_template = """
- Call function producing a like-indexed %(klass)s on each group and
- return a %(klass)s having the same indexes as the original object
- filled with the transformed values
- Parameters
- ----------
- f : function
- Function to apply to each group
- Returns
- -------
- %(klass)s
- See Also
- --------
- aggregate, transform
- Notes
- -----
- Each group is endowed the attribute 'name' in case you need to know
- which group you are working on.
- The current implementation imposes three requirements on f:
- * f must return a value that either has the same shape as the input
- subframe or can be broadcast to the shape of the input subframe.
- For example, f returns a scalar it will be broadcast to have the
- same shape as the input subframe.
- * if this is a DataFrame, f must support application column-by-column
- in the subframe. If f also supports application to the entire subframe,
- then a fast path is used starting from the second chunk.
- * f must not mutate groups. Mutation is not supported and may
- produce unexpected results.
- Examples
- --------
- # Same shape
- >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
- ... 'foo', 'bar'],
- ... 'B' : ['one', 'one', 'two', 'three',
- ... 'two', 'two'],
- ... 'C' : [1, 5, 5, 2, 5, 5],
- ... 'D' : [2.0, 5., 8., 1., 2., 9.]})
- >>> grouped = df.groupby('A')
- >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
- C D
- 0 -1.154701 -0.577350
- 1 0.577350 0.000000
- 2 0.577350 1.154701
- 3 -1.154701 -1.000000
- 4 0.577350 -0.577350
- 5 0.577350 1.000000
- # Broadcastable
- >>> grouped.transform(lambda x: x.max() - x.min())
- C D
- 0 4 6.0
- 1 3 8.0
- 2 4 6.0
- 3 3 8.0
- 4 4 6.0
- 5 3 8.0
- """
- class GroupByPlot(PandasObject):
- """
- Class implementing the .plot attribute for groupby objects.
- """
- def __init__(self, groupby):
- self._groupby = groupby
- def __call__(self, *args, **kwargs):
- def f(self):
- return self.plot(*args, **kwargs)
- f.__name__ = 'plot'
- return self._groupby.apply(f)
- def __getattr__(self, name):
- def attr(*args, **kwargs):
- def f(self):
- return getattr(self.plot, name)(*args, **kwargs)
- return self._groupby.apply(f)
- return attr
- @contextmanager
- def _group_selection_context(groupby):
- """
- Set / reset the _group_selection_context.
- """
- groupby._set_group_selection()
- yield groupby
- groupby._reset_group_selection()
- class _GroupBy(PandasObject, SelectionMixin):
- _group_selection = None
- _apply_whitelist = frozenset()
- def __init__(self, obj, keys=None, axis=0, level=None,
- grouper=None, exclusions=None, selection=None, as_index=True,
- sort=True, group_keys=True, squeeze=False,
- observed=False, **kwargs):
- self._selection = selection
- if isinstance(obj, NDFrame):
- obj._consolidate_inplace()
- self.level = level
- if not as_index:
- if not isinstance(obj, DataFrame):
- raise TypeError('as_index=False only valid with DataFrame')
- if axis != 0:
- raise ValueError('as_index=False only valid for axis=0')
- self.as_index = as_index
- self.keys = keys
- self.sort = sort
- self.group_keys = group_keys
- self.squeeze = squeeze
- self.observed = observed
- self.mutated = kwargs.pop('mutated', False)
- if grouper is None:
- from pandas.core.groupby.grouper import _get_grouper
- grouper, exclusions, obj = _get_grouper(obj, keys,
- axis=axis,
- level=level,
- sort=sort,
- observed=observed,
- mutated=self.mutated)
- self.obj = obj
- self.axis = obj._get_axis_number(axis)
- self.grouper = grouper
- self.exclusions = set(exclusions) if exclusions else set()
- # we accept no other args
- validate_kwargs('group', kwargs, {})
- def __len__(self):
- return len(self.groups)
- def __unicode__(self):
- # TODO: Better unicode/repr for GroupBy object
- return object.__repr__(self)
- def _assure_grouper(self):
- """
- We create the grouper on instantiation sub-classes may have a
- different policy.
- """
- pass
- @property
- def groups(self):
- """
- Dict {group name -> group labels}.
- """
- self._assure_grouper()
- return self.grouper.groups
- @property
- def ngroups(self):
- self._assure_grouper()
- return self.grouper.ngroups
- @property
- def indices(self):
- """
- Dict {group name -> group indices}.
- """
- self._assure_grouper()
- return self.grouper.indices
- def _get_indices(self, names):
- """
- Safe get multiple indices, translate keys for
- datelike to underlying repr.
- """
- def get_converter(s):
- # possibly convert to the actual key types
- # in the indices, could be a Timestamp or a np.datetime64
- if isinstance(s, (Timestamp, datetime.datetime)):
- return lambda key: Timestamp(key)
- elif isinstance(s, np.datetime64):
- return lambda key: Timestamp(key).asm8
- else:
- return lambda key: key
- if len(names) == 0:
- return []
- if len(self.indices) > 0:
- index_sample = next(iter(self.indices))
- else:
- index_sample = None # Dummy sample
- name_sample = names[0]
- if isinstance(index_sample, tuple):
- if not isinstance(name_sample, tuple):
- msg = ("must supply a tuple to get_group with multiple"
- " grouping keys")
- raise ValueError(msg)
- if not len(name_sample) == len(index_sample):
- try:
- # If the original grouper was a tuple
- return [self.indices[name] for name in names]
- except KeyError:
- # turns out it wasn't a tuple
- msg = ("must supply a same-length tuple to get_group"
- " with multiple grouping keys")
- raise ValueError(msg)
- converters = [get_converter(s) for s in index_sample]
- names = [tuple(f(n) for f, n in zip(converters, name))
- for name in names]
- else:
- converter = get_converter(index_sample)
- names = [converter(name) for name in names]
- return [self.indices.get(name, []) for name in names]
- def _get_index(self, name):
- """
- Safe get index, translate keys for datelike to underlying repr.
- """
- return self._get_indices([name])[0]
- @cache_readonly
- def _selected_obj(self):
- if self._selection is None or isinstance(self.obj, Series):
- if self._group_selection is not None:
- return self.obj[self._group_selection]
- return self.obj
- else:
- return self.obj[self._selection]
- def _reset_group_selection(self):
- """
- Clear group based selection.
- Used for methods needing to return info on each group regardless of
- whether a group selection was previously set.
- """
- if self._group_selection is not None:
- # GH12839 clear cached selection too when changing group selection
- self._group_selection = None
- self._reset_cache('_selected_obj')
- def _set_group_selection(self):
- """
- Create group based selection.
- Used when selection is not passed directly but instead via a grouper.
- NOTE: this should be paired with a call to _reset_group_selection
- """
- grp = self.grouper
- if not (self.as_index and
- getattr(grp, 'groupings', None) is not None and
- self.obj.ndim > 1 and
- self._group_selection is None):
- return
- ax = self.obj._info_axis
- groupers = [g.name for g in grp.groupings
- if g.level is None and g.in_axis]
- if len(groupers):
- # GH12839 clear selected obj cache when group selection changes
- self._group_selection = ax.difference(Index(groupers),
- sort=False).tolist()
- self._reset_cache('_selected_obj')
- def _set_result_index_ordered(self, result):
- # set the result index on the passed values object and
- # return the new object, xref 8046
- # the values/counts are repeated according to the group index
- # shortcut if we have an already ordered grouper
- if not self.grouper.is_monotonic:
- index = Index(np.concatenate(
- self._get_indices(self.grouper.result_index)))
- result.set_axis(index, axis=self.axis, inplace=True)
- result = result.sort_index(axis=self.axis)
- result.set_axis(self.obj._get_axis(self.axis), axis=self.axis,
- inplace=True)
- return result
- def _dir_additions(self):
- return self.obj._dir_additions() | self._apply_whitelist
- def __getattr__(self, attr):
- if attr in self._internal_names_set:
- return object.__getattribute__(self, attr)
- if attr in self.obj:
- return self[attr]
- if hasattr(self.obj, attr):
- return self._make_wrapper(attr)
- raise AttributeError("%r object has no attribute %r" %
- (type(self).__name__, attr))
- @Substitution(klass='GroupBy',
- versionadded='.. versionadded:: 0.21.0',
- examples="""\
- >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})
- >>> df
- A B
- 0 a 1
- 1 b 2
- 2 a 3
- 3 b 4
- To get the difference between each groups maximum and minimum value in one
- pass, you can do
- >>> df.groupby('A').pipe(lambda x: x.max() - x.min())
- B
- A
- a 2
- b 2""")
- @Appender(_pipe_template)
- def pipe(self, func, *args, **kwargs):
- return com._pipe(self, func, *args, **kwargs)
- plot = property(GroupByPlot)
- def _make_wrapper(self, name):
- if name not in self._apply_whitelist:
- is_callable = callable(getattr(self._selected_obj, name, None))
- kind = ' callable ' if is_callable else ' '
- msg = ("Cannot access{0}attribute {1!r} of {2!r} objects, try "
- "using the 'apply' method".format(kind, name,
- type(self).__name__))
- raise AttributeError(msg)
- self._set_group_selection()
- # need to setup the selection
- # as are not passed directly but in the grouper
- f = getattr(self._selected_obj, name)
- if not isinstance(f, types.MethodType):
- return self.apply(lambda self: getattr(self, name))
- f = getattr(type(self._selected_obj), name)
- def wrapper(*args, **kwargs):
- # a little trickery for aggregation functions that need an axis
- # argument
- kwargs_with_axis = kwargs.copy()
- if ('axis' not in kwargs_with_axis or
- kwargs_with_axis['axis'] is None):
- kwargs_with_axis['axis'] = self.axis
- def curried_with_axis(x):
- return f(x, *args, **kwargs_with_axis)
- def curried(x):
- return f(x, *args, **kwargs)
- # preserve the name so we can detect it when calling plot methods,
- # to avoid duplicates
- curried.__name__ = curried_with_axis.__name__ = name
- # special case otherwise extra plots are created when catching the
- # exception below
- if name in base.plotting_methods:
- return self.apply(curried)
- try:
- return self.apply(curried_with_axis)
- except Exception:
- try:
- return self.apply(curried)
- except Exception:
- # related to : GH3688
- # try item-by-item
- # this can be called recursively, so need to raise
- # ValueError
- # if we don't have this method to indicated to aggregate to
- # mark this column as an error
- try:
- return self._aggregate_item_by_item(name,
- *args, **kwargs)
- except (AttributeError):
- raise ValueError
- return wrapper
- def get_group(self, name, obj=None):
- """
- Constructs NDFrame from group with provided name.
- Parameters
- ----------
- name : object
- the name of the group to get as a DataFrame
- obj : NDFrame, default None
- the NDFrame to take the DataFrame out of. If
- it is None, the object groupby was called on will
- be used
- Returns
- -------
- group : same type as obj
- """
- if obj is None:
- obj = self._selected_obj
- inds = self._get_index(name)
- if not len(inds):
- raise KeyError(name)
- return obj._take(inds, axis=self.axis)
- def __iter__(self):
- """
- Groupby iterator.
- Returns
- -------
- Generator yielding sequence of (name, subsetted object)
- for each group
- """
- return self.grouper.get_iterator(self.obj, axis=self.axis)
- @Appender(_apply_docs['template']
- .format(input="dataframe",
- examples=_apply_docs['dataframe_examples']))
- def apply(self, func, *args, **kwargs):
- func = self._is_builtin_func(func)
- # this is needed so we don't try and wrap strings. If we could
- # resolve functions to their callable functions prior, this
- # wouldn't be needed
- if args or kwargs:
- if callable(func):
- @wraps(func)
- def f(g):
- with np.errstate(all='ignore'):
- return func(g, *args, **kwargs)
- else:
- raise ValueError('func must be a callable if args or '
- 'kwargs are supplied')
- else:
- f = func
- # ignore SettingWithCopy here in case the user mutates
- with option_context('mode.chained_assignment', None):
- try:
- result = self._python_apply_general(f)
- except Exception:
- # gh-20949
- # try again, with .apply acting as a filtering
- # operation, by excluding the grouping column
- # This would normally not be triggered
- # except if the udf is trying an operation that
- # fails on *some* columns, e.g. a numeric operation
- # on a string grouper column
- with _group_selection_context(self):
- return self._python_apply_general(f)
- return result
- def _python_apply_general(self, f):
- keys, values, mutated = self.grouper.apply(f, self._selected_obj,
- self.axis)
- return self._wrap_applied_output(
- keys,
- values,
- not_indexed_same=mutated or self.mutated)
- def _iterate_slices(self):
- yield self._selection_name, self._selected_obj
- def transform(self, func, *args, **kwargs):
- raise AbstractMethodError(self)
- def _cumcount_array(self, ascending=True):
- """
- Parameters
- ----------
- ascending : bool, default True
- If False, number in reverse, from length of group - 1 to 0.
- Notes
- -----
- this is currently implementing sort=False
- (though the default is sort=True) for groupby in general
- """
- ids, _, ngroups = self.grouper.group_info
- sorter = get_group_index_sorter(ids, ngroups)
- ids, count = ids[sorter], len(ids)
- if count == 0:
- return np.empty(0, dtype=np.int64)
- run = np.r_[True, ids[:-1] != ids[1:]]
- rep = np.diff(np.r_[np.nonzero(run)[0], count])
- out = (~run).cumsum()
- if ascending:
- out -= np.repeat(out[run], rep)
- else:
- out = np.repeat(out[np.r_[run[1:], True]], rep) - out
- rev = np.empty(count, dtype=np.intp)
- rev[sorter] = np.arange(count, dtype=np.intp)
- return out[rev].astype(np.int64, copy=False)
- def _try_cast(self, result, obj, numeric_only=False):
- """
- Try to cast the result to our obj original type,
- we may have roundtripped through object in the mean-time.
- If numeric_only is True, then only try to cast numerics
- and not datetimelikes.
- """
- if obj.ndim > 1:
- dtype = obj._values.dtype
- else:
- dtype = obj.dtype
- if not is_scalar(result):
- if is_extension_array_dtype(dtype):
- # The function can return something of any type, so check
- # if the type is compatible with the calling EA.
- try:
- result = obj._values._from_sequence(result, dtype=dtype)
- except Exception:
- # https://github.com/pandas-dev/pandas/issues/22850
- # pandas has no control over what 3rd-party ExtensionArrays
- # do in _values_from_sequence. We still want ops to work
- # though, so we catch any regular Exception.
- pass
- elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
- result = maybe_downcast_to_dtype(result, dtype)
- return result
- def _transform_should_cast(self, func_nm):
- """
- Parameters:
- -----------
- func_nm: str
- The name of the aggregation function being performed
- Returns:
- --------
- bool
- Whether transform should attempt to cast the result of aggregation
- """
- return (self.size().fillna(0) > 0).any() and (
- func_nm not in base.cython_cast_blacklist)
- def _cython_transform(self, how, numeric_only=True, **kwargs):
- output = collections.OrderedDict()
- for name, obj in self._iterate_slices():
- is_numeric = is_numeric_dtype(obj.dtype)
- if numeric_only and not is_numeric:
- continue
- try:
- result, names = self.grouper.transform(obj.values, how,
- **kwargs)
- except NotImplementedError:
- continue
- except AssertionError as e:
- raise GroupByError(str(e))
- if self._transform_should_cast(how):
- output[name] = self._try_cast(result, obj)
- else:
- output[name] = result
- if len(output) == 0:
- raise DataError('No numeric types to aggregate')
- return self._wrap_transformed_output(output, names)
- def _cython_agg_general(self, how, alt=None, numeric_only=True,
- min_count=-1):
- output = {}
- for name, obj in self._iterate_slices():
- is_numeric = is_numeric_dtype(obj.dtype)
- if numeric_only and not is_numeric:
- continue
- try:
- result, names = self.grouper.aggregate(obj.values, how,
- min_count=min_count)
- except AssertionError as e:
- raise GroupByError(str(e))
- output[name] = self._try_cast(result, obj)
- if len(output) == 0:
- raise DataError('No numeric types to aggregate')
- return self._wrap_aggregated_output(output, names)
- def _python_agg_general(self, func, *args, **kwargs):
- func = self._is_builtin_func(func)
- f = lambda x: func(x, *args, **kwargs)
- # iterate through "columns" ex exclusions to populate output dict
- output = {}
- for name, obj in self._iterate_slices():
- try:
- result, counts = self.grouper.agg_series(obj, f)
- output[name] = self._try_cast(result, obj, numeric_only=True)
- except TypeError:
- continue
- if len(output) == 0:
- return self._python_apply_general(f)
- if self.grouper._filter_empty_groups:
- mask = counts.ravel() > 0
- for name, result in compat.iteritems(output):
- # since we are masking, make sure that we have a float object
- values = result
- if is_numeric_dtype(values.dtype):
- values = ensure_float(values)
- output[name] = self._try_cast(values[mask], result)
- return self._wrap_aggregated_output(output)
- def _wrap_applied_output(self, *args, **kwargs):
- raise AbstractMethodError(self)
- def _concat_objects(self, keys, values, not_indexed_same=False):
- from pandas.core.reshape.concat import concat
- def reset_identity(values):
- # reset the identities of the components
- # of the values to prevent aliasing
- for v in com._not_none(*values):
- ax = v._get_axis(self.axis)
- ax._reset_identity()
- return values
- if not not_indexed_same:
- result = concat(values, axis=self.axis)
- ax = self._selected_obj._get_axis(self.axis)
- if isinstance(result, Series):
- result = result.reindex(ax)
- else:
- # this is a very unfortunate situation
- # we have a multi-index that is NOT lexsorted
- # and we have a result which is duplicated
- # we can't reindex, so we resort to this
- # GH 14776
- if isinstance(ax, MultiIndex) and not ax.is_unique:
- indexer = algorithms.unique1d(
- result.index.get_indexer_for(ax.values))
- result = result.take(indexer, axis=self.axis)
- else:
- result = result.reindex(ax, axis=self.axis)
- elif self.group_keys:
- values = reset_identity(values)
- if self.as_index:
- # possible MI return case
- group_keys = keys
- group_levels = self.grouper.levels
- group_names = self.grouper.names
- result = concat(values, axis=self.axis, keys=group_keys,
- levels=group_levels, names=group_names,
- sort=False)
- else:
- # GH5610, returns a MI, with the first level being a
- # range index
- keys = list(range(len(values)))
- result = concat(values, axis=self.axis, keys=keys)
- else:
- values = reset_identity(values)
- result = concat(values, axis=self.axis)
- if (isinstance(result, Series) and
- getattr(self, '_selection_name', None) is not None):
- result.name = self._selection_name
- return result
- def _apply_filter(self, indices, dropna):
- if len(indices) == 0:
- indices = np.array([], dtype='int64')
- else:
- indices = np.sort(np.concatenate(indices))
- if dropna:
- filtered = self._selected_obj.take(indices, axis=self.axis)
- else:
- mask = np.empty(len(self._selected_obj.index), dtype=bool)
- mask.fill(False)
- mask[indices.astype(int)] = True
- # mask fails to broadcast when passed to where; broadcast manually.
- mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
- filtered = self._selected_obj.where(mask) # Fill with NaNs.
- return filtered
- class GroupBy(_GroupBy):
- """
- Class for grouping and aggregating relational data.
- See aggregate, transform, and apply functions on this object.
- It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
- ::
- grouped = groupby(obj, ...)
- Parameters
- ----------
- obj : pandas object
- axis : int, default 0
- level : int, default None
- Level of MultiIndex
- groupings : list of Grouping objects
- Most users should ignore this
- exclusions : array-like, optional
- List of columns to exclude
- name : string
- Most users should ignore this
- Returns
- -------
- **Attributes**
- groups : dict
- {group name -> group labels}
- len(grouped) : int
- Number of groups
- Notes
- -----
- After grouping, see aggregate, apply, and transform functions. Here are
- some other brief notes about usage. When grouping by multiple groups, the
- result index will be a MultiIndex (hierarchical) by default.
- Iteration produces (key, group) tuples, i.e. chunking the data by group. So
- you can write code like:
- ::
- grouped = obj.groupby(keys, axis=axis)
- for key, group in grouped:
- # do something with the data
- Function calls on GroupBy, if not specially implemented, "dispatch" to the
- grouped data. So if you group a DataFrame and wish to invoke the std()
- method on each group, you can simply do:
- ::
- df.groupby(mapper).std()
- rather than
- ::
- df.groupby(mapper).aggregate(np.std)
- You can pass arguments to these "wrapped" functions, too.
- See the online documentation for full exposition on these topics and much
- more
- """
- def _bool_agg(self, val_test, skipna):
- """
- Shared func to call any / all Cython GroupBy implementations.
- """
- def objs_to_bool(vals):
- try:
- vals = vals.astype(np.bool)
- except ValueError: # for objects
- vals = np.array([bool(x) for x in vals])
- return vals.view(np.uint8)
- def result_to_bool(result):
- return result.astype(np.bool, copy=False)
- return self._get_cythonized_result('group_any_all', self.grouper,
- aggregate=True,
- cython_dtype=np.uint8,
- needs_values=True,
- needs_mask=True,
- pre_processing=objs_to_bool,
- post_processing=result_to_bool,
- val_test=val_test, skipna=skipna)
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def any(self, skipna=True):
- """
- Returns True if any value in the group is truthful, else False.
- Parameters
- ----------
- skipna : bool, default True
- Flag to ignore nan values during truth testing
- """
- return self._bool_agg('any', skipna)
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def all(self, skipna=True):
- """
- Returns True if all values in the group are truthful, else False.
- Parameters
- ----------
- skipna : bool, default True
- Flag to ignore nan values during truth testing
- """
- return self._bool_agg('all', skipna)
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def count(self):
- """
- Compute count of group, excluding missing values.
- """
- # defined here for API doc
- raise NotImplementedError
- @Substitution(name='groupby', see_also=_common_see_also)
- def mean(self, *args, **kwargs):
- """
- Compute mean of groups, excluding missing values.
- Returns
- -------
- pandas.Series or pandas.DataFrame
- %(see_also)s
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
- ... 'B': [np.nan, 2, 3, 4, 5],
- ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])
- Groupby one column and return the mean of the remaining columns in
- each group.
- >>> df.groupby('A').mean()
- >>>
- B C
- A
- 1 3.0 1.333333
- 2 4.0 1.500000
- Groupby two columns and return the mean of the remaining column.
- >>> df.groupby(['A', 'B']).mean()
- >>>
- C
- A B
- 1 2.0 2
- 4.0 1
- 2 3.0 1
- 5.0 2
- Groupby one column and return the mean of only particular column in
- the group.
- >>> df.groupby('A')['B'].mean()
- >>>
- A
- 1 3.0
- 2 4.0
- Name: B, dtype: float64
- """
- nv.validate_groupby_func('mean', args, kwargs, ['numeric_only'])
- try:
- return self._cython_agg_general('mean', **kwargs)
- except GroupByError:
- raise
- except Exception: # pragma: no cover
- with _group_selection_context(self):
- f = lambda x: x.mean(axis=self.axis, **kwargs)
- return self._python_agg_general(f)
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def median(self, **kwargs):
- """
- Compute median of groups, excluding missing values.
- For multiple groupings, the result index will be a MultiIndex
- """
- try:
- return self._cython_agg_general('median', **kwargs)
- except GroupByError:
- raise
- except Exception: # pragma: no cover
- def f(x):
- if isinstance(x, np.ndarray):
- x = Series(x)
- return x.median(axis=self.axis, **kwargs)
- with _group_selection_context(self):
- return self._python_agg_general(f)
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def std(self, ddof=1, *args, **kwargs):
- """
- Compute standard deviation of groups, excluding missing values.
- For multiple groupings, the result index will be a MultiIndex.
- Parameters
- ----------
- ddof : integer, default 1
- degrees of freedom
- """
- # TODO: implement at Cython level?
- nv.validate_groupby_func('std', args, kwargs)
- return np.sqrt(self.var(ddof=ddof, **kwargs))
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def var(self, ddof=1, *args, **kwargs):
- """
- Compute variance of groups, excluding missing values.
- For multiple groupings, the result index will be a MultiIndex.
- Parameters
- ----------
- ddof : integer, default 1
- degrees of freedom
- """
- nv.validate_groupby_func('var', args, kwargs)
- if ddof == 1:
- try:
- return self._cython_agg_general('var', **kwargs)
- except Exception:
- f = lambda x: x.var(ddof=ddof, **kwargs)
- with _group_selection_context(self):
- return self._python_agg_general(f)
- else:
- f = lambda x: x.var(ddof=ddof, **kwargs)
- with _group_selection_context(self):
- return self._python_agg_general(f)
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def sem(self, ddof=1):
- """
- Compute standard error of the mean of groups, excluding missing values.
- For multiple groupings, the result index will be a MultiIndex.
- Parameters
- ----------
- ddof : integer, default 1
- degrees of freedom
- """
- return self.std(ddof=ddof) / np.sqrt(self.count())
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def size(self):
- """
- Compute group sizes.
- """
- result = self.grouper.size()
- if isinstance(self.obj, Series):
- result.name = getattr(self.obj, 'name', None)
- return result
- @classmethod
- def _add_numeric_operations(cls):
- """
- Add numeric operations to the GroupBy generically.
- """
- def groupby_function(name, alias, npfunc,
- numeric_only=True, _convert=False,
- min_count=-1):
- _local_template = "Compute %(f)s of group values"
- @Substitution(name='groupby', f=name)
- @Appender(_common_see_also)
- @Appender(_local_template)
- def f(self, **kwargs):
- if 'numeric_only' not in kwargs:
- kwargs['numeric_only'] = numeric_only
- if 'min_count' not in kwargs:
- kwargs['min_count'] = min_count
- self._set_group_selection()
- try:
- return self._cython_agg_general(
- alias, alt=npfunc, **kwargs)
- except AssertionError as e:
- raise SpecificationError(str(e))
- except Exception:
- result = self.aggregate(
- lambda x: npfunc(x, axis=self.axis))
- if _convert:
- result = result._convert(datetime=True)
- return result
- set_function_name(f, name, cls)
- return f
- def first_compat(x, axis=0):
- def first(x):
- x = x.to_numpy()
- x = x[notna(x)]
- if len(x) == 0:
- return np.nan
- return x[0]
- if isinstance(x, DataFrame):
- return x.apply(first, axis=axis)
- else:
- return first(x)
- def last_compat(x, axis=0):
- def last(x):
- x = x.to_numpy()
- x = x[notna(x)]
- if len(x) == 0:
- return np.nan
- return x[-1]
- if isinstance(x, DataFrame):
- return x.apply(last, axis=axis)
- else:
- return last(x)
- cls.sum = groupby_function('sum', 'add', np.sum, min_count=0)
- cls.prod = groupby_function('prod', 'prod', np.prod, min_count=0)
- cls.min = groupby_function('min', 'min', np.min, numeric_only=False)
- cls.max = groupby_function('max', 'max', np.max, numeric_only=False)
- cls.first = groupby_function('first', 'first', first_compat,
- numeric_only=False)
- cls.last = groupby_function('last', 'last', last_compat,
- numeric_only=False)
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def ohlc(self):
- """
- Compute sum of values, excluding missing values.
- For multiple groupings, the result index will be a MultiIndex
- """
- return self._apply_to_column_groupbys(
- lambda x: x._cython_agg_general('ohlc'))
- @Appender(DataFrame.describe.__doc__)
- def describe(self, **kwargs):
- with _group_selection_context(self):
- result = self.apply(lambda x: x.describe(**kwargs))
- if self.axis == 1:
- return result.T
- return result.unstack()
- def resample(self, rule, *args, **kwargs):
- """
- Provide resampling when using a TimeGrouper.
- Given a grouper, the function resamples it according to a string
- "string" -> "frequency".
- See the :ref:`frequency aliases <timeseries.offset_aliases>`
- documentation for more details.
- Parameters
- ----------
- rule : str or DateOffset
- The offset string or object representing target grouper conversion.
- *args, **kwargs
- Possible arguments are `how`, `fill_method`, `limit`, `kind` and
- `on`, and other arguments of `TimeGrouper`.
- Returns
- -------
- Grouper
- Return a new grouper with our resampler appended.
- See Also
- --------
- pandas.Grouper : Specify a frequency to resample with when
- grouping by a key.
- DatetimeIndex.resample : Frequency conversion and resampling of
- time series.
- Examples
- --------
- >>> idx = pd.date_range('1/1/2000', periods=4, freq='T')
- >>> df = pd.DataFrame(data=4 * [range(2)],
- ... index=idx,
- ... columns=['a', 'b'])
- >>> df.iloc[2, 0] = 5
- >>> df
- a b
- 2000-01-01 00:00:00 0 1
- 2000-01-01 00:01:00 0 1
- 2000-01-01 00:02:00 5 1
- 2000-01-01 00:03:00 0 1
- Downsample the DataFrame into 3 minute bins and sum the values of
- the timestamps falling into a bin.
- >>> df.groupby('a').resample('3T').sum()
- a b
- a
- 0 2000-01-01 00:00:00 0 2
- 2000-01-01 00:03:00 0 1
- 5 2000-01-01 00:00:00 5 1
- Upsample the series into 30 second bins.
- >>> df.groupby('a').resample('30S').sum()
- a b
- a
- 0 2000-01-01 00:00:00 0 1
- 2000-01-01 00:00:30 0 0
- 2000-01-01 00:01:00 0 1
- 2000-01-01 00:01:30 0 0
- 2000-01-01 00:02:00 0 0
- 2000-01-01 00:02:30 0 0
- 2000-01-01 00:03:00 0 1
- 5 2000-01-01 00:02:00 5 1
- Resample by month. Values are assigned to the month of the period.
- >>> df.groupby('a').resample('M').sum()
- a b
- a
- 0 2000-01-31 0 3
- 5 2000-01-31 5 1
- Downsample the series into 3 minute bins as above, but close the right
- side of the bin interval.
- >>> df.groupby('a').resample('3T', closed='right').sum()
- a b
- a
- 0 1999-12-31 23:57:00 0 1
- 2000-01-01 00:00:00 0 2
- 5 2000-01-01 00:00:00 5 1
- Downsample the series into 3 minute bins and close the right side of
- the bin interval, but label each bin using the right edge instead of
- the left.
- >>> df.groupby('a').resample('3T', closed='right', label='right').sum()
- a b
- a
- 0 2000-01-01 00:00:00 0 1
- 2000-01-01 00:03:00 0 2
- 5 2000-01-01 00:03:00 5 1
- Add an offset of twenty seconds.
- >>> df.groupby('a').resample('3T', loffset='20s').sum()
- a b
- a
- 0 2000-01-01 00:00:20 0 2
- 2000-01-01 00:03:20 0 1
- 5 2000-01-01 00:00:20 5 1
- """
- from pandas.core.resample import get_resampler_for_grouping
- return get_resampler_for_grouping(self, rule, *args, **kwargs)
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def rolling(self, *args, **kwargs):
- """
- Return a rolling grouper, providing rolling functionality per group.
- """
- from pandas.core.window import RollingGroupby
- return RollingGroupby(self, *args, **kwargs)
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def expanding(self, *args, **kwargs):
- """
- Return an expanding grouper, providing expanding
- functionality per group.
- """
- from pandas.core.window import ExpandingGroupby
- return ExpandingGroupby(self, *args, **kwargs)
- def _fill(self, direction, limit=None):
- """
- Shared function for `pad` and `backfill` to call Cython method.
- Parameters
- ----------
- direction : {'ffill', 'bfill'}
- Direction passed to underlying Cython function. `bfill` will cause
- values to be filled backwards. `ffill` and any other values will
- default to a forward fill
- limit : int, default None
- Maximum number of consecutive values to fill. If `None`, this
- method will convert to -1 prior to passing to Cython
- Returns
- -------
- `Series` or `DataFrame` with filled values
- See Also
- --------
- pad
- backfill
- """
- # Need int value for Cython
- if limit is None:
- limit = -1
- return self._get_cythonized_result('group_fillna_indexer',
- self.grouper, needs_mask=True,
- cython_dtype=np.int64,
- result_is_index=True,
- direction=direction, limit=limit)
- @Substitution(name='groupby')
- def pad(self, limit=None):
- """
- Forward fill the values.
- Parameters
- ----------
- limit : integer, optional
- limit of how many values to fill
- See Also
- --------
- Series.pad
- DataFrame.pad
- Series.fillna
- DataFrame.fillna
- """
- return self._fill('ffill', limit=limit)
- ffill = pad
- @Substitution(name='groupby')
- def backfill(self, limit=None):
- """
- Backward fill the values.
- Parameters
- ----------
- limit : integer, optional
- limit of how many values to fill
- See Also
- --------
- Series.backfill
- DataFrame.backfill
- Series.fillna
- DataFrame.fillna
- """
- return self._fill('bfill', limit=limit)
- bfill = backfill
- @Substitution(name='groupby', see_also=_common_see_also)
- def nth(self, n, dropna=None):
- """
- Take the nth row from each group if n is an int, or a subset of rows
- if n is a list of ints.
- If dropna, will take the nth non-null row, dropna is either
- Truthy (if a Series) or 'all', 'any' (if a DataFrame);
- this is equivalent to calling dropna(how=dropna) before the
- groupby.
- Parameters
- ----------
- n : int or list of ints
- a single nth value for the row or a list of nth values
- dropna : None or str, optional
- apply the specified dropna operation before counting which row is
- the nth row. Needs to be None, 'any' or 'all'
- %(see_also)s
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
- ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])
- >>> g = df.groupby('A')
- >>> g.nth(0)
- B
- A
- 1 NaN
- 2 3.0
- >>> g.nth(1)
- B
- A
- 1 2.0
- 2 5.0
- >>> g.nth(-1)
- B
- A
- 1 4.0
- 2 5.0
- >>> g.nth([0, 1])
- B
- A
- 1 NaN
- 1 2.0
- 2 3.0
- 2 5.0
- Specifying `dropna` allows count ignoring ``NaN``
- >>> g.nth(0, dropna='any')
- B
- A
- 1 2.0
- 2 3.0
- NaNs denote group exhausted when using dropna
- >>> g.nth(3, dropna='any')
- B
- A
- 1 NaN
- 2 NaN
- Specifying `as_index=False` in `groupby` keeps the original index.
- >>> df.groupby('A', as_index=False).nth(1)
- A B
- 1 1 2.0
- 4 2 5.0
- """
- if isinstance(n, int):
- nth_values = [n]
- elif isinstance(n, (set, list, tuple)):
- nth_values = list(set(n))
- if dropna is not None:
- raise ValueError(
- "dropna option with a list of nth values is not supported")
- else:
- raise TypeError("n needs to be an int or a list/set/tuple of ints")
- nth_values = np.array(nth_values, dtype=np.intp)
- self._set_group_selection()
- if not dropna:
- mask_left = np.in1d(self._cumcount_array(), nth_values)
- mask_right = np.in1d(self._cumcount_array(ascending=False) + 1,
- -nth_values)
- mask = mask_left | mask_right
- out = self._selected_obj[mask]
- if not self.as_index:
- return out
- ids, _, _ = self.grouper.group_info
- out.index = self.grouper.result_index[ids[mask]]
- return out.sort_index() if self.sort else out
- if dropna not in ['any', 'all']:
- if isinstance(self._selected_obj, Series) and dropna is True:
- warnings.warn("the dropna={dropna} keyword is deprecated,"
- "use dropna='all' instead. "
- "For a Series groupby, dropna must be "
- "either None, 'any' or 'all'.".format(
- dropna=dropna),
- FutureWarning,
- stacklevel=2)
- dropna = 'all'
- else:
- # Note: when agg-ing picker doesn't raise this,
- # just returns NaN
- raise ValueError("For a DataFrame groupby, dropna must be "
- "either None, 'any' or 'all', "
- "(was passed {dropna}).".format(
- dropna=dropna))
- # old behaviour, but with all and any support for DataFrames.
- # modified in GH 7559 to have better perf
- max_len = n if n >= 0 else - 1 - n
- dropped = self.obj.dropna(how=dropna, axis=self.axis)
- # get a new grouper for our dropped obj
- if self.keys is None and self.level is None:
- # we don't have the grouper info available
- # (e.g. we have selected out
- # a column that is not in the current object)
- axis = self.grouper.axis
- grouper = axis[axis.isin(dropped.index)]
- else:
- # create a grouper with the original parameters, but on the dropped
- # object
- from pandas.core.groupby.grouper import _get_grouper
- grouper, _, _ = _get_grouper(dropped, key=self.keys,
- axis=self.axis, level=self.level,
- sort=self.sort,
- mutated=self.mutated)
- grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort)
- sizes, result = grb.size(), grb.nth(n)
- mask = (sizes < max_len).values
- # set the results which don't meet the criteria
- if len(result) and mask.any():
- result.loc[mask] = np.nan
- # reset/reindex to the original groups
- if (len(self.obj) == len(dropped) or
- len(result) == len(self.grouper.result_index)):
- result.index = self.grouper.result_index
- else:
- result = result.reindex(self.grouper.result_index)
- return result
- @Substitution(name='groupby')
- def ngroup(self, ascending=True):
- """
- Number each group from 0 to the number of groups - 1.
- This is the enumerative complement of cumcount. Note that the
- numbers given to the groups match the order in which the groups
- would be seen when iterating over the groupby object, not the
- order they are first observed.
- .. versionadded:: 0.20.2
- Parameters
- ----------
- ascending : bool, default True
- If False, number in reverse, from number of group - 1 to 0.
- See Also
- --------
- .cumcount : Number the rows in each group.
- Examples
- --------
- >>> df = pd.DataFrame({"A": list("aaabba")})
- >>> df
- A
- 0 a
- 1 a
- 2 a
- 3 b
- 4 b
- 5 a
- >>> df.groupby('A').ngroup()
- 0 0
- 1 0
- 2 0
- 3 1
- 4 1
- 5 0
- dtype: int64
- >>> df.groupby('A').ngroup(ascending=False)
- 0 1
- 1 1
- 2 1
- 3 0
- 4 0
- 5 1
- dtype: int64
- >>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup()
- 0 0
- 1 0
- 2 1
- 3 3
- 4 2
- 5 0
- dtype: int64
- """
- with _group_selection_context(self):
- index = self._selected_obj.index
- result = Series(self.grouper.group_info[0], index)
- if not ascending:
- result = self.ngroups - 1 - result
- return result
- @Substitution(name='groupby')
- def cumcount(self, ascending=True):
- """
- Number each item in each group from 0 to the length of that group - 1.
- Essentially this is equivalent to
- >>> self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))
- Parameters
- ----------
- ascending : bool, default True
- If False, number in reverse, from length of group - 1 to 0.
- See Also
- --------
- .ngroup : Number the groups themselves.
- Examples
- --------
- >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
- ... columns=['A'])
- >>> df
- A
- 0 a
- 1 a
- 2 a
- 3 b
- 4 b
- 5 a
- >>> df.groupby('A').cumcount()
- 0 0
- 1 1
- 2 2
- 3 0
- 4 1
- 5 3
- dtype: int64
- >>> df.groupby('A').cumcount(ascending=False)
- 0 3
- 1 2
- 2 1
- 3 1
- 4 0
- 5 0
- dtype: int64
- """
- with _group_selection_context(self):
- index = self._selected_obj.index
- cumcounts = self._cumcount_array(ascending=ascending)
- return Series(cumcounts, index)
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def rank(self, method='average', ascending=True, na_option='keep',
- pct=False, axis=0):
- """
- Provides the rank of values within each group.
- Parameters
- ----------
- method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
- * average: average rank of group
- * min: lowest rank in group
- * max: highest rank in group
- * first: ranks assigned in order they appear in the array
- * dense: like 'min', but rank always increases by 1 between groups
- ascending : boolean, default True
- False for ranks by high (1) to low (N)
- na_option : {'keep', 'top', 'bottom'}, default 'keep'
- * keep: leave NA values where they are
- * top: smallest rank if ascending
- * bottom: smallest rank if descending
- pct : boolean, default False
- Compute percentage rank of data within each group
- axis : int, default 0
- The axis of the object over which to compute the rank.
- Returns
- -----
- DataFrame with ranking of values within each group
- """
- if na_option not in {'keep', 'top', 'bottom'}:
- msg = "na_option must be one of 'keep', 'top', or 'bottom'"
- raise ValueError(msg)
- return self._cython_transform('rank', numeric_only=False,
- ties_method=method, ascending=ascending,
- na_option=na_option, pct=pct, axis=axis)
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def cumprod(self, axis=0, *args, **kwargs):
- """
- Cumulative product for each group.
- """
- nv.validate_groupby_func('cumprod', args, kwargs,
- ['numeric_only', 'skipna'])
- if axis != 0:
- return self.apply(lambda x: x.cumprod(axis=axis, **kwargs))
- return self._cython_transform('cumprod', **kwargs)
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def cumsum(self, axis=0, *args, **kwargs):
- """
- Cumulative sum for each group.
- """
- nv.validate_groupby_func('cumsum', args, kwargs,
- ['numeric_only', 'skipna'])
- if axis != 0:
- return self.apply(lambda x: x.cumsum(axis=axis, **kwargs))
- return self._cython_transform('cumsum', **kwargs)
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def cummin(self, axis=0, **kwargs):
- """
- Cumulative min for each group.
- """
- if axis != 0:
- return self.apply(lambda x: np.minimum.accumulate(x, axis))
- return self._cython_transform('cummin', numeric_only=False)
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def cummax(self, axis=0, **kwargs):
- """
- Cumulative max for each group.
- """
- if axis != 0:
- return self.apply(lambda x: np.maximum.accumulate(x, axis))
- return self._cython_transform('cummax', numeric_only=False)
- def _get_cythonized_result(self, how, grouper, aggregate=False,
- cython_dtype=None, needs_values=False,
- needs_mask=False, needs_ngroups=False,
- result_is_index=False,
- pre_processing=None, post_processing=None,
- **kwargs):
- """
- Get result for Cythonized functions.
- Parameters
- ----------
- how : str, Cythonized function name to be called
- grouper : Grouper object containing pertinent group info
- aggregate : bool, default False
- Whether the result should be aggregated to match the number of
- groups
- cython_dtype : default None
- Type of the array that will be modified by the Cython call. If
- `None`, the type will be inferred from the values of each slice
- needs_values : bool, default False
- Whether the values should be a part of the Cython call
- signature
- needs_mask : bool, default False
- Whether boolean mask needs to be part of the Cython call
- signature
- needs_ngroups : bool, default False
- Whether number of groups is part of the Cython call signature
- result_is_index : bool, default False
- Whether the result of the Cython operation is an index of
- values to be retrieved, instead of the actual values themselves
- pre_processing : function, default None
- Function to be applied to `values` prior to passing to Cython
- Raises if `needs_values` is False
- post_processing : function, default None
- Function to be applied to result of Cython function
- **kwargs : dict
- Extra arguments to be passed back to Cython funcs
- Returns
- -------
- `Series` or `DataFrame` with filled values
- """
- if result_is_index and aggregate:
- raise ValueError("'result_is_index' and 'aggregate' cannot both "
- "be True!")
- if post_processing:
- if not callable(pre_processing):
- raise ValueError("'post_processing' must be a callable!")
- if pre_processing:
- if not callable(pre_processing):
- raise ValueError("'pre_processing' must be a callable!")
- if not needs_values:
- raise ValueError("Cannot use 'pre_processing' without "
- "specifying 'needs_values'!")
- labels, _, ngroups = grouper.group_info
- output = collections.OrderedDict()
- base_func = getattr(libgroupby, how)
- for name, obj in self._iterate_slices():
- if aggregate:
- result_sz = ngroups
- else:
- result_sz = len(obj.values)
- if not cython_dtype:
- cython_dtype = obj.values.dtype
- result = np.zeros(result_sz, dtype=cython_dtype)
- func = partial(base_func, result, labels)
- if needs_values:
- vals = obj.values
- if pre_processing:
- vals = pre_processing(vals)
- func = partial(func, vals)
- if needs_mask:
- mask = isna(obj.values).view(np.uint8)
- func = partial(func, mask)
- if needs_ngroups:
- func = partial(func, ngroups)
- func(**kwargs) # Call func to modify indexer values in place
- if result_is_index:
- result = algorithms.take_nd(obj.values, result)
- if post_processing:
- result = post_processing(result)
- output[name] = result
- if aggregate:
- return self._wrap_aggregated_output(output)
- else:
- return self._wrap_transformed_output(output)
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def shift(self, periods=1, freq=None, axis=0, fill_value=None):
- """
- Shift each group by periods observations.
- Parameters
- ----------
- periods : integer, default 1
- number of periods to shift
- freq : frequency string
- axis : axis to shift, default 0
- fill_value : optional
- .. versionadded:: 0.24.0
- """
- if freq is not None or axis != 0 or not isna(fill_value):
- return self.apply(lambda x: x.shift(periods, freq,
- axis, fill_value))
- return self._get_cythonized_result('group_shift_indexer',
- self.grouper, cython_dtype=np.int64,
- needs_ngroups=True,
- result_is_index=True,
- periods=periods)
- @Substitution(name='groupby')
- @Appender(_common_see_also)
- def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
- axis=0):
- """
- Calculate pct_change of each value to previous entry in group.
- """
- if freq is not None or axis != 0:
- return self.apply(lambda x: x.pct_change(periods=periods,
- fill_method=fill_method,
- limit=limit, freq=freq,
- axis=axis))
- filled = getattr(self, fill_method)(limit=limit)
- filled = filled.drop(self.grouper.names, axis=1)
- fill_grp = filled.groupby(self.grouper.labels)
- shifted = fill_grp.shift(periods=periods, freq=freq)
- return (filled / shifted) - 1
- @Substitution(name='groupby', see_also=_common_see_also)
- def head(self, n=5):
- """
- Returns first n rows of each group.
- Essentially equivalent to ``.apply(lambda x: x.head(n))``,
- except ignores as_index flag.
- %(see_also)s
- Examples
- --------
- >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]],
- columns=['A', 'B'])
- >>> df.groupby('A', as_index=False).head(1)
- A B
- 0 1 2
- 2 5 6
- >>> df.groupby('A').head(1)
- A B
- 0 1 2
- 2 5 6
- """
- self._reset_group_selection()
- mask = self._cumcount_array() < n
- return self._selected_obj[mask]
- @Substitution(name='groupby', see_also=_common_see_also)
- def tail(self, n=5):
- """
- Returns last n rows of each group.
- Essentially equivalent to ``.apply(lambda x: x.tail(n))``,
- except ignores as_index flag.
- %(see_also)s
- Examples
- --------
- >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],
- columns=['A', 'B'])
- >>> df.groupby('A').tail(1)
- A B
- 1 a 2
- 3 b 2
- >>> df.groupby('A').head(1)
- A B
- 0 a 1
- 2 b 1
- """
- self._reset_group_selection()
- mask = self._cumcount_array(ascending=False) < n
- return self._selected_obj[mask]
- GroupBy._add_numeric_operations()
- @Appender(GroupBy.__doc__)
- def groupby(obj, by, **kwds):
- if isinstance(obj, Series):
- from pandas.core.groupby.generic import SeriesGroupBy
- klass = SeriesGroupBy
- elif isinstance(obj, DataFrame):
- from pandas.core.groupby.generic import DataFrameGroupBy
- klass = DataFrameGroupBy
- else: # pragma: no cover
- raise TypeError('invalid type: {}'.format(obj))
- return klass(obj, by, **kwds)
|