grouper.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632
  1. """
  2. Provide user facing operators for doing the split part of the
  3. split-apply-combine paradigm.
  4. """
  5. import warnings
  6. import numpy as np
  7. import pandas.compat as compat
  8. from pandas.compat import callable, zip
  9. from pandas.util._decorators import cache_readonly
  10. from pandas.core.dtypes.common import (
  11. ensure_categorical, is_categorical_dtype, is_datetime64_dtype, is_hashable,
  12. is_list_like, is_scalar, is_timedelta64_dtype)
  13. from pandas.core.dtypes.generic import ABCSeries
  14. import pandas.core.algorithms as algorithms
  15. from pandas.core.arrays import Categorical, ExtensionArray
  16. import pandas.core.common as com
  17. from pandas.core.frame import DataFrame
  18. from pandas.core.groupby.ops import BaseGrouper
  19. from pandas.core.index import CategoricalIndex, Index, MultiIndex
  20. from pandas.core.series import Series
  21. from pandas.io.formats.printing import pprint_thing
  22. class Grouper(object):
  23. """
  24. A Grouper allows the user to specify a groupby instruction for a target
  25. object
  26. This specification will select a column via the key parameter, or if the
  27. level and/or axis parameters are given, a level of the index of the target
  28. object.
  29. These are local specifications and will override 'global' settings,
  30. that is the parameters axis and level which are passed to the groupby
  31. itself.
  32. Parameters
  33. ----------
  34. key : string, defaults to None
  35. groupby key, which selects the grouping column of the target
  36. level : name/number, defaults to None
  37. the level for the target index
  38. freq : string / frequency object, defaults to None
  39. This will groupby the specified frequency if the target selection
  40. (via key or level) is a datetime-like object. For full specification
  41. of available frequencies, please see `here
  42. <http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases>`_.
  43. axis : number/name of the axis, defaults to 0
  44. sort : boolean, default to False
  45. whether to sort the resulting labels
  46. additional kwargs to control time-like groupers (when `freq` is passed)
  47. closed : closed end of interval; 'left' or 'right'
  48. label : interval boundary to use for labeling; 'left' or 'right'
  49. convention : {'start', 'end', 'e', 's'}
  50. If grouper is PeriodIndex
  51. base, loffset
  52. Returns
  53. -------
  54. A specification for a groupby instruction
  55. Examples
  56. --------
  57. Syntactic sugar for ``df.groupby('A')``
  58. >>> df.groupby(Grouper(key='A'))
  59. Specify a resample operation on the column 'date'
  60. >>> df.groupby(Grouper(key='date', freq='60s'))
  61. Specify a resample operation on the level 'date' on the columns axis
  62. with a frequency of 60s
  63. >>> df.groupby(Grouper(level='date', freq='60s', axis=1))
  64. """
  65. _attributes = ('key', 'level', 'freq', 'axis', 'sort')
  66. def __new__(cls, *args, **kwargs):
  67. if kwargs.get('freq') is not None:
  68. from pandas.core.resample import TimeGrouper
  69. cls = TimeGrouper
  70. return super(Grouper, cls).__new__(cls)
  71. def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
  72. self.key = key
  73. self.level = level
  74. self.freq = freq
  75. self.axis = axis
  76. self.sort = sort
  77. self.grouper = None
  78. self.obj = None
  79. self.indexer = None
  80. self.binner = None
  81. self._grouper = None
  82. @property
  83. def ax(self):
  84. return self.grouper
  85. def _get_grouper(self, obj, validate=True):
  86. """
  87. Parameters
  88. ----------
  89. obj : the subject object
  90. validate : boolean, default True
  91. if True, validate the grouper
  92. Returns
  93. -------
  94. a tuple of binner, grouper, obj (possibly sorted)
  95. """
  96. self._set_grouper(obj)
  97. self.grouper, exclusions, self.obj = _get_grouper(self.obj, [self.key],
  98. axis=self.axis,
  99. level=self.level,
  100. sort=self.sort,
  101. validate=validate)
  102. return self.binner, self.grouper, self.obj
  103. def _set_grouper(self, obj, sort=False):
  104. """
  105. given an object and the specifications, setup the internal grouper
  106. for this particular specification
  107. Parameters
  108. ----------
  109. obj : the subject object
  110. sort : bool, default False
  111. whether the resulting grouper should be sorted
  112. """
  113. if self.key is not None and self.level is not None:
  114. raise ValueError(
  115. "The Grouper cannot specify both a key and a level!")
  116. # Keep self.grouper value before overriding
  117. if self._grouper is None:
  118. self._grouper = self.grouper
  119. # the key must be a valid info item
  120. if self.key is not None:
  121. key = self.key
  122. # The 'on' is already defined
  123. if (getattr(self.grouper, 'name', None) == key and
  124. isinstance(obj, ABCSeries)):
  125. ax = self._grouper.take(obj.index)
  126. else:
  127. if key not in obj._info_axis:
  128. raise KeyError(
  129. "The grouper name {0} is not found".format(key))
  130. ax = Index(obj[key], name=key)
  131. else:
  132. ax = obj._get_axis(self.axis)
  133. if self.level is not None:
  134. level = self.level
  135. # if a level is given it must be a mi level or
  136. # equivalent to the axis name
  137. if isinstance(ax, MultiIndex):
  138. level = ax._get_level_number(level)
  139. ax = Index(ax._get_level_values(level),
  140. name=ax.names[level])
  141. else:
  142. if level not in (0, ax.name):
  143. raise ValueError(
  144. "The level {0} is not valid".format(level))
  145. # possibly sort
  146. if (self.sort or sort) and not ax.is_monotonic:
  147. # use stable sort to support first, last, nth
  148. indexer = self.indexer = ax.argsort(kind='mergesort')
  149. ax = ax.take(indexer)
  150. obj = obj._take(indexer, axis=self.axis, is_copy=False)
  151. self.obj = obj
  152. self.grouper = ax
  153. return self.grouper
  154. @property
  155. def groups(self):
  156. return self.grouper.groups
  157. def __repr__(self):
  158. attrs_list = ["{}={!r}".format(attr_name, getattr(self, attr_name))
  159. for attr_name in self._attributes
  160. if getattr(self, attr_name) is not None]
  161. attrs = ", ".join(attrs_list)
  162. cls_name = self.__class__.__name__
  163. return "{}({})".format(cls_name, attrs)
  164. class Grouping(object):
  165. """
  166. Holds the grouping information for a single key
  167. Parameters
  168. ----------
  169. index : Index
  170. grouper :
  171. obj :
  172. name :
  173. level :
  174. observed : boolean, default False
  175. If we are a Categorical, use the observed values
  176. in_axis : if the Grouping is a column in self.obj and hence among
  177. Groupby.exclusions list
  178. Returns
  179. -------
  180. **Attributes**:
  181. * indices : dict of {group -> index_list}
  182. * labels : ndarray, group labels
  183. * ids : mapping of label -> group
  184. * counts : array of group counts
  185. * group_index : unique groups
  186. * groups : dict of {group -> label_list}
  187. """
  188. def __init__(self, index, grouper=None, obj=None, name=None, level=None,
  189. sort=True, observed=False, in_axis=False):
  190. self.name = name
  191. self.level = level
  192. self.grouper = _convert_grouper(index, grouper)
  193. self.all_grouper = None
  194. self.index = index
  195. self.sort = sort
  196. self.obj = obj
  197. self.observed = observed
  198. self.in_axis = in_axis
  199. # right place for this?
  200. if isinstance(grouper, (Series, Index)) and name is None:
  201. self.name = grouper.name
  202. if isinstance(grouper, MultiIndex):
  203. self.grouper = grouper.values
  204. # we have a single grouper which may be a myriad of things,
  205. # some of which are dependent on the passing in level
  206. if level is not None:
  207. if not isinstance(level, int):
  208. if level not in index.names:
  209. raise AssertionError('Level {} not in index'.format(level))
  210. level = index.names.index(level)
  211. if self.name is None:
  212. self.name = index.names[level]
  213. self.grouper, self._labels, self._group_index = \
  214. index._get_grouper_for_level(self.grouper, level)
  215. # a passed Grouper like, directly get the grouper in the same way
  216. # as single grouper groupby, use the group_info to get labels
  217. elif isinstance(self.grouper, Grouper):
  218. # get the new grouper; we already have disambiguated
  219. # what key/level refer to exactly, don't need to
  220. # check again as we have by this point converted these
  221. # to an actual value (rather than a pd.Grouper)
  222. _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False)
  223. if self.name is None:
  224. self.name = grouper.result_index.name
  225. self.obj = self.grouper.obj
  226. self.grouper = grouper
  227. else:
  228. if self.grouper is None and self.name is not None:
  229. self.grouper = self.obj[self.name]
  230. elif isinstance(self.grouper, (list, tuple)):
  231. self.grouper = com.asarray_tuplesafe(self.grouper)
  232. # a passed Categorical
  233. elif is_categorical_dtype(self.grouper):
  234. from pandas.core.groupby.categorical import recode_for_groupby
  235. self.grouper, self.all_grouper = recode_for_groupby(
  236. self.grouper, self.sort, observed)
  237. categories = self.grouper.categories
  238. # we make a CategoricalIndex out of the cat grouper
  239. # preserving the categories / ordered attributes
  240. self._labels = self.grouper.codes
  241. if observed:
  242. codes = algorithms.unique1d(self.grouper.codes)
  243. codes = codes[codes != -1]
  244. else:
  245. codes = np.arange(len(categories))
  246. self._group_index = CategoricalIndex(
  247. Categorical.from_codes(
  248. codes=codes,
  249. categories=categories,
  250. ordered=self.grouper.ordered))
  251. # we are done
  252. if isinstance(self.grouper, Grouping):
  253. self.grouper = self.grouper.grouper
  254. # no level passed
  255. elif not isinstance(self.grouper,
  256. (Series, Index, ExtensionArray, np.ndarray)):
  257. if getattr(self.grouper, 'ndim', 1) != 1:
  258. t = self.name or str(type(self.grouper))
  259. raise ValueError(
  260. "Grouper for '{}' not 1-dimensional".format(t))
  261. self.grouper = self.index.map(self.grouper)
  262. if not (hasattr(self.grouper, "__len__") and
  263. len(self.grouper) == len(self.index)):
  264. errmsg = ('Grouper result violates len(labels) == '
  265. 'len(data)\nresult: %s' %
  266. pprint_thing(self.grouper))
  267. self.grouper = None # Try for sanity
  268. raise AssertionError(errmsg)
  269. # if we have a date/time-like grouper, make sure that we have
  270. # Timestamps like
  271. if getattr(self.grouper, 'dtype', None) is not None:
  272. if is_datetime64_dtype(self.grouper):
  273. from pandas import to_datetime
  274. self.grouper = to_datetime(self.grouper)
  275. elif is_timedelta64_dtype(self.grouper):
  276. from pandas import to_timedelta
  277. self.grouper = to_timedelta(self.grouper)
  278. def __repr__(self):
  279. return 'Grouping({0})'.format(self.name)
  280. def __iter__(self):
  281. return iter(self.indices)
  282. _labels = None
  283. _group_index = None
  284. @property
  285. def ngroups(self):
  286. return len(self.group_index)
  287. @cache_readonly
  288. def indices(self):
  289. # we have a list of groupers
  290. if isinstance(self.grouper, BaseGrouper):
  291. return self.grouper.indices
  292. values = ensure_categorical(self.grouper)
  293. return values._reverse_indexer()
  294. @property
  295. def labels(self):
  296. if self._labels is None:
  297. self._make_labels()
  298. return self._labels
  299. @cache_readonly
  300. def result_index(self):
  301. if self.all_grouper is not None:
  302. from pandas.core.groupby.categorical import recode_from_groupby
  303. return recode_from_groupby(self.all_grouper,
  304. self.sort, self.group_index)
  305. return self.group_index
  306. @property
  307. def group_index(self):
  308. if self._group_index is None:
  309. self._make_labels()
  310. return self._group_index
  311. def _make_labels(self):
  312. if self._labels is None or self._group_index is None:
  313. # we have a list of groupers
  314. if isinstance(self.grouper, BaseGrouper):
  315. labels = self.grouper.label_info
  316. uniques = self.grouper.result_index
  317. else:
  318. labels, uniques = algorithms.factorize(
  319. self.grouper, sort=self.sort)
  320. uniques = Index(uniques, name=self.name)
  321. self._labels = labels
  322. self._group_index = uniques
  323. @cache_readonly
  324. def groups(self):
  325. return self.index.groupby(Categorical.from_codes(self.labels,
  326. self.group_index))
  327. def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
  328. observed=False, mutated=False, validate=True):
  329. """
  330. create and return a BaseGrouper, which is an internal
  331. mapping of how to create the grouper indexers.
  332. This may be composed of multiple Grouping objects, indicating
  333. multiple groupers
  334. Groupers are ultimately index mappings. They can originate as:
  335. index mappings, keys to columns, functions, or Groupers
  336. Groupers enable local references to axis,level,sort, while
  337. the passed in axis, level, and sort are 'global'.
  338. This routine tries to figure out what the passing in references
  339. are and then creates a Grouping for each one, combined into
  340. a BaseGrouper.
  341. If observed & we have a categorical grouper, only show the observed
  342. values
  343. If validate, then check for key/level overlaps
  344. """
  345. group_axis = obj._get_axis(axis)
  346. # validate that the passed single level is compatible with the passed
  347. # axis of the object
  348. if level is not None:
  349. # TODO: These if-block and else-block are almost same.
  350. # MultiIndex instance check is removable, but it seems that there are
  351. # some processes only for non-MultiIndex in else-block,
  352. # eg. `obj.index.name != level`. We have to consider carefully whether
  353. # these are applicable for MultiIndex. Even if these are applicable,
  354. # we need to check if it makes no side effect to subsequent processes
  355. # on the outside of this condition.
  356. # (GH 17621)
  357. if isinstance(group_axis, MultiIndex):
  358. if is_list_like(level) and len(level) == 1:
  359. level = level[0]
  360. if key is None and is_scalar(level):
  361. # Get the level values from group_axis
  362. key = group_axis.get_level_values(level)
  363. level = None
  364. else:
  365. # allow level to be a length-one list-like object
  366. # (e.g., level=[0])
  367. # GH 13901
  368. if is_list_like(level):
  369. nlevels = len(level)
  370. if nlevels == 1:
  371. level = level[0]
  372. elif nlevels == 0:
  373. raise ValueError('No group keys passed!')
  374. else:
  375. raise ValueError('multiple levels only valid with '
  376. 'MultiIndex')
  377. if isinstance(level, compat.string_types):
  378. if obj.index.name != level:
  379. raise ValueError('level name {} is not the name of the '
  380. 'index'.format(level))
  381. elif level > 0 or level < -1:
  382. raise ValueError(
  383. 'level > 0 or level < -1 only valid with MultiIndex')
  384. # NOTE: `group_axis` and `group_axis.get_level_values(level)`
  385. # are same in this section.
  386. level = None
  387. key = group_axis
  388. # a passed-in Grouper, directly convert
  389. if isinstance(key, Grouper):
  390. binner, grouper, obj = key._get_grouper(obj, validate=False)
  391. if key.key is None:
  392. return grouper, [], obj
  393. else:
  394. return grouper, {key.key}, obj
  395. # already have a BaseGrouper, just return it
  396. elif isinstance(key, BaseGrouper):
  397. return key, [], obj
  398. # In the future, a tuple key will always mean an actual key,
  399. # not an iterable of keys. In the meantime, we attempt to provide
  400. # a warning. We can assume that the user wanted a list of keys when
  401. # the key is not in the index. We just have to be careful with
  402. # unhashble elements of `key`. Any unhashable elements implies that
  403. # they wanted a list of keys.
  404. # https://github.com/pandas-dev/pandas/issues/18314
  405. is_tuple = isinstance(key, tuple)
  406. all_hashable = is_tuple and is_hashable(key)
  407. if is_tuple:
  408. if ((all_hashable and key not in obj and set(key).issubset(obj))
  409. or not all_hashable):
  410. # column names ('a', 'b') -> ['a', 'b']
  411. # arrays like (a, b) -> [a, b]
  412. msg = ("Interpreting tuple 'by' as a list of keys, rather than "
  413. "a single key. Use 'by=[...]' instead of 'by=(...)'. In "
  414. "the future, a tuple will always mean a single key.")
  415. warnings.warn(msg, FutureWarning, stacklevel=5)
  416. key = list(key)
  417. if not isinstance(key, list):
  418. keys = [key]
  419. match_axis_length = False
  420. else:
  421. keys = key
  422. match_axis_length = len(keys) == len(group_axis)
  423. # what are we after, exactly?
  424. any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
  425. any_groupers = any(isinstance(g, Grouper) for g in keys)
  426. any_arraylike = any(isinstance(g, (list, tuple, Series, Index, np.ndarray))
  427. for g in keys)
  428. try:
  429. if isinstance(obj, DataFrame):
  430. all_in_columns_index = all(g in obj.columns or g in obj.index.names
  431. for g in keys)
  432. else:
  433. all_in_columns_index = False
  434. except Exception:
  435. all_in_columns_index = False
  436. if (not any_callable and not all_in_columns_index and
  437. not any_arraylike and not any_groupers and
  438. match_axis_length and level is None):
  439. keys = [com.asarray_tuplesafe(keys)]
  440. if isinstance(level, (tuple, list)):
  441. if key is None:
  442. keys = [None] * len(level)
  443. levels = level
  444. else:
  445. levels = [level] * len(keys)
  446. groupings = []
  447. exclusions = []
  448. # if the actual grouper should be obj[key]
  449. def is_in_axis(key):
  450. if not _is_label_like(key):
  451. try:
  452. obj._data.items.get_loc(key)
  453. except Exception:
  454. return False
  455. return True
  456. # if the grouper is obj[name]
  457. def is_in_obj(gpr):
  458. try:
  459. return id(gpr) == id(obj[gpr.name])
  460. except Exception:
  461. return False
  462. for i, (gpr, level) in enumerate(zip(keys, levels)):
  463. if is_in_obj(gpr): # df.groupby(df['name'])
  464. in_axis, name = True, gpr.name
  465. exclusions.append(name)
  466. elif is_in_axis(gpr): # df.groupby('name')
  467. if gpr in obj:
  468. if validate:
  469. obj._check_label_or_level_ambiguity(gpr)
  470. in_axis, name, gpr = True, gpr, obj[gpr]
  471. exclusions.append(name)
  472. elif obj._is_level_reference(gpr):
  473. in_axis, name, level, gpr = False, None, gpr, None
  474. else:
  475. raise KeyError(gpr)
  476. elif isinstance(gpr, Grouper) and gpr.key is not None:
  477. # Add key to exclusions
  478. exclusions.append(gpr.key)
  479. in_axis, name = False, None
  480. else:
  481. in_axis, name = False, None
  482. if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]:
  483. raise ValueError(
  484. ("Length of grouper ({len_gpr}) and axis ({len_axis})"
  485. " must be same length"
  486. .format(len_gpr=len(gpr), len_axis=obj.shape[axis])))
  487. # create the Grouping
  488. # allow us to passing the actual Grouping as the gpr
  489. ping = (Grouping(group_axis,
  490. gpr,
  491. obj=obj,
  492. name=name,
  493. level=level,
  494. sort=sort,
  495. observed=observed,
  496. in_axis=in_axis)
  497. if not isinstance(gpr, Grouping) else gpr)
  498. groupings.append(ping)
  499. if len(groupings) == 0:
  500. raise ValueError('No group keys passed!')
  501. # create the internals grouper
  502. grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated)
  503. return grouper, exclusions, obj
  504. def _is_label_like(val):
  505. return (isinstance(val, (compat.string_types, tuple)) or
  506. (val is not None and is_scalar(val)))
  507. def _convert_grouper(axis, grouper):
  508. if isinstance(grouper, dict):
  509. return grouper.get
  510. elif isinstance(grouper, Series):
  511. if grouper.index.equals(axis):
  512. return grouper._values
  513. else:
  514. return grouper.reindex(axis)._values
  515. elif isinstance(grouper, (list, Series, Index, np.ndarray)):
  516. if len(grouper) != len(axis):
  517. raise ValueError('Grouper and axis must be same length')
  518. return grouper
  519. else:
  520. return grouper