generic.py 58 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673
  1. """
  2. Define the SeriesGroupBy, DataFrameGroupBy, and PanelGroupBy
  3. classes that hold the groupby interfaces (and some implementations).
  4. These are user facing as the result of the ``df.groupby(...)`` operations,
  5. which here returns a DataFrameGroupBy object.
  6. """
  7. import collections
  8. import copy
  9. from functools import partial
  10. from textwrap import dedent
  11. import warnings
  12. import numpy as np
  13. from pandas._libs import Timestamp, lib
  14. import pandas.compat as compat
  15. from pandas.compat import lzip, map
  16. from pandas.compat.numpy import _np_version_under1p13
  17. from pandas.errors import AbstractMethodError
  18. from pandas.util._decorators import Appender, Substitution
  19. from pandas.core.dtypes.cast import maybe_downcast_to_dtype
  20. from pandas.core.dtypes.common import (
  21. ensure_int64, ensure_platform_int, is_bool, is_datetimelike,
  22. is_integer_dtype, is_interval_dtype, is_numeric_dtype, is_scalar)
  23. from pandas.core.dtypes.missing import isna, notna
  24. import pandas.core.algorithms as algorithms
  25. from pandas.core.arrays import Categorical
  26. from pandas.core.base import DataError, SpecificationError
  27. import pandas.core.common as com
  28. from pandas.core.frame import DataFrame
  29. from pandas.core.generic import NDFrame, _shared_docs
  30. from pandas.core.groupby import base
  31. from pandas.core.groupby.groupby import (
  32. GroupBy, _apply_docs, _transform_template)
  33. from pandas.core.index import CategoricalIndex, Index, MultiIndex
  34. import pandas.core.indexes.base as ibase
  35. from pandas.core.internals import BlockManager, make_block
  36. from pandas.core.panel import Panel
  37. from pandas.core.series import Series
  38. from pandas.plotting._core import boxplot_frame_groupby
  39. class NDFrameGroupBy(GroupBy):
  40. def _iterate_slices(self):
  41. if self.axis == 0:
  42. # kludge
  43. if self._selection is None:
  44. slice_axis = self.obj.columns
  45. else:
  46. slice_axis = self._selection_list
  47. slicer = lambda x: self.obj[x]
  48. else:
  49. slice_axis = self.obj.index
  50. slicer = self.obj.xs
  51. for val in slice_axis:
  52. if val in self.exclusions:
  53. continue
  54. yield val, slicer(val)
  55. def _cython_agg_general(self, how, alt=None, numeric_only=True,
  56. min_count=-1):
  57. new_items, new_blocks = self._cython_agg_blocks(
  58. how, alt=alt, numeric_only=numeric_only, min_count=min_count)
  59. return self._wrap_agged_blocks(new_items, new_blocks)
  60. def _wrap_agged_blocks(self, items, blocks):
  61. obj = self._obj_with_exclusions
  62. new_axes = list(obj._data.axes)
  63. # more kludge
  64. if self.axis == 0:
  65. new_axes[0], new_axes[1] = new_axes[1], self.grouper.result_index
  66. else:
  67. new_axes[self.axis] = self.grouper.result_index
  68. # Make sure block manager integrity check passes.
  69. assert new_axes[0].equals(items)
  70. new_axes[0] = items
  71. mgr = BlockManager(blocks, new_axes)
  72. new_obj = type(obj)(mgr)
  73. return self._post_process_cython_aggregate(new_obj)
  74. _block_agg_axis = 0
  75. def _cython_agg_blocks(self, how, alt=None, numeric_only=True,
  76. min_count=-1):
  77. # TODO: the actual managing of mgr_locs is a PITA
  78. # here, it should happen via BlockManager.combine
  79. data, agg_axis = self._get_data_to_aggregate()
  80. if numeric_only:
  81. data = data.get_numeric_data(copy=False)
  82. new_blocks = []
  83. new_items = []
  84. deleted_items = []
  85. for block in data.blocks:
  86. locs = block.mgr_locs.as_array
  87. try:
  88. result, _ = self.grouper.aggregate(
  89. block.values, how, axis=agg_axis, min_count=min_count)
  90. except NotImplementedError:
  91. # generally if we have numeric_only=False
  92. # and non-applicable functions
  93. # try to python agg
  94. if alt is None:
  95. # we cannot perform the operation
  96. # in an alternate way, exclude the block
  97. deleted_items.append(locs)
  98. continue
  99. # call our grouper again with only this block
  100. from pandas.core.groupby.groupby import groupby
  101. obj = self.obj[data.items[locs]]
  102. s = groupby(obj, self.grouper)
  103. result = s.aggregate(lambda x: alt(x, axis=self.axis))
  104. finally:
  105. # see if we can cast the block back to the original dtype
  106. result = block._try_coerce_and_cast_result(result)
  107. newb = block.make_block(result)
  108. new_items.append(locs)
  109. new_blocks.append(newb)
  110. if len(new_blocks) == 0:
  111. raise DataError('No numeric types to aggregate')
  112. # reset the locs in the blocks to correspond to our
  113. # current ordering
  114. indexer = np.concatenate(new_items)
  115. new_items = data.items.take(np.sort(indexer))
  116. if len(deleted_items):
  117. # we need to adjust the indexer to account for the
  118. # items we have removed
  119. # really should be done in internals :<
  120. deleted = np.concatenate(deleted_items)
  121. ai = np.arange(len(data))
  122. mask = np.zeros(len(data))
  123. mask[deleted] = 1
  124. indexer = (ai - mask.cumsum())[indexer]
  125. offset = 0
  126. for b in new_blocks:
  127. loc = len(b.mgr_locs)
  128. b.mgr_locs = indexer[offset:(offset + loc)]
  129. offset += loc
  130. return new_items, new_blocks
  131. def _get_data_to_aggregate(self):
  132. obj = self._obj_with_exclusions
  133. if self.axis == 0:
  134. return obj.swapaxes(0, 1)._data, 1
  135. else:
  136. return obj._data, self.axis
  137. def _post_process_cython_aggregate(self, obj):
  138. # undoing kludge from below
  139. if self.axis == 0:
  140. obj = obj.swapaxes(0, 1)
  141. return obj
  142. def aggregate(self, arg, *args, **kwargs):
  143. _level = kwargs.pop('_level', None)
  144. result, how = self._aggregate(arg, _level=_level, *args, **kwargs)
  145. if how is None:
  146. return result
  147. if result is None:
  148. # grouper specific aggregations
  149. if self.grouper.nkeys > 1:
  150. return self._python_agg_general(arg, *args, **kwargs)
  151. else:
  152. # try to treat as if we are passing a list
  153. try:
  154. assert not args and not kwargs
  155. result = self._aggregate_multiple_funcs(
  156. [arg], _level=_level, _axis=self.axis)
  157. result.columns = Index(
  158. result.columns.levels[0],
  159. name=self._selected_obj.columns.name)
  160. except Exception:
  161. result = self._aggregate_generic(arg, *args, **kwargs)
  162. if not self.as_index:
  163. self._insert_inaxis_grouper_inplace(result)
  164. result.index = np.arange(len(result))
  165. return result._convert(datetime=True)
  166. agg = aggregate
  167. def _aggregate_generic(self, func, *args, **kwargs):
  168. if self.grouper.nkeys != 1:
  169. raise AssertionError('Number of keys must be 1')
  170. axis = self.axis
  171. obj = self._obj_with_exclusions
  172. result = {}
  173. if axis != obj._info_axis_number:
  174. try:
  175. for name, data in self:
  176. result[name] = self._try_cast(func(data, *args, **kwargs),
  177. data)
  178. except Exception:
  179. return self._aggregate_item_by_item(func, *args, **kwargs)
  180. else:
  181. for name in self.indices:
  182. try:
  183. data = self.get_group(name, obj=obj)
  184. result[name] = self._try_cast(func(data, *args, **kwargs),
  185. data)
  186. except Exception:
  187. wrapper = lambda x: func(x, *args, **kwargs)
  188. result[name] = data.apply(wrapper, axis=axis)
  189. return self._wrap_generic_output(result, obj)
  190. def _wrap_aggregated_output(self, output, names=None):
  191. raise AbstractMethodError(self)
  192. def _aggregate_item_by_item(self, func, *args, **kwargs):
  193. # only for axis==0
  194. obj = self._obj_with_exclusions
  195. result = {}
  196. cannot_agg = []
  197. errors = None
  198. for item in obj:
  199. try:
  200. data = obj[item]
  201. colg = SeriesGroupBy(data, selection=item,
  202. grouper=self.grouper)
  203. result[item] = self._try_cast(
  204. colg.aggregate(func, *args, **kwargs), data)
  205. except ValueError:
  206. cannot_agg.append(item)
  207. continue
  208. except TypeError as e:
  209. cannot_agg.append(item)
  210. errors = e
  211. continue
  212. result_columns = obj.columns
  213. if cannot_agg:
  214. result_columns = result_columns.drop(cannot_agg)
  215. # GH6337
  216. if not len(result_columns) and errors is not None:
  217. raise errors
  218. return DataFrame(result, columns=result_columns)
  219. def _decide_output_index(self, output, labels):
  220. if len(output) == len(labels):
  221. output_keys = labels
  222. else:
  223. output_keys = sorted(output)
  224. try:
  225. output_keys.sort()
  226. except Exception: # pragma: no cover
  227. pass
  228. if isinstance(labels, MultiIndex):
  229. output_keys = MultiIndex.from_tuples(output_keys,
  230. names=labels.names)
  231. return output_keys
  232. def _wrap_applied_output(self, keys, values, not_indexed_same=False):
  233. from pandas.core.index import _all_indexes_same
  234. from pandas.core.tools.numeric import to_numeric
  235. if len(keys) == 0:
  236. return DataFrame(index=keys)
  237. key_names = self.grouper.names
  238. # GH12824.
  239. def first_not_none(values):
  240. try:
  241. return next(com._not_none(*values))
  242. except StopIteration:
  243. return None
  244. v = first_not_none(values)
  245. if v is None:
  246. # GH9684. If all values are None, then this will throw an error.
  247. # We'd prefer it return an empty dataframe.
  248. return DataFrame()
  249. elif isinstance(v, DataFrame):
  250. return self._concat_objects(keys, values,
  251. not_indexed_same=not_indexed_same)
  252. elif self.grouper.groupings is not None:
  253. if len(self.grouper.groupings) > 1:
  254. key_index = self.grouper.result_index
  255. else:
  256. ping = self.grouper.groupings[0]
  257. if len(keys) == ping.ngroups:
  258. key_index = ping.group_index
  259. key_index.name = key_names[0]
  260. key_lookup = Index(keys)
  261. indexer = key_lookup.get_indexer(key_index)
  262. # reorder the values
  263. values = [values[i] for i in indexer]
  264. else:
  265. key_index = Index(keys, name=key_names[0])
  266. # don't use the key indexer
  267. if not self.as_index:
  268. key_index = None
  269. # make Nones an empty object
  270. v = first_not_none(values)
  271. if v is None:
  272. return DataFrame()
  273. elif isinstance(v, NDFrame):
  274. values = [
  275. x if x is not None else
  276. v._constructor(**v._construct_axes_dict())
  277. for x in values
  278. ]
  279. v = values[0]
  280. if isinstance(v, (np.ndarray, Index, Series)):
  281. if isinstance(v, Series):
  282. applied_index = self._selected_obj._get_axis(self.axis)
  283. all_indexed_same = _all_indexes_same([
  284. x.index for x in values
  285. ])
  286. singular_series = (len(values) == 1 and
  287. applied_index.nlevels == 1)
  288. # GH3596
  289. # provide a reduction (Frame -> Series) if groups are
  290. # unique
  291. if self.squeeze:
  292. # assign the name to this series
  293. if singular_series:
  294. values[0].name = keys[0]
  295. # GH2893
  296. # we have series in the values array, we want to
  297. # produce a series:
  298. # if any of the sub-series are not indexed the same
  299. # OR we don't have a multi-index and we have only a
  300. # single values
  301. return self._concat_objects(
  302. keys, values, not_indexed_same=not_indexed_same
  303. )
  304. # still a series
  305. # path added as of GH 5545
  306. elif all_indexed_same:
  307. from pandas.core.reshape.concat import concat
  308. return concat(values)
  309. if not all_indexed_same:
  310. # GH 8467
  311. return self._concat_objects(
  312. keys, values, not_indexed_same=True,
  313. )
  314. try:
  315. if self.axis == 0:
  316. # GH6124 if the list of Series have a consistent name,
  317. # then propagate that name to the result.
  318. index = v.index.copy()
  319. if index.name is None:
  320. # Only propagate the series name to the result
  321. # if all series have a consistent name. If the
  322. # series do not have a consistent name, do
  323. # nothing.
  324. names = {v.name for v in values}
  325. if len(names) == 1:
  326. index.name = list(names)[0]
  327. # normally use vstack as its faster than concat
  328. # and if we have mi-columns
  329. if (isinstance(v.index, MultiIndex) or
  330. key_index is None or
  331. isinstance(key_index, MultiIndex)):
  332. stacked_values = np.vstack([
  333. np.asarray(v) for v in values
  334. ])
  335. result = DataFrame(stacked_values, index=key_index,
  336. columns=index)
  337. else:
  338. # GH5788 instead of stacking; concat gets the
  339. # dtypes correct
  340. from pandas.core.reshape.concat import concat
  341. result = concat(values, keys=key_index,
  342. names=key_index.names,
  343. axis=self.axis).unstack()
  344. result.columns = index
  345. else:
  346. stacked_values = np.vstack([np.asarray(v)
  347. for v in values])
  348. result = DataFrame(stacked_values.T, index=v.index,
  349. columns=key_index)
  350. except (ValueError, AttributeError):
  351. # GH1738: values is list of arrays of unequal lengths fall
  352. # through to the outer else caluse
  353. return Series(values, index=key_index,
  354. name=self._selection_name)
  355. # if we have date/time like in the original, then coerce dates
  356. # as we are stacking can easily have object dtypes here
  357. so = self._selected_obj
  358. if (so.ndim == 2 and so.dtypes.apply(is_datetimelike).any()):
  359. result = result.apply(
  360. lambda x: to_numeric(x, errors='ignore'))
  361. date_cols = self._selected_obj.select_dtypes(
  362. include=['datetime', 'timedelta']).columns
  363. date_cols = date_cols.intersection(result.columns)
  364. result[date_cols] = (result[date_cols]
  365. ._convert(datetime=True,
  366. coerce=True))
  367. else:
  368. result = result._convert(datetime=True)
  369. return self._reindex_output(result)
  370. # values are not series or array-like but scalars
  371. else:
  372. # only coerce dates if we find at least 1 datetime
  373. coerce = any(isinstance(x, Timestamp) for x in values)
  374. # self._selection_name not passed through to Series as the
  375. # result should not take the name of original selection
  376. # of columns
  377. return (Series(values, index=key_index)
  378. ._convert(datetime=True,
  379. coerce=coerce))
  380. else:
  381. # Handle cases like BinGrouper
  382. return self._concat_objects(keys, values,
  383. not_indexed_same=not_indexed_same)
  384. def _transform_general(self, func, *args, **kwargs):
  385. from pandas.core.reshape.concat import concat
  386. applied = []
  387. obj = self._obj_with_exclusions
  388. gen = self.grouper.get_iterator(obj, axis=self.axis)
  389. fast_path, slow_path = self._define_paths(func, *args, **kwargs)
  390. path = None
  391. for name, group in gen:
  392. object.__setattr__(group, 'name', name)
  393. if path is None:
  394. # Try slow path and fast path.
  395. try:
  396. path, res = self._choose_path(fast_path, slow_path, group)
  397. except TypeError:
  398. return self._transform_item_by_item(obj, fast_path)
  399. except ValueError:
  400. msg = 'transform must return a scalar value for each group'
  401. raise ValueError(msg)
  402. else:
  403. res = path(group)
  404. if isinstance(res, Series):
  405. # we need to broadcast across the
  406. # other dimension; this will preserve dtypes
  407. # GH14457
  408. if not np.prod(group.shape):
  409. continue
  410. elif res.index.is_(obj.index):
  411. r = concat([res] * len(group.columns), axis=1)
  412. r.columns = group.columns
  413. r.index = group.index
  414. else:
  415. r = DataFrame(
  416. np.concatenate([res.values] * len(group.index)
  417. ).reshape(group.shape),
  418. columns=group.columns, index=group.index)
  419. applied.append(r)
  420. else:
  421. applied.append(res)
  422. concat_index = obj.columns if self.axis == 0 else obj.index
  423. concatenated = concat(applied, join_axes=[concat_index],
  424. axis=self.axis, verify_integrity=False)
  425. return self._set_result_index_ordered(concatenated)
  426. @Substitution(klass='DataFrame', selected='')
  427. @Appender(_transform_template)
  428. def transform(self, func, *args, **kwargs):
  429. # optimized transforms
  430. func = self._is_cython_func(func) or func
  431. if isinstance(func, compat.string_types):
  432. if func in base.cython_transforms:
  433. # cythonized transform
  434. return getattr(self, func)(*args, **kwargs)
  435. else:
  436. # cythonized aggregation and merge
  437. result = getattr(self, func)(*args, **kwargs)
  438. else:
  439. return self._transform_general(func, *args, **kwargs)
  440. # a reduction transform
  441. if not isinstance(result, DataFrame):
  442. return self._transform_general(func, *args, **kwargs)
  443. obj = self._obj_with_exclusions
  444. # nuiscance columns
  445. if not result.columns.equals(obj.columns):
  446. return self._transform_general(func, *args, **kwargs)
  447. return self._transform_fast(result, obj, func)
  448. def _transform_fast(self, result, obj, func_nm):
  449. """
  450. Fast transform path for aggregations
  451. """
  452. # if there were groups with no observations (Categorical only?)
  453. # try casting data to original dtype
  454. cast = self._transform_should_cast(func_nm)
  455. # for each col, reshape to to size of original frame
  456. # by take operation
  457. ids, _, ngroup = self.grouper.group_info
  458. output = []
  459. for i, _ in enumerate(result.columns):
  460. res = algorithms.take_1d(result.iloc[:, i].values, ids)
  461. if cast:
  462. res = self._try_cast(res, obj.iloc[:, i])
  463. output.append(res)
  464. return DataFrame._from_arrays(output, columns=result.columns,
  465. index=obj.index)
  466. def _define_paths(self, func, *args, **kwargs):
  467. if isinstance(func, compat.string_types):
  468. fast_path = lambda group: getattr(group, func)(*args, **kwargs)
  469. slow_path = lambda group: group.apply(
  470. lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis)
  471. else:
  472. fast_path = lambda group: func(group, *args, **kwargs)
  473. slow_path = lambda group: group.apply(
  474. lambda x: func(x, *args, **kwargs), axis=self.axis)
  475. return fast_path, slow_path
  476. def _choose_path(self, fast_path, slow_path, group):
  477. path = slow_path
  478. res = slow_path(group)
  479. # if we make it here, test if we can use the fast path
  480. try:
  481. res_fast = fast_path(group)
  482. # verify fast path does not change columns (and names), otherwise
  483. # its results cannot be joined with those of the slow path
  484. if res_fast.columns != group.columns:
  485. return path, res
  486. # verify numerical equality with the slow path
  487. if res.shape == res_fast.shape:
  488. res_r = res.values.ravel()
  489. res_fast_r = res_fast.values.ravel()
  490. mask = notna(res_r)
  491. if (res_r[mask] == res_fast_r[mask]).all():
  492. path = fast_path
  493. except Exception:
  494. pass
  495. return path, res
  496. def _transform_item_by_item(self, obj, wrapper):
  497. # iterate through columns
  498. output = {}
  499. inds = []
  500. for i, col in enumerate(obj):
  501. try:
  502. output[col] = self[col].transform(wrapper)
  503. inds.append(i)
  504. except Exception:
  505. pass
  506. if len(output) == 0: # pragma: no cover
  507. raise TypeError('Transform function invalid for data types')
  508. columns = obj.columns
  509. if len(output) < len(obj.columns):
  510. columns = columns.take(inds)
  511. return DataFrame(output, index=obj.index, columns=columns)
  512. def filter(self, func, dropna=True, *args, **kwargs): # noqa
  513. """
  514. Return a copy of a DataFrame excluding elements from groups that
  515. do not satisfy the boolean criterion specified by func.
  516. Parameters
  517. ----------
  518. f : function
  519. Function to apply to each subframe. Should return True or False.
  520. dropna : Drop groups that do not pass the filter. True by default;
  521. if False, groups that evaluate False are filled with NaNs.
  522. Returns
  523. -------
  524. filtered : DataFrame
  525. Notes
  526. -----
  527. Each subframe is endowed the attribute 'name' in case you need to know
  528. which group you are working on.
  529. Examples
  530. --------
  531. >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
  532. ... 'foo', 'bar'],
  533. ... 'B' : [1, 2, 3, 4, 5, 6],
  534. ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
  535. >>> grouped = df.groupby('A')
  536. >>> grouped.filter(lambda x: x['B'].mean() > 3.)
  537. A B C
  538. 1 bar 2 5.0
  539. 3 bar 4 1.0
  540. 5 bar 6 9.0
  541. """
  542. indices = []
  543. obj = self._selected_obj
  544. gen = self.grouper.get_iterator(obj, axis=self.axis)
  545. for name, group in gen:
  546. object.__setattr__(group, 'name', name)
  547. res = func(group, *args, **kwargs)
  548. try:
  549. res = res.squeeze()
  550. except AttributeError: # allow e.g., scalars and frames to pass
  551. pass
  552. # interpret the result of the filter
  553. if is_bool(res) or (is_scalar(res) and isna(res)):
  554. if res and notna(res):
  555. indices.append(self._get_index(name))
  556. else:
  557. # non scalars aren't allowed
  558. raise TypeError("filter function returned a %s, "
  559. "but expected a scalar bool" %
  560. type(res).__name__)
  561. return self._apply_filter(indices, dropna)
  562. class SeriesGroupBy(GroupBy):
  563. #
  564. # Make class defs of attributes on SeriesGroupBy whitelist
  565. _apply_whitelist = base.series_apply_whitelist
  566. for _def_str in base.whitelist_method_generator(
  567. GroupBy, Series, _apply_whitelist):
  568. exec(_def_str)
  569. @property
  570. def _selection_name(self):
  571. """
  572. since we are a series, we by definition only have
  573. a single name, but may be the result of a selection or
  574. the name of our object
  575. """
  576. if self._selection is None:
  577. return self.obj.name
  578. else:
  579. return self._selection
  580. _agg_see_also_doc = dedent("""
  581. See Also
  582. --------
  583. pandas.Series.groupby.apply
  584. pandas.Series.groupby.transform
  585. pandas.Series.aggregate
  586. """)
  587. _agg_examples_doc = dedent("""
  588. Examples
  589. --------
  590. >>> s = pd.Series([1, 2, 3, 4])
  591. >>> s
  592. 0 1
  593. 1 2
  594. 2 3
  595. 3 4
  596. dtype: int64
  597. >>> s.groupby([1, 1, 2, 2]).min()
  598. 1 1
  599. 2 3
  600. dtype: int64
  601. >>> s.groupby([1, 1, 2, 2]).agg('min')
  602. 1 1
  603. 2 3
  604. dtype: int64
  605. >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max'])
  606. min max
  607. 1 1 2
  608. 2 3 4
  609. """)
  610. @Appender(_apply_docs['template']
  611. .format(input='series',
  612. examples=_apply_docs['series_examples']))
  613. def apply(self, func, *args, **kwargs):
  614. return super(SeriesGroupBy, self).apply(func, *args, **kwargs)
  615. @Substitution(see_also=_agg_see_also_doc,
  616. examples=_agg_examples_doc,
  617. versionadded='',
  618. klass='Series',
  619. axis='')
  620. @Appender(_shared_docs['aggregate'])
  621. def aggregate(self, func_or_funcs, *args, **kwargs):
  622. _level = kwargs.pop('_level', None)
  623. if isinstance(func_or_funcs, compat.string_types):
  624. return getattr(self, func_or_funcs)(*args, **kwargs)
  625. if isinstance(func_or_funcs, compat.Iterable):
  626. # Catch instances of lists / tuples
  627. # but not the class list / tuple itself.
  628. ret = self._aggregate_multiple_funcs(func_or_funcs,
  629. (_level or 0) + 1)
  630. else:
  631. cyfunc = self._is_cython_func(func_or_funcs)
  632. if cyfunc and not args and not kwargs:
  633. return getattr(self, cyfunc)()
  634. if self.grouper.nkeys > 1:
  635. return self._python_agg_general(func_or_funcs, *args, **kwargs)
  636. try:
  637. return self._python_agg_general(func_or_funcs, *args, **kwargs)
  638. except Exception:
  639. result = self._aggregate_named(func_or_funcs, *args, **kwargs)
  640. index = Index(sorted(result), name=self.grouper.names[0])
  641. ret = Series(result, index=index)
  642. if not self.as_index: # pragma: no cover
  643. print('Warning, ignoring as_index=True')
  644. # _level handled at higher
  645. if not _level and isinstance(ret, dict):
  646. from pandas import concat
  647. ret = concat(ret, axis=1)
  648. return ret
  649. agg = aggregate
  650. def _aggregate_multiple_funcs(self, arg, _level):
  651. if isinstance(arg, dict):
  652. # show the deprecation, but only if we
  653. # have not shown a higher level one
  654. # GH 15931
  655. if isinstance(self._selected_obj, Series) and _level <= 1:
  656. warnings.warn(
  657. ("using a dict on a Series for aggregation\n"
  658. "is deprecated and will be removed in a future "
  659. "version"),
  660. FutureWarning, stacklevel=3)
  661. columns = list(arg.keys())
  662. arg = list(arg.items())
  663. elif any(isinstance(x, (tuple, list)) for x in arg):
  664. arg = [(x, x) if not isinstance(x, (tuple, list)) else x
  665. for x in arg]
  666. # indicated column order
  667. columns = lzip(*arg)[0]
  668. else:
  669. # list of functions / function names
  670. columns = []
  671. for f in arg:
  672. if isinstance(f, compat.string_types):
  673. columns.append(f)
  674. else:
  675. # protect against callables without names
  676. columns.append(com.get_callable_name(f))
  677. arg = lzip(columns, arg)
  678. results = {}
  679. for name, func in arg:
  680. obj = self
  681. if name in results:
  682. raise SpecificationError(
  683. 'Function names must be unique, found multiple named '
  684. '{}'.format(name))
  685. # reset the cache so that we
  686. # only include the named selection
  687. if name in self._selected_obj:
  688. obj = copy.copy(obj)
  689. obj._reset_cache()
  690. obj._selection = name
  691. results[name] = obj.aggregate(func)
  692. if any(isinstance(x, DataFrame) for x in compat.itervalues(results)):
  693. # let higher level handle
  694. if _level:
  695. return results
  696. return DataFrame(results, columns=columns)
  697. def _wrap_output(self, output, index, names=None):
  698. """ common agg/transform wrapping logic """
  699. output = output[self._selection_name]
  700. if names is not None:
  701. return DataFrame(output, index=index, columns=names)
  702. else:
  703. name = self._selection_name
  704. if name is None:
  705. name = self._selected_obj.name
  706. return Series(output, index=index, name=name)
  707. def _wrap_aggregated_output(self, output, names=None):
  708. return self._wrap_output(output=output,
  709. index=self.grouper.result_index,
  710. names=names)
  711. def _wrap_transformed_output(self, output, names=None):
  712. return self._wrap_output(output=output,
  713. index=self.obj.index,
  714. names=names)
  715. def _wrap_applied_output(self, keys, values, not_indexed_same=False):
  716. if len(keys) == 0:
  717. # GH #6265
  718. return Series([], name=self._selection_name, index=keys)
  719. def _get_index():
  720. if self.grouper.nkeys > 1:
  721. index = MultiIndex.from_tuples(keys, names=self.grouper.names)
  722. else:
  723. index = Index(keys, name=self.grouper.names[0])
  724. return index
  725. if isinstance(values[0], dict):
  726. # GH #823
  727. index = _get_index()
  728. result = DataFrame(values, index=index).stack()
  729. result.name = self._selection_name
  730. return result
  731. if isinstance(values[0], (Series, dict)):
  732. return self._concat_objects(keys, values,
  733. not_indexed_same=not_indexed_same)
  734. elif isinstance(values[0], DataFrame):
  735. # possible that Series -> DataFrame by applied function
  736. return self._concat_objects(keys, values,
  737. not_indexed_same=not_indexed_same)
  738. else:
  739. # GH #6265
  740. return Series(values, index=_get_index(),
  741. name=self._selection_name)
  742. def _aggregate_named(self, func, *args, **kwargs):
  743. result = {}
  744. for name, group in self:
  745. group.name = name
  746. output = func(group, *args, **kwargs)
  747. if isinstance(output, (Series, Index, np.ndarray)):
  748. raise Exception('Must produce aggregated value')
  749. result[name] = self._try_cast(output, group)
  750. return result
  751. @Substitution(klass='Series', selected='A.')
  752. @Appender(_transform_template)
  753. def transform(self, func, *args, **kwargs):
  754. func = self._is_cython_func(func) or func
  755. # if string function
  756. if isinstance(func, compat.string_types):
  757. if func in base.cython_transforms:
  758. # cythonized transform
  759. return getattr(self, func)(*args, **kwargs)
  760. else:
  761. # cythonized aggregation and merge
  762. return self._transform_fast(
  763. lambda: getattr(self, func)(*args, **kwargs), func)
  764. # reg transform
  765. klass = self._selected_obj.__class__
  766. results = []
  767. wrapper = lambda x: func(x, *args, **kwargs)
  768. for name, group in self:
  769. object.__setattr__(group, 'name', name)
  770. res = wrapper(group)
  771. if hasattr(res, 'values'):
  772. res = res.values
  773. indexer = self._get_index(name)
  774. s = klass(res, indexer)
  775. results.append(s)
  776. from pandas.core.reshape.concat import concat
  777. result = concat(results).sort_index()
  778. # we will only try to coerce the result type if
  779. # we have a numeric dtype, as these are *always* udfs
  780. # the cython take a different path (and casting)
  781. dtype = self._selected_obj.dtype
  782. if is_numeric_dtype(dtype):
  783. result = maybe_downcast_to_dtype(result, dtype)
  784. result.name = self._selected_obj.name
  785. result.index = self._selected_obj.index
  786. return result
  787. def _transform_fast(self, func, func_nm):
  788. """
  789. fast version of transform, only applicable to
  790. builtin/cythonizable functions
  791. """
  792. if isinstance(func, compat.string_types):
  793. func = getattr(self, func)
  794. ids, _, ngroup = self.grouper.group_info
  795. cast = self._transform_should_cast(func_nm)
  796. out = algorithms.take_1d(func()._values, ids)
  797. if cast:
  798. out = self._try_cast(out, self.obj)
  799. return Series(out, index=self.obj.index, name=self.obj.name)
  800. def filter(self, func, dropna=True, *args, **kwargs): # noqa
  801. """
  802. Return a copy of a Series excluding elements from groups that
  803. do not satisfy the boolean criterion specified by func.
  804. Parameters
  805. ----------
  806. func : function
  807. To apply to each group. Should return True or False.
  808. dropna : Drop groups that do not pass the filter. True by default;
  809. if False, groups that evaluate False are filled with NaNs.
  810. Examples
  811. --------
  812. >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
  813. ... 'foo', 'bar'],
  814. ... 'B' : [1, 2, 3, 4, 5, 6],
  815. ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
  816. >>> grouped = df.groupby('A')
  817. >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.)
  818. 1 2
  819. 3 4
  820. 5 6
  821. Name: B, dtype: int64
  822. Returns
  823. -------
  824. filtered : Series
  825. """
  826. if isinstance(func, compat.string_types):
  827. wrapper = lambda x: getattr(x, func)(*args, **kwargs)
  828. else:
  829. wrapper = lambda x: func(x, *args, **kwargs)
  830. # Interpret np.nan as False.
  831. def true_and_notna(x, *args, **kwargs):
  832. b = wrapper(x, *args, **kwargs)
  833. return b and notna(b)
  834. try:
  835. indices = [self._get_index(name) for name, group in self
  836. if true_and_notna(group)]
  837. except ValueError:
  838. raise TypeError("the filter must return a boolean result")
  839. except TypeError:
  840. raise TypeError("the filter must return a boolean result")
  841. filtered = self._apply_filter(indices, dropna)
  842. return filtered
  843. def nunique(self, dropna=True):
  844. """ Returns number of unique elements in the group """
  845. ids, _, _ = self.grouper.group_info
  846. val = self.obj.get_values()
  847. try:
  848. sorter = np.lexsort((val, ids))
  849. except TypeError: # catches object dtypes
  850. msg = 'val.dtype must be object, got {}'.format(val.dtype)
  851. assert val.dtype == object, msg
  852. val, _ = algorithms.factorize(val, sort=False)
  853. sorter = np.lexsort((val, ids))
  854. _isna = lambda a: a == -1
  855. else:
  856. _isna = isna
  857. ids, val = ids[sorter], val[sorter]
  858. # group boundaries are where group ids change
  859. # unique observations are where sorted values change
  860. idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
  861. inc = np.r_[1, val[1:] != val[:-1]]
  862. # 1st item of each group is a new unique observation
  863. mask = _isna(val)
  864. if dropna:
  865. inc[idx] = 1
  866. inc[mask] = 0
  867. else:
  868. inc[mask & np.r_[False, mask[:-1]]] = 0
  869. inc[idx] = 1
  870. out = np.add.reduceat(inc, idx).astype('int64', copy=False)
  871. if len(ids):
  872. # NaN/NaT group exists if the head of ids is -1,
  873. # so remove it from res and exclude its index from idx
  874. if ids[0] == -1:
  875. res = out[1:]
  876. idx = idx[np.flatnonzero(idx)]
  877. else:
  878. res = out
  879. else:
  880. res = out[1:]
  881. ri = self.grouper.result_index
  882. # we might have duplications among the bins
  883. if len(res) != len(ri):
  884. res, out = np.zeros(len(ri), dtype=out.dtype), res
  885. res[ids[idx]] = out
  886. return Series(res,
  887. index=ri,
  888. name=self._selection_name)
  889. @Appender(Series.describe.__doc__)
  890. def describe(self, **kwargs):
  891. result = self.apply(lambda x: x.describe(**kwargs))
  892. if self.axis == 1:
  893. return result.T
  894. return result.unstack()
  895. def value_counts(self, normalize=False, sort=True, ascending=False,
  896. bins=None, dropna=True):
  897. from pandas.core.reshape.tile import cut
  898. from pandas.core.reshape.merge import _get_join_indexers
  899. if bins is not None and not np.iterable(bins):
  900. # scalar bins cannot be done at top level
  901. # in a backward compatible way
  902. return self.apply(Series.value_counts,
  903. normalize=normalize,
  904. sort=sort,
  905. ascending=ascending,
  906. bins=bins)
  907. ids, _, _ = self.grouper.group_info
  908. val = self.obj.get_values()
  909. # groupby removes null keys from groupings
  910. mask = ids != -1
  911. ids, val = ids[mask], val[mask]
  912. if bins is None:
  913. lab, lev = algorithms.factorize(val, sort=True)
  914. llab = lambda lab, inc: lab[inc]
  915. else:
  916. # lab is a Categorical with categories an IntervalIndex
  917. lab = cut(Series(val), bins, include_lowest=True)
  918. lev = lab.cat.categories
  919. lab = lev.take(lab.cat.codes)
  920. llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]
  921. if is_interval_dtype(lab):
  922. # TODO: should we do this inside II?
  923. sorter = np.lexsort((lab.left, lab.right, ids))
  924. else:
  925. sorter = np.lexsort((lab, ids))
  926. ids, lab = ids[sorter], lab[sorter]
  927. # group boundaries are where group ids change
  928. idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
  929. # new values are where sorted labels change
  930. lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
  931. inc = np.r_[True, lchanges]
  932. inc[idx] = True # group boundaries are also new values
  933. out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
  934. # num. of times each group should be repeated
  935. rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
  936. # multi-index components
  937. labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)]
  938. levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
  939. names = self.grouper.names + [self._selection_name]
  940. if dropna:
  941. mask = labels[-1] != -1
  942. if mask.all():
  943. dropna = False
  944. else:
  945. out, labels = out[mask], [label[mask] for label in labels]
  946. if normalize:
  947. out = out.astype('float')
  948. d = np.diff(np.r_[idx, len(ids)])
  949. if dropna:
  950. m = ids[lab == -1]
  951. np.add.at(d, m, -1)
  952. acc = rep(d)[mask]
  953. else:
  954. acc = rep(d)
  955. out /= acc
  956. if sort and bins is None:
  957. cat = ids[inc][mask] if dropna else ids[inc]
  958. sorter = np.lexsort((out if ascending else -out, cat))
  959. out, labels[-1] = out[sorter], labels[-1][sorter]
  960. if bins is None:
  961. mi = MultiIndex(levels=levels, codes=labels, names=names,
  962. verify_integrity=False)
  963. if is_integer_dtype(out):
  964. out = ensure_int64(out)
  965. return Series(out, index=mi, name=self._selection_name)
  966. # for compat. with libgroupby.value_counts need to ensure every
  967. # bin is present at every index level, null filled with zeros
  968. diff = np.zeros(len(out), dtype='bool')
  969. for lab in labels[:-1]:
  970. diff |= np.r_[True, lab[1:] != lab[:-1]]
  971. ncat, nbin = diff.sum(), len(levels[-1])
  972. left = [np.repeat(np.arange(ncat), nbin),
  973. np.tile(np.arange(nbin), ncat)]
  974. right = [diff.cumsum() - 1, labels[-1]]
  975. _, idx = _get_join_indexers(left, right, sort=False, how='left')
  976. out = np.where(idx != -1, out[idx], 0)
  977. if sort:
  978. sorter = np.lexsort((out if ascending else -out, left[0]))
  979. out, left[-1] = out[sorter], left[-1][sorter]
  980. # build the multi-index w/ full levels
  981. codes = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1]))
  982. codes.append(left[-1])
  983. mi = MultiIndex(levels=levels, codes=codes, names=names,
  984. verify_integrity=False)
  985. if is_integer_dtype(out):
  986. out = ensure_int64(out)
  987. return Series(out, index=mi, name=self._selection_name)
  988. def count(self):
  989. """ Compute count of group, excluding missing values """
  990. ids, _, ngroups = self.grouper.group_info
  991. val = self.obj.get_values()
  992. mask = (ids != -1) & ~isna(val)
  993. ids = ensure_platform_int(ids)
  994. minlength = ngroups or (None if _np_version_under1p13 else 0)
  995. out = np.bincount(ids[mask], minlength=minlength)
  996. return Series(out,
  997. index=self.grouper.result_index,
  998. name=self._selection_name,
  999. dtype='int64')
  1000. def _apply_to_column_groupbys(self, func):
  1001. """ return a pass thru """
  1002. return func(self)
  1003. def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None):
  1004. """Calcuate pct_change of each value to previous entry in group"""
  1005. # TODO: Remove this conditional when #23918 is fixed
  1006. if freq:
  1007. return self.apply(lambda x: x.pct_change(periods=periods,
  1008. fill_method=fill_method,
  1009. limit=limit, freq=freq))
  1010. filled = getattr(self, fill_method)(limit=limit)
  1011. fill_grp = filled.groupby(self.grouper.labels)
  1012. shifted = fill_grp.shift(periods=periods, freq=freq)
  1013. return (filled / shifted) - 1
  1014. class DataFrameGroupBy(NDFrameGroupBy):
  1015. _apply_whitelist = base.dataframe_apply_whitelist
  1016. #
  1017. # Make class defs of attributes on DataFrameGroupBy whitelist.
  1018. for _def_str in base.whitelist_method_generator(
  1019. GroupBy, DataFrame, _apply_whitelist):
  1020. exec(_def_str)
  1021. _block_agg_axis = 1
  1022. _agg_see_also_doc = dedent("""
  1023. See Also
  1024. --------
  1025. pandas.DataFrame.groupby.apply
  1026. pandas.DataFrame.groupby.transform
  1027. pandas.DataFrame.aggregate
  1028. """)
  1029. _agg_examples_doc = dedent("""
  1030. Examples
  1031. --------
  1032. >>> df = pd.DataFrame({'A': [1, 1, 2, 2],
  1033. ... 'B': [1, 2, 3, 4],
  1034. ... 'C': np.random.randn(4)})
  1035. >>> df
  1036. A B C
  1037. 0 1 1 0.362838
  1038. 1 1 2 0.227877
  1039. 2 2 3 1.267767
  1040. 3 2 4 -0.562860
  1041. The aggregation is for each column.
  1042. >>> df.groupby('A').agg('min')
  1043. B C
  1044. A
  1045. 1 1 0.227877
  1046. 2 3 -0.562860
  1047. Multiple aggregations
  1048. >>> df.groupby('A').agg(['min', 'max'])
  1049. B C
  1050. min max min max
  1051. A
  1052. 1 1 2 0.227877 0.362838
  1053. 2 3 4 -0.562860 1.267767
  1054. Select a column for aggregation
  1055. >>> df.groupby('A').B.agg(['min', 'max'])
  1056. min max
  1057. A
  1058. 1 1 2
  1059. 2 3 4
  1060. Different aggregations per column
  1061. >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
  1062. B C
  1063. min max sum
  1064. A
  1065. 1 1 2 0.590716
  1066. 2 3 4 0.704907
  1067. """)
  1068. @Substitution(see_also=_agg_see_also_doc,
  1069. examples=_agg_examples_doc,
  1070. versionadded='',
  1071. klass='DataFrame',
  1072. axis='')
  1073. @Appender(_shared_docs['aggregate'])
  1074. def aggregate(self, arg, *args, **kwargs):
  1075. return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)
  1076. agg = aggregate
  1077. def _gotitem(self, key, ndim, subset=None):
  1078. """
  1079. sub-classes to define
  1080. return a sliced object
  1081. Parameters
  1082. ----------
  1083. key : string / list of selections
  1084. ndim : 1,2
  1085. requested ndim of result
  1086. subset : object, default None
  1087. subset to act on
  1088. """
  1089. if ndim == 2:
  1090. if subset is None:
  1091. subset = self.obj
  1092. return DataFrameGroupBy(subset, self.grouper, selection=key,
  1093. grouper=self.grouper,
  1094. exclusions=self.exclusions,
  1095. as_index=self.as_index,
  1096. observed=self.observed)
  1097. elif ndim == 1:
  1098. if subset is None:
  1099. subset = self.obj[key]
  1100. return SeriesGroupBy(subset, selection=key,
  1101. grouper=self.grouper)
  1102. raise AssertionError("invalid ndim for _gotitem")
  1103. def _wrap_generic_output(self, result, obj):
  1104. result_index = self.grouper.levels[0]
  1105. if self.axis == 0:
  1106. return DataFrame(result, index=obj.columns,
  1107. columns=result_index).T
  1108. else:
  1109. return DataFrame(result, index=obj.index,
  1110. columns=result_index)
  1111. def _get_data_to_aggregate(self):
  1112. obj = self._obj_with_exclusions
  1113. if self.axis == 1:
  1114. return obj.T._data, 1
  1115. else:
  1116. return obj._data, 1
  1117. def _insert_inaxis_grouper_inplace(self, result):
  1118. # zip in reverse so we can always insert at loc 0
  1119. izip = zip(* map(reversed, (
  1120. self.grouper.names,
  1121. self.grouper.get_group_levels(),
  1122. [grp.in_axis for grp in self.grouper.groupings])))
  1123. for name, lev, in_axis in izip:
  1124. if in_axis:
  1125. result.insert(0, name, lev)
  1126. def _wrap_aggregated_output(self, output, names=None):
  1127. agg_axis = 0 if self.axis == 1 else 1
  1128. agg_labels = self._obj_with_exclusions._get_axis(agg_axis)
  1129. output_keys = self._decide_output_index(output, agg_labels)
  1130. if not self.as_index:
  1131. result = DataFrame(output, columns=output_keys)
  1132. self._insert_inaxis_grouper_inplace(result)
  1133. result = result._consolidate()
  1134. else:
  1135. index = self.grouper.result_index
  1136. result = DataFrame(output, index=index, columns=output_keys)
  1137. if self.axis == 1:
  1138. result = result.T
  1139. return self._reindex_output(result)._convert(datetime=True)
  1140. def _wrap_transformed_output(self, output, names=None):
  1141. return DataFrame(output, index=self.obj.index)
  1142. def _wrap_agged_blocks(self, items, blocks):
  1143. if not self.as_index:
  1144. index = np.arange(blocks[0].values.shape[-1])
  1145. mgr = BlockManager(blocks, [items, index])
  1146. result = DataFrame(mgr)
  1147. self._insert_inaxis_grouper_inplace(result)
  1148. result = result._consolidate()
  1149. else:
  1150. index = self.grouper.result_index
  1151. mgr = BlockManager(blocks, [items, index])
  1152. result = DataFrame(mgr)
  1153. if self.axis == 1:
  1154. result = result.T
  1155. return self._reindex_output(result)._convert(datetime=True)
  1156. def _reindex_output(self, result):
  1157. """
  1158. If we have categorical groupers, then we want to make sure that
  1159. we have a fully reindex-output to the levels. These may have not
  1160. participated in the groupings (e.g. may have all been
  1161. nan groups);
  1162. This can re-expand the output space
  1163. """
  1164. # we need to re-expand the output space to accomodate all values
  1165. # whether observed or not in the cartesian product of our groupes
  1166. groupings = self.grouper.groupings
  1167. if groupings is None:
  1168. return result
  1169. elif len(groupings) == 1:
  1170. return result
  1171. # if we only care about the observed values
  1172. # we are done
  1173. elif self.observed:
  1174. return result
  1175. # reindexing only applies to a Categorical grouper
  1176. elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex))
  1177. for ping in groupings):
  1178. return result
  1179. levels_list = [ping.group_index for ping in groupings]
  1180. index, _ = MultiIndex.from_product(
  1181. levels_list, names=self.grouper.names).sortlevel()
  1182. if self.as_index:
  1183. d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
  1184. return result.reindex(**d)
  1185. # GH 13204
  1186. # Here, the categorical in-axis groupers, which need to be fully
  1187. # expanded, are columns in `result`. An idea is to do:
  1188. # result = result.set_index(self.grouper.names)
  1189. # .reindex(index).reset_index()
  1190. # but special care has to be taken because of possible not-in-axis
  1191. # groupers.
  1192. # So, we manually select and drop the in-axis grouper columns,
  1193. # reindex `result`, and then reset the in-axis grouper columns.
  1194. # Select in-axis groupers
  1195. in_axis_grps = [(i, ping.name) for (i, ping)
  1196. in enumerate(groupings) if ping.in_axis]
  1197. g_nums, g_names = zip(*in_axis_grps)
  1198. result = result.drop(labels=list(g_names), axis=1)
  1199. # Set a temp index and reindex (possibly expanding)
  1200. result = result.set_index(self.grouper.result_index
  1201. ).reindex(index, copy=False)
  1202. # Reset in-axis grouper columns
  1203. # (using level numbers `g_nums` because level names may not be unique)
  1204. result = result.reset_index(level=g_nums)
  1205. return result.reset_index(drop=True)
  1206. def _iterate_column_groupbys(self):
  1207. for i, colname in enumerate(self._selected_obj.columns):
  1208. yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i],
  1209. selection=colname,
  1210. grouper=self.grouper,
  1211. exclusions=self.exclusions)
  1212. def _apply_to_column_groupbys(self, func):
  1213. from pandas.core.reshape.concat import concat
  1214. return concat(
  1215. (func(col_groupby) for _, col_groupby
  1216. in self._iterate_column_groupbys()),
  1217. keys=self._selected_obj.columns, axis=1)
  1218. def _fill(self, direction, limit=None):
  1219. """Overridden method to join grouped columns in output"""
  1220. res = super(DataFrameGroupBy, self)._fill(direction, limit=limit)
  1221. output = collections.OrderedDict(
  1222. (grp.name, grp.grouper) for grp in self.grouper.groupings)
  1223. from pandas import concat
  1224. return concat((self._wrap_transformed_output(output), res), axis=1)
  1225. def count(self):
  1226. """ Compute count of group, excluding missing values """
  1227. from pandas.core.dtypes.missing import _isna_ndarraylike as _isna
  1228. data, _ = self._get_data_to_aggregate()
  1229. ids, _, ngroups = self.grouper.group_info
  1230. mask = ids != -1
  1231. val = ((mask & ~_isna(np.atleast_2d(blk.get_values())))
  1232. for blk in data.blocks)
  1233. loc = (blk.mgr_locs for blk in data.blocks)
  1234. counter = partial(
  1235. lib.count_level_2d, labels=ids, max_bin=ngroups, axis=1)
  1236. blk = map(make_block, map(counter, val), loc)
  1237. return self._wrap_agged_blocks(data.items, list(blk))
  1238. def nunique(self, dropna=True):
  1239. """
  1240. Return DataFrame with number of distinct observations per group for
  1241. each column.
  1242. .. versionadded:: 0.20.0
  1243. Parameters
  1244. ----------
  1245. dropna : boolean, default True
  1246. Don't include NaN in the counts.
  1247. Returns
  1248. -------
  1249. nunique: DataFrame
  1250. Examples
  1251. --------
  1252. >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
  1253. ... 'ham', 'ham'],
  1254. ... 'value1': [1, 5, 5, 2, 5, 5],
  1255. ... 'value2': list('abbaxy')})
  1256. >>> df
  1257. id value1 value2
  1258. 0 spam 1 a
  1259. 1 egg 5 b
  1260. 2 egg 5 b
  1261. 3 spam 2 a
  1262. 4 ham 5 x
  1263. 5 ham 5 y
  1264. >>> df.groupby('id').nunique()
  1265. id value1 value2
  1266. id
  1267. egg 1 1 1
  1268. ham 1 1 2
  1269. spam 1 2 1
  1270. # check for rows with the same id but conflicting values
  1271. >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
  1272. id value1 value2
  1273. 0 spam 1 a
  1274. 3 spam 2 a
  1275. 4 ham 5 x
  1276. 5 ham 5 y
  1277. """
  1278. obj = self._selected_obj
  1279. def groupby_series(obj, col=None):
  1280. return SeriesGroupBy(obj,
  1281. selection=col,
  1282. grouper=self.grouper).nunique(dropna=dropna)
  1283. if isinstance(obj, Series):
  1284. results = groupby_series(obj)
  1285. else:
  1286. from pandas.core.reshape.concat import concat
  1287. results = [groupby_series(obj[col], col) for col in obj.columns]
  1288. results = concat(results, axis=1)
  1289. if not self.as_index:
  1290. results.index = ibase.default_index(len(results))
  1291. return results
  1292. boxplot = boxplot_frame_groupby
  1293. class PanelGroupBy(NDFrameGroupBy):
  1294. def aggregate(self, arg, *args, **kwargs):
  1295. return super(PanelGroupBy, self).aggregate(arg, *args, **kwargs)
  1296. agg = aggregate
  1297. def _iterate_slices(self):
  1298. if self.axis == 0:
  1299. # kludge
  1300. if self._selection is None:
  1301. slice_axis = self._selected_obj.items
  1302. else:
  1303. slice_axis = self._selection_list
  1304. slicer = lambda x: self._selected_obj[x]
  1305. else:
  1306. raise NotImplementedError("axis other than 0 is not supported")
  1307. for val in slice_axis:
  1308. if val in self.exclusions:
  1309. continue
  1310. yield val, slicer(val)
  1311. def aggregate(self, arg, *args, **kwargs):
  1312. """
  1313. Aggregate using input function or dict of {column -> function}
  1314. Parameters
  1315. ----------
  1316. arg : function or dict
  1317. Function to use for aggregating groups. If a function, must either
  1318. work when passed a Panel or when passed to Panel.apply. If
  1319. pass a dict, the keys must be DataFrame column names
  1320. Returns
  1321. -------
  1322. aggregated : Panel
  1323. """
  1324. if isinstance(arg, compat.string_types):
  1325. return getattr(self, arg)(*args, **kwargs)
  1326. return self._aggregate_generic(arg, *args, **kwargs)
  1327. def _wrap_generic_output(self, result, obj):
  1328. if self.axis == 0:
  1329. new_axes = list(obj.axes)
  1330. new_axes[0] = self.grouper.result_index
  1331. elif self.axis == 1:
  1332. x, y, z = obj.axes
  1333. new_axes = [self.grouper.result_index, z, x]
  1334. else:
  1335. x, y, z = obj.axes
  1336. new_axes = [self.grouper.result_index, y, x]
  1337. result = Panel._from_axes(result, new_axes)
  1338. if self.axis == 1:
  1339. result = result.swapaxes(0, 1).swapaxes(0, 2)
  1340. elif self.axis == 2:
  1341. result = result.swapaxes(0, 2)
  1342. return result
  1343. def _aggregate_item_by_item(self, func, *args, **kwargs):
  1344. obj = self._obj_with_exclusions
  1345. result = {}
  1346. if self.axis > 0:
  1347. for item in obj:
  1348. try:
  1349. itemg = DataFrameGroupBy(obj[item],
  1350. axis=self.axis - 1,
  1351. grouper=self.grouper)
  1352. result[item] = itemg.aggregate(func, *args, **kwargs)
  1353. except (ValueError, TypeError):
  1354. raise
  1355. new_axes = list(obj.axes)
  1356. new_axes[self.axis] = self.grouper.result_index
  1357. return Panel._from_axes(result, new_axes)
  1358. else:
  1359. raise ValueError("axis value must be greater than 0")
  1360. def _wrap_aggregated_output(self, output, names=None):
  1361. raise AbstractMethodError(self)