resample.py 57 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766
  1. import copy
  2. from datetime import timedelta
  3. from textwrap import dedent
  4. import warnings
  5. import numpy as np
  6. from pandas._libs import lib
  7. from pandas._libs.tslibs import NaT, Timestamp
  8. from pandas._libs.tslibs.frequencies import is_subperiod, is_superperiod
  9. from pandas._libs.tslibs.period import IncompatibleFrequency
  10. import pandas.compat as compat
  11. from pandas.compat.numpy import function as nv
  12. from pandas.errors import AbstractMethodError
  13. from pandas.util._decorators import Appender, Substitution
  14. from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
  15. import pandas as pd
  16. import pandas.core.algorithms as algos
  17. from pandas.core.generic import _shared_docs
  18. from pandas.core.groupby.base import GroupByMixin
  19. from pandas.core.groupby.generic import PanelGroupBy, SeriesGroupBy
  20. from pandas.core.groupby.groupby import (
  21. GroupBy, _GroupBy, _pipe_template, groupby)
  22. from pandas.core.groupby.grouper import Grouper
  23. from pandas.core.groupby.ops import BinGrouper
  24. from pandas.core.indexes.datetimes import DatetimeIndex, date_range
  25. from pandas.core.indexes.period import PeriodIndex
  26. from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range
  27. from pandas.tseries.frequencies import to_offset
  28. from pandas.tseries.offsets import DateOffset, Day, Nano, Tick
  29. _shared_docs_kwargs = dict()
  30. class Resampler(_GroupBy):
  31. """
  32. Class for resampling datetimelike data, a groupby-like operation.
  33. See aggregate, transform, and apply functions on this object.
  34. It's easiest to use obj.resample(...) to use Resampler.
  35. Parameters
  36. ----------
  37. obj : pandas object
  38. groupby : a TimeGrouper object
  39. axis : int, default 0
  40. kind : str or None
  41. 'period', 'timestamp' to override default index treatement
  42. Returns
  43. -------
  44. a Resampler of the appropriate type
  45. Notes
  46. -----
  47. After resampling, see aggregate, apply, and transform functions.
  48. """
  49. # to the groupby descriptor
  50. _attributes = ['freq', 'axis', 'closed', 'label', 'convention',
  51. 'loffset', 'base', 'kind']
  52. def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs):
  53. self.groupby = groupby
  54. self.keys = None
  55. self.sort = True
  56. self.axis = axis
  57. self.kind = kind
  58. self.squeeze = False
  59. self.group_keys = True
  60. self.as_index = True
  61. self.exclusions = set()
  62. self.binner = None
  63. self.grouper = None
  64. if self.groupby is not None:
  65. self.groupby._set_grouper(self._convert_obj(obj), sort=True)
  66. def __unicode__(self):
  67. """
  68. Provide a nice str repr of our rolling object.
  69. """
  70. attrs = ["{k}={v}".format(k=k, v=getattr(self.groupby, k))
  71. for k in self._attributes if
  72. getattr(self.groupby, k, None) is not None]
  73. return "{klass} [{attrs}]".format(klass=self.__class__.__name__,
  74. attrs=', '.join(attrs))
  75. def __getattr__(self, attr):
  76. if attr in self._internal_names_set:
  77. return object.__getattribute__(self, attr)
  78. if attr in self._attributes:
  79. return getattr(self.groupby, attr)
  80. if attr in self.obj:
  81. return self[attr]
  82. return object.__getattribute__(self, attr)
  83. def __iter__(self):
  84. """
  85. Resampler iterator.
  86. Returns
  87. -------
  88. Generator yielding sequence of (name, subsetted object)
  89. for each group
  90. See Also
  91. --------
  92. GroupBy.__iter__
  93. """
  94. self._set_binner()
  95. return super(Resampler, self).__iter__()
  96. @property
  97. def obj(self):
  98. return self.groupby.obj
  99. @property
  100. def ax(self):
  101. return self.groupby.ax
  102. @property
  103. def _typ(self):
  104. """
  105. Masquerade for compat as a Series or a DataFrame.
  106. """
  107. if isinstance(self._selected_obj, pd.Series):
  108. return 'series'
  109. return 'dataframe'
  110. @property
  111. def _from_selection(self):
  112. """
  113. Is the resampling from a DataFrame column or MultiIndex level.
  114. """
  115. # upsampling and PeriodIndex resampling do not work
  116. # with selection, this state used to catch and raise an error
  117. return (self.groupby is not None and
  118. (self.groupby.key is not None or
  119. self.groupby.level is not None))
  120. def _convert_obj(self, obj):
  121. """
  122. Provide any conversions for the object in order to correctly handle.
  123. Parameters
  124. ----------
  125. obj : the object to be resampled
  126. Returns
  127. -------
  128. obj : converted object
  129. """
  130. obj = obj._consolidate()
  131. return obj
  132. def _get_binner_for_time(self):
  133. raise AbstractMethodError(self)
  134. def _set_binner(self):
  135. """
  136. Setup our binners.
  137. Cache these as we are an immutable object
  138. """
  139. if self.binner is None:
  140. self.binner, self.grouper = self._get_binner()
  141. def _get_binner(self):
  142. """
  143. Create the BinGrouper, assume that self.set_grouper(obj)
  144. has already been called.
  145. """
  146. binner, bins, binlabels = self._get_binner_for_time()
  147. bin_grouper = BinGrouper(bins, binlabels, indexer=self.groupby.indexer)
  148. return binner, bin_grouper
  149. def _assure_grouper(self):
  150. """
  151. Make sure that we are creating our binner & grouper.
  152. """
  153. self._set_binner()
  154. @Substitution(klass='Resampler',
  155. versionadded='.. versionadded:: 0.23.0',
  156. examples="""
  157. >>> df = pd.DataFrame({'A': [1, 2, 3, 4]},
  158. ... index=pd.date_range('2012-08-02', periods=4))
  159. >>> df
  160. A
  161. 2012-08-02 1
  162. 2012-08-03 2
  163. 2012-08-04 3
  164. 2012-08-05 4
  165. To get the difference between each 2-day period's maximum and minimum
  166. value in one pass, you can do
  167. >>> df.resample('2D').pipe(lambda x: x.max() - x.min())
  168. A
  169. 2012-08-02 1
  170. 2012-08-04 1
  171. """)
  172. @Appender(_pipe_template)
  173. def pipe(self, func, *args, **kwargs):
  174. return super(Resampler, self).pipe(func, *args, **kwargs)
  175. _agg_see_also_doc = dedent("""
  176. See Also
  177. --------
  178. pandas.DataFrame.groupby.aggregate
  179. pandas.DataFrame.resample.transform
  180. pandas.DataFrame.aggregate
  181. """)
  182. _agg_examples_doc = dedent("""
  183. Examples
  184. --------
  185. >>> s = pd.Series([1,2,3,4,5],
  186. index=pd.date_range('20130101', periods=5,freq='s'))
  187. 2013-01-01 00:00:00 1
  188. 2013-01-01 00:00:01 2
  189. 2013-01-01 00:00:02 3
  190. 2013-01-01 00:00:03 4
  191. 2013-01-01 00:00:04 5
  192. Freq: S, dtype: int64
  193. >>> r = s.resample('2s')
  194. DatetimeIndexResampler [freq=<2 * Seconds>, axis=0, closed=left,
  195. label=left, convention=start, base=0]
  196. >>> r.agg(np.sum)
  197. 2013-01-01 00:00:00 3
  198. 2013-01-01 00:00:02 7
  199. 2013-01-01 00:00:04 5
  200. Freq: 2S, dtype: int64
  201. >>> r.agg(['sum','mean','max'])
  202. sum mean max
  203. 2013-01-01 00:00:00 3 1.5 2
  204. 2013-01-01 00:00:02 7 3.5 4
  205. 2013-01-01 00:00:04 5 5.0 5
  206. >>> r.agg({'result' : lambda x: x.mean() / x.std(),
  207. 'total' : np.sum})
  208. total result
  209. 2013-01-01 00:00:00 3 2.121320
  210. 2013-01-01 00:00:02 7 4.949747
  211. 2013-01-01 00:00:04 5 NaN
  212. """)
  213. @Substitution(see_also=_agg_see_also_doc,
  214. examples=_agg_examples_doc,
  215. versionadded='',
  216. klass='DataFrame',
  217. axis='')
  218. @Appender(_shared_docs['aggregate'])
  219. def aggregate(self, func, *args, **kwargs):
  220. self._set_binner()
  221. result, how = self._aggregate(func, *args, **kwargs)
  222. if result is None:
  223. how = func
  224. grouper = None
  225. result = self._groupby_and_aggregate(how,
  226. grouper,
  227. *args,
  228. **kwargs)
  229. result = self._apply_loffset(result)
  230. return result
  231. agg = aggregate
  232. apply = aggregate
  233. def transform(self, arg, *args, **kwargs):
  234. """
  235. Call function producing a like-indexed Series on each group and return
  236. a Series with the transformed values.
  237. Parameters
  238. ----------
  239. func : function
  240. To apply to each group. Should return a Series with the same index
  241. Returns
  242. -------
  243. transformed : Series
  244. Examples
  245. --------
  246. >>> resampled.transform(lambda x: (x - x.mean()) / x.std())
  247. """
  248. return self._selected_obj.groupby(self.groupby).transform(
  249. arg, *args, **kwargs)
  250. def _downsample(self, f):
  251. raise AbstractMethodError(self)
  252. def _upsample(self, f, limit=None, fill_value=None):
  253. raise AbstractMethodError(self)
  254. def _gotitem(self, key, ndim, subset=None):
  255. """
  256. Sub-classes to define. Return a sliced object.
  257. Parameters
  258. ----------
  259. key : string / list of selections
  260. ndim : 1,2
  261. requested ndim of result
  262. subset : object, default None
  263. subset to act on
  264. """
  265. self._set_binner()
  266. grouper = self.grouper
  267. if subset is None:
  268. subset = self.obj
  269. grouped = groupby(subset, by=None, grouper=grouper, axis=self.axis)
  270. # try the key selection
  271. try:
  272. return grouped[key]
  273. except KeyError:
  274. return grouped
  275. def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs):
  276. """
  277. Re-evaluate the obj with a groupby aggregation.
  278. """
  279. if grouper is None:
  280. self._set_binner()
  281. grouper = self.grouper
  282. obj = self._selected_obj
  283. try:
  284. grouped = groupby(obj, by=None, grouper=grouper, axis=self.axis)
  285. except TypeError:
  286. # panel grouper
  287. grouped = PanelGroupBy(obj, grouper=grouper, axis=self.axis)
  288. try:
  289. if isinstance(obj, ABCDataFrame) and compat.callable(how):
  290. # Check if the function is reducing or not.
  291. result = grouped._aggregate_item_by_item(how, *args, **kwargs)
  292. else:
  293. result = grouped.aggregate(how, *args, **kwargs)
  294. except Exception:
  295. # we have a non-reducing function
  296. # try to evaluate
  297. result = grouped.apply(how, *args, **kwargs)
  298. result = self._apply_loffset(result)
  299. return self._wrap_result(result)
  300. def _apply_loffset(self, result):
  301. """
  302. If loffset is set, offset the result index.
  303. This is NOT an idempotent routine, it will be applied
  304. exactly once to the result.
  305. Parameters
  306. ----------
  307. result : Series or DataFrame
  308. the result of resample
  309. """
  310. needs_offset = (
  311. isinstance(self.loffset, (DateOffset, timedelta,
  312. np.timedelta64)) and
  313. isinstance(result.index, DatetimeIndex) and
  314. len(result.index) > 0
  315. )
  316. if needs_offset:
  317. result.index = result.index + self.loffset
  318. self.loffset = None
  319. return result
  320. def _get_resampler_for_grouping(self, groupby, **kwargs):
  321. """
  322. Return the correct class for resampling with groupby.
  323. """
  324. return self._resampler_for_grouping(self, groupby=groupby, **kwargs)
  325. def _wrap_result(self, result):
  326. """
  327. Potentially wrap any results.
  328. """
  329. if isinstance(result, ABCSeries) and self._selection is not None:
  330. result.name = self._selection
  331. if isinstance(result, ABCSeries) and result.empty:
  332. obj = self.obj
  333. if isinstance(obj.index, PeriodIndex):
  334. result.index = obj.index.asfreq(self.freq)
  335. else:
  336. result.index = obj.index._shallow_copy(freq=self.freq)
  337. result.name = getattr(obj, 'name', None)
  338. return result
  339. def pad(self, limit=None):
  340. """
  341. Forward fill the values.
  342. Parameters
  343. ----------
  344. limit : integer, optional
  345. limit of how many values to fill
  346. Returns
  347. -------
  348. an upsampled Series
  349. See Also
  350. --------
  351. Series.fillna
  352. DataFrame.fillna
  353. """
  354. return self._upsample('pad', limit=limit)
  355. ffill = pad
  356. def nearest(self, limit=None):
  357. """
  358. Resample by using the nearest value.
  359. When resampling data, missing values may appear (e.g., when the
  360. resampling frequency is higher than the original frequency).
  361. The `nearest` method will replace ``NaN`` values that appeared in
  362. the resampled data with the value from the nearest member of the
  363. sequence, based on the index value.
  364. Missing values that existed in the original data will not be modified.
  365. If `limit` is given, fill only this many values in each direction for
  366. each of the original values.
  367. Parameters
  368. ----------
  369. limit : int, optional
  370. Limit of how many values to fill.
  371. .. versionadded:: 0.21.0
  372. Returns
  373. -------
  374. Series or DataFrame
  375. An upsampled Series or DataFrame with ``NaN`` values filled with
  376. their nearest value.
  377. See Also
  378. --------
  379. backfill : Backward fill the new missing values in the resampled data.
  380. pad : Forward fill ``NaN`` values.
  381. Examples
  382. --------
  383. >>> s = pd.Series([1, 2],
  384. ... index=pd.date_range('20180101',
  385. ... periods=2,
  386. ... freq='1h'))
  387. >>> s
  388. 2018-01-01 00:00:00 1
  389. 2018-01-01 01:00:00 2
  390. Freq: H, dtype: int64
  391. >>> s.resample('15min').nearest()
  392. 2018-01-01 00:00:00 1
  393. 2018-01-01 00:15:00 1
  394. 2018-01-01 00:30:00 2
  395. 2018-01-01 00:45:00 2
  396. 2018-01-01 01:00:00 2
  397. Freq: 15T, dtype: int64
  398. Limit the number of upsampled values imputed by the nearest:
  399. >>> s.resample('15min').nearest(limit=1)
  400. 2018-01-01 00:00:00 1.0
  401. 2018-01-01 00:15:00 1.0
  402. 2018-01-01 00:30:00 NaN
  403. 2018-01-01 00:45:00 2.0
  404. 2018-01-01 01:00:00 2.0
  405. Freq: 15T, dtype: float64
  406. """
  407. return self._upsample('nearest', limit=limit)
  408. def backfill(self, limit=None):
  409. """
  410. Backward fill the new missing values in the resampled data.
  411. In statistics, imputation is the process of replacing missing data with
  412. substituted values [1]_. When resampling data, missing values may
  413. appear (e.g., when the resampling frequency is higher than the original
  414. frequency). The backward fill will replace NaN values that appeared in
  415. the resampled data with the next value in the original sequence.
  416. Missing values that existed in the original data will not be modified.
  417. Parameters
  418. ----------
  419. limit : integer, optional
  420. Limit of how many values to fill.
  421. Returns
  422. -------
  423. Series, DataFrame
  424. An upsampled Series or DataFrame with backward filled NaN values.
  425. See Also
  426. --------
  427. bfill : Alias of backfill.
  428. fillna : Fill NaN values using the specified method, which can be
  429. 'backfill'.
  430. nearest : Fill NaN values with nearest neighbor starting from center.
  431. pad : Forward fill NaN values.
  432. pandas.Series.fillna : Fill NaN values in the Series using the
  433. specified method, which can be 'backfill'.
  434. pandas.DataFrame.fillna : Fill NaN values in the DataFrame using the
  435. specified method, which can be 'backfill'.
  436. References
  437. ----------
  438. .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics)
  439. Examples
  440. --------
  441. Resampling a Series:
  442. >>> s = pd.Series([1, 2, 3],
  443. ... index=pd.date_range('20180101', periods=3, freq='h'))
  444. >>> s
  445. 2018-01-01 00:00:00 1
  446. 2018-01-01 01:00:00 2
  447. 2018-01-01 02:00:00 3
  448. Freq: H, dtype: int64
  449. >>> s.resample('30min').backfill()
  450. 2018-01-01 00:00:00 1
  451. 2018-01-01 00:30:00 2
  452. 2018-01-01 01:00:00 2
  453. 2018-01-01 01:30:00 3
  454. 2018-01-01 02:00:00 3
  455. Freq: 30T, dtype: int64
  456. >>> s.resample('15min').backfill(limit=2)
  457. 2018-01-01 00:00:00 1.0
  458. 2018-01-01 00:15:00 NaN
  459. 2018-01-01 00:30:00 2.0
  460. 2018-01-01 00:45:00 2.0
  461. 2018-01-01 01:00:00 2.0
  462. 2018-01-01 01:15:00 NaN
  463. 2018-01-01 01:30:00 3.0
  464. 2018-01-01 01:45:00 3.0
  465. 2018-01-01 02:00:00 3.0
  466. Freq: 15T, dtype: float64
  467. Resampling a DataFrame that has missing values:
  468. >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]},
  469. ... index=pd.date_range('20180101', periods=3,
  470. ... freq='h'))
  471. >>> df
  472. a b
  473. 2018-01-01 00:00:00 2.0 1
  474. 2018-01-01 01:00:00 NaN 3
  475. 2018-01-01 02:00:00 6.0 5
  476. >>> df.resample('30min').backfill()
  477. a b
  478. 2018-01-01 00:00:00 2.0 1
  479. 2018-01-01 00:30:00 NaN 3
  480. 2018-01-01 01:00:00 NaN 3
  481. 2018-01-01 01:30:00 6.0 5
  482. 2018-01-01 02:00:00 6.0 5
  483. >>> df.resample('15min').backfill(limit=2)
  484. a b
  485. 2018-01-01 00:00:00 2.0 1.0
  486. 2018-01-01 00:15:00 NaN NaN
  487. 2018-01-01 00:30:00 NaN 3.0
  488. 2018-01-01 00:45:00 NaN 3.0
  489. 2018-01-01 01:00:00 NaN 3.0
  490. 2018-01-01 01:15:00 NaN NaN
  491. 2018-01-01 01:30:00 6.0 5.0
  492. 2018-01-01 01:45:00 6.0 5.0
  493. 2018-01-01 02:00:00 6.0 5.0
  494. """
  495. return self._upsample('backfill', limit=limit)
  496. bfill = backfill
  497. def fillna(self, method, limit=None):
  498. """
  499. Fill missing values introduced by upsampling.
  500. In statistics, imputation is the process of replacing missing data with
  501. substituted values [1]_. When resampling data, missing values may
  502. appear (e.g., when the resampling frequency is higher than the original
  503. frequency).
  504. Missing values that existed in the original data will
  505. not be modified.
  506. Parameters
  507. ----------
  508. method : {'pad', 'backfill', 'ffill', 'bfill', 'nearest'}
  509. Method to use for filling holes in resampled data
  510. * 'pad' or 'ffill': use previous valid observation to fill gap
  511. (forward fill).
  512. * 'backfill' or 'bfill': use next valid observation to fill gap.
  513. * 'nearest': use nearest valid observation to fill gap.
  514. limit : integer, optional
  515. Limit of how many consecutive missing values to fill.
  516. Returns
  517. -------
  518. Series or DataFrame
  519. An upsampled Series or DataFrame with missing values filled.
  520. See Also
  521. --------
  522. backfill : Backward fill NaN values in the resampled data.
  523. pad : Forward fill NaN values in the resampled data.
  524. nearest : Fill NaN values in the resampled data
  525. with nearest neighbor starting from center.
  526. interpolate : Fill NaN values using interpolation.
  527. pandas.Series.fillna : Fill NaN values in the Series using the
  528. specified method, which can be 'bfill' and 'ffill'.
  529. pandas.DataFrame.fillna : Fill NaN values in the DataFrame using the
  530. specified method, which can be 'bfill' and 'ffill'.
  531. References
  532. ----------
  533. .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics)
  534. Examples
  535. --------
  536. Resampling a Series:
  537. >>> s = pd.Series([1, 2, 3],
  538. ... index=pd.date_range('20180101', periods=3, freq='h'))
  539. >>> s
  540. 2018-01-01 00:00:00 1
  541. 2018-01-01 01:00:00 2
  542. 2018-01-01 02:00:00 3
  543. Freq: H, dtype: int64
  544. Without filling the missing values you get:
  545. >>> s.resample("30min").asfreq()
  546. 2018-01-01 00:00:00 1.0
  547. 2018-01-01 00:30:00 NaN
  548. 2018-01-01 01:00:00 2.0
  549. 2018-01-01 01:30:00 NaN
  550. 2018-01-01 02:00:00 3.0
  551. Freq: 30T, dtype: float64
  552. >>> s.resample('30min').fillna("backfill")
  553. 2018-01-01 00:00:00 1
  554. 2018-01-01 00:30:00 2
  555. 2018-01-01 01:00:00 2
  556. 2018-01-01 01:30:00 3
  557. 2018-01-01 02:00:00 3
  558. Freq: 30T, dtype: int64
  559. >>> s.resample('15min').fillna("backfill", limit=2)
  560. 2018-01-01 00:00:00 1.0
  561. 2018-01-01 00:15:00 NaN
  562. 2018-01-01 00:30:00 2.0
  563. 2018-01-01 00:45:00 2.0
  564. 2018-01-01 01:00:00 2.0
  565. 2018-01-01 01:15:00 NaN
  566. 2018-01-01 01:30:00 3.0
  567. 2018-01-01 01:45:00 3.0
  568. 2018-01-01 02:00:00 3.0
  569. Freq: 15T, dtype: float64
  570. >>> s.resample('30min').fillna("pad")
  571. 2018-01-01 00:00:00 1
  572. 2018-01-01 00:30:00 1
  573. 2018-01-01 01:00:00 2
  574. 2018-01-01 01:30:00 2
  575. 2018-01-01 02:00:00 3
  576. Freq: 30T, dtype: int64
  577. >>> s.resample('30min').fillna("nearest")
  578. 2018-01-01 00:00:00 1
  579. 2018-01-01 00:30:00 2
  580. 2018-01-01 01:00:00 2
  581. 2018-01-01 01:30:00 3
  582. 2018-01-01 02:00:00 3
  583. Freq: 30T, dtype: int64
  584. Missing values present before the upsampling are not affected.
  585. >>> sm = pd.Series([1, None, 3],
  586. ... index=pd.date_range('20180101', periods=3, freq='h'))
  587. >>> sm
  588. 2018-01-01 00:00:00 1.0
  589. 2018-01-01 01:00:00 NaN
  590. 2018-01-01 02:00:00 3.0
  591. Freq: H, dtype: float64
  592. >>> sm.resample('30min').fillna('backfill')
  593. 2018-01-01 00:00:00 1.0
  594. 2018-01-01 00:30:00 NaN
  595. 2018-01-01 01:00:00 NaN
  596. 2018-01-01 01:30:00 3.0
  597. 2018-01-01 02:00:00 3.0
  598. Freq: 30T, dtype: float64
  599. >>> sm.resample('30min').fillna('pad')
  600. 2018-01-01 00:00:00 1.0
  601. 2018-01-01 00:30:00 1.0
  602. 2018-01-01 01:00:00 NaN
  603. 2018-01-01 01:30:00 NaN
  604. 2018-01-01 02:00:00 3.0
  605. Freq: 30T, dtype: float64
  606. >>> sm.resample('30min').fillna('nearest')
  607. 2018-01-01 00:00:00 1.0
  608. 2018-01-01 00:30:00 NaN
  609. 2018-01-01 01:00:00 NaN
  610. 2018-01-01 01:30:00 3.0
  611. 2018-01-01 02:00:00 3.0
  612. Freq: 30T, dtype: float64
  613. DataFrame resampling is done column-wise. All the same options are
  614. available.
  615. >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]},
  616. ... index=pd.date_range('20180101', periods=3,
  617. ... freq='h'))
  618. >>> df
  619. a b
  620. 2018-01-01 00:00:00 2.0 1
  621. 2018-01-01 01:00:00 NaN 3
  622. 2018-01-01 02:00:00 6.0 5
  623. >>> df.resample('30min').fillna("bfill")
  624. a b
  625. 2018-01-01 00:00:00 2.0 1
  626. 2018-01-01 00:30:00 NaN 3
  627. 2018-01-01 01:00:00 NaN 3
  628. 2018-01-01 01:30:00 6.0 5
  629. 2018-01-01 02:00:00 6.0 5
  630. """
  631. return self._upsample(method, limit=limit)
  632. @Appender(_shared_docs['interpolate'] % _shared_docs_kwargs)
  633. def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
  634. limit_direction='forward', limit_area=None,
  635. downcast=None, **kwargs):
  636. """
  637. Interpolate values according to different methods.
  638. .. versionadded:: 0.18.1
  639. """
  640. result = self._upsample(None)
  641. return result.interpolate(method=method, axis=axis, limit=limit,
  642. inplace=inplace,
  643. limit_direction=limit_direction,
  644. limit_area=limit_area,
  645. downcast=downcast, **kwargs)
  646. def asfreq(self, fill_value=None):
  647. """
  648. Return the values at the new freq, essentially a reindex.
  649. Parameters
  650. ----------
  651. fill_value : scalar, optional
  652. Value to use for missing values, applied during upsampling (note
  653. this does not fill NaNs that already were present).
  654. .. versionadded:: 0.20.0
  655. See Also
  656. --------
  657. Series.asfreq
  658. DataFrame.asfreq
  659. """
  660. return self._upsample('asfreq', fill_value=fill_value)
  661. def std(self, ddof=1, *args, **kwargs):
  662. """
  663. Compute standard deviation of groups, excluding missing values.
  664. Parameters
  665. ----------
  666. ddof : integer, default 1
  667. degrees of freedom
  668. """
  669. nv.validate_resampler_func('std', args, kwargs)
  670. return self._downsample('std', ddof=ddof)
  671. def var(self, ddof=1, *args, **kwargs):
  672. """
  673. Compute variance of groups, excluding missing values.
  674. Parameters
  675. ----------
  676. ddof : integer, default 1
  677. degrees of freedom
  678. """
  679. nv.validate_resampler_func('var', args, kwargs)
  680. return self._downsample('var', ddof=ddof)
  681. @Appender(GroupBy.size.__doc__)
  682. def size(self):
  683. # It's a special case as higher level does return
  684. # a copy of 0-len objects. GH14962
  685. result = self._downsample('size')
  686. if not len(self.ax) and isinstance(self._selected_obj, ABCDataFrame):
  687. result = pd.Series([], index=result.index, dtype='int64')
  688. return result
  689. def quantile(self, q=0.5, **kwargs):
  690. """
  691. Return value at the given quantile.
  692. .. versionadded:: 0.24.0
  693. Parameters
  694. ----------
  695. q : float or array-like, default 0.5 (50% quantile)
  696. See Also
  697. --------
  698. Series.quantile
  699. DataFrame.quantile
  700. DataFrameGroupBy.quantile
  701. """
  702. return self._downsample('quantile', q=q, **kwargs)
  703. # downsample methods
  704. for method in ['sum', 'prod']:
  705. def f(self, _method=method, min_count=0, *args, **kwargs):
  706. nv.validate_resampler_func(_method, args, kwargs)
  707. return self._downsample(_method, min_count=min_count)
  708. f.__doc__ = getattr(GroupBy, method).__doc__
  709. setattr(Resampler, method, f)
  710. # downsample methods
  711. for method in ['min', 'max', 'first', 'last', 'mean', 'sem',
  712. 'median', 'ohlc']:
  713. def f(self, _method=method, *args, **kwargs):
  714. nv.validate_resampler_func(_method, args, kwargs)
  715. return self._downsample(_method)
  716. f.__doc__ = getattr(GroupBy, method).__doc__
  717. setattr(Resampler, method, f)
  718. # groupby & aggregate methods
  719. for method in ['count']:
  720. def f(self, _method=method):
  721. return self._downsample(_method)
  722. f.__doc__ = getattr(GroupBy, method).__doc__
  723. setattr(Resampler, method, f)
  724. # series only methods
  725. for method in ['nunique']:
  726. def f(self, _method=method):
  727. return self._downsample(_method)
  728. f.__doc__ = getattr(SeriesGroupBy, method).__doc__
  729. setattr(Resampler, method, f)
  730. def _maybe_process_deprecations(r, how=None, fill_method=None, limit=None):
  731. """
  732. Potentially we might have a deprecation warning, show it
  733. but call the appropriate methods anyhow.
  734. """
  735. if how is not None:
  736. # .resample(..., how='sum')
  737. if isinstance(how, compat.string_types):
  738. method = "{0}()".format(how)
  739. # .resample(..., how=lambda x: ....)
  740. else:
  741. method = ".apply(<func>)"
  742. # if we have both a how and fill_method, then show
  743. # the following warning
  744. if fill_method is None:
  745. warnings.warn("how in .resample() is deprecated\n"
  746. "the new syntax is "
  747. ".resample(...).{method}".format(
  748. method=method),
  749. FutureWarning, stacklevel=3)
  750. r = r.aggregate(how)
  751. if fill_method is not None:
  752. # show the prior function call
  753. method = '.' + method if how is not None else ''
  754. args = "limit={0}".format(limit) if limit is not None else ""
  755. warnings.warn("fill_method is deprecated to .resample()\n"
  756. "the new syntax is .resample(...){method}"
  757. ".{fill_method}({args})".format(
  758. method=method,
  759. fill_method=fill_method,
  760. args=args),
  761. FutureWarning, stacklevel=3)
  762. if how is not None:
  763. r = getattr(r, fill_method)(limit=limit)
  764. else:
  765. r = r.aggregate(fill_method, limit=limit)
  766. return r
  767. class _GroupByMixin(GroupByMixin):
  768. """
  769. Provide the groupby facilities.
  770. """
  771. def __init__(self, obj, *args, **kwargs):
  772. parent = kwargs.pop('parent', None)
  773. groupby = kwargs.pop('groupby', None)
  774. if parent is None:
  775. parent = obj
  776. # initialize our GroupByMixin object with
  777. # the resampler attributes
  778. for attr in self._attributes:
  779. setattr(self, attr, kwargs.get(attr, getattr(parent, attr)))
  780. super(_GroupByMixin, self).__init__(None)
  781. self._groupby = groupby
  782. self._groupby.mutated = True
  783. self._groupby.grouper.mutated = True
  784. self.groupby = copy.copy(parent.groupby)
  785. def _apply(self, f, grouper=None, *args, **kwargs):
  786. """
  787. Dispatch to _upsample; we are stripping all of the _upsample kwargs and
  788. performing the original function call on the grouped object.
  789. """
  790. def func(x):
  791. x = self._shallow_copy(x, groupby=self.groupby)
  792. if isinstance(f, compat.string_types):
  793. return getattr(x, f)(**kwargs)
  794. return x.apply(f, *args, **kwargs)
  795. result = self._groupby.apply(func)
  796. return self._wrap_result(result)
  797. _upsample = _apply
  798. _downsample = _apply
  799. _groupby_and_aggregate = _apply
  800. class DatetimeIndexResampler(Resampler):
  801. @property
  802. def _resampler_for_grouping(self):
  803. return DatetimeIndexResamplerGroupby
  804. def _get_binner_for_time(self):
  805. # this is how we are actually creating the bins
  806. if self.kind == 'period':
  807. return self.groupby._get_time_period_bins(self.ax)
  808. return self.groupby._get_time_bins(self.ax)
  809. def _downsample(self, how, **kwargs):
  810. """
  811. Downsample the cython defined function.
  812. Parameters
  813. ----------
  814. how : string / cython mapped function
  815. **kwargs : kw args passed to how function
  816. """
  817. self._set_binner()
  818. how = self._is_cython_func(how) or how
  819. ax = self.ax
  820. obj = self._selected_obj
  821. if not len(ax):
  822. # reset to the new freq
  823. obj = obj.copy()
  824. obj.index.freq = self.freq
  825. return obj
  826. # do we have a regular frequency
  827. if ax.freq is not None or ax.inferred_freq is not None:
  828. if len(self.grouper.binlabels) > len(ax) and how is None:
  829. # let's do an asfreq
  830. return self.asfreq()
  831. # we are downsampling
  832. # we want to call the actual grouper method here
  833. result = obj.groupby(
  834. self.grouper, axis=self.axis).aggregate(how, **kwargs)
  835. result = self._apply_loffset(result)
  836. return self._wrap_result(result)
  837. def _adjust_binner_for_upsample(self, binner):
  838. """
  839. Adjust our binner when upsampling.
  840. The range of a new index should not be outside specified range
  841. """
  842. if self.closed == 'right':
  843. binner = binner[1:]
  844. else:
  845. binner = binner[:-1]
  846. return binner
  847. def _upsample(self, method, limit=None, fill_value=None):
  848. """
  849. Parameters
  850. ----------
  851. method : string {'backfill', 'bfill', 'pad',
  852. 'ffill', 'asfreq'} method for upsampling
  853. limit : int, default None
  854. Maximum size gap to fill when reindexing
  855. fill_value : scalar, default None
  856. Value to use for missing values
  857. See Also
  858. --------
  859. .fillna
  860. """
  861. self._set_binner()
  862. if self.axis:
  863. raise AssertionError('axis must be 0')
  864. if self._from_selection:
  865. raise ValueError("Upsampling from level= or on= selection"
  866. " is not supported, use .set_index(...)"
  867. " to explicitly set index to"
  868. " datetime-like")
  869. ax = self.ax
  870. obj = self._selected_obj
  871. binner = self.binner
  872. res_index = self._adjust_binner_for_upsample(binner)
  873. # if we have the same frequency as our axis, then we are equal sampling
  874. if limit is None and to_offset(ax.inferred_freq) == self.freq:
  875. result = obj.copy()
  876. result.index = res_index
  877. else:
  878. result = obj.reindex(res_index, method=method,
  879. limit=limit, fill_value=fill_value)
  880. result = self._apply_loffset(result)
  881. return self._wrap_result(result)
  882. def _wrap_result(self, result):
  883. result = super(DatetimeIndexResampler, self)._wrap_result(result)
  884. # we may have a different kind that we were asked originally
  885. # convert if needed
  886. if self.kind == 'period' and not isinstance(result.index, PeriodIndex):
  887. result.index = result.index.to_period(self.freq)
  888. return result
  889. class DatetimeIndexResamplerGroupby(_GroupByMixin, DatetimeIndexResampler):
  890. """
  891. Provides a resample of a groupby implementation
  892. .. versionadded:: 0.18.1
  893. """
  894. @property
  895. def _constructor(self):
  896. return DatetimeIndexResampler
  897. class PeriodIndexResampler(DatetimeIndexResampler):
  898. @property
  899. def _resampler_for_grouping(self):
  900. return PeriodIndexResamplerGroupby
  901. def _get_binner_for_time(self):
  902. if self.kind == 'timestamp':
  903. return super(PeriodIndexResampler, self)._get_binner_for_time()
  904. return self.groupby._get_period_bins(self.ax)
  905. def _convert_obj(self, obj):
  906. obj = super(PeriodIndexResampler, self)._convert_obj(obj)
  907. if self._from_selection:
  908. # see GH 14008, GH 12871
  909. msg = ("Resampling from level= or on= selection"
  910. " with a PeriodIndex is not currently supported,"
  911. " use .set_index(...) to explicitly set index")
  912. raise NotImplementedError(msg)
  913. if self.loffset is not None:
  914. # Cannot apply loffset/timedelta to PeriodIndex -> convert to
  915. # timestamps
  916. self.kind = 'timestamp'
  917. # convert to timestamp
  918. if self.kind == 'timestamp':
  919. obj = obj.to_timestamp(how=self.convention)
  920. return obj
  921. def _downsample(self, how, **kwargs):
  922. """
  923. Downsample the cython defined function.
  924. Parameters
  925. ----------
  926. how : string / cython mapped function
  927. **kwargs : kw args passed to how function
  928. """
  929. # we may need to actually resample as if we are timestamps
  930. if self.kind == 'timestamp':
  931. return super(PeriodIndexResampler, self)._downsample(how, **kwargs)
  932. how = self._is_cython_func(how) or how
  933. ax = self.ax
  934. if is_subperiod(ax.freq, self.freq):
  935. # Downsampling
  936. return self._groupby_and_aggregate(how, grouper=self.grouper,
  937. **kwargs)
  938. elif is_superperiod(ax.freq, self.freq):
  939. if how == 'ohlc':
  940. # GH #13083
  941. # upsampling to subperiods is handled as an asfreq, which works
  942. # for pure aggregating/reducing methods
  943. # OHLC reduces along the time dimension, but creates multiple
  944. # values for each period -> handle by _groupby_and_aggregate()
  945. return self._groupby_and_aggregate(how, grouper=self.grouper)
  946. return self.asfreq()
  947. elif ax.freq == self.freq:
  948. return self.asfreq()
  949. raise IncompatibleFrequency(
  950. 'Frequency {} cannot be resampled to {}, as they are not '
  951. 'sub or super periods'.format(ax.freq, self.freq))
  952. def _upsample(self, method, limit=None, fill_value=None):
  953. """
  954. Parameters
  955. ----------
  956. method : string {'backfill', 'bfill', 'pad', 'ffill'}
  957. method for upsampling
  958. limit : int, default None
  959. Maximum size gap to fill when reindexing
  960. fill_value : scalar, default None
  961. Value to use for missing values
  962. See Also
  963. --------
  964. .fillna
  965. """
  966. # we may need to actually resample as if we are timestamps
  967. if self.kind == 'timestamp':
  968. return super(PeriodIndexResampler, self)._upsample(
  969. method, limit=limit, fill_value=fill_value)
  970. self._set_binner()
  971. ax = self.ax
  972. obj = self.obj
  973. new_index = self.binner
  974. # Start vs. end of period
  975. memb = ax.asfreq(self.freq, how=self.convention)
  976. # Get the fill indexer
  977. indexer = memb.get_indexer(new_index, method=method, limit=limit)
  978. return self._wrap_result(_take_new_index(
  979. obj, indexer, new_index, axis=self.axis))
  980. class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler):
  981. """
  982. Provides a resample of a groupby implementation.
  983. .. versionadded:: 0.18.1
  984. """
  985. @property
  986. def _constructor(self):
  987. return PeriodIndexResampler
  988. class TimedeltaIndexResampler(DatetimeIndexResampler):
  989. @property
  990. def _resampler_for_grouping(self):
  991. return TimedeltaIndexResamplerGroupby
  992. def _get_binner_for_time(self):
  993. return self.groupby._get_time_delta_bins(self.ax)
  994. def _adjust_binner_for_upsample(self, binner):
  995. """
  996. Adjust our binner when upsampling.
  997. The range of a new index is allowed to be greater than original range
  998. so we don't need to change the length of a binner, GH 13022
  999. """
  1000. return binner
  1001. class TimedeltaIndexResamplerGroupby(_GroupByMixin, TimedeltaIndexResampler):
  1002. """
  1003. Provides a resample of a groupby implementation.
  1004. .. versionadded:: 0.18.1
  1005. """
  1006. @property
  1007. def _constructor(self):
  1008. return TimedeltaIndexResampler
  1009. def resample(obj, kind=None, **kwds):
  1010. """
  1011. Create a TimeGrouper and return our resampler.
  1012. """
  1013. tg = TimeGrouper(**kwds)
  1014. return tg._get_resampler(obj, kind=kind)
  1015. resample.__doc__ = Resampler.__doc__
  1016. def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None,
  1017. limit=None, kind=None, **kwargs):
  1018. """
  1019. Return our appropriate resampler when grouping as well.
  1020. """
  1021. # .resample uses 'on' similar to how .groupby uses 'key'
  1022. kwargs['key'] = kwargs.pop('on', None)
  1023. tg = TimeGrouper(freq=rule, **kwargs)
  1024. resampler = tg._get_resampler(groupby.obj, kind=kind)
  1025. r = resampler._get_resampler_for_grouping(groupby=groupby)
  1026. return _maybe_process_deprecations(r,
  1027. how=how,
  1028. fill_method=fill_method,
  1029. limit=limit)
  1030. class TimeGrouper(Grouper):
  1031. """
  1032. Custom groupby class for time-interval grouping.
  1033. Parameters
  1034. ----------
  1035. freq : pandas date offset or offset alias for identifying bin edges
  1036. closed : closed end of interval; 'left' or 'right'
  1037. label : interval boundary to use for labeling; 'left' or 'right'
  1038. convention : {'start', 'end', 'e', 's'}
  1039. If axis is PeriodIndex
  1040. """
  1041. _attributes = Grouper._attributes + ('closed', 'label', 'how',
  1042. 'loffset', 'kind', 'convention',
  1043. 'base')
  1044. def __init__(self, freq='Min', closed=None, label=None, how='mean',
  1045. axis=0, fill_method=None, limit=None, loffset=None,
  1046. kind=None, convention=None, base=0, **kwargs):
  1047. # Check for correctness of the keyword arguments which would
  1048. # otherwise silently use the default if misspelled
  1049. if label not in {None, 'left', 'right'}:
  1050. raise ValueError('Unsupported value {} for `label`'.format(label))
  1051. if closed not in {None, 'left', 'right'}:
  1052. raise ValueError('Unsupported value {} for `closed`'.format(
  1053. closed))
  1054. if convention not in {None, 'start', 'end', 'e', 's'}:
  1055. raise ValueError('Unsupported value {} for `convention`'
  1056. .format(convention))
  1057. freq = to_offset(freq)
  1058. end_types = {'M', 'A', 'Q', 'BM', 'BA', 'BQ', 'W'}
  1059. rule = freq.rule_code
  1060. if (rule in end_types or
  1061. ('-' in rule and rule[:rule.find('-')] in end_types)):
  1062. if closed is None:
  1063. closed = 'right'
  1064. if label is None:
  1065. label = 'right'
  1066. else:
  1067. if closed is None:
  1068. closed = 'left'
  1069. if label is None:
  1070. label = 'left'
  1071. self.closed = closed
  1072. self.label = label
  1073. self.kind = kind
  1074. self.convention = convention or 'E'
  1075. self.convention = self.convention.lower()
  1076. if isinstance(loffset, compat.string_types):
  1077. loffset = to_offset(loffset)
  1078. self.loffset = loffset
  1079. self.how = how
  1080. self.fill_method = fill_method
  1081. self.limit = limit
  1082. self.base = base
  1083. # always sort time groupers
  1084. kwargs['sort'] = True
  1085. super(TimeGrouper, self).__init__(freq=freq, axis=axis, **kwargs)
  1086. def _get_resampler(self, obj, kind=None):
  1087. """
  1088. Return my resampler or raise if we have an invalid axis.
  1089. Parameters
  1090. ----------
  1091. obj : input object
  1092. kind : string, optional
  1093. 'period','timestamp','timedelta' are valid
  1094. Returns
  1095. -------
  1096. a Resampler
  1097. Raises
  1098. ------
  1099. TypeError if incompatible axis
  1100. """
  1101. self._set_grouper(obj)
  1102. ax = self.ax
  1103. if isinstance(ax, DatetimeIndex):
  1104. return DatetimeIndexResampler(obj,
  1105. groupby=self,
  1106. kind=kind,
  1107. axis=self.axis)
  1108. elif isinstance(ax, PeriodIndex) or kind == 'period':
  1109. return PeriodIndexResampler(obj,
  1110. groupby=self,
  1111. kind=kind,
  1112. axis=self.axis)
  1113. elif isinstance(ax, TimedeltaIndex):
  1114. return TimedeltaIndexResampler(obj,
  1115. groupby=self,
  1116. axis=self.axis)
  1117. raise TypeError("Only valid with DatetimeIndex, "
  1118. "TimedeltaIndex or PeriodIndex, "
  1119. "but got an instance of %r" % type(ax).__name__)
  1120. def _get_grouper(self, obj, validate=True):
  1121. # create the resampler and return our binner
  1122. r = self._get_resampler(obj)
  1123. r._set_binner()
  1124. return r.binner, r.grouper, r.obj
  1125. def _get_time_bins(self, ax):
  1126. if not isinstance(ax, DatetimeIndex):
  1127. raise TypeError('axis must be a DatetimeIndex, but got '
  1128. 'an instance of %r' % type(ax).__name__)
  1129. if len(ax) == 0:
  1130. binner = labels = DatetimeIndex(
  1131. data=[], freq=self.freq, name=ax.name)
  1132. return binner, [], labels
  1133. first, last = _get_timestamp_range_edges(ax.min(), ax.max(),
  1134. self.freq,
  1135. closed=self.closed,
  1136. base=self.base)
  1137. # GH #12037
  1138. # use first/last directly instead of call replace() on them
  1139. # because replace() will swallow the nanosecond part
  1140. # thus last bin maybe slightly before the end if the end contains
  1141. # nanosecond part and lead to `Values falls after last bin` error
  1142. binner = labels = date_range(freq=self.freq,
  1143. start=first,
  1144. end=last,
  1145. tz=ax.tz,
  1146. name=ax.name,
  1147. ambiguous='infer',
  1148. nonexistent='shift_forward')
  1149. ax_values = ax.asi8
  1150. binner, bin_edges = self._adjust_bin_edges(binner, ax_values)
  1151. # general version, knowing nothing about relative frequencies
  1152. bins = lib.generate_bins_dt64(
  1153. ax_values, bin_edges, self.closed, hasnans=ax.hasnans)
  1154. if self.closed == 'right':
  1155. labels = binner
  1156. if self.label == 'right':
  1157. labels = labels[1:]
  1158. elif self.label == 'right':
  1159. labels = labels[1:]
  1160. if ax.hasnans:
  1161. binner = binner.insert(0, NaT)
  1162. labels = labels.insert(0, NaT)
  1163. # if we end up with more labels than bins
  1164. # adjust the labels
  1165. # GH4076
  1166. if len(bins) < len(labels):
  1167. labels = labels[:len(bins)]
  1168. return binner, bins, labels
  1169. def _adjust_bin_edges(self, binner, ax_values):
  1170. # Some hacks for > daily data, see #1471, #1458, #1483
  1171. if self.freq != 'D' and is_superperiod(self.freq, 'D'):
  1172. if self.closed == 'right':
  1173. # GH 21459, GH 9119: Adjust the bins relative to the wall time
  1174. bin_edges = binner.tz_localize(None)
  1175. bin_edges = bin_edges + timedelta(1) - Nano(1)
  1176. bin_edges = bin_edges.tz_localize(binner.tz).asi8
  1177. else:
  1178. bin_edges = binner.asi8
  1179. # intraday values on last day
  1180. if bin_edges[-2] > ax_values.max():
  1181. bin_edges = bin_edges[:-1]
  1182. binner = binner[:-1]
  1183. else:
  1184. bin_edges = binner.asi8
  1185. return binner, bin_edges
  1186. def _get_time_delta_bins(self, ax):
  1187. if not isinstance(ax, TimedeltaIndex):
  1188. raise TypeError('axis must be a TimedeltaIndex, but got '
  1189. 'an instance of %r' % type(ax).__name__)
  1190. if not len(ax):
  1191. binner = labels = TimedeltaIndex(
  1192. data=[], freq=self.freq, name=ax.name)
  1193. return binner, [], labels
  1194. start, end = ax.min(), ax.max()
  1195. labels = binner = timedelta_range(start=start,
  1196. end=end,
  1197. freq=self.freq,
  1198. name=ax.name)
  1199. end_stamps = labels + self.freq
  1200. bins = ax.searchsorted(end_stamps, side='left')
  1201. # Addresses GH #10530
  1202. if self.base > 0:
  1203. labels += type(self.freq)(self.base)
  1204. return binner, bins, labels
  1205. def _get_time_period_bins(self, ax):
  1206. if not isinstance(ax, DatetimeIndex):
  1207. raise TypeError('axis must be a DatetimeIndex, but got '
  1208. 'an instance of %r' % type(ax).__name__)
  1209. freq = self.freq
  1210. if not len(ax):
  1211. binner = labels = PeriodIndex(data=[], freq=freq, name=ax.name)
  1212. return binner, [], labels
  1213. labels = binner = pd.period_range(start=ax[0],
  1214. end=ax[-1],
  1215. freq=freq,
  1216. name=ax.name)
  1217. end_stamps = (labels + freq).asfreq(freq, 's').to_timestamp()
  1218. if ax.tzinfo:
  1219. end_stamps = end_stamps.tz_localize(ax.tzinfo)
  1220. bins = ax.searchsorted(end_stamps, side='left')
  1221. return binner, bins, labels
  1222. def _get_period_bins(self, ax):
  1223. if not isinstance(ax, PeriodIndex):
  1224. raise TypeError('axis must be a PeriodIndex, but got '
  1225. 'an instance of %r' % type(ax).__name__)
  1226. memb = ax.asfreq(self.freq, how=self.convention)
  1227. # NaT handling as in pandas._lib.lib.generate_bins_dt64()
  1228. nat_count = 0
  1229. if memb.hasnans:
  1230. nat_count = np.sum(memb._isnan)
  1231. memb = memb[~memb._isnan]
  1232. # if index contains no valid (non-NaT) values, return empty index
  1233. if not len(memb):
  1234. binner = labels = PeriodIndex(
  1235. data=[], freq=self.freq, name=ax.name)
  1236. return binner, [], labels
  1237. freq_mult = self.freq.n
  1238. start = ax.min().asfreq(self.freq, how=self.convention)
  1239. end = ax.max().asfreq(self.freq, how='end')
  1240. bin_shift = 0
  1241. # GH 23882
  1242. if self.base:
  1243. # get base adjusted bin edge labels
  1244. p_start, end = _get_period_range_edges(start,
  1245. end,
  1246. self.freq,
  1247. closed=self.closed,
  1248. base=self.base)
  1249. # Get offset for bin edge (not label edge) adjustment
  1250. start_offset = (pd.Period(start, self.freq)
  1251. - pd.Period(p_start, self.freq))
  1252. bin_shift = start_offset.n % freq_mult
  1253. start = p_start
  1254. labels = binner = pd.period_range(start=start, end=end,
  1255. freq=self.freq, name=ax.name)
  1256. i8 = memb.asi8
  1257. # when upsampling to subperiods, we need to generate enough bins
  1258. expected_bins_count = len(binner) * freq_mult
  1259. i8_extend = expected_bins_count - (i8[-1] - i8[0])
  1260. rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult)
  1261. rng += freq_mult
  1262. # adjust bin edge indexes to account for base
  1263. rng -= bin_shift
  1264. bins = memb.searchsorted(rng, side='left')
  1265. if nat_count > 0:
  1266. # NaT handling as in pandas._lib.lib.generate_bins_dt64()
  1267. # shift bins by the number of NaT
  1268. bins += nat_count
  1269. bins = np.insert(bins, 0, nat_count)
  1270. binner = binner.insert(0, NaT)
  1271. labels = labels.insert(0, NaT)
  1272. return binner, bins, labels
  1273. def _take_new_index(obj, indexer, new_index, axis=0):
  1274. from pandas.core.api import Series, DataFrame
  1275. if isinstance(obj, Series):
  1276. new_values = algos.take_1d(obj.values, indexer)
  1277. return Series(new_values, index=new_index, name=obj.name)
  1278. elif isinstance(obj, DataFrame):
  1279. if axis == 1:
  1280. raise NotImplementedError("axis 1 is not supported")
  1281. return DataFrame(obj._data.reindex_indexer(
  1282. new_axis=new_index, indexer=indexer, axis=1))
  1283. else:
  1284. raise ValueError("'obj' should be either a Series or a DataFrame")
  1285. def _get_timestamp_range_edges(first, last, offset, closed='left', base=0):
  1286. """
  1287. Adjust the `first` Timestamp to the preceeding Timestamp that resides on
  1288. the provided offset. Adjust the `last` Timestamp to the following
  1289. Timestamp that resides on the provided offset. Input Timestamps that
  1290. already reside on the offset will be adjusted depending on the type of
  1291. offset and the `closed` parameter.
  1292. Parameters
  1293. ----------
  1294. first : pd.Timestamp
  1295. The beginning Timestamp of the range to be adjusted.
  1296. last : pd.Timestamp
  1297. The ending Timestamp of the range to be adjusted.
  1298. offset : pd.DateOffset
  1299. The dateoffset to which the Timestamps will be adjusted.
  1300. closed : {'right', 'left'}, default None
  1301. Which side of bin interval is closed.
  1302. base : int, default 0
  1303. The "origin" of the adjusted Timestamps.
  1304. Returns
  1305. -------
  1306. A tuple of length 2, containing the adjusted pd.Timestamp objects.
  1307. """
  1308. if isinstance(offset, Tick):
  1309. if isinstance(offset, Day):
  1310. # _adjust_dates_anchored assumes 'D' means 24H, but first/last
  1311. # might contain a DST transition (23H, 24H, or 25H).
  1312. # So "pretend" the dates are naive when adjusting the endpoints
  1313. tz = first.tz
  1314. first = first.tz_localize(None)
  1315. last = last.tz_localize(None)
  1316. first, last = _adjust_dates_anchored(first, last, offset,
  1317. closed=closed, base=base)
  1318. if isinstance(offset, Day):
  1319. first = first.tz_localize(tz)
  1320. last = last.tz_localize(tz)
  1321. return first, last
  1322. else:
  1323. first = first.normalize()
  1324. last = last.normalize()
  1325. if closed == 'left':
  1326. first = Timestamp(offset.rollback(first))
  1327. else:
  1328. first = Timestamp(first - offset)
  1329. last = Timestamp(last + offset)
  1330. return first, last
  1331. def _get_period_range_edges(first, last, offset, closed='left', base=0):
  1332. """
  1333. Adjust the provided `first` and `last` Periods to the respective Period of
  1334. the given offset that encompasses them.
  1335. Parameters
  1336. ----------
  1337. first : pd.Period
  1338. The beginning Period of the range to be adjusted.
  1339. last : pd.Period
  1340. The ending Period of the range to be adjusted.
  1341. offset : pd.DateOffset
  1342. The dateoffset to which the Periods will be adjusted.
  1343. closed : {'right', 'left'}, default None
  1344. Which side of bin interval is closed.
  1345. base : int, default 0
  1346. The "origin" of the adjusted Periods.
  1347. Returns
  1348. -------
  1349. A tuple of length 2, containing the adjusted pd.Period objects.
  1350. """
  1351. if not all(isinstance(obj, pd.Period) for obj in [first, last]):
  1352. raise TypeError("'first' and 'last' must be instances of type Period")
  1353. # GH 23882
  1354. first = first.to_timestamp()
  1355. last = last.to_timestamp()
  1356. adjust_first = not offset.onOffset(first)
  1357. adjust_last = offset.onOffset(last)
  1358. first, last = _get_timestamp_range_edges(first, last, offset,
  1359. closed=closed, base=base)
  1360. first = (first + adjust_first * offset).to_period(offset)
  1361. last = (last - adjust_last * offset).to_period(offset)
  1362. return first, last
  1363. def _adjust_dates_anchored(first, last, offset, closed='right', base=0):
  1364. # First and last offsets should be calculated from the start day to fix an
  1365. # error cause by resampling across multiple days when a one day period is
  1366. # not a multiple of the frequency.
  1367. #
  1368. # See https://github.com/pandas-dev/pandas/issues/8683
  1369. # GH 10117 & GH 19375. If first and last contain timezone information,
  1370. # Perform the calculation in UTC in order to avoid localizing on an
  1371. # Ambiguous or Nonexistent time.
  1372. first_tzinfo = first.tzinfo
  1373. last_tzinfo = last.tzinfo
  1374. start_day_nanos = first.normalize().value
  1375. if first_tzinfo is not None:
  1376. first = first.tz_convert('UTC')
  1377. if last_tzinfo is not None:
  1378. last = last.tz_convert('UTC')
  1379. base_nanos = (base % offset.n) * offset.nanos // offset.n
  1380. start_day_nanos += base_nanos
  1381. foffset = (first.value - start_day_nanos) % offset.nanos
  1382. loffset = (last.value - start_day_nanos) % offset.nanos
  1383. if closed == 'right':
  1384. if foffset > 0:
  1385. # roll back
  1386. fresult = first.value - foffset
  1387. else:
  1388. fresult = first.value - offset.nanos
  1389. if loffset > 0:
  1390. # roll forward
  1391. lresult = last.value + (offset.nanos - loffset)
  1392. else:
  1393. # already the end of the road
  1394. lresult = last.value
  1395. else: # closed == 'left'
  1396. if foffset > 0:
  1397. fresult = first.value - foffset
  1398. else:
  1399. # start of the road
  1400. fresult = first.value
  1401. if loffset > 0:
  1402. # roll forward
  1403. lresult = last.value + (offset.nanos - loffset)
  1404. else:
  1405. lresult = last.value + offset.nanos
  1406. fresult = Timestamp(fresult)
  1407. lresult = Timestamp(lresult)
  1408. if first_tzinfo is not None:
  1409. fresult = fresult.tz_localize('UTC').tz_convert(first_tzinfo)
  1410. if last_tzinfo is not None:
  1411. lresult = lresult.tz_localize('UTC').tz_convert(last_tzinfo)
  1412. return fresult, lresult
  1413. def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None):
  1414. """
  1415. Utility frequency conversion method for Series/DataFrame.
  1416. """
  1417. if isinstance(obj.index, PeriodIndex):
  1418. if method is not None:
  1419. raise NotImplementedError("'method' argument is not supported")
  1420. if how is None:
  1421. how = 'E'
  1422. new_obj = obj.copy()
  1423. new_obj.index = obj.index.asfreq(freq, how=how)
  1424. elif len(obj.index) == 0:
  1425. new_obj = obj.copy()
  1426. new_obj.index = obj.index._shallow_copy(freq=to_offset(freq))
  1427. else:
  1428. dti = date_range(obj.index[0], obj.index[-1], freq=freq)
  1429. dti.name = obj.index.name
  1430. new_obj = obj.reindex(dti, method=method, fill_value=fill_value)
  1431. if normalize:
  1432. new_obj.index = new_obj.index.normalize()
  1433. return new_obj