123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748 |
- """
- Routines for filling missing data
- """
- from distutils.version import LooseVersion
- import operator
- import numpy as np
- from pandas._libs import algos, lib
- from pandas.compat import range, string_types
- from pandas.core.dtypes.cast import infer_dtype_from_array
- from pandas.core.dtypes.common import (
- ensure_float64, is_datetime64_dtype, is_datetime64tz_dtype, is_float_dtype,
- is_integer, is_integer_dtype, is_numeric_v_string_like, is_scalar,
- is_timedelta64_dtype, needs_i8_conversion)
- from pandas.core.dtypes.missing import isna
- def mask_missing(arr, values_to_mask):
- """
- Return a masking array of same size/shape as arr
- with entries equaling any member of values_to_mask set to True
- """
- dtype, values_to_mask = infer_dtype_from_array(values_to_mask)
- try:
- values_to_mask = np.array(values_to_mask, dtype=dtype)
- except Exception:
- values_to_mask = np.array(values_to_mask, dtype=object)
- na_mask = isna(values_to_mask)
- nonna = values_to_mask[~na_mask]
- mask = None
- for x in nonna:
- if mask is None:
- # numpy elementwise comparison warning
- if is_numeric_v_string_like(arr, x):
- mask = False
- else:
- mask = arr == x
- # if x is a string and arr is not, then we get False and we must
- # expand the mask to size arr.shape
- if is_scalar(mask):
- mask = np.zeros(arr.shape, dtype=bool)
- else:
- # numpy elementwise comparison warning
- if is_numeric_v_string_like(arr, x):
- mask |= False
- else:
- mask |= arr == x
- if na_mask.any():
- if mask is None:
- mask = isna(arr)
- else:
- mask |= isna(arr)
- # GH 21977
- if mask is None:
- mask = np.zeros(arr.shape, dtype=bool)
- return mask
- def clean_fill_method(method, allow_nearest=False):
- # asfreq is compat for resampling
- if method in [None, 'asfreq']:
- return None
- if isinstance(method, string_types):
- method = method.lower()
- if method == 'ffill':
- method = 'pad'
- elif method == 'bfill':
- method = 'backfill'
- valid_methods = ['pad', 'backfill']
- expecting = 'pad (ffill) or backfill (bfill)'
- if allow_nearest:
- valid_methods.append('nearest')
- expecting = 'pad (ffill), backfill (bfill) or nearest'
- if method not in valid_methods:
- msg = ('Invalid fill method. Expecting {expecting}. Got {method}'
- .format(expecting=expecting, method=method))
- raise ValueError(msg)
- return method
- def clean_interp_method(method, **kwargs):
- order = kwargs.get('order')
- valid = ['linear', 'time', 'index', 'values', 'nearest', 'zero', 'slinear',
- 'quadratic', 'cubic', 'barycentric', 'polynomial', 'krogh',
- 'piecewise_polynomial', 'pchip', 'akima', 'spline',
- 'from_derivatives']
- if method in ('spline', 'polynomial') and order is None:
- raise ValueError("You must specify the order of the spline or "
- "polynomial.")
- if method not in valid:
- raise ValueError("method must be one of {valid}. Got '{method}' "
- "instead.".format(valid=valid, method=method))
- return method
- def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
- limit_direction='forward', limit_area=None, fill_value=None,
- bounds_error=False, order=None, **kwargs):
- """
- Logic for the 1-d interpolation. The result should be 1-d, inputs
- xvalues and yvalues will each be 1-d arrays of the same length.
- Bounds_error is currently hardcoded to False since non-scipy ones don't
- take it as an argumnet.
- """
- # Treat the original, non-scipy methods first.
- invalid = isna(yvalues)
- valid = ~invalid
- if not valid.any():
- # have to call np.asarray(xvalues) since xvalues could be an Index
- # which can't be mutated
- result = np.empty_like(np.asarray(xvalues), dtype=np.float64)
- result.fill(np.nan)
- return result
- if valid.all():
- return yvalues
- if method == 'time':
- if not getattr(xvalues, 'is_all_dates', None):
- # if not issubclass(xvalues.dtype.type, np.datetime64):
- raise ValueError('time-weighted interpolation only works '
- 'on Series or DataFrames with a '
- 'DatetimeIndex')
- method = 'values'
- valid_limit_directions = ['forward', 'backward', 'both']
- limit_direction = limit_direction.lower()
- if limit_direction not in valid_limit_directions:
- msg = ('Invalid limit_direction: expecting one of {valid!r}, '
- 'got {invalid!r}.')
- raise ValueError(msg.format(valid=valid_limit_directions,
- invalid=limit_direction))
- if limit_area is not None:
- valid_limit_areas = ['inside', 'outside']
- limit_area = limit_area.lower()
- if limit_area not in valid_limit_areas:
- raise ValueError('Invalid limit_area: expecting one of {}, got '
- '{}.'.format(valid_limit_areas, limit_area))
- # default limit is unlimited GH #16282
- if limit is None:
- # limit = len(xvalues)
- pass
- elif not is_integer(limit):
- raise ValueError('Limit must be an integer')
- elif limit < 1:
- raise ValueError('Limit must be greater than 0')
- from pandas import Series
- ys = Series(yvalues)
- # These are sets of index pointers to invalid values... i.e. {0, 1, etc...
- all_nans = set(np.flatnonzero(invalid))
- start_nans = set(range(ys.first_valid_index()))
- end_nans = set(range(1 + ys.last_valid_index(), len(valid)))
- mid_nans = all_nans - start_nans - end_nans
- # Like the sets above, preserve_nans contains indices of invalid values,
- # but in this case, it is the final set of indices that need to be
- # preserved as NaN after the interpolation.
- # For example if limit_direction='forward' then preserve_nans will
- # contain indices of NaNs at the beginning of the series, and NaNs that
- # are more than'limit' away from the prior non-NaN.
- # set preserve_nans based on direction using _interp_limit
- if limit_direction == 'forward':
- preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
- elif limit_direction == 'backward':
- preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
- else:
- # both directions... just use _interp_limit
- preserve_nans = set(_interp_limit(invalid, limit, limit))
- # if limit_area is set, add either mid or outside indices
- # to preserve_nans GH #16284
- if limit_area == 'inside':
- # preserve NaNs on the outside
- preserve_nans |= start_nans | end_nans
- elif limit_area == 'outside':
- # preserve NaNs on the inside
- preserve_nans |= mid_nans
- # sort preserve_nans and covert to list
- preserve_nans = sorted(preserve_nans)
- xvalues = getattr(xvalues, 'values', xvalues)
- yvalues = getattr(yvalues, 'values', yvalues)
- result = yvalues.copy()
- if method in ['linear', 'time', 'index', 'values']:
- if method in ('values', 'index'):
- inds = np.asarray(xvalues)
- # hack for DatetimeIndex, #1646
- if needs_i8_conversion(inds.dtype.type):
- inds = inds.view(np.int64)
- if inds.dtype == np.object_:
- inds = lib.maybe_convert_objects(inds)
- else:
- inds = xvalues
- result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid])
- result[preserve_nans] = np.nan
- return result
- sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic',
- 'barycentric', 'krogh', 'spline', 'polynomial',
- 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima']
- if method in sp_methods:
- inds = np.asarray(xvalues)
- # hack for DatetimeIndex, #1646
- if issubclass(inds.dtype.type, np.datetime64):
- inds = inds.view(np.int64)
- result[invalid] = _interpolate_scipy_wrapper(inds[valid],
- yvalues[valid],
- inds[invalid],
- method=method,
- fill_value=fill_value,
- bounds_error=bounds_error,
- order=order, **kwargs)
- result[preserve_nans] = np.nan
- return result
- def _interpolate_scipy_wrapper(x, y, new_x, method, fill_value=None,
- bounds_error=False, order=None, **kwargs):
- """
- passed off to scipy.interpolate.interp1d. method is scipy's kind.
- Returns an array interpolated at new_x. Add any new methods to
- the list in _clean_interp_method
- """
- try:
- from scipy import interpolate
- # TODO: Why is DatetimeIndex being imported here?
- from pandas import DatetimeIndex # noqa
- except ImportError:
- raise ImportError('{method} interpolation requires SciPy'
- .format(method=method))
- new_x = np.asarray(new_x)
- # ignores some kwargs that could be passed along.
- alt_methods = {
- 'barycentric': interpolate.barycentric_interpolate,
- 'krogh': interpolate.krogh_interpolate,
- 'from_derivatives': _from_derivatives,
- 'piecewise_polynomial': _from_derivatives,
- }
- if getattr(x, 'is_all_dates', False):
- # GH 5975, scipy.interp1d can't hande datetime64s
- x, new_x = x._values.astype('i8'), new_x.astype('i8')
- if method == 'pchip':
- try:
- alt_methods['pchip'] = interpolate.pchip_interpolate
- except AttributeError:
- raise ImportError("Your version of Scipy does not support "
- "PCHIP interpolation.")
- elif method == 'akima':
- try:
- from scipy.interpolate import Akima1DInterpolator # noqa
- alt_methods['akima'] = _akima_interpolate
- except ImportError:
- raise ImportError("Your version of Scipy does not support "
- "Akima interpolation.")
- interp1d_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic',
- 'polynomial']
- if method in interp1d_methods:
- if method == 'polynomial':
- method = order
- terp = interpolate.interp1d(x, y, kind=method, fill_value=fill_value,
- bounds_error=bounds_error)
- new_y = terp(new_x)
- elif method == 'spline':
- # GH #10633
- if not order:
- raise ValueError("order needs to be specified and greater than 0")
- terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs)
- new_y = terp(new_x)
- else:
- # GH 7295: need to be able to write for some reason
- # in some circumstances: check all three
- if not x.flags.writeable:
- x = x.copy()
- if not y.flags.writeable:
- y = y.copy()
- if not new_x.flags.writeable:
- new_x = new_x.copy()
- method = alt_methods[method]
- new_y = method(x, y, new_x, **kwargs)
- return new_y
- def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False):
- """
- Convenience function for interpolate.BPoly.from_derivatives
- Construct a piecewise polynomial in the Bernstein basis, compatible
- with the specified values and derivatives at breakpoints.
- Parameters
- ----------
- xi : array_like
- sorted 1D array of x-coordinates
- yi : array_like or list of array-likes
- yi[i][j] is the j-th derivative known at xi[i]
- orders : None or int or array_like of ints. Default: None.
- Specifies the degree of local polynomials. If not None, some
- derivatives are ignored.
- der : int or list
- How many derivatives to extract; None for all potentially nonzero
- derivatives (that is a number equal to the number of points), or a
- list of derivatives to extract. This numberincludes the function
- value as 0th derivative.
- extrapolate : bool, optional
- Whether to extrapolate to ouf-of-bounds points based on first and last
- intervals, or to return NaNs. Default: True.
- See Also
- --------
- scipy.interpolate.BPoly.from_derivatives
- Returns
- -------
- y : scalar or array_like
- The result, of length R or length M or M by R,
- """
- import scipy
- from scipy import interpolate
- if LooseVersion(scipy.__version__) < LooseVersion('0.18.0'):
- try:
- method = interpolate.piecewise_polynomial_interpolate
- return method(xi, yi.reshape(-1, 1), x,
- orders=order, der=der)
- except AttributeError:
- pass
- # return the method for compat with scipy version & backwards compat
- method = interpolate.BPoly.from_derivatives
- m = method(xi, yi.reshape(-1, 1),
- orders=order, extrapolate=extrapolate)
- return m(x)
- def _akima_interpolate(xi, yi, x, der=0, axis=0):
- """
- Convenience function for akima interpolation.
- xi and yi are arrays of values used to approximate some function f,
- with ``yi = f(xi)``.
- See `Akima1DInterpolator` for details.
- Parameters
- ----------
- xi : array_like
- A sorted list of x-coordinates, of length N.
- yi : array_like
- A 1-D array of real values. `yi`'s length along the interpolation
- axis must be equal to the length of `xi`. If N-D array, use axis
- parameter to select correct axis.
- x : scalar or array_like
- Of length M.
- der : int or list, optional
- How many derivatives to extract; None for all potentially
- nonzero derivatives (that is a number equal to the number
- of points), or a list of derivatives to extract. This number
- includes the function value as 0th derivative.
- axis : int, optional
- Axis in the yi array corresponding to the x-coordinate values.
- See Also
- --------
- scipy.interpolate.Akima1DInterpolator
- Returns
- -------
- y : scalar or array_like
- The result, of length R or length M or M by R,
- """
- from scipy import interpolate
- try:
- P = interpolate.Akima1DInterpolator(xi, yi, axis=axis)
- except TypeError:
- # Scipy earlier than 0.17.0 missing axis
- P = interpolate.Akima1DInterpolator(xi, yi)
- if der == 0:
- return P(x)
- elif interpolate._isscalar(der):
- return P(x, der=der)
- else:
- return [P(x, nu) for nu in der]
- def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None,
- dtype=None):
- """ perform an actual interpolation of values, values will be make 2-d if
- needed fills inplace, returns the result
- """
- transf = (lambda x: x) if axis == 0 else (lambda x: x.T)
- # reshape a 1 dim if needed
- ndim = values.ndim
- if values.ndim == 1:
- if axis != 0: # pragma: no cover
- raise AssertionError("cannot interpolate on a ndim == 1 with "
- "axis != 0")
- values = values.reshape(tuple((1,) + values.shape))
- if fill_value is None:
- mask = None
- else: # todo create faster fill func without masking
- mask = mask_missing(transf(values), fill_value)
- method = clean_fill_method(method)
- if method == 'pad':
- values = transf(pad_2d(
- transf(values), limit=limit, mask=mask, dtype=dtype))
- else:
- values = transf(backfill_2d(
- transf(values), limit=limit, mask=mask, dtype=dtype))
- # reshape back
- if ndim == 1:
- values = values[0]
- return values
- def _cast_values_for_fillna(values, dtype):
- """
- Cast values to a dtype that algos.pad and algos.backfill can handle.
- """
- # TODO: for int-dtypes we make a copy, but for everything else this
- # alters the values in-place. Is this intentional?
- if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or
- is_timedelta64_dtype(dtype)):
- values = values.view(np.int64)
- elif is_integer_dtype(values):
- # NB: this check needs to come after the datetime64 check above
- values = ensure_float64(values)
- return values
- def _fillna_prep(values, mask=None, dtype=None):
- # boilerplate for pad_1d, backfill_1d, pad_2d, backfill_2d
- if dtype is None:
- dtype = values.dtype
- if mask is None:
- # This needs to occur before datetime/timedeltas are cast to int64
- mask = isna(values)
- values = _cast_values_for_fillna(values, dtype)
- mask = mask.view(np.uint8)
- return values, mask
- def pad_1d(values, limit=None, mask=None, dtype=None):
- values, mask = _fillna_prep(values, mask, dtype)
- algos.pad_inplace(values, mask, limit=limit)
- return values
- def backfill_1d(values, limit=None, mask=None, dtype=None):
- values, mask = _fillna_prep(values, mask, dtype)
- algos.backfill_inplace(values, mask, limit=limit)
- return values
- def pad_2d(values, limit=None, mask=None, dtype=None):
- values, mask = _fillna_prep(values, mask, dtype)
- if np.all(values.shape):
- algos.pad_2d_inplace(values, mask, limit=limit)
- else:
- # for test coverage
- pass
- return values
- def backfill_2d(values, limit=None, mask=None, dtype=None):
- values, mask = _fillna_prep(values, mask, dtype)
- if np.all(values.shape):
- algos.backfill_2d_inplace(values, mask, limit=limit)
- else:
- # for test coverage
- pass
- return values
- _fill_methods = {'pad': pad_1d, 'backfill': backfill_1d}
- def get_fill_func(method):
- method = clean_fill_method(method)
- return _fill_methods[method]
- def clean_reindex_fill_method(method):
- return clean_fill_method(method, allow_nearest=True)
- def fill_zeros(result, x, y, name, fill):
- """
- if this is a reversed op, then flip x,y
- if we have an integer value (or array in y)
- and we have 0's, fill them with the fill,
- return the result
- mask the nan's from x
- """
- if fill is None or is_float_dtype(result):
- return result
- if name.startswith(('r', '__r')):
- x, y = y, x
- is_variable_type = (hasattr(y, 'dtype') or hasattr(y, 'type'))
- is_scalar_type = is_scalar(y)
- if not is_variable_type and not is_scalar_type:
- return result
- if is_scalar_type:
- y = np.array(y)
- if is_integer_dtype(y):
- if (y == 0).any():
- # GH 7325, mask and nans must be broadcastable (also: PR 9308)
- # Raveling and then reshaping makes np.putmask faster
- mask = ((y == 0) & ~np.isnan(result)).ravel()
- shape = result.shape
- result = result.astype('float64', copy=False).ravel()
- np.putmask(result, mask, fill)
- # if we have a fill of inf, then sign it correctly
- # (GH 6178 and PR 9308)
- if np.isinf(fill):
- signs = y if name.startswith(('r', '__r')) else x
- signs = np.sign(signs.astype('float', copy=False))
- negative_inf_mask = (signs.ravel() < 0) & mask
- np.putmask(result, negative_inf_mask, -fill)
- if "floordiv" in name: # (PR 9308)
- nan_mask = ((y == 0) & (x == 0)).ravel()
- np.putmask(result, nan_mask, np.nan)
- result = result.reshape(shape)
- return result
- def mask_zero_div_zero(x, y, result, copy=False):
- """
- Set results of 0 / 0 or 0 // 0 to np.nan, regardless of the dtypes
- of the numerator or the denominator.
- Parameters
- ----------
- x : ndarray
- y : ndarray
- result : ndarray
- copy : bool (default False)
- Whether to always create a new array or try to fill in the existing
- array if possible.
- Returns
- -------
- filled_result : ndarray
- Examples
- --------
- >>> x = np.array([1, 0, -1], dtype=np.int64)
- >>> y = 0 # int 0; numpy behavior is different with float
- >>> result = x / y
- >>> result # raw numpy result does not fill division by zero
- array([0, 0, 0])
- >>> mask_zero_div_zero(x, y, result)
- array([ inf, nan, -inf])
- """
- if is_scalar(y):
- y = np.array(y)
- zmask = y == 0
- if zmask.any():
- shape = result.shape
- nan_mask = (zmask & (x == 0)).ravel()
- neginf_mask = (zmask & (x < 0)).ravel()
- posinf_mask = (zmask & (x > 0)).ravel()
- if nan_mask.any() or neginf_mask.any() or posinf_mask.any():
- # Fill negative/0 with -inf, positive/0 with +inf, 0/0 with NaN
- result = result.astype('float64', copy=copy).ravel()
- np.putmask(result, nan_mask, np.nan)
- np.putmask(result, posinf_mask, np.inf)
- np.putmask(result, neginf_mask, -np.inf)
- result = result.reshape(shape)
- return result
- def dispatch_missing(op, left, right, result):
- """
- Fill nulls caused by division by zero, casting to a diffferent dtype
- if necessary.
- Parameters
- ----------
- op : function (operator.add, operator.div, ...)
- left : object (Index for non-reversed ops)
- right : object (Index fof reversed ops)
- result : ndarray
- Returns
- -------
- result : ndarray
- """
- opstr = '__{opname}__'.format(opname=op.__name__).replace('____', '__')
- if op in [operator.truediv, operator.floordiv,
- getattr(operator, 'div', None)]:
- result = mask_zero_div_zero(left, right, result)
- elif op is operator.mod:
- result = fill_zeros(result, left, right, opstr, np.nan)
- elif op is divmod:
- res0 = mask_zero_div_zero(left, right, result[0])
- res1 = fill_zeros(result[1], left, right, opstr, np.nan)
- result = (res0, res1)
- return result
- def _interp_limit(invalid, fw_limit, bw_limit):
- """
- Get indexers of values that won't be filled
- because they exceed the limits.
- Parameters
- ----------
- invalid : boolean ndarray
- fw_limit : int or None
- forward limit to index
- bw_limit : int or None
- backward limit to index
- Returns
- -------
- set of indexers
- Notes
- -----
- This is equivalent to the more readable, but slower
- .. code-block:: python
- def _interp_limit(invalid, fw_limit, bw_limit):
- for x in np.where(invalid)[0]:
- if invalid[max(0, x - fw_limit):x + bw_limit + 1].all():
- yield x
- """
- # handle forward first; the backward direction is the same except
- # 1. operate on the reversed array
- # 2. subtract the returned indices from N - 1
- N = len(invalid)
- f_idx = set()
- b_idx = set()
- def inner(invalid, limit):
- limit = min(limit, N)
- windowed = _rolling_window(invalid, limit + 1).all(1)
- idx = (set(np.where(windowed)[0] + limit) |
- set(np.where((~invalid[:limit + 1]).cumsum() == 0)[0]))
- return idx
- if fw_limit is not None:
- if fw_limit == 0:
- f_idx = set(np.where(invalid)[0])
- else:
- f_idx = inner(invalid, fw_limit)
- if bw_limit is not None:
- if bw_limit == 0:
- # then we don't even need to care about backwards
- # just use forwards
- return f_idx
- else:
- b_idx = list(inner(invalid[::-1], bw_limit))
- b_idx = set(N - 1 - np.asarray(b_idx))
- if fw_limit == 0:
- return b_idx
- return f_idx & b_idx
- def _rolling_window(a, window):
- """
- [True, True, False, True, False], 2 ->
- [
- [True, True],
- [True, False],
- [False, True],
- [True, False],
- ]
- """
- # https://stackoverflow.com/a/6811241
- shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
- strides = a.strides + (a.strides[-1],)
- return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
|