123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721 |
- """
- Functions for preparing various inputs passed to the DataFrame or Series
- constructors before passing them to a BlockManager.
- """
- from collections import OrderedDict
- import numpy as np
- import numpy.ma as ma
- from pandas._libs import lib
- from pandas._libs.tslibs import IncompatibleFrequency
- import pandas.compat as compat
- from pandas.compat import (
- get_range_parameters, lmap, lrange, raise_with_traceback, range)
- from pandas.core.dtypes.cast import (
- construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na,
- construct_1d_object_array_from_listlike, infer_dtype_from_scalar,
- maybe_cast_to_datetime, maybe_cast_to_integer_array, maybe_castable,
- maybe_convert_platform, maybe_infer_to_datetimelike, maybe_upcast)
- from pandas.core.dtypes.common import (
- is_categorical_dtype, is_datetime64tz_dtype, is_dtype_equal,
- is_extension_array_dtype, is_extension_type, is_float_dtype,
- is_integer_dtype, is_iterator, is_list_like, is_object_dtype, pandas_dtype)
- from pandas.core.dtypes.generic import (
- ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCPandasArray,
- ABCPeriodIndex, ABCSeries, ABCTimedeltaIndex)
- from pandas.core.dtypes.missing import isna
- from pandas.core import algorithms, common as com
- from pandas.core.arrays import Categorical, ExtensionArray, period_array
- from pandas.core.index import (
- Index, _get_objs_combined_axis, _union_indexes, ensure_index)
- from pandas.core.indexes import base as ibase
- from pandas.core.internals import (
- create_block_manager_from_arrays, create_block_manager_from_blocks)
- from pandas.core.internals.arrays import extract_array
- # ---------------------------------------------------------------------
- # BlockManager Interface
- def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):
- """
- Segregate Series based on type and coerce into matrices.
- Needs to handle a lot of exceptional cases.
- """
- # figure out the index, if necessary
- if index is None:
- index = extract_index(arrays)
- else:
- index = ensure_index(index)
- # don't force copy because getting jammed in an ndarray anyway
- arrays = _homogenize(arrays, index, dtype)
- # from BlockManager perspective
- axes = [ensure_index(columns), index]
- return create_block_manager_from_arrays(arrays, arr_names, axes)
- def masked_rec_array_to_mgr(data, index, columns, dtype, copy):
- """
- Extract from a masked rec array and create the manager.
- """
- # essentially process a record array then fill it
- fill_value = data.fill_value
- fdata = ma.getdata(data)
- if index is None:
- index = get_names_from_index(fdata)
- if index is None:
- index = ibase.default_index(len(data))
- index = ensure_index(index)
- if columns is not None:
- columns = ensure_index(columns)
- arrays, arr_columns = to_arrays(fdata, columns)
- # fill if needed
- new_arrays = []
- for fv, arr, col in zip(fill_value, arrays, arr_columns):
- mask = ma.getmaskarray(data[col])
- if mask.any():
- arr, fv = maybe_upcast(arr, fill_value=fv, copy=True)
- arr[mask] = fv
- new_arrays.append(arr)
- # create the manager
- arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns)
- if columns is None:
- columns = arr_columns
- mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype)
- if copy:
- mgr = mgr.copy()
- return mgr
- # ---------------------------------------------------------------------
- # DataFrame Constructor Interface
- def init_ndarray(values, index, columns, dtype=None, copy=False):
- # input must be a ndarray, list, Series, index
- if isinstance(values, ABCSeries):
- if columns is None:
- if values.name is not None:
- columns = [values.name]
- if index is None:
- index = values.index
- else:
- values = values.reindex(index)
- # zero len case (GH #2234)
- if not len(values) and columns is not None and len(columns):
- values = np.empty((0, 1), dtype=object)
- # we could have a categorical type passed or coerced to 'category'
- # recast this to an arrays_to_mgr
- if (is_categorical_dtype(getattr(values, 'dtype', None)) or
- is_categorical_dtype(dtype)):
- if not hasattr(values, 'dtype'):
- values = prep_ndarray(values, copy=copy)
- values = values.ravel()
- elif copy:
- values = values.copy()
- index, columns = _get_axes(len(values), 1, index, columns)
- return arrays_to_mgr([values], columns, index, columns,
- dtype=dtype)
- elif (is_datetime64tz_dtype(values) or
- is_extension_array_dtype(values)):
- # GH#19157
- if columns is None:
- columns = [0]
- return arrays_to_mgr([values], columns, index, columns,
- dtype=dtype)
- # by definition an array here
- # the dtypes will be coerced to a single dtype
- values = prep_ndarray(values, copy=copy)
- if dtype is not None:
- if not is_dtype_equal(values.dtype, dtype):
- try:
- values = values.astype(dtype)
- except Exception as orig:
- e = ValueError("failed to cast to '{dtype}' (Exception "
- "was: {orig})".format(dtype=dtype,
- orig=orig))
- raise_with_traceback(e)
- index, columns = _get_axes(*values.shape, index=index, columns=columns)
- values = values.T
- # if we don't have a dtype specified, then try to convert objects
- # on the entire block; this is to convert if we have datetimelike's
- # embedded in an object type
- if dtype is None and is_object_dtype(values):
- values = maybe_infer_to_datetimelike(values)
- return create_block_manager_from_blocks([values], [columns, index])
- def init_dict(data, index, columns, dtype=None):
- """
- Segregate Series based on type and coerce into matrices.
- Needs to handle a lot of exceptional cases.
- """
- if columns is not None:
- from pandas.core.series import Series
- arrays = Series(data, index=columns, dtype=object)
- data_names = arrays.index
- missing = arrays.isnull()
- if index is None:
- # GH10856
- # raise ValueError if only scalars in dict
- index = extract_index(arrays[~missing])
- else:
- index = ensure_index(index)
- # no obvious "empty" int column
- if missing.any() and not is_integer_dtype(dtype):
- if dtype is None or np.issubdtype(dtype, np.flexible):
- # GH#1783
- nan_dtype = object
- else:
- nan_dtype = dtype
- val = construct_1d_arraylike_from_scalar(np.nan, len(index),
- nan_dtype)
- arrays.loc[missing] = [val] * missing.sum()
- else:
- for key in data:
- if (isinstance(data[key], ABCDatetimeIndex) and
- data[key].tz is not None):
- # GH#24096 need copy to be deep for datetime64tz case
- # TODO: See if we can avoid these copies
- data[key] = data[key].copy(deep=True)
- keys = com.dict_keys_to_ordered_list(data)
- columns = data_names = Index(keys)
- arrays = [data[k] for k in keys]
- return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
- # ---------------------------------------------------------------------
- def prep_ndarray(values, copy=True):
- if not isinstance(values, (np.ndarray, ABCSeries, Index)):
- if len(values) == 0:
- return np.empty((0, 0), dtype=object)
- def convert(v):
- return maybe_convert_platform(v)
- # we could have a 1-dim or 2-dim list here
- # this is equiv of np.asarray, but does object conversion
- # and platform dtype preservation
- try:
- if is_list_like(values[0]) or hasattr(values[0], 'len'):
- values = np.array([convert(v) for v in values])
- elif isinstance(values[0], np.ndarray) and values[0].ndim == 0:
- # GH#21861
- values = np.array([convert(v) for v in values])
- else:
- values = convert(values)
- except (ValueError, TypeError):
- values = convert(values)
- else:
- # drop subclass info, do not copy data
- values = np.asarray(values)
- if copy:
- values = values.copy()
- if values.ndim == 1:
- values = values.reshape((values.shape[0], 1))
- elif values.ndim != 2:
- raise ValueError('Must pass 2-d input')
- return values
- def _homogenize(data, index, dtype=None):
- oindex = None
- homogenized = []
- for val in data:
- if isinstance(val, ABCSeries):
- if dtype is not None:
- val = val.astype(dtype)
- if val.index is not index:
- # Forces alignment. No need to copy data since we
- # are putting it into an ndarray later
- val = val.reindex(index, copy=False)
- else:
- if isinstance(val, dict):
- if oindex is None:
- oindex = index.astype('O')
- if isinstance(index, (ABCDatetimeIndex, ABCTimedeltaIndex)):
- val = com.dict_compat(val)
- else:
- val = dict(val)
- val = lib.fast_multiget(val, oindex.values, default=np.nan)
- val = sanitize_array(val, index, dtype=dtype, copy=False,
- raise_cast_failure=False)
- homogenized.append(val)
- return homogenized
- def extract_index(data):
- index = None
- if len(data) == 0:
- index = Index([])
- elif len(data) > 0:
- raw_lengths = []
- indexes = []
- have_raw_arrays = False
- have_series = False
- have_dicts = False
- for val in data:
- if isinstance(val, ABCSeries):
- have_series = True
- indexes.append(val.index)
- elif isinstance(val, dict):
- have_dicts = True
- indexes.append(list(val.keys()))
- elif is_list_like(val) and getattr(val, 'ndim', 1) == 1:
- have_raw_arrays = True
- raw_lengths.append(len(val))
- if not indexes and not raw_lengths:
- raise ValueError('If using all scalar values, you must pass'
- ' an index')
- if have_series or have_dicts:
- index = _union_indexes(indexes)
- if have_raw_arrays:
- lengths = list(set(raw_lengths))
- if len(lengths) > 1:
- raise ValueError('arrays must all be same length')
- if have_dicts:
- raise ValueError('Mixing dicts with non-Series may lead to '
- 'ambiguous ordering.')
- if have_series:
- if lengths[0] != len(index):
- msg = ('array length {length} does not match index '
- 'length {idx_len}'
- .format(length=lengths[0], idx_len=len(index)))
- raise ValueError(msg)
- else:
- index = ibase.default_index(lengths[0])
- return ensure_index(index)
- def reorder_arrays(arrays, arr_columns, columns):
- # reorder according to the columns
- if (columns is not None and len(columns) and arr_columns is not None and
- len(arr_columns)):
- indexer = ensure_index(arr_columns).get_indexer(columns)
- arr_columns = ensure_index([arr_columns[i] for i in indexer])
- arrays = [arrays[i] for i in indexer]
- return arrays, arr_columns
- def get_names_from_index(data):
- has_some_name = any(getattr(s, 'name', None) is not None for s in data)
- if not has_some_name:
- return ibase.default_index(len(data))
- index = lrange(len(data))
- count = 0
- for i, s in enumerate(data):
- n = getattr(s, 'name', None)
- if n is not None:
- index[i] = n
- else:
- index[i] = 'Unnamed {count}'.format(count=count)
- count += 1
- return index
- def _get_axes(N, K, index, columns):
- # helper to create the axes as indexes
- # return axes or defaults
- if index is None:
- index = ibase.default_index(N)
- else:
- index = ensure_index(index)
- if columns is None:
- columns = ibase.default_index(K)
- else:
- columns = ensure_index(columns)
- return index, columns
- # ---------------------------------------------------------------------
- # Conversion of Inputs to Arrays
- def to_arrays(data, columns, coerce_float=False, dtype=None):
- """
- Return list of arrays, columns.
- """
- if isinstance(data, ABCDataFrame):
- if columns is not None:
- arrays = [data._ixs(i, axis=1).values
- for i, col in enumerate(data.columns) if col in columns]
- else:
- columns = data.columns
- arrays = [data._ixs(i, axis=1).values for i in range(len(columns))]
- return arrays, columns
- if not len(data):
- if isinstance(data, np.ndarray):
- columns = data.dtype.names
- if columns is not None:
- return [[]] * len(columns), columns
- return [], [] # columns if columns is not None else []
- if isinstance(data[0], (list, tuple)):
- return _list_to_arrays(data, columns, coerce_float=coerce_float,
- dtype=dtype)
- elif isinstance(data[0], compat.Mapping):
- return _list_of_dict_to_arrays(data, columns,
- coerce_float=coerce_float, dtype=dtype)
- elif isinstance(data[0], ABCSeries):
- return _list_of_series_to_arrays(data, columns,
- coerce_float=coerce_float,
- dtype=dtype)
- elif isinstance(data[0], Categorical):
- if columns is None:
- columns = ibase.default_index(len(data))
- return data, columns
- elif (isinstance(data, (np.ndarray, ABCSeries, Index)) and
- data.dtype.names is not None):
- columns = list(data.dtype.names)
- arrays = [data[k] for k in columns]
- return arrays, columns
- else:
- # last ditch effort
- data = lmap(tuple, data)
- return _list_to_arrays(data, columns, coerce_float=coerce_float,
- dtype=dtype)
- def _list_to_arrays(data, columns, coerce_float=False, dtype=None):
- if len(data) > 0 and isinstance(data[0], tuple):
- content = list(lib.to_object_array_tuples(data).T)
- else:
- # list of lists
- content = list(lib.to_object_array(data).T)
- return _convert_object_array(content, columns, dtype=dtype,
- coerce_float=coerce_float)
- def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
- if columns is None:
- columns = _get_objs_combined_axis(data, sort=False)
- indexer_cache = {}
- aligned_values = []
- for s in data:
- index = getattr(s, 'index', None)
- if index is None:
- index = ibase.default_index(len(s))
- if id(index) in indexer_cache:
- indexer = indexer_cache[id(index)]
- else:
- indexer = indexer_cache[id(index)] = index.get_indexer(columns)
- values = com.values_from_object(s)
- aligned_values.append(algorithms.take_1d(values, indexer))
- values = np.vstack(aligned_values)
- if values.dtype == np.object_:
- content = list(values.T)
- return _convert_object_array(content, columns, dtype=dtype,
- coerce_float=coerce_float)
- else:
- return values.T, columns
- def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
- if columns is None:
- gen = (list(x.keys()) for x in data)
- sort = not any(isinstance(d, OrderedDict) for d in data)
- columns = lib.fast_unique_multiple_list_gen(gen, sort=sort)
- # assure that they are of the base dict class and not of derived
- # classes
- data = [(type(d) is dict) and d or dict(d) for d in data]
- content = list(lib.dicts_to_array(data, list(columns)).T)
- return _convert_object_array(content, columns, dtype=dtype,
- coerce_float=coerce_float)
- def _convert_object_array(content, columns, coerce_float=False, dtype=None):
- if columns is None:
- columns = ibase.default_index(len(content))
- else:
- if len(columns) != len(content): # pragma: no cover
- # caller's responsibility to check for this...
- raise AssertionError('{col:d} columns passed, passed data had '
- '{con} columns'.format(col=len(columns),
- con=len(content)))
- # provide soft conversion of object dtypes
- def convert(arr):
- if dtype != object and dtype != np.object:
- arr = lib.maybe_convert_objects(arr, try_float=coerce_float)
- arr = maybe_cast_to_datetime(arr, dtype)
- return arr
- arrays = [convert(arr) for arr in content]
- return arrays, columns
- # ---------------------------------------------------------------------
- # Series-Based
- def sanitize_index(data, index, copy=False):
- """
- Sanitize an index type to return an ndarray of the underlying, pass
- through a non-Index.
- """
- if index is None:
- return data
- if len(data) != len(index):
- raise ValueError('Length of values does not match length of index')
- if isinstance(data, ABCIndexClass) and not copy:
- pass
- elif isinstance(data, (ABCPeriodIndex, ABCDatetimeIndex)):
- data = data._values
- if copy:
- data = data.copy()
- elif isinstance(data, np.ndarray):
- # coerce datetimelike types
- if data.dtype.kind in ['M', 'm']:
- data = sanitize_array(data, index, copy=copy)
- return data
- def sanitize_array(data, index, dtype=None, copy=False,
- raise_cast_failure=False):
- """
- Sanitize input data to an ndarray, copy if specified, coerce to the
- dtype if specified.
- """
- if dtype is not None:
- dtype = pandas_dtype(dtype)
- if isinstance(data, ma.MaskedArray):
- mask = ma.getmaskarray(data)
- if mask.any():
- data, fill_value = maybe_upcast(data, copy=True)
- data.soften_mask() # set hardmask False if it was True
- data[mask] = fill_value
- else:
- data = data.copy()
- data = extract_array(data, extract_numpy=True)
- # GH#846
- if isinstance(data, np.ndarray):
- if dtype is not None:
- subarr = np.array(data, copy=False)
- # possibility of nan -> garbage
- if is_float_dtype(data.dtype) and is_integer_dtype(dtype):
- try:
- subarr = _try_cast(data, True, dtype, copy,
- True)
- except ValueError:
- if copy:
- subarr = data.copy()
- else:
- subarr = _try_cast(data, True, dtype, copy, raise_cast_failure)
- elif isinstance(data, Index):
- # don't coerce Index types
- # e.g. indexes can have different conversions (so don't fast path
- # them)
- # GH#6140
- subarr = sanitize_index(data, index, copy=copy)
- else:
- # we will try to copy be-definition here
- subarr = _try_cast(data, True, dtype, copy, raise_cast_failure)
- elif isinstance(data, ExtensionArray):
- if isinstance(data, ABCPandasArray):
- # We don't want to let people put our PandasArray wrapper
- # (the output of Series/Index.array), into a Series. So
- # we explicitly unwrap it here.
- subarr = data.to_numpy()
- else:
- subarr = data
- # everything else in this block must also handle ndarray's,
- # becuase we've unwrapped PandasArray into an ndarray.
- if dtype is not None:
- subarr = data.astype(dtype)
- if copy:
- subarr = data.copy()
- return subarr
- elif isinstance(data, (list, tuple)) and len(data) > 0:
- if dtype is not None:
- try:
- subarr = _try_cast(data, False, dtype, copy,
- raise_cast_failure)
- except Exception:
- if raise_cast_failure: # pragma: no cover
- raise
- subarr = np.array(data, dtype=object, copy=copy)
- subarr = lib.maybe_convert_objects(subarr)
- else:
- subarr = maybe_convert_platform(data)
- subarr = maybe_cast_to_datetime(subarr, dtype)
- elif isinstance(data, range):
- # GH#16804
- start, stop, step = get_range_parameters(data)
- arr = np.arange(start, stop, step, dtype='int64')
- subarr = _try_cast(arr, False, dtype, copy, raise_cast_failure)
- else:
- subarr = _try_cast(data, False, dtype, copy, raise_cast_failure)
- # scalar like, GH
- if getattr(subarr, 'ndim', 0) == 0:
- if isinstance(data, list): # pragma: no cover
- subarr = np.array(data, dtype=object)
- elif index is not None:
- value = data
- # figure out the dtype from the value (upcast if necessary)
- if dtype is None:
- dtype, value = infer_dtype_from_scalar(value)
- else:
- # need to possibly convert the value here
- value = maybe_cast_to_datetime(value, dtype)
- subarr = construct_1d_arraylike_from_scalar(
- value, len(index), dtype)
- else:
- return subarr.item()
- # the result that we want
- elif subarr.ndim == 1:
- if index is not None:
- # a 1-element ndarray
- if len(subarr) != len(index) and len(subarr) == 1:
- subarr = construct_1d_arraylike_from_scalar(
- subarr[0], len(index), subarr.dtype)
- elif subarr.ndim > 1:
- if isinstance(data, np.ndarray):
- raise Exception('Data must be 1-dimensional')
- else:
- subarr = com.asarray_tuplesafe(data, dtype=dtype)
- # This is to prevent mixed-type Series getting all casted to
- # NumPy string type, e.g. NaN --> '-1#IND'.
- if issubclass(subarr.dtype.type, compat.string_types):
- # GH#16605
- # If not empty convert the data to dtype
- # GH#19853: If data is a scalar, subarr has already the result
- if not lib.is_scalar(data):
- if not np.all(isna(data)):
- data = np.array(data, dtype=dtype, copy=False)
- subarr = np.array(data, dtype=object, copy=copy)
- if is_object_dtype(subarr.dtype) and dtype != 'object':
- inferred = lib.infer_dtype(subarr, skipna=False)
- if inferred == 'period':
- try:
- subarr = period_array(subarr)
- except IncompatibleFrequency:
- pass
- return subarr
- def _try_cast(arr, take_fast_path, dtype, copy, raise_cast_failure):
- # perf shortcut as this is the most common case
- if take_fast_path:
- if maybe_castable(arr) and not copy and dtype is None:
- return arr
- try:
- # GH#15832: Check if we are requesting a numeric dype and
- # that we can convert the data to the requested dtype.
- if is_integer_dtype(dtype):
- subarr = maybe_cast_to_integer_array(arr, dtype)
- subarr = maybe_cast_to_datetime(arr, dtype)
- # Take care in creating object arrays (but iterators are not
- # supported):
- if is_object_dtype(dtype) and (is_list_like(subarr) and
- not (is_iterator(subarr) or
- isinstance(subarr, np.ndarray))):
- subarr = construct_1d_object_array_from_listlike(subarr)
- elif not is_extension_type(subarr):
- subarr = construct_1d_ndarray_preserving_na(subarr, dtype,
- copy=copy)
- except (ValueError, TypeError):
- if is_categorical_dtype(dtype):
- # We *do* allow casting to categorical, since we know
- # that Categorical is the only array type for 'category'.
- subarr = Categorical(arr, dtype.categories,
- ordered=dtype.ordered)
- elif is_extension_array_dtype(dtype):
- # create an extension array from its dtype
- array_type = dtype.construct_array_type()._from_sequence
- subarr = array_type(arr, dtype=dtype, copy=copy)
- elif dtype is not None and raise_cast_failure:
- raise
- else:
- subarr = np.array(arr, dtype=object, copy=copy)
- return subarr
|