array.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. """Test extension array for storing nested data in a pandas container.
  2. The JSONArray stores lists of dictionaries. The storage mechanism is a list,
  3. not an ndarray.
  4. Note:
  5. We currently store lists of UserDicts (Py3 only). Pandas has a few places
  6. internally that specifically check for dicts, and does non-scalar things
  7. in that case. We *want* the dictionaries to be treated as scalars, so we
  8. hack around pandas by using UserDicts.
  9. """
  10. import collections
  11. import itertools
  12. import numbers
  13. import random
  14. import string
  15. import sys
  16. import numpy as np
  17. from pandas.core.dtypes.base import ExtensionDtype
  18. from pandas import compat
  19. from pandas.core.arrays import ExtensionArray
  20. class JSONDtype(ExtensionDtype):
  21. type = compat.Mapping
  22. name = 'json'
  23. try:
  24. na_value = collections.UserDict()
  25. except AttributeError:
  26. # source compatibility with Py2.
  27. na_value = {}
  28. @classmethod
  29. def construct_array_type(cls):
  30. """Return the array type associated with this dtype
  31. Returns
  32. -------
  33. type
  34. """
  35. return JSONArray
  36. @classmethod
  37. def construct_from_string(cls, string):
  38. if string == cls.name:
  39. return cls()
  40. else:
  41. raise TypeError("Cannot construct a '{}' from "
  42. "'{}'".format(cls, string))
  43. class JSONArray(ExtensionArray):
  44. dtype = JSONDtype()
  45. __array_priority__ = 1000
  46. def __init__(self, values, dtype=None, copy=False):
  47. for val in values:
  48. if not isinstance(val, self.dtype.type):
  49. raise TypeError("All values must be of type " +
  50. str(self.dtype.type))
  51. self.data = values
  52. # Some aliases for common attribute names to ensure pandas supports
  53. # these
  54. self._items = self._data = self.data
  55. # those aliases are currently not working due to assumptions
  56. # in internal code (GH-20735)
  57. # self._values = self.values = self.data
  58. @classmethod
  59. def _from_sequence(cls, scalars, dtype=None, copy=False):
  60. return cls(scalars)
  61. @classmethod
  62. def _from_factorized(cls, values, original):
  63. return cls([collections.UserDict(x) for x in values if x != ()])
  64. def __getitem__(self, item):
  65. if isinstance(item, numbers.Integral):
  66. return self.data[item]
  67. elif isinstance(item, np.ndarray) and item.dtype == 'bool':
  68. return self._from_sequence([x for x, m in zip(self, item) if m])
  69. elif isinstance(item, compat.Iterable):
  70. # fancy indexing
  71. return type(self)([self.data[i] for i in item])
  72. else:
  73. # slice
  74. return type(self)(self.data[item])
  75. def __setitem__(self, key, value):
  76. if isinstance(key, numbers.Integral):
  77. self.data[key] = value
  78. else:
  79. if not isinstance(value, (type(self),
  80. compat.Sequence)):
  81. # broadcast value
  82. value = itertools.cycle([value])
  83. if isinstance(key, np.ndarray) and key.dtype == 'bool':
  84. # masking
  85. for i, (k, v) in enumerate(zip(key, value)):
  86. if k:
  87. assert isinstance(v, self.dtype.type)
  88. self.data[i] = v
  89. else:
  90. for k, v in zip(key, value):
  91. assert isinstance(v, self.dtype.type)
  92. self.data[k] = v
  93. def __len__(self):
  94. return len(self.data)
  95. @property
  96. def nbytes(self):
  97. return sys.getsizeof(self.data)
  98. def isna(self):
  99. return np.array([x == self.dtype.na_value for x in self.data],
  100. dtype=bool)
  101. def take(self, indexer, allow_fill=False, fill_value=None):
  102. # re-implement here, since NumPy has trouble setting
  103. # sized objects like UserDicts into scalar slots of
  104. # an ndarary.
  105. indexer = np.asarray(indexer)
  106. msg = ("Index is out of bounds or cannot do a "
  107. "non-empty take from an empty array.")
  108. if allow_fill:
  109. if fill_value is None:
  110. fill_value = self.dtype.na_value
  111. # bounds check
  112. if (indexer < -1).any():
  113. raise ValueError
  114. try:
  115. output = [self.data[loc] if loc != -1 else fill_value
  116. for loc in indexer]
  117. except IndexError:
  118. raise IndexError(msg)
  119. else:
  120. try:
  121. output = [self.data[loc] for loc in indexer]
  122. except IndexError:
  123. raise IndexError(msg)
  124. return self._from_sequence(output)
  125. def copy(self, deep=False):
  126. return type(self)(self.data[:])
  127. def astype(self, dtype, copy=True):
  128. # NumPy has issues when all the dicts are the same length.
  129. # np.array([UserDict(...), UserDict(...)]) fails,
  130. # but np.array([{...}, {...}]) works, so cast.
  131. # needed to add this check for the Series constructor
  132. if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
  133. if copy:
  134. return self.copy()
  135. return self
  136. return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
  137. def unique(self):
  138. # Parent method doesn't work since np.array will try to infer
  139. # a 2-dim object.
  140. return type(self)([
  141. dict(x) for x in list({tuple(d.items()) for d in self.data})
  142. ])
  143. @classmethod
  144. def _concat_same_type(cls, to_concat):
  145. data = list(itertools.chain.from_iterable([x.data for x in to_concat]))
  146. return cls(data)
  147. def _values_for_factorize(self):
  148. frozen = self._values_for_argsort()
  149. if len(frozen) == 0:
  150. # _factorize_array expects 1-d array, this is a len-0 2-d array.
  151. frozen = frozen.ravel()
  152. return frozen, ()
  153. def _values_for_argsort(self):
  154. # Disable NumPy's shape inference by including an empty tuple...
  155. # If all the elemnts of self are the same size P, NumPy will
  156. # cast them to an (N, P) array, instead of an (N,) array of tuples.
  157. frozen = [()] + [tuple(x.items()) for x in self]
  158. return np.array(frozen, dtype=object)[1:]
  159. def make_data():
  160. # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
  161. return [collections.UserDict([
  162. (random.choice(string.ascii_letters), random.randint(0, 100))
  163. for _ in range(random.randint(0, 10))]) for _ in range(100)]