methods.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. import pandas.util.testing as tm
  5. from .base import BaseExtensionTests
  6. class BaseMethodsTests(BaseExtensionTests):
  7. """Various Series and DataFrame methods."""
  8. @pytest.mark.parametrize('dropna', [True, False])
  9. def test_value_counts(self, all_data, dropna):
  10. all_data = all_data[:10]
  11. if dropna:
  12. other = np.array(all_data[~all_data.isna()])
  13. else:
  14. other = all_data
  15. result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
  16. expected = pd.Series(other).value_counts(
  17. dropna=dropna).sort_index()
  18. self.assert_series_equal(result, expected)
  19. def test_count(self, data_missing):
  20. df = pd.DataFrame({"A": data_missing})
  21. result = df.count(axis='columns')
  22. expected = pd.Series([0, 1])
  23. self.assert_series_equal(result, expected)
  24. def test_apply_simple_series(self, data):
  25. result = pd.Series(data).apply(id)
  26. assert isinstance(result, pd.Series)
  27. def test_argsort(self, data_for_sorting):
  28. result = pd.Series(data_for_sorting).argsort()
  29. expected = pd.Series(np.array([2, 0, 1], dtype=np.int64))
  30. self.assert_series_equal(result, expected)
  31. def test_argsort_missing(self, data_missing_for_sorting):
  32. result = pd.Series(data_missing_for_sorting).argsort()
  33. expected = pd.Series(np.array([1, -1, 0], dtype=np.int64))
  34. self.assert_series_equal(result, expected)
  35. @pytest.mark.parametrize('ascending', [True, False])
  36. def test_sort_values(self, data_for_sorting, ascending):
  37. ser = pd.Series(data_for_sorting)
  38. result = ser.sort_values(ascending=ascending)
  39. expected = ser.iloc[[2, 0, 1]]
  40. if not ascending:
  41. expected = expected[::-1]
  42. self.assert_series_equal(result, expected)
  43. @pytest.mark.parametrize('ascending', [True, False])
  44. def test_sort_values_missing(self, data_missing_for_sorting, ascending):
  45. ser = pd.Series(data_missing_for_sorting)
  46. result = ser.sort_values(ascending=ascending)
  47. if ascending:
  48. expected = ser.iloc[[2, 0, 1]]
  49. else:
  50. expected = ser.iloc[[0, 2, 1]]
  51. self.assert_series_equal(result, expected)
  52. @pytest.mark.parametrize('ascending', [True, False])
  53. def test_sort_values_frame(self, data_for_sorting, ascending):
  54. df = pd.DataFrame({"A": [1, 2, 1],
  55. "B": data_for_sorting})
  56. result = df.sort_values(['A', 'B'])
  57. expected = pd.DataFrame({"A": [1, 1, 2],
  58. 'B': data_for_sorting.take([2, 0, 1])},
  59. index=[2, 0, 1])
  60. self.assert_frame_equal(result, expected)
  61. @pytest.mark.parametrize('box', [pd.Series, lambda x: x])
  62. @pytest.mark.parametrize('method', [lambda x: x.unique(), pd.unique])
  63. def test_unique(self, data, box, method):
  64. duplicated = box(data._from_sequence([data[0], data[0]]))
  65. result = method(duplicated)
  66. assert len(result) == 1
  67. assert isinstance(result, type(data))
  68. assert result[0] == duplicated[0]
  69. @pytest.mark.parametrize('na_sentinel', [-1, -2])
  70. def test_factorize(self, data_for_grouping, na_sentinel):
  71. labels, uniques = pd.factorize(data_for_grouping,
  72. na_sentinel=na_sentinel)
  73. expected_labels = np.array([0, 0, na_sentinel,
  74. na_sentinel, 1, 1, 0, 2],
  75. dtype=np.intp)
  76. expected_uniques = data_for_grouping.take([0, 4, 7])
  77. tm.assert_numpy_array_equal(labels, expected_labels)
  78. self.assert_extension_array_equal(uniques, expected_uniques)
  79. @pytest.mark.parametrize('na_sentinel', [-1, -2])
  80. def test_factorize_equivalence(self, data_for_grouping, na_sentinel):
  81. l1, u1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
  82. l2, u2 = data_for_grouping.factorize(na_sentinel=na_sentinel)
  83. tm.assert_numpy_array_equal(l1, l2)
  84. self.assert_extension_array_equal(u1, u2)
  85. def test_factorize_empty(self, data):
  86. labels, uniques = pd.factorize(data[:0])
  87. expected_labels = np.array([], dtype=np.intp)
  88. expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)
  89. tm.assert_numpy_array_equal(labels, expected_labels)
  90. self.assert_extension_array_equal(uniques, expected_uniques)
  91. def test_fillna_copy_frame(self, data_missing):
  92. arr = data_missing.take([1, 1])
  93. df = pd.DataFrame({"A": arr})
  94. filled_val = df.iloc[0, 0]
  95. result = df.fillna(filled_val)
  96. assert df.A.values is not result.A.values
  97. def test_fillna_copy_series(self, data_missing):
  98. arr = data_missing.take([1, 1])
  99. ser = pd.Series(arr)
  100. filled_val = ser[0]
  101. result = ser.fillna(filled_val)
  102. assert ser._values is not result._values
  103. assert ser._values is arr
  104. def test_fillna_length_mismatch(self, data_missing):
  105. msg = "Length of 'value' does not match."
  106. with pytest.raises(ValueError, match=msg):
  107. data_missing.fillna(data_missing.take([1]))
  108. def test_combine_le(self, data_repeated):
  109. # GH 20825
  110. # Test that combine works when doing a <= (le) comparison
  111. orig_data1, orig_data2 = data_repeated(2)
  112. s1 = pd.Series(orig_data1)
  113. s2 = pd.Series(orig_data2)
  114. result = s1.combine(s2, lambda x1, x2: x1 <= x2)
  115. expected = pd.Series([a <= b for (a, b) in
  116. zip(list(orig_data1), list(orig_data2))])
  117. self.assert_series_equal(result, expected)
  118. val = s1.iloc[0]
  119. result = s1.combine(val, lambda x1, x2: x1 <= x2)
  120. expected = pd.Series([a <= val for a in list(orig_data1)])
  121. self.assert_series_equal(result, expected)
  122. def test_combine_add(self, data_repeated):
  123. # GH 20825
  124. orig_data1, orig_data2 = data_repeated(2)
  125. s1 = pd.Series(orig_data1)
  126. s2 = pd.Series(orig_data2)
  127. result = s1.combine(s2, lambda x1, x2: x1 + x2)
  128. with np.errstate(over='ignore'):
  129. expected = pd.Series(
  130. orig_data1._from_sequence([a + b for (a, b) in
  131. zip(list(orig_data1),
  132. list(orig_data2))]))
  133. self.assert_series_equal(result, expected)
  134. val = s1.iloc[0]
  135. result = s1.combine(val, lambda x1, x2: x1 + x2)
  136. expected = pd.Series(
  137. orig_data1._from_sequence([a + val for a in list(orig_data1)]))
  138. self.assert_series_equal(result, expected)
  139. def test_combine_first(self, data):
  140. # https://github.com/pandas-dev/pandas/issues/24147
  141. a = pd.Series(data[:3])
  142. b = pd.Series(data[2:5], index=[2, 3, 4])
  143. result = a.combine_first(b)
  144. expected = pd.Series(data[:5])
  145. self.assert_series_equal(result, expected)
  146. @pytest.mark.parametrize('frame', [True, False])
  147. @pytest.mark.parametrize('periods, indices', [
  148. (-2, [2, 3, 4, -1, -1]),
  149. (0, [0, 1, 2, 3, 4]),
  150. (2, [-1, -1, 0, 1, 2]),
  151. ])
  152. def test_container_shift(self, data, frame, periods, indices):
  153. # https://github.com/pandas-dev/pandas/issues/22386
  154. subset = data[:5]
  155. data = pd.Series(subset, name='A')
  156. expected = pd.Series(subset.take(indices, allow_fill=True), name='A')
  157. if frame:
  158. result = data.to_frame(name='A').assign(B=1).shift(periods)
  159. expected = pd.concat([
  160. expected,
  161. pd.Series([1] * 5, name='B').shift(periods)
  162. ], axis=1)
  163. compare = self.assert_frame_equal
  164. else:
  165. result = data.shift(periods)
  166. compare = self.assert_series_equal
  167. compare(result, expected)
  168. @pytest.mark.parametrize('periods, indices', [
  169. [-4, [-1, -1]],
  170. [-1, [1, -1]],
  171. [0, [0, 1]],
  172. [1, [-1, 0]],
  173. [4, [-1, -1]]
  174. ])
  175. def test_shift_non_empty_array(self, data, periods, indices):
  176. # https://github.com/pandas-dev/pandas/issues/23911
  177. subset = data[:2]
  178. result = subset.shift(periods)
  179. expected = subset.take(indices, allow_fill=True)
  180. self.assert_extension_array_equal(result, expected)
  181. @pytest.mark.parametrize('periods', [
  182. -4, -1, 0, 1, 4
  183. ])
  184. def test_shift_empty_array(self, data, periods):
  185. # https://github.com/pandas-dev/pandas/issues/23911
  186. empty = data[:0]
  187. result = empty.shift(periods)
  188. expected = empty
  189. self.assert_extension_array_equal(result, expected)
  190. def test_shift_fill_value(self, data):
  191. arr = data[:4]
  192. fill_value = data[0]
  193. result = arr.shift(1, fill_value=fill_value)
  194. expected = data.take([0, 0, 1, 2])
  195. self.assert_extension_array_equal(result, expected)
  196. result = arr.shift(-2, fill_value=fill_value)
  197. expected = data.take([2, 3, 0, 0])
  198. self.assert_extension_array_equal(result, expected)
  199. @pytest.mark.parametrize("as_frame", [True, False])
  200. def test_hash_pandas_object_works(self, data, as_frame):
  201. # https://github.com/pandas-dev/pandas/issues/23066
  202. data = pd.Series(data)
  203. if as_frame:
  204. data = data.to_frame()
  205. a = pd.util.hash_pandas_object(data)
  206. b = pd.util.hash_pandas_object(data)
  207. self.assert_equal(a, b)
  208. @pytest.mark.parametrize("as_series", [True, False])
  209. def test_searchsorted(self, data_for_sorting, as_series):
  210. b, c, a = data_for_sorting
  211. arr = type(data_for_sorting)._from_sequence([a, b, c])
  212. if as_series:
  213. arr = pd.Series(arr)
  214. assert arr.searchsorted(a) == 0
  215. assert arr.searchsorted(a, side="right") == 1
  216. assert arr.searchsorted(b) == 1
  217. assert arr.searchsorted(b, side="right") == 2
  218. assert arr.searchsorted(c) == 2
  219. assert arr.searchsorted(c, side="right") == 3
  220. result = arr.searchsorted(arr.take([0, 2]))
  221. expected = np.array([0, 2], dtype=np.intp)
  222. tm.assert_numpy_array_equal(result, expected)
  223. # sorter
  224. sorter = np.array([1, 2, 0])
  225. assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
  226. @pytest.mark.parametrize("as_frame", [True, False])
  227. def test_where_series(self, data, na_value, as_frame):
  228. assert data[0] != data[1]
  229. cls = type(data)
  230. a, b = data[:2]
  231. ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))
  232. cond = np.array([True, True, False, False])
  233. if as_frame:
  234. ser = ser.to_frame(name='a')
  235. cond = cond.reshape(-1, 1)
  236. result = ser.where(cond)
  237. expected = pd.Series(cls._from_sequence([a, a, na_value, na_value],
  238. dtype=data.dtype))
  239. if as_frame:
  240. expected = expected.to_frame(name='a')
  241. self.assert_equal(result, expected)
  242. # array other
  243. cond = np.array([True, False, True, True])
  244. other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
  245. if as_frame:
  246. other = pd.DataFrame({"a": other})
  247. cond = pd.DataFrame({"a": cond})
  248. result = ser.where(cond, other)
  249. expected = pd.Series(cls._from_sequence([a, b, b, b],
  250. dtype=data.dtype))
  251. if as_frame:
  252. expected = expected.to_frame(name='a')
  253. self.assert_equal(result, expected)
  254. @pytest.mark.parametrize("use_numpy", [True, False])
  255. @pytest.mark.parametrize("as_series", [True, False])
  256. @pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]])
  257. def test_repeat(self, data, repeats, as_series, use_numpy):
  258. arr = type(data)._from_sequence(data[:3], dtype=data.dtype)
  259. if as_series:
  260. arr = pd.Series(arr)
  261. result = np.repeat(arr, repeats) if use_numpy else arr.repeat(repeats)
  262. repeats = [repeats] * 3 if isinstance(repeats, int) else repeats
  263. expected = [x for x, n in zip(arr, repeats) for _ in range(n)]
  264. expected = type(data)._from_sequence(expected, dtype=data.dtype)
  265. if as_series:
  266. expected = pd.Series(expected, index=arr.index.repeat(repeats))
  267. self.assert_equal(result, expected)
  268. @pytest.mark.parametrize("use_numpy", [True, False])
  269. @pytest.mark.parametrize('repeats, kwargs, error, msg', [
  270. (2, dict(axis=1), ValueError, "'axis"),
  271. (-1, dict(), ValueError, "negative"),
  272. ([1, 2], dict(), ValueError, "shape"),
  273. (2, dict(foo='bar'), TypeError, "'foo'")])
  274. def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy):
  275. with pytest.raises(error, match=msg):
  276. if use_numpy:
  277. np.repeat(data, repeats, **kwargs)
  278. else:
  279. data.repeat(repeats, **kwargs)