test_algos.py 71 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881
  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime
  3. from itertools import permutations
  4. import struct
  5. import numpy as np
  6. from numpy import nan
  7. from numpy.random import RandomState
  8. import pytest
  9. from pandas._libs import (
  10. algos as libalgos, groupby as libgroupby, hashtable as ht)
  11. from pandas.compat import lrange, range
  12. from pandas.compat.numpy import np_array_datetime64_compat
  13. import pandas.util._test_decorators as td
  14. from pandas.core.dtypes.dtypes import CategoricalDtype as CDT
  15. import pandas as pd
  16. from pandas import (
  17. Categorical, CategoricalIndex, DatetimeIndex, Index, IntervalIndex, Series,
  18. Timestamp, compat)
  19. import pandas.core.algorithms as algos
  20. from pandas.core.arrays import DatetimeArray
  21. import pandas.core.common as com
  22. import pandas.util.testing as tm
  23. from pandas.util.testing import assert_almost_equal
  24. class TestMatch(object):
  25. def test_ints(self):
  26. values = np.array([0, 2, 1])
  27. to_match = np.array([0, 1, 2, 2, 0, 1, 3, 0])
  28. result = algos.match(to_match, values)
  29. expected = np.array([0, 2, 1, 1, 0, 2, -1, 0], dtype=np.int64)
  30. tm.assert_numpy_array_equal(result, expected)
  31. result = Series(algos.match(to_match, values, np.nan))
  32. expected = Series(np.array([0, 2, 1, 1, 0, 2, np.nan, 0]))
  33. tm.assert_series_equal(result, expected)
  34. s = Series(np.arange(5), dtype=np.float32)
  35. result = algos.match(s, [2, 4])
  36. expected = np.array([-1, -1, 0, -1, 1], dtype=np.int64)
  37. tm.assert_numpy_array_equal(result, expected)
  38. result = Series(algos.match(s, [2, 4], np.nan))
  39. expected = Series(np.array([np.nan, np.nan, 0, np.nan, 1]))
  40. tm.assert_series_equal(result, expected)
  41. def test_strings(self):
  42. values = ['foo', 'bar', 'baz']
  43. to_match = ['bar', 'foo', 'qux', 'foo', 'bar', 'baz', 'qux']
  44. result = algos.match(to_match, values)
  45. expected = np.array([1, 0, -1, 0, 1, 2, -1], dtype=np.int64)
  46. tm.assert_numpy_array_equal(result, expected)
  47. result = Series(algos.match(to_match, values, np.nan))
  48. expected = Series(np.array([1, 0, np.nan, 0, 1, 2, np.nan]))
  49. tm.assert_series_equal(result, expected)
  50. class TestFactorize(object):
  51. def test_basic(self):
  52. labels, uniques = algos.factorize(['a', 'b', 'b', 'a', 'a', 'c', 'c',
  53. 'c'])
  54. tm.assert_numpy_array_equal(
  55. uniques, np.array(['a', 'b', 'c'], dtype=object))
  56. labels, uniques = algos.factorize(['a', 'b', 'b', 'a',
  57. 'a', 'c', 'c', 'c'], sort=True)
  58. exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp)
  59. tm.assert_numpy_array_equal(labels, exp)
  60. exp = np.array(['a', 'b', 'c'], dtype=object)
  61. tm.assert_numpy_array_equal(uniques, exp)
  62. labels, uniques = algos.factorize(list(reversed(range(5))))
  63. exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
  64. tm.assert_numpy_array_equal(labels, exp)
  65. exp = np.array([4, 3, 2, 1, 0], dtype=np.int64)
  66. tm.assert_numpy_array_equal(uniques, exp)
  67. labels, uniques = algos.factorize(list(reversed(range(5))), sort=True)
  68. exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
  69. tm.assert_numpy_array_equal(labels, exp)
  70. exp = np.array([0, 1, 2, 3, 4], dtype=np.int64)
  71. tm.assert_numpy_array_equal(uniques, exp)
  72. labels, uniques = algos.factorize(list(reversed(np.arange(5.))))
  73. exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
  74. tm.assert_numpy_array_equal(labels, exp)
  75. exp = np.array([4., 3., 2., 1., 0.], dtype=np.float64)
  76. tm.assert_numpy_array_equal(uniques, exp)
  77. labels, uniques = algos.factorize(list(reversed(np.arange(5.))),
  78. sort=True)
  79. exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
  80. tm.assert_numpy_array_equal(labels, exp)
  81. exp = np.array([0., 1., 2., 3., 4.], dtype=np.float64)
  82. tm.assert_numpy_array_equal(uniques, exp)
  83. def test_mixed(self):
  84. # doc example reshaping.rst
  85. x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf])
  86. labels, uniques = algos.factorize(x)
  87. exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp)
  88. tm.assert_numpy_array_equal(labels, exp)
  89. exp = Index(['A', 'B', 3.14, np.inf])
  90. tm.assert_index_equal(uniques, exp)
  91. labels, uniques = algos.factorize(x, sort=True)
  92. exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp)
  93. tm.assert_numpy_array_equal(labels, exp)
  94. exp = Index([3.14, np.inf, 'A', 'B'])
  95. tm.assert_index_equal(uniques, exp)
  96. def test_datelike(self):
  97. # M8
  98. v1 = Timestamp('20130101 09:00:00.00004')
  99. v2 = Timestamp('20130101')
  100. x = Series([v1, v1, v1, v2, v2, v1])
  101. labels, uniques = algos.factorize(x)
  102. exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
  103. tm.assert_numpy_array_equal(labels, exp)
  104. exp = DatetimeIndex([v1, v2])
  105. tm.assert_index_equal(uniques, exp)
  106. labels, uniques = algos.factorize(x, sort=True)
  107. exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.intp)
  108. tm.assert_numpy_array_equal(labels, exp)
  109. exp = DatetimeIndex([v2, v1])
  110. tm.assert_index_equal(uniques, exp)
  111. # period
  112. v1 = pd.Period('201302', freq='M')
  113. v2 = pd.Period('201303', freq='M')
  114. x = Series([v1, v1, v1, v2, v2, v1])
  115. # periods are not 'sorted' as they are converted back into an index
  116. labels, uniques = algos.factorize(x)
  117. exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
  118. tm.assert_numpy_array_equal(labels, exp)
  119. tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))
  120. labels, uniques = algos.factorize(x, sort=True)
  121. exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
  122. tm.assert_numpy_array_equal(labels, exp)
  123. tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))
  124. # GH 5986
  125. v1 = pd.to_timedelta('1 day 1 min')
  126. v2 = pd.to_timedelta('1 day')
  127. x = Series([v1, v2, v1, v1, v2, v2, v1])
  128. labels, uniques = algos.factorize(x)
  129. exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp)
  130. tm.assert_numpy_array_equal(labels, exp)
  131. tm.assert_index_equal(uniques, pd.to_timedelta([v1, v2]))
  132. labels, uniques = algos.factorize(x, sort=True)
  133. exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp)
  134. tm.assert_numpy_array_equal(labels, exp)
  135. tm.assert_index_equal(uniques, pd.to_timedelta([v2, v1]))
  136. def test_factorize_nan(self):
  137. # nan should map to na_sentinel, not reverse_indexer[na_sentinel]
  138. # rizer.factorize should not raise an exception if na_sentinel indexes
  139. # outside of reverse_indexer
  140. key = np.array([1, 2, 1, np.nan], dtype='O')
  141. rizer = ht.Factorizer(len(key))
  142. for na_sentinel in (-1, 20):
  143. ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel)
  144. expected = np.array([0, 1, 0, na_sentinel], dtype='int32')
  145. assert len(set(key)) == len(set(expected))
  146. tm.assert_numpy_array_equal(pd.isna(key),
  147. expected == na_sentinel)
  148. # nan still maps to na_sentinel when sort=False
  149. key = np.array([0, np.nan, 1], dtype='O')
  150. na_sentinel = -1
  151. # TODO(wesm): unused?
  152. ids = rizer.factorize(key, sort=False, na_sentinel=na_sentinel) # noqa
  153. expected = np.array([2, -1, 0], dtype='int32')
  154. assert len(set(key)) == len(set(expected))
  155. tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel)
  156. @pytest.mark.parametrize("data,expected_label,expected_level", [
  157. (
  158. [(1, 1), (1, 2), (0, 0), (1, 2), 'nonsense'],
  159. [0, 1, 2, 1, 3],
  160. [(1, 1), (1, 2), (0, 0), 'nonsense']
  161. ),
  162. (
  163. [(1, 1), (1, 2), (0, 0), (1, 2), (1, 2, 3)],
  164. [0, 1, 2, 1, 3],
  165. [(1, 1), (1, 2), (0, 0), (1, 2, 3)]
  166. ),
  167. (
  168. [(1, 1), (1, 2), (0, 0), (1, 2)],
  169. [0, 1, 2, 1],
  170. [(1, 1), (1, 2), (0, 0)]
  171. )
  172. ])
  173. def test_factorize_tuple_list(self, data, expected_label, expected_level):
  174. # GH9454
  175. result = pd.factorize(data)
  176. tm.assert_numpy_array_equal(result[0],
  177. np.array(expected_label, dtype=np.intp))
  178. expected_level_array = com.asarray_tuplesafe(expected_level,
  179. dtype=object)
  180. tm.assert_numpy_array_equal(result[1], expected_level_array)
  181. def test_complex_sorting(self):
  182. # gh 12666 - check no segfault
  183. x17 = np.array([complex(i) for i in range(17)], dtype=object)
  184. pytest.raises(TypeError, algos.factorize, x17[::-1], sort=True)
  185. def test_float64_factorize(self, writable):
  186. data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64)
  187. data.setflags(write=writable)
  188. exp_labels = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp)
  189. exp_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64)
  190. labels, uniques = algos.factorize(data)
  191. tm.assert_numpy_array_equal(labels, exp_labels)
  192. tm.assert_numpy_array_equal(uniques, exp_uniques)
  193. def test_uint64_factorize(self, writable):
  194. data = np.array([2**64 - 1, 1, 2**64 - 1], dtype=np.uint64)
  195. data.setflags(write=writable)
  196. exp_labels = np.array([0, 1, 0], dtype=np.intp)
  197. exp_uniques = np.array([2**64 - 1, 1], dtype=np.uint64)
  198. labels, uniques = algos.factorize(data)
  199. tm.assert_numpy_array_equal(labels, exp_labels)
  200. tm.assert_numpy_array_equal(uniques, exp_uniques)
  201. def test_int64_factorize(self, writable):
  202. data = np.array([2**63 - 1, -2**63, 2**63 - 1], dtype=np.int64)
  203. data.setflags(write=writable)
  204. exp_labels = np.array([0, 1, 0], dtype=np.intp)
  205. exp_uniques = np.array([2**63 - 1, -2**63], dtype=np.int64)
  206. labels, uniques = algos.factorize(data)
  207. tm.assert_numpy_array_equal(labels, exp_labels)
  208. tm.assert_numpy_array_equal(uniques, exp_uniques)
  209. def test_string_factorize(self, writable):
  210. data = np.array(['a', 'c', 'a', 'b', 'c'],
  211. dtype=object)
  212. data.setflags(write=writable)
  213. exp_labels = np.array([0, 1, 0, 2, 1], dtype=np.intp)
  214. exp_uniques = np.array(['a', 'c', 'b'], dtype=object)
  215. labels, uniques = algos.factorize(data)
  216. tm.assert_numpy_array_equal(labels, exp_labels)
  217. tm.assert_numpy_array_equal(uniques, exp_uniques)
  218. def test_object_factorize(self, writable):
  219. data = np.array(['a', 'c', None, np.nan, 'a', 'b', pd.NaT, 'c'],
  220. dtype=object)
  221. data.setflags(write=writable)
  222. exp_labels = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp)
  223. exp_uniques = np.array(['a', 'c', 'b'], dtype=object)
  224. labels, uniques = algos.factorize(data)
  225. tm.assert_numpy_array_equal(labels, exp_labels)
  226. tm.assert_numpy_array_equal(uniques, exp_uniques)
  227. def test_deprecate_order(self):
  228. # gh 19727 - check warning is raised for deprecated keyword, order.
  229. # Test not valid once order keyword is removed.
  230. data = np.array([2**63, 1, 2**63], dtype=np.uint64)
  231. with tm.assert_produces_warning(expected_warning=FutureWarning):
  232. algos.factorize(data, order=True)
  233. with tm.assert_produces_warning(False):
  234. algos.factorize(data)
  235. @pytest.mark.parametrize('data', [
  236. np.array([0, 1, 0], dtype='u8'),
  237. np.array([-2**63, 1, -2**63], dtype='i8'),
  238. np.array(['__nan__', 'foo', '__nan__'], dtype='object'),
  239. ])
  240. def test_parametrized_factorize_na_value_default(self, data):
  241. # arrays that include the NA default for that type, but isn't used.
  242. l, u = algos.factorize(data)
  243. expected_uniques = data[[0, 1]]
  244. expected_labels = np.array([0, 1, 0], dtype=np.intp)
  245. tm.assert_numpy_array_equal(l, expected_labels)
  246. tm.assert_numpy_array_equal(u, expected_uniques)
  247. @pytest.mark.parametrize('data, na_value', [
  248. (np.array([0, 1, 0, 2], dtype='u8'), 0),
  249. (np.array([1, 0, 1, 2], dtype='u8'), 1),
  250. (np.array([-2**63, 1, -2**63, 0], dtype='i8'), -2**63),
  251. (np.array([1, -2**63, 1, 0], dtype='i8'), 1),
  252. (np.array(['a', '', 'a', 'b'], dtype=object), 'a'),
  253. (np.array([(), ('a', 1), (), ('a', 2)], dtype=object), ()),
  254. (np.array([('a', 1), (), ('a', 1), ('a', 2)], dtype=object),
  255. ('a', 1)),
  256. ])
  257. def test_parametrized_factorize_na_value(self, data, na_value):
  258. l, u = algos._factorize_array(data, na_value=na_value)
  259. expected_uniques = data[[1, 3]]
  260. expected_labels = np.array([-1, 0, -1, 1], dtype=np.intp)
  261. tm.assert_numpy_array_equal(l, expected_labels)
  262. tm.assert_numpy_array_equal(u, expected_uniques)
  263. @pytest.mark.parametrize('sort', [True, False])
  264. @pytest.mark.parametrize('na_sentinel', [-1, -10, 100])
  265. def test_factorize_na_sentinel(self, sort, na_sentinel):
  266. data = np.array(['b', 'a', None, 'b'], dtype=object)
  267. labels, uniques = algos.factorize(data, sort=sort,
  268. na_sentinel=na_sentinel)
  269. if sort:
  270. expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp)
  271. expected_uniques = np.array(['a', 'b'], dtype=object)
  272. else:
  273. expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp)
  274. expected_uniques = np.array(['b', 'a'], dtype=object)
  275. tm.assert_numpy_array_equal(labels, expected_labels)
  276. tm.assert_numpy_array_equal(uniques, expected_uniques)
  277. class TestUnique(object):
  278. def test_ints(self):
  279. arr = np.random.randint(0, 100, size=50)
  280. result = algos.unique(arr)
  281. assert isinstance(result, np.ndarray)
  282. def test_objects(self):
  283. arr = np.random.randint(0, 100, size=50).astype('O')
  284. result = algos.unique(arr)
  285. assert isinstance(result, np.ndarray)
  286. def test_object_refcount_bug(self):
  287. lst = ['A', 'B', 'C', 'D', 'E']
  288. for i in range(1000):
  289. len(algos.unique(lst))
  290. def test_on_index_object(self):
  291. mindex = pd.MultiIndex.from_arrays([np.arange(5).repeat(5), np.tile(
  292. np.arange(5), 5)])
  293. expected = mindex.values
  294. expected.sort()
  295. mindex = mindex.repeat(2)
  296. result = pd.unique(mindex)
  297. result.sort()
  298. tm.assert_almost_equal(result, expected)
  299. def test_datetime64_dtype_array_returned(self):
  300. # GH 9431
  301. expected = np_array_datetime64_compat(
  302. ['2015-01-03T00:00:00.000000000+0000',
  303. '2015-01-01T00:00:00.000000000+0000'],
  304. dtype='M8[ns]')
  305. dt_index = pd.to_datetime(['2015-01-03T00:00:00.000000000',
  306. '2015-01-01T00:00:00.000000000',
  307. '2015-01-01T00:00:00.000000000'])
  308. result = algos.unique(dt_index)
  309. tm.assert_numpy_array_equal(result, expected)
  310. assert result.dtype == expected.dtype
  311. s = Series(dt_index)
  312. result = algos.unique(s)
  313. tm.assert_numpy_array_equal(result, expected)
  314. assert result.dtype == expected.dtype
  315. arr = s.values
  316. result = algos.unique(arr)
  317. tm.assert_numpy_array_equal(result, expected)
  318. assert result.dtype == expected.dtype
  319. def test_timedelta64_dtype_array_returned(self):
  320. # GH 9431
  321. expected = np.array([31200, 45678, 10000], dtype='m8[ns]')
  322. td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678])
  323. result = algos.unique(td_index)
  324. tm.assert_numpy_array_equal(result, expected)
  325. assert result.dtype == expected.dtype
  326. s = Series(td_index)
  327. result = algos.unique(s)
  328. tm.assert_numpy_array_equal(result, expected)
  329. assert result.dtype == expected.dtype
  330. arr = s.values
  331. result = algos.unique(arr)
  332. tm.assert_numpy_array_equal(result, expected)
  333. assert result.dtype == expected.dtype
  334. def test_uint64_overflow(self):
  335. s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
  336. exp = np.array([1, 2, 2**63], dtype=np.uint64)
  337. tm.assert_numpy_array_equal(algos.unique(s), exp)
  338. def test_nan_in_object_array(self):
  339. duplicated_items = ['a', np.nan, 'c', 'c']
  340. result = pd.unique(duplicated_items)
  341. expected = np.array(['a', np.nan, 'c'], dtype=object)
  342. tm.assert_numpy_array_equal(result, expected)
  343. def test_categorical(self):
  344. # we are expecting to return in the order
  345. # of appearance
  346. expected = Categorical(list('bac'), categories=list('bac'))
  347. # we are expecting to return in the order
  348. # of the categories
  349. expected_o = Categorical(
  350. list('bac'), categories=list('abc'), ordered=True)
  351. # GH 15939
  352. c = Categorical(list('baabc'))
  353. result = c.unique()
  354. tm.assert_categorical_equal(result, expected)
  355. result = algos.unique(c)
  356. tm.assert_categorical_equal(result, expected)
  357. c = Categorical(list('baabc'), ordered=True)
  358. result = c.unique()
  359. tm.assert_categorical_equal(result, expected_o)
  360. result = algos.unique(c)
  361. tm.assert_categorical_equal(result, expected_o)
  362. # Series of categorical dtype
  363. s = Series(Categorical(list('baabc')), name='foo')
  364. result = s.unique()
  365. tm.assert_categorical_equal(result, expected)
  366. result = pd.unique(s)
  367. tm.assert_categorical_equal(result, expected)
  368. # CI -> return CI
  369. ci = CategoricalIndex(Categorical(list('baabc'),
  370. categories=list('bac')))
  371. expected = CategoricalIndex(expected)
  372. result = ci.unique()
  373. tm.assert_index_equal(result, expected)
  374. result = pd.unique(ci)
  375. tm.assert_index_equal(result, expected)
  376. def test_datetime64tz_aware(self):
  377. # GH 15939
  378. result = Series(
  379. Index([Timestamp('20160101', tz='US/Eastern'),
  380. Timestamp('20160101', tz='US/Eastern')])).unique()
  381. expected = DatetimeArray._from_sequence(np.array([
  382. Timestamp('2016-01-01 00:00:00-0500', tz="US/Eastern")
  383. ]))
  384. tm.assert_extension_array_equal(result, expected)
  385. result = Index([Timestamp('20160101', tz='US/Eastern'),
  386. Timestamp('20160101', tz='US/Eastern')]).unique()
  387. expected = DatetimeIndex(['2016-01-01 00:00:00'],
  388. dtype='datetime64[ns, US/Eastern]', freq=None)
  389. tm.assert_index_equal(result, expected)
  390. result = pd.unique(
  391. Series(Index([Timestamp('20160101', tz='US/Eastern'),
  392. Timestamp('20160101', tz='US/Eastern')])))
  393. expected = DatetimeArray._from_sequence(np.array([
  394. Timestamp('2016-01-01', tz="US/Eastern"),
  395. ]))
  396. tm.assert_extension_array_equal(result, expected)
  397. result = pd.unique(Index([Timestamp('20160101', tz='US/Eastern'),
  398. Timestamp('20160101', tz='US/Eastern')]))
  399. expected = DatetimeIndex(['2016-01-01 00:00:00'],
  400. dtype='datetime64[ns, US/Eastern]', freq=None)
  401. tm.assert_index_equal(result, expected)
  402. def test_order_of_appearance(self):
  403. # 9346
  404. # light testing of guarantee of order of appearance
  405. # these also are the doc-examples
  406. result = pd.unique(Series([2, 1, 3, 3]))
  407. tm.assert_numpy_array_equal(result,
  408. np.array([2, 1, 3], dtype='int64'))
  409. result = pd.unique(Series([2] + [1] * 5))
  410. tm.assert_numpy_array_equal(result,
  411. np.array([2, 1], dtype='int64'))
  412. result = pd.unique(Series([Timestamp('20160101'),
  413. Timestamp('20160101')]))
  414. expected = np.array(['2016-01-01T00:00:00.000000000'],
  415. dtype='datetime64[ns]')
  416. tm.assert_numpy_array_equal(result, expected)
  417. result = pd.unique(Index(
  418. [Timestamp('20160101', tz='US/Eastern'),
  419. Timestamp('20160101', tz='US/Eastern')]))
  420. expected = DatetimeIndex(['2016-01-01 00:00:00'],
  421. dtype='datetime64[ns, US/Eastern]',
  422. freq=None)
  423. tm.assert_index_equal(result, expected)
  424. result = pd.unique(list('aabc'))
  425. expected = np.array(['a', 'b', 'c'], dtype=object)
  426. tm.assert_numpy_array_equal(result, expected)
  427. result = pd.unique(Series(Categorical(list('aabc'))))
  428. expected = Categorical(list('abc'))
  429. tm.assert_categorical_equal(result, expected)
  430. @pytest.mark.parametrize("arg ,expected", [
  431. (('1', '1', '2'), np.array(['1', '2'], dtype=object)),
  432. (('foo',), np.array(['foo'], dtype=object))
  433. ])
  434. def test_tuple_with_strings(self, arg, expected):
  435. # see GH 17108
  436. result = pd.unique(arg)
  437. tm.assert_numpy_array_equal(result, expected)
  438. def test_obj_none_preservation(self):
  439. # GH 20866
  440. arr = np.array(['foo', None], dtype=object)
  441. result = pd.unique(arr)
  442. expected = np.array(['foo', None], dtype=object)
  443. tm.assert_numpy_array_equal(result, expected, strict_nan=True)
  444. def test_signed_zero(self):
  445. # GH 21866
  446. a = np.array([-0.0, 0.0])
  447. result = pd.unique(a)
  448. expected = np.array([-0.0]) # 0.0 and -0.0 are equivalent
  449. tm.assert_numpy_array_equal(result, expected)
  450. def test_different_nans(self):
  451. # GH 21866
  452. # create different nans from bit-patterns:
  453. NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
  454. NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
  455. assert NAN1 != NAN1
  456. assert NAN2 != NAN2
  457. a = np.array([NAN1, NAN2]) # NAN1 and NAN2 are equivalent
  458. result = pd.unique(a)
  459. expected = np.array([np.nan])
  460. tm.assert_numpy_array_equal(result, expected)
  461. def test_first_nan_kept(self):
  462. # GH 22295
  463. # create different nans from bit-patterns:
  464. bits_for_nan1 = 0xfff8000000000001
  465. bits_for_nan2 = 0x7ff8000000000001
  466. NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0]
  467. NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0]
  468. assert NAN1 != NAN1
  469. assert NAN2 != NAN2
  470. for el_type in [np.float64, np.object]:
  471. a = np.array([NAN1, NAN2], dtype=el_type)
  472. result = pd.unique(a)
  473. assert result.size == 1
  474. # use bit patterns to identify which nan was kept:
  475. result_nan_bits = struct.unpack("=Q",
  476. struct.pack("d", result[0]))[0]
  477. assert result_nan_bits == bits_for_nan1
  478. def test_do_not_mangle_na_values(self, unique_nulls_fixture,
  479. unique_nulls_fixture2):
  480. # GH 22295
  481. if unique_nulls_fixture is unique_nulls_fixture2:
  482. return # skip it, values not unique
  483. a = np.array([unique_nulls_fixture,
  484. unique_nulls_fixture2], dtype=np.object)
  485. result = pd.unique(a)
  486. assert result.size == 2
  487. assert a[0] is unique_nulls_fixture
  488. assert a[1] is unique_nulls_fixture2
  489. class TestIsin(object):
  490. def test_invalid(self):
  491. pytest.raises(TypeError, lambda: algos.isin(1, 1))
  492. pytest.raises(TypeError, lambda: algos.isin(1, [1]))
  493. pytest.raises(TypeError, lambda: algos.isin([1], 1))
  494. def test_basic(self):
  495. result = algos.isin([1, 2], [1])
  496. expected = np.array([True, False])
  497. tm.assert_numpy_array_equal(result, expected)
  498. result = algos.isin(np.array([1, 2]), [1])
  499. expected = np.array([True, False])
  500. tm.assert_numpy_array_equal(result, expected)
  501. result = algos.isin(Series([1, 2]), [1])
  502. expected = np.array([True, False])
  503. tm.assert_numpy_array_equal(result, expected)
  504. result = algos.isin(Series([1, 2]), Series([1]))
  505. expected = np.array([True, False])
  506. tm.assert_numpy_array_equal(result, expected)
  507. result = algos.isin(Series([1, 2]), {1})
  508. expected = np.array([True, False])
  509. tm.assert_numpy_array_equal(result, expected)
  510. result = algos.isin(['a', 'b'], ['a'])
  511. expected = np.array([True, False])
  512. tm.assert_numpy_array_equal(result, expected)
  513. result = algos.isin(Series(['a', 'b']), Series(['a']))
  514. expected = np.array([True, False])
  515. tm.assert_numpy_array_equal(result, expected)
  516. result = algos.isin(Series(['a', 'b']), {'a'})
  517. expected = np.array([True, False])
  518. tm.assert_numpy_array_equal(result, expected)
  519. result = algos.isin(['a', 'b'], [1])
  520. expected = np.array([False, False])
  521. tm.assert_numpy_array_equal(result, expected)
  522. def test_i8(self):
  523. arr = pd.date_range('20130101', periods=3).values
  524. result = algos.isin(arr, [arr[0]])
  525. expected = np.array([True, False, False])
  526. tm.assert_numpy_array_equal(result, expected)
  527. result = algos.isin(arr, arr[0:2])
  528. expected = np.array([True, True, False])
  529. tm.assert_numpy_array_equal(result, expected)
  530. result = algos.isin(arr, set(arr[0:2]))
  531. expected = np.array([True, True, False])
  532. tm.assert_numpy_array_equal(result, expected)
  533. arr = pd.timedelta_range('1 day', periods=3).values
  534. result = algos.isin(arr, [arr[0]])
  535. expected = np.array([True, False, False])
  536. tm.assert_numpy_array_equal(result, expected)
  537. result = algos.isin(arr, arr[0:2])
  538. expected = np.array([True, True, False])
  539. tm.assert_numpy_array_equal(result, expected)
  540. result = algos.isin(arr, set(arr[0:2]))
  541. expected = np.array([True, True, False])
  542. tm.assert_numpy_array_equal(result, expected)
  543. def test_large(self):
  544. s = pd.date_range('20000101', periods=2000000, freq='s').values
  545. result = algos.isin(s, s[0:2])
  546. expected = np.zeros(len(s), dtype=bool)
  547. expected[0] = True
  548. expected[1] = True
  549. tm.assert_numpy_array_equal(result, expected)
  550. def test_categorical_from_codes(self):
  551. # GH 16639
  552. vals = np.array([0, 1, 2, 0])
  553. cats = ['a', 'b', 'c']
  554. Sd = Series(Categorical(1).from_codes(vals, cats))
  555. St = Series(Categorical(1).from_codes(np.array([0, 1]), cats))
  556. expected = np.array([True, True, False, True])
  557. result = algos.isin(Sd, St)
  558. tm.assert_numpy_array_equal(expected, result)
  559. def test_same_nan_is_in(self):
  560. # GH 22160
  561. # nan is special, because from " a is b" doesn't follow "a == b"
  562. # at least, isin() should follow python's "np.nan in [nan] == True"
  563. # casting to -> np.float64 -> another float-object somewher on
  564. # the way could lead jepardize this behavior
  565. comps = [np.nan] # could be casted to float64
  566. values = [np.nan]
  567. expected = np.array([True])
  568. result = algos.isin(comps, values)
  569. tm.assert_numpy_array_equal(expected, result)
  570. def test_same_object_is_in(self):
  571. # GH 22160
  572. # there could be special treatment for nans
  573. # the user however could define a custom class
  574. # with similar behavior, then we at least should
  575. # fall back to usual python's behavior: "a in [a] == True"
  576. class LikeNan(object):
  577. def __eq__(self):
  578. return False
  579. def __hash__(self):
  580. return 0
  581. a, b = LikeNan(), LikeNan()
  582. # same object -> True
  583. tm.assert_numpy_array_equal(algos.isin([a], [a]), np.array([True]))
  584. # different objects -> False
  585. tm.assert_numpy_array_equal(algos.isin([a], [b]), np.array([False]))
  586. def test_different_nans(self):
  587. # GH 22160
  588. # all nans are handled as equivalent
  589. comps = [float('nan')]
  590. values = [float('nan')]
  591. assert comps[0] is not values[0] # different nan-objects
  592. # as list of python-objects:
  593. result = algos.isin(comps, values)
  594. tm.assert_numpy_array_equal(np.array([True]), result)
  595. # as object-array:
  596. result = algos.isin(np.asarray(comps, dtype=np.object),
  597. np.asarray(values, dtype=np.object))
  598. tm.assert_numpy_array_equal(np.array([True]), result)
  599. # as float64-array:
  600. result = algos.isin(np.asarray(comps, dtype=np.float64),
  601. np.asarray(values, dtype=np.float64))
  602. tm.assert_numpy_array_equal(np.array([True]), result)
  603. def test_no_cast(self):
  604. # GH 22160
  605. # ensure 42 is not casted to a string
  606. comps = ['ss', 42]
  607. values = ['42']
  608. expected = np.array([False, False])
  609. result = algos.isin(comps, values)
  610. tm.assert_numpy_array_equal(expected, result)
  611. @pytest.mark.parametrize("empty", [[], Series(), np.array([])])
  612. def test_empty(self, empty):
  613. # see gh-16991
  614. vals = Index(["a", "b"])
  615. expected = np.array([False, False])
  616. result = algos.isin(vals, empty)
  617. tm.assert_numpy_array_equal(expected, result)
  618. def test_different_nan_objects(self):
  619. # GH 22119
  620. comps = np.array(['nan', np.nan * 1j, float('nan')], dtype=np.object)
  621. vals = np.array([float('nan')], dtype=np.object)
  622. expected = np.array([False, False, True])
  623. result = algos.isin(comps, vals)
  624. tm.assert_numpy_array_equal(expected, result)
  625. def test_different_nans_as_float64(self):
  626. # GH 21866
  627. # create different nans from bit-patterns,
  628. # these nans will land in different buckets in the hash-table
  629. # if no special care is taken
  630. NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
  631. NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
  632. assert NAN1 != NAN1
  633. assert NAN2 != NAN2
  634. # check that NAN1 and NAN2 are equivalent:
  635. arr = np.array([NAN1, NAN2], dtype=np.float64)
  636. lookup1 = np.array([NAN1], dtype=np.float64)
  637. result = algos.isin(arr, lookup1)
  638. expected = np.array([True, True])
  639. tm.assert_numpy_array_equal(result, expected)
  640. lookup2 = np.array([NAN2], dtype=np.float64)
  641. result = algos.isin(arr, lookup2)
  642. expected = np.array([True, True])
  643. tm.assert_numpy_array_equal(result, expected)
  644. class TestValueCounts(object):
  645. def test_value_counts(self):
  646. np.random.seed(1234)
  647. from pandas.core.reshape.tile import cut
  648. arr = np.random.randn(4)
  649. factor = cut(arr, 4)
  650. # assert isinstance(factor, n)
  651. result = algos.value_counts(factor)
  652. breaks = [-1.194, -0.535, 0.121, 0.777, 1.433]
  653. index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True))
  654. expected = Series([1, 1, 1, 1], index=index)
  655. tm.assert_series_equal(result.sort_index(), expected.sort_index())
  656. def test_value_counts_bins(self):
  657. s = [1, 2, 3, 4]
  658. result = algos.value_counts(s, bins=1)
  659. expected = Series([4],
  660. index=IntervalIndex.from_tuples([(0.996, 4.0)]))
  661. tm.assert_series_equal(result, expected)
  662. result = algos.value_counts(s, bins=2, sort=False)
  663. expected = Series([2, 2],
  664. index=IntervalIndex.from_tuples([(0.996, 2.5),
  665. (2.5, 4.0)]))
  666. tm.assert_series_equal(result, expected)
  667. def test_value_counts_dtypes(self):
  668. result = algos.value_counts([1, 1.])
  669. assert len(result) == 1
  670. result = algos.value_counts([1, 1.], bins=1)
  671. assert len(result) == 1
  672. result = algos.value_counts(Series([1, 1., '1'])) # object
  673. assert len(result) == 2
  674. pytest.raises(TypeError, lambda s: algos.value_counts(s, bins=1),
  675. ['1', 1])
  676. def test_value_counts_nat(self):
  677. td = Series([np.timedelta64(10000), pd.NaT], dtype='timedelta64[ns]')
  678. dt = pd.to_datetime(['NaT', '2014-01-01'])
  679. for s in [td, dt]:
  680. vc = algos.value_counts(s)
  681. vc_with_na = algos.value_counts(s, dropna=False)
  682. assert len(vc) == 1
  683. assert len(vc_with_na) == 2
  684. exp_dt = Series({Timestamp('2014-01-01 00:00:00'): 1})
  685. tm.assert_series_equal(algos.value_counts(dt), exp_dt)
  686. # TODO same for (timedelta)
  687. def test_value_counts_datetime_outofbounds(self):
  688. # GH 13663
  689. s = Series([datetime(3000, 1, 1), datetime(5000, 1, 1),
  690. datetime(5000, 1, 1), datetime(6000, 1, 1),
  691. datetime(3000, 1, 1), datetime(3000, 1, 1)])
  692. res = s.value_counts()
  693. exp_index = Index([datetime(3000, 1, 1), datetime(5000, 1, 1),
  694. datetime(6000, 1, 1)], dtype=object)
  695. exp = Series([3, 2, 1], index=exp_index)
  696. tm.assert_series_equal(res, exp)
  697. # GH 12424
  698. res = pd.to_datetime(Series(['2362-01-01', np.nan]),
  699. errors='ignore')
  700. exp = Series(['2362-01-01', np.nan], dtype=object)
  701. tm.assert_series_equal(res, exp)
  702. def test_categorical(self):
  703. s = Series(Categorical(list('aaabbc')))
  704. result = s.value_counts()
  705. expected = Series([3, 2, 1], index=CategoricalIndex(['a', 'b', 'c']))
  706. tm.assert_series_equal(result, expected, check_index_type=True)
  707. # preserve order?
  708. s = s.cat.as_ordered()
  709. result = s.value_counts()
  710. expected.index = expected.index.as_ordered()
  711. tm.assert_series_equal(result, expected, check_index_type=True)
  712. def test_categorical_nans(self):
  713. s = Series(Categorical(list('aaaaabbbcc'))) # 4,3,2,1 (nan)
  714. s.iloc[1] = np.nan
  715. result = s.value_counts()
  716. expected = Series([4, 3, 2], index=CategoricalIndex(
  717. ['a', 'b', 'c'], categories=['a', 'b', 'c']))
  718. tm.assert_series_equal(result, expected, check_index_type=True)
  719. result = s.value_counts(dropna=False)
  720. expected = Series([
  721. 4, 3, 2, 1
  722. ], index=CategoricalIndex(['a', 'b', 'c', np.nan]))
  723. tm.assert_series_equal(result, expected, check_index_type=True)
  724. # out of order
  725. s = Series(Categorical(
  726. list('aaaaabbbcc'), ordered=True, categories=['b', 'a', 'c']))
  727. s.iloc[1] = np.nan
  728. result = s.value_counts()
  729. expected = Series([4, 3, 2], index=CategoricalIndex(
  730. ['a', 'b', 'c'], categories=['b', 'a', 'c'], ordered=True))
  731. tm.assert_series_equal(result, expected, check_index_type=True)
  732. result = s.value_counts(dropna=False)
  733. expected = Series([4, 3, 2, 1], index=CategoricalIndex(
  734. ['a', 'b', 'c', np.nan], categories=['b', 'a', 'c'], ordered=True))
  735. tm.assert_series_equal(result, expected, check_index_type=True)
  736. def test_categorical_zeroes(self):
  737. # keep the `d` category with 0
  738. s = Series(Categorical(
  739. list('bbbaac'), categories=list('abcd'), ordered=True))
  740. result = s.value_counts()
  741. expected = Series([3, 2, 1, 0], index=Categorical(
  742. ['b', 'a', 'c', 'd'], categories=list('abcd'), ordered=True))
  743. tm.assert_series_equal(result, expected, check_index_type=True)
  744. def test_dropna(self):
  745. # https://github.com/pandas-dev/pandas/issues/9443#issuecomment-73719328
  746. tm.assert_series_equal(
  747. Series([True, True, False]).value_counts(dropna=True),
  748. Series([2, 1], index=[True, False]))
  749. tm.assert_series_equal(
  750. Series([True, True, False]).value_counts(dropna=False),
  751. Series([2, 1], index=[True, False]))
  752. tm.assert_series_equal(
  753. Series([True, True, False, None]).value_counts(dropna=True),
  754. Series([2, 1], index=[True, False]))
  755. tm.assert_series_equal(
  756. Series([True, True, False, None]).value_counts(dropna=False),
  757. Series([2, 1, 1], index=[True, False, np.nan]))
  758. tm.assert_series_equal(
  759. Series([10.3, 5., 5.]).value_counts(dropna=True),
  760. Series([2, 1], index=[5., 10.3]))
  761. tm.assert_series_equal(
  762. Series([10.3, 5., 5.]).value_counts(dropna=False),
  763. Series([2, 1], index=[5., 10.3]))
  764. tm.assert_series_equal(
  765. Series([10.3, 5., 5., None]).value_counts(dropna=True),
  766. Series([2, 1], index=[5., 10.3]))
  767. # 32-bit linux has a different ordering
  768. if not compat.is_platform_32bit():
  769. result = Series([10.3, 5., 5., None]).value_counts(dropna=False)
  770. expected = Series([2, 1, 1], index=[5., 10.3, np.nan])
  771. tm.assert_series_equal(result, expected)
  772. def test_value_counts_normalized(self):
  773. # GH12558
  774. s = Series([1, 2, np.nan, np.nan, np.nan])
  775. dtypes = (np.float64, np.object, 'M8[ns]')
  776. for t in dtypes:
  777. s_typed = s.astype(t)
  778. result = s_typed.value_counts(normalize=True, dropna=False)
  779. expected = Series([0.6, 0.2, 0.2],
  780. index=Series([np.nan, 2.0, 1.0], dtype=t))
  781. tm.assert_series_equal(result, expected)
  782. result = s_typed.value_counts(normalize=True, dropna=True)
  783. expected = Series([0.5, 0.5],
  784. index=Series([2.0, 1.0], dtype=t))
  785. tm.assert_series_equal(result, expected)
  786. def test_value_counts_uint64(self):
  787. arr = np.array([2**63], dtype=np.uint64)
  788. expected = Series([1], index=[2**63])
  789. result = algos.value_counts(arr)
  790. tm.assert_series_equal(result, expected)
  791. arr = np.array([-1, 2**63], dtype=object)
  792. expected = Series([1, 1], index=[-1, 2**63])
  793. result = algos.value_counts(arr)
  794. # 32-bit linux has a different ordering
  795. if not compat.is_platform_32bit():
  796. tm.assert_series_equal(result, expected)
  797. class TestDuplicated(object):
  798. def test_duplicated_with_nas(self):
  799. keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object)
  800. result = algos.duplicated(keys)
  801. expected = np.array([False, False, False, True, False, True])
  802. tm.assert_numpy_array_equal(result, expected)
  803. result = algos.duplicated(keys, keep='first')
  804. expected = np.array([False, False, False, True, False, True])
  805. tm.assert_numpy_array_equal(result, expected)
  806. result = algos.duplicated(keys, keep='last')
  807. expected = np.array([True, False, True, False, False, False])
  808. tm.assert_numpy_array_equal(result, expected)
  809. result = algos.duplicated(keys, keep=False)
  810. expected = np.array([True, False, True, True, False, True])
  811. tm.assert_numpy_array_equal(result, expected)
  812. keys = np.empty(8, dtype=object)
  813. for i, t in enumerate(zip([0, 0, np.nan, np.nan] * 2,
  814. [0, np.nan, 0, np.nan] * 2)):
  815. keys[i] = t
  816. result = algos.duplicated(keys)
  817. falses = [False] * 4
  818. trues = [True] * 4
  819. expected = np.array(falses + trues)
  820. tm.assert_numpy_array_equal(result, expected)
  821. result = algos.duplicated(keys, keep='last')
  822. expected = np.array(trues + falses)
  823. tm.assert_numpy_array_equal(result, expected)
  824. result = algos.duplicated(keys, keep=False)
  825. expected = np.array(trues + trues)
  826. tm.assert_numpy_array_equal(result, expected)
  827. @pytest.mark.parametrize('case', [
  828. np.array([1, 2, 1, 5, 3,
  829. 2, 4, 1, 5, 6]),
  830. np.array([1.1, 2.2, 1.1, np.nan, 3.3,
  831. 2.2, 4.4, 1.1, np.nan, 6.6]),
  832. np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j,
  833. 2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j]),
  834. np.array(['a', 'b', 'a', 'e', 'c',
  835. 'b', 'd', 'a', 'e', 'f'], dtype=object),
  836. np.array([1, 2**63, 1, 3**5, 10, 2**63, 39, 1, 3**5, 7],
  837. dtype=np.uint64),
  838. ])
  839. def test_numeric_object_likes(self, case):
  840. exp_first = np.array([False, False, True, False, False,
  841. True, False, True, True, False])
  842. exp_last = np.array([True, True, True, True, False,
  843. False, False, False, False, False])
  844. exp_false = exp_first | exp_last
  845. res_first = algos.duplicated(case, keep='first')
  846. tm.assert_numpy_array_equal(res_first, exp_first)
  847. res_last = algos.duplicated(case, keep='last')
  848. tm.assert_numpy_array_equal(res_last, exp_last)
  849. res_false = algos.duplicated(case, keep=False)
  850. tm.assert_numpy_array_equal(res_false, exp_false)
  851. # index
  852. for idx in [Index(case), Index(case, dtype='category')]:
  853. res_first = idx.duplicated(keep='first')
  854. tm.assert_numpy_array_equal(res_first, exp_first)
  855. res_last = idx.duplicated(keep='last')
  856. tm.assert_numpy_array_equal(res_last, exp_last)
  857. res_false = idx.duplicated(keep=False)
  858. tm.assert_numpy_array_equal(res_false, exp_false)
  859. # series
  860. for s in [Series(case), Series(case, dtype='category')]:
  861. res_first = s.duplicated(keep='first')
  862. tm.assert_series_equal(res_first, Series(exp_first))
  863. res_last = s.duplicated(keep='last')
  864. tm.assert_series_equal(res_last, Series(exp_last))
  865. res_false = s.duplicated(keep=False)
  866. tm.assert_series_equal(res_false, Series(exp_false))
  867. def test_datetime_likes(self):
  868. dt = ['2011-01-01', '2011-01-02', '2011-01-01', 'NaT', '2011-01-03',
  869. '2011-01-02', '2011-01-04', '2011-01-01', 'NaT', '2011-01-06']
  870. td = ['1 days', '2 days', '1 days', 'NaT', '3 days',
  871. '2 days', '4 days', '1 days', 'NaT', '6 days']
  872. cases = [np.array([Timestamp(d) for d in dt]),
  873. np.array([Timestamp(d, tz='US/Eastern') for d in dt]),
  874. np.array([pd.Period(d, freq='D') for d in dt]),
  875. np.array([np.datetime64(d) for d in dt]),
  876. np.array([pd.Timedelta(d) for d in td])]
  877. exp_first = np.array([False, False, True, False, False,
  878. True, False, True, True, False])
  879. exp_last = np.array([True, True, True, True, False,
  880. False, False, False, False, False])
  881. exp_false = exp_first | exp_last
  882. for case in cases:
  883. res_first = algos.duplicated(case, keep='first')
  884. tm.assert_numpy_array_equal(res_first, exp_first)
  885. res_last = algos.duplicated(case, keep='last')
  886. tm.assert_numpy_array_equal(res_last, exp_last)
  887. res_false = algos.duplicated(case, keep=False)
  888. tm.assert_numpy_array_equal(res_false, exp_false)
  889. # index
  890. for idx in [Index(case), Index(case, dtype='category'),
  891. Index(case, dtype=object)]:
  892. res_first = idx.duplicated(keep='first')
  893. tm.assert_numpy_array_equal(res_first, exp_first)
  894. res_last = idx.duplicated(keep='last')
  895. tm.assert_numpy_array_equal(res_last, exp_last)
  896. res_false = idx.duplicated(keep=False)
  897. tm.assert_numpy_array_equal(res_false, exp_false)
  898. # series
  899. for s in [Series(case), Series(case, dtype='category'),
  900. Series(case, dtype=object)]:
  901. res_first = s.duplicated(keep='first')
  902. tm.assert_series_equal(res_first, Series(exp_first))
  903. res_last = s.duplicated(keep='last')
  904. tm.assert_series_equal(res_last, Series(exp_last))
  905. res_false = s.duplicated(keep=False)
  906. tm.assert_series_equal(res_false, Series(exp_false))
  907. def test_unique_index(self):
  908. cases = [Index([1, 2, 3]), pd.RangeIndex(0, 3)]
  909. for case in cases:
  910. assert case.is_unique is True
  911. tm.assert_numpy_array_equal(case.duplicated(),
  912. np.array([False, False, False]))
  913. @pytest.mark.parametrize('arr, unique', [
  914. ([(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)],
  915. [(0, 0), (0, 1), (1, 0), (1, 1)]),
  916. ([('b', 'c'), ('a', 'b'), ('a', 'b'), ('b', 'c')],
  917. [('b', 'c'), ('a', 'b')]),
  918. ([('a', 1), ('b', 2), ('a', 3), ('a', 1)],
  919. [('a', 1), ('b', 2), ('a', 3)]),
  920. ])
  921. def test_unique_tuples(self, arr, unique):
  922. # https://github.com/pandas-dev/pandas/issues/16519
  923. expected = np.empty(len(unique), dtype=object)
  924. expected[:] = unique
  925. result = pd.unique(arr)
  926. tm.assert_numpy_array_equal(result, expected)
  927. class GroupVarTestMixin(object):
  928. def test_group_var_generic_1d(self):
  929. prng = RandomState(1234)
  930. out = (np.nan * np.ones((5, 1))).astype(self.dtype)
  931. counts = np.zeros(5, dtype='int64')
  932. values = 10 * prng.rand(15, 1).astype(self.dtype)
  933. labels = np.tile(np.arange(5), (3, )).astype('int64')
  934. expected_out = (np.squeeze(values)
  935. .reshape((5, 3), order='F')
  936. .std(axis=1, ddof=1) ** 2)[:, np.newaxis]
  937. expected_counts = counts + 3
  938. self.algo(out, counts, values, labels)
  939. assert np.allclose(out, expected_out, self.rtol)
  940. tm.assert_numpy_array_equal(counts, expected_counts)
  941. def test_group_var_generic_1d_flat_labels(self):
  942. prng = RandomState(1234)
  943. out = (np.nan * np.ones((1, 1))).astype(self.dtype)
  944. counts = np.zeros(1, dtype='int64')
  945. values = 10 * prng.rand(5, 1).astype(self.dtype)
  946. labels = np.zeros(5, dtype='int64')
  947. expected_out = np.array([[values.std(ddof=1) ** 2]])
  948. expected_counts = counts + 5
  949. self.algo(out, counts, values, labels)
  950. assert np.allclose(out, expected_out, self.rtol)
  951. tm.assert_numpy_array_equal(counts, expected_counts)
  952. def test_group_var_generic_2d_all_finite(self):
  953. prng = RandomState(1234)
  954. out = (np.nan * np.ones((5, 2))).astype(self.dtype)
  955. counts = np.zeros(5, dtype='int64')
  956. values = 10 * prng.rand(10, 2).astype(self.dtype)
  957. labels = np.tile(np.arange(5), (2, )).astype('int64')
  958. expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2
  959. expected_counts = counts + 2
  960. self.algo(out, counts, values, labels)
  961. assert np.allclose(out, expected_out, self.rtol)
  962. tm.assert_numpy_array_equal(counts, expected_counts)
  963. def test_group_var_generic_2d_some_nan(self):
  964. prng = RandomState(1234)
  965. out = (np.nan * np.ones((5, 2))).astype(self.dtype)
  966. counts = np.zeros(5, dtype='int64')
  967. values = 10 * prng.rand(10, 2).astype(self.dtype)
  968. values[:, 1] = np.nan
  969. labels = np.tile(np.arange(5), (2, )).astype('int64')
  970. expected_out = np.vstack([values[:, 0]
  971. .reshape(5, 2, order='F')
  972. .std(ddof=1, axis=1) ** 2,
  973. np.nan * np.ones(5)]).T.astype(self.dtype)
  974. expected_counts = counts + 2
  975. self.algo(out, counts, values, labels)
  976. tm.assert_almost_equal(out, expected_out, check_less_precise=6)
  977. tm.assert_numpy_array_equal(counts, expected_counts)
  978. def test_group_var_constant(self):
  979. # Regression test from GH 10448.
  980. out = np.array([[np.nan]], dtype=self.dtype)
  981. counts = np.array([0], dtype='int64')
  982. values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype)
  983. labels = np.zeros(3, dtype='int64')
  984. self.algo(out, counts, values, labels)
  985. assert counts[0] == 3
  986. assert out[0, 0] >= 0
  987. tm.assert_almost_equal(out[0, 0], 0.0)
  988. class TestGroupVarFloat64(GroupVarTestMixin):
  989. __test__ = True
  990. algo = libgroupby.group_var_float64
  991. dtype = np.float64
  992. rtol = 1e-5
  993. def test_group_var_large_inputs(self):
  994. prng = RandomState(1234)
  995. out = np.array([[np.nan]], dtype=self.dtype)
  996. counts = np.array([0], dtype='int64')
  997. values = (prng.rand(10 ** 6) + 10 ** 12).astype(self.dtype)
  998. values.shape = (10 ** 6, 1)
  999. labels = np.zeros(10 ** 6, dtype='int64')
  1000. self.algo(out, counts, values, labels)
  1001. assert counts[0] == 10 ** 6
  1002. tm.assert_almost_equal(out[0, 0], 1.0 / 12, check_less_precise=True)
  1003. class TestGroupVarFloat32(GroupVarTestMixin):
  1004. __test__ = True
  1005. algo = libgroupby.group_var_float32
  1006. dtype = np.float32
  1007. rtol = 1e-2
  1008. class TestHashTable(object):
  1009. def test_lookup_nan(self, writable):
  1010. xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
  1011. # GH 21688 ensure we can deal with readonly memory views
  1012. xs.setflags(write=writable)
  1013. m = ht.Float64HashTable()
  1014. m.map_locations(xs)
  1015. tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
  1016. dtype=np.int64))
  1017. def test_add_signed_zeros(self):
  1018. # GH 21866 inconsistent hash-function for float64
  1019. # default hash-function would lead to different hash-buckets
  1020. # for 0.0 and -0.0 if there are more than 2^30 hash-buckets
  1021. # but this would mean 16GB
  1022. N = 4 # 12 * 10**8 would trigger the error, if you have enough memory
  1023. m = ht.Float64HashTable(N)
  1024. m.set_item(0.0, 0)
  1025. m.set_item(-0.0, 0)
  1026. assert len(m) == 1 # 0.0 and -0.0 are equivalent
  1027. def test_add_different_nans(self):
  1028. # GH 21866 inconsistent hash-function for float64
  1029. # create different nans from bit-patterns:
  1030. NAN1 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000000))[0]
  1031. NAN2 = struct.unpack("d", struct.pack("=Q", 0x7ff8000000000001))[0]
  1032. assert NAN1 != NAN1
  1033. assert NAN2 != NAN2
  1034. # default hash function would lead to different hash-buckets
  1035. # for NAN1 and NAN2 even if there are only 4 buckets:
  1036. m = ht.Float64HashTable()
  1037. m.set_item(NAN1, 0)
  1038. m.set_item(NAN2, 0)
  1039. assert len(m) == 1 # NAN1 and NAN2 are equivalent
  1040. def test_lookup_overflow(self, writable):
  1041. xs = np.array([1, 2, 2**63], dtype=np.uint64)
  1042. # GH 21688 ensure we can deal with readonly memory views
  1043. xs.setflags(write=writable)
  1044. m = ht.UInt64HashTable()
  1045. m.map_locations(xs)
  1046. tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs),
  1047. dtype=np.int64))
  1048. def test_get_unique(self):
  1049. s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
  1050. exp = np.array([1, 2, 2**63], dtype=np.uint64)
  1051. tm.assert_numpy_array_equal(s.unique(), exp)
  1052. @pytest.mark.parametrize('nvals', [0, 10]) # resizing to 0 is special case
  1053. @pytest.mark.parametrize('htable, uniques, dtype, safely_resizes', [
  1054. (ht.PyObjectHashTable, ht.ObjectVector, 'object', False),
  1055. (ht.StringHashTable, ht.ObjectVector, 'object', True),
  1056. (ht.Float64HashTable, ht.Float64Vector, 'float64', False),
  1057. (ht.Int64HashTable, ht.Int64Vector, 'int64', False),
  1058. (ht.UInt64HashTable, ht.UInt64Vector, 'uint64', False)])
  1059. def test_vector_resize(self, writable, htable, uniques, dtype,
  1060. safely_resizes, nvals):
  1061. # Test for memory errors after internal vector
  1062. # reallocations (GH 7157)
  1063. vals = np.array(np.random.randn(1000), dtype=dtype)
  1064. # GH 21688 ensures we can deal with read-only memory views
  1065. vals.setflags(write=writable)
  1066. # initialise instances; cannot initialise in parametrization,
  1067. # as otherwise external views would be held on the array (which is
  1068. # one of the things this test is checking)
  1069. htable = htable()
  1070. uniques = uniques()
  1071. # get_labels may append to uniques
  1072. htable.get_labels(vals[:nvals], uniques, 0, -1)
  1073. # to_array() sets an external_view_exists flag on uniques.
  1074. tmp = uniques.to_array()
  1075. oldshape = tmp.shape
  1076. # subsequent get_labels() calls can no longer append to it
  1077. # (except for StringHashTables + ObjectVector)
  1078. if safely_resizes:
  1079. htable.get_labels(vals, uniques, 0, -1)
  1080. else:
  1081. with pytest.raises(ValueError, match='external reference.*'):
  1082. htable.get_labels(vals, uniques, 0, -1)
  1083. uniques.to_array() # should not raise here
  1084. assert tmp.shape == oldshape
  1085. @pytest.mark.parametrize('htable, tm_dtype', [
  1086. (ht.PyObjectHashTable, 'String'),
  1087. (ht.StringHashTable, 'String'),
  1088. (ht.Float64HashTable, 'Float'),
  1089. (ht.Int64HashTable, 'Int'),
  1090. (ht.UInt64HashTable, 'UInt')])
  1091. def test_hashtable_unique(self, htable, tm_dtype, writable):
  1092. # output of maker has guaranteed unique elements
  1093. maker = getattr(tm, 'make' + tm_dtype + 'Index')
  1094. s = Series(maker(1000))
  1095. if htable == ht.Float64HashTable:
  1096. # add NaN for float column
  1097. s.loc[500] = np.nan
  1098. elif htable == ht.PyObjectHashTable:
  1099. # use different NaN types for object column
  1100. s.loc[500:502] = [np.nan, None, pd.NaT]
  1101. # create duplicated selection
  1102. s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
  1103. s_duplicated.values.setflags(write=writable)
  1104. # drop_duplicates has own cython code (hash_table_func_helper.pxi)
  1105. # and is tested separately; keeps first occurrence like ht.unique()
  1106. expected_unique = s_duplicated.drop_duplicates(keep='first').values
  1107. result_unique = htable().unique(s_duplicated.values)
  1108. tm.assert_numpy_array_equal(result_unique, expected_unique)
  1109. # test return_inverse=True
  1110. # reconstruction can only succeed if the inverse is correct
  1111. result_unique, result_inverse = htable().unique(s_duplicated.values,
  1112. return_inverse=True)
  1113. tm.assert_numpy_array_equal(result_unique, expected_unique)
  1114. reconstr = result_unique[result_inverse]
  1115. tm.assert_numpy_array_equal(reconstr, s_duplicated.values)
  1116. @pytest.mark.parametrize('htable, tm_dtype', [
  1117. (ht.PyObjectHashTable, 'String'),
  1118. (ht.StringHashTable, 'String'),
  1119. (ht.Float64HashTable, 'Float'),
  1120. (ht.Int64HashTable, 'Int'),
  1121. (ht.UInt64HashTable, 'UInt')])
  1122. def test_hashtable_factorize(self, htable, tm_dtype, writable):
  1123. # output of maker has guaranteed unique elements
  1124. maker = getattr(tm, 'make' + tm_dtype + 'Index')
  1125. s = Series(maker(1000))
  1126. if htable == ht.Float64HashTable:
  1127. # add NaN for float column
  1128. s.loc[500] = np.nan
  1129. elif htable == ht.PyObjectHashTable:
  1130. # use different NaN types for object column
  1131. s.loc[500:502] = [np.nan, None, pd.NaT]
  1132. # create duplicated selection
  1133. s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
  1134. s_duplicated.values.setflags(write=writable)
  1135. na_mask = s_duplicated.isna().values
  1136. result_unique, result_inverse = htable().factorize(s_duplicated.values)
  1137. # drop_duplicates has own cython code (hash_table_func_helper.pxi)
  1138. # and is tested separately; keeps first occurrence like ht.factorize()
  1139. # since factorize removes all NaNs, we do the same here
  1140. expected_unique = s_duplicated.dropna().drop_duplicates().values
  1141. tm.assert_numpy_array_equal(result_unique, expected_unique)
  1142. # reconstruction can only succeed if the inverse is correct. Since
  1143. # factorize removes the NaNs, those have to be excluded here as well
  1144. result_reconstruct = result_unique[result_inverse[~na_mask]]
  1145. expected_reconstruct = s_duplicated.dropna().values
  1146. tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct)
  1147. @pytest.mark.parametrize('hashtable', [
  1148. ht.PyObjectHashTable, ht.StringHashTable,
  1149. ht.Float64HashTable, ht.Int64HashTable, ht.UInt64HashTable])
  1150. def test_hashtable_large_sizehint(self, hashtable):
  1151. # GH 22729
  1152. size_hint = np.iinfo(np.uint32).max + 1
  1153. tbl = hashtable(size_hint=size_hint) # noqa
  1154. def test_quantile():
  1155. s = Series(np.random.randn(100))
  1156. result = algos.quantile(s, [0, .25, .5, .75, 1.])
  1157. expected = algos.quantile(s.values, [0, .25, .5, .75, 1.])
  1158. tm.assert_almost_equal(result, expected)
  1159. def test_unique_label_indices():
  1160. a = np.random.randint(1, 1 << 10, 1 << 15).astype('i8')
  1161. left = ht.unique_label_indices(a)
  1162. right = np.unique(a, return_index=True)[1]
  1163. tm.assert_numpy_array_equal(left, right,
  1164. check_dtype=False)
  1165. a[np.random.choice(len(a), 10)] = -1
  1166. left = ht.unique_label_indices(a)
  1167. right = np.unique(a, return_index=True)[1][1:]
  1168. tm.assert_numpy_array_equal(left, right,
  1169. check_dtype=False)
  1170. class TestRank(object):
  1171. @td.skip_if_no_scipy
  1172. def test_scipy_compat(self):
  1173. from scipy.stats import rankdata
  1174. def _check(arr):
  1175. mask = ~np.isfinite(arr)
  1176. arr = arr.copy()
  1177. result = libalgos.rank_1d_float64(arr)
  1178. arr[mask] = np.inf
  1179. exp = rankdata(arr)
  1180. exp[mask] = nan
  1181. assert_almost_equal(result, exp)
  1182. _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan]))
  1183. _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan]))
  1184. def test_basic(self):
  1185. exp = np.array([1, 2], dtype=np.float64)
  1186. for dtype in np.typecodes['AllInteger']:
  1187. s = Series([1, 100], dtype=dtype)
  1188. tm.assert_numpy_array_equal(algos.rank(s), exp)
  1189. def test_uint64_overflow(self):
  1190. exp = np.array([1, 2], dtype=np.float64)
  1191. for dtype in [np.float64, np.uint64]:
  1192. s = Series([1, 2**63], dtype=dtype)
  1193. tm.assert_numpy_array_equal(algos.rank(s), exp)
  1194. def test_too_many_ndims(self):
  1195. arr = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]])
  1196. msg = "Array with ndim > 2 are not supported"
  1197. with pytest.raises(TypeError, match=msg):
  1198. algos.rank(arr)
  1199. @pytest.mark.single
  1200. @pytest.mark.parametrize('values', [
  1201. np.arange(2**24 + 1),
  1202. np.arange(2**25 + 2).reshape(2**24 + 1, 2)],
  1203. ids=['1d', '2d'])
  1204. def test_pct_max_many_rows(self, values):
  1205. # GH 18271
  1206. result = algos.rank(values, pct=True).max()
  1207. assert result == 1
  1208. def test_pad_backfill_object_segfault():
  1209. old = np.array([], dtype='O')
  1210. new = np.array([datetime(2010, 12, 31)], dtype='O')
  1211. result = libalgos.pad["object"](old, new)
  1212. expected = np.array([-1], dtype=np.int64)
  1213. tm.assert_numpy_array_equal(result, expected)
  1214. result = libalgos.pad["object"](new, old)
  1215. expected = np.array([], dtype=np.int64)
  1216. tm.assert_numpy_array_equal(result, expected)
  1217. result = libalgos.backfill["object"](old, new)
  1218. expected = np.array([-1], dtype=np.int64)
  1219. tm.assert_numpy_array_equal(result, expected)
  1220. result = libalgos.backfill["object"](new, old)
  1221. expected = np.array([], dtype=np.int64)
  1222. tm.assert_numpy_array_equal(result, expected)
  1223. def test_arrmap():
  1224. values = np.array(['foo', 'foo', 'bar', 'bar', 'baz', 'qux'], dtype='O')
  1225. result = libalgos.arrmap_object(values, lambda x: x in ['foo', 'bar'])
  1226. assert (result.dtype == np.bool_)
  1227. class TestTseriesUtil(object):
  1228. def test_combineFunc(self):
  1229. pass
  1230. def test_reindex(self):
  1231. pass
  1232. def test_isna(self):
  1233. pass
  1234. def test_groupby(self):
  1235. pass
  1236. def test_groupby_withnull(self):
  1237. pass
  1238. def test_backfill(self):
  1239. old = Index([1, 5, 10])
  1240. new = Index(lrange(12))
  1241. filler = libalgos.backfill["int64_t"](old.values, new.values)
  1242. expect_filler = np.array([0, 0, 1, 1, 1, 1,
  1243. 2, 2, 2, 2, 2, -1], dtype=np.int64)
  1244. tm.assert_numpy_array_equal(filler, expect_filler)
  1245. # corner case
  1246. old = Index([1, 4])
  1247. new = Index(lrange(5, 10))
  1248. filler = libalgos.backfill["int64_t"](old.values, new.values)
  1249. expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64)
  1250. tm.assert_numpy_array_equal(filler, expect_filler)
  1251. def test_pad(self):
  1252. old = Index([1, 5, 10])
  1253. new = Index(lrange(12))
  1254. filler = libalgos.pad["int64_t"](old.values, new.values)
  1255. expect_filler = np.array([-1, 0, 0, 0, 0, 1,
  1256. 1, 1, 1, 1, 2, 2], dtype=np.int64)
  1257. tm.assert_numpy_array_equal(filler, expect_filler)
  1258. # corner case
  1259. old = Index([5, 10])
  1260. new = Index(lrange(5))
  1261. filler = libalgos.pad["int64_t"](old.values, new.values)
  1262. expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64)
  1263. tm.assert_numpy_array_equal(filler, expect_filler)
  1264. def test_is_lexsorted():
  1265. failure = [
  1266. np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1267. 3, 3,
  1268. 3, 3,
  1269. 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2,
  1270. 2, 2, 2, 2, 2, 2, 2,
  1271. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1272. 1, 1, 1, 1, 1, 1, 1,
  1273. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1274. 1, 1, 1, 1, 1, 1, 1,
  1275. 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1276. 0, 0, 0, 0, 0, 0, 0,
  1277. 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='int64'),
  1278. np.array([30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
  1279. 15, 14,
  1280. 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28,
  1281. 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13,
  1282. 12, 11,
  1283. 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25,
  1284. 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10,
  1285. 9, 8,
  1286. 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, 24, 23, 22,
  1287. 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
  1288. 6, 5,
  1289. 4, 3, 2, 1, 0], dtype='int64')]
  1290. assert (not libalgos.is_lexsorted(failure))
  1291. def test_groupsort_indexer():
  1292. a = np.random.randint(0, 1000, 100).astype(np.int64)
  1293. b = np.random.randint(0, 1000, 100).astype(np.int64)
  1294. result = libalgos.groupsort_indexer(a, 1000)[0]
  1295. # need to use a stable sort
  1296. # np.argsort returns int, groupsort_indexer
  1297. # always returns int64
  1298. expected = np.argsort(a, kind='mergesort')
  1299. expected = expected.astype(np.int64)
  1300. tm.assert_numpy_array_equal(result, expected)
  1301. # compare with lexsort
  1302. # np.lexsort returns int, groupsort_indexer
  1303. # always returns int64
  1304. key = a * 1000 + b
  1305. result = libalgos.groupsort_indexer(key, 1000000)[0]
  1306. expected = np.lexsort((b, a))
  1307. expected = expected.astype(np.int64)
  1308. tm.assert_numpy_array_equal(result, expected)
  1309. def test_infinity_sort():
  1310. # GH 13445
  1311. # numpy's argsort can be unhappy if something is less than
  1312. # itself. Instead, let's give our infinities a self-consistent
  1313. # ordering, but outside the float extended real line.
  1314. Inf = libalgos.Infinity()
  1315. NegInf = libalgos.NegInfinity()
  1316. ref_nums = [NegInf, float("-inf"), -1e100, 0, 1e100, float("inf"), Inf]
  1317. assert all(Inf >= x for x in ref_nums)
  1318. assert all(Inf > x or x is Inf for x in ref_nums)
  1319. assert Inf >= Inf and Inf == Inf
  1320. assert not Inf < Inf and not Inf > Inf
  1321. assert libalgos.Infinity() == libalgos.Infinity()
  1322. assert not libalgos.Infinity() != libalgos.Infinity()
  1323. assert all(NegInf <= x for x in ref_nums)
  1324. assert all(NegInf < x or x is NegInf for x in ref_nums)
  1325. assert NegInf <= NegInf and NegInf == NegInf
  1326. assert not NegInf < NegInf and not NegInf > NegInf
  1327. assert libalgos.NegInfinity() == libalgos.NegInfinity()
  1328. assert not libalgos.NegInfinity() != libalgos.NegInfinity()
  1329. for perm in permutations(ref_nums):
  1330. assert sorted(perm) == ref_nums
  1331. # smoke tests
  1332. np.array([libalgos.Infinity()] * 32).argsort()
  1333. np.array([libalgos.NegInfinity()] * 32).argsort()
  1334. def test_infinity_against_nan():
  1335. Inf = libalgos.Infinity()
  1336. NegInf = libalgos.NegInfinity()
  1337. assert not Inf > np.nan
  1338. assert not Inf >= np.nan
  1339. assert not Inf < np.nan
  1340. assert not Inf <= np.nan
  1341. assert not Inf == np.nan
  1342. assert Inf != np.nan
  1343. assert not NegInf > np.nan
  1344. assert not NegInf >= np.nan
  1345. assert not NegInf < np.nan
  1346. assert not NegInf <= np.nan
  1347. assert not NegInf == np.nan
  1348. assert NegInf != np.nan
  1349. def test_ensure_platform_int():
  1350. arr = np.arange(100, dtype=np.intp)
  1351. result = libalgos.ensure_platform_int(arr)
  1352. assert (result is arr)
  1353. def test_int64_add_overflow():
  1354. # see gh-14068
  1355. msg = "Overflow in int64 addition"
  1356. m = np.iinfo(np.int64).max
  1357. n = np.iinfo(np.int64).min
  1358. with pytest.raises(OverflowError, match=msg):
  1359. algos.checked_add_with_arr(np.array([m, m]), m)
  1360. with pytest.raises(OverflowError, match=msg):
  1361. algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]))
  1362. with pytest.raises(OverflowError, match=msg):
  1363. algos.checked_add_with_arr(np.array([n, n]), n)
  1364. with pytest.raises(OverflowError, match=msg):
  1365. algos.checked_add_with_arr(np.array([n, n]), np.array([n, n]))
  1366. with pytest.raises(OverflowError, match=msg):
  1367. algos.checked_add_with_arr(np.array([m, n]), np.array([n, n]))
  1368. with pytest.raises(OverflowError, match=msg):
  1369. algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
  1370. arr_mask=np.array([False, True]))
  1371. with pytest.raises(OverflowError, match=msg):
  1372. algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
  1373. b_mask=np.array([False, True]))
  1374. with pytest.raises(OverflowError, match=msg):
  1375. algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
  1376. arr_mask=np.array([False, True]),
  1377. b_mask=np.array([False, True]))
  1378. with pytest.raises(OverflowError, match=msg):
  1379. with tm.assert_produces_warning(RuntimeWarning):
  1380. algos.checked_add_with_arr(np.array([m, m]),
  1381. np.array([np.nan, m]))
  1382. # Check that the nan boolean arrays override whether or not
  1383. # the addition overflows. We don't check the result but just
  1384. # the fact that an OverflowError is not raised.
  1385. algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
  1386. arr_mask=np.array([True, True]))
  1387. algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
  1388. b_mask=np.array([True, True]))
  1389. algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
  1390. arr_mask=np.array([True, False]),
  1391. b_mask=np.array([False, True]))
  1392. class TestMode(object):
  1393. def test_no_mode(self):
  1394. exp = Series([], dtype=np.float64)
  1395. tm.assert_series_equal(algos.mode([]), exp)
  1396. def test_mode_single(self):
  1397. # GH 15714
  1398. exp_single = [1]
  1399. data_single = [1]
  1400. exp_multi = [1]
  1401. data_multi = [1, 1]
  1402. for dt in np.typecodes['AllInteger'] + np.typecodes['Float']:
  1403. s = Series(data_single, dtype=dt)
  1404. exp = Series(exp_single, dtype=dt)
  1405. tm.assert_series_equal(algos.mode(s), exp)
  1406. s = Series(data_multi, dtype=dt)
  1407. exp = Series(exp_multi, dtype=dt)
  1408. tm.assert_series_equal(algos.mode(s), exp)
  1409. exp = Series([1], dtype=np.int)
  1410. tm.assert_series_equal(algos.mode([1]), exp)
  1411. exp = Series(['a', 'b', 'c'], dtype=np.object)
  1412. tm.assert_series_equal(algos.mode(['a', 'b', 'c']), exp)
  1413. def test_number_mode(self):
  1414. exp_single = [1]
  1415. data_single = [1] * 5 + [2] * 3
  1416. exp_multi = [1, 3]
  1417. data_multi = [1] * 5 + [2] * 3 + [3] * 5
  1418. for dt in np.typecodes['AllInteger'] + np.typecodes['Float']:
  1419. s = Series(data_single, dtype=dt)
  1420. exp = Series(exp_single, dtype=dt)
  1421. tm.assert_series_equal(algos.mode(s), exp)
  1422. s = Series(data_multi, dtype=dt)
  1423. exp = Series(exp_multi, dtype=dt)
  1424. tm.assert_series_equal(algos.mode(s), exp)
  1425. def test_strobj_mode(self):
  1426. exp = ['b']
  1427. data = ['a'] * 2 + ['b'] * 3
  1428. s = Series(data, dtype='c')
  1429. exp = Series(exp, dtype='c')
  1430. tm.assert_series_equal(algos.mode(s), exp)
  1431. exp = ['bar']
  1432. data = ['foo'] * 2 + ['bar'] * 3
  1433. for dt in [str, object]:
  1434. s = Series(data, dtype=dt)
  1435. exp = Series(exp, dtype=dt)
  1436. tm.assert_series_equal(algos.mode(s), exp)
  1437. def test_datelike_mode(self):
  1438. exp = Series(['1900-05-03', '2011-01-03',
  1439. '2013-01-02'], dtype="M8[ns]")
  1440. s = Series(['2011-01-03', '2013-01-02',
  1441. '1900-05-03'], dtype='M8[ns]')
  1442. tm.assert_series_equal(algos.mode(s), exp)
  1443. exp = Series(['2011-01-03', '2013-01-02'], dtype='M8[ns]')
  1444. s = Series(['2011-01-03', '2013-01-02', '1900-05-03',
  1445. '2011-01-03', '2013-01-02'], dtype='M8[ns]')
  1446. tm.assert_series_equal(algos.mode(s), exp)
  1447. def test_timedelta_mode(self):
  1448. exp = Series(['-1 days', '0 days', '1 days'],
  1449. dtype='timedelta64[ns]')
  1450. s = Series(['1 days', '-1 days', '0 days'],
  1451. dtype='timedelta64[ns]')
  1452. tm.assert_series_equal(algos.mode(s), exp)
  1453. exp = Series(['2 min', '1 day'], dtype='timedelta64[ns]')
  1454. s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min',
  1455. '2 min', '2 min'], dtype='timedelta64[ns]')
  1456. tm.assert_series_equal(algos.mode(s), exp)
  1457. def test_mixed_dtype(self):
  1458. exp = Series(['foo'])
  1459. s = Series([1, 'foo', 'foo'])
  1460. tm.assert_series_equal(algos.mode(s), exp)
  1461. def test_uint64_overflow(self):
  1462. exp = Series([2**63], dtype=np.uint64)
  1463. s = Series([1, 2**63, 2**63], dtype=np.uint64)
  1464. tm.assert_series_equal(algos.mode(s), exp)
  1465. exp = Series([1, 2**63], dtype=np.uint64)
  1466. s = Series([1, 2**63], dtype=np.uint64)
  1467. tm.assert_series_equal(algos.mode(s), exp)
  1468. def test_categorical(self):
  1469. c = Categorical([1, 2])
  1470. exp = c
  1471. tm.assert_categorical_equal(algos.mode(c), exp)
  1472. tm.assert_categorical_equal(c.mode(), exp)
  1473. c = Categorical([1, 'a', 'a'])
  1474. exp = Categorical(['a'], categories=[1, 'a'])
  1475. tm.assert_categorical_equal(algos.mode(c), exp)
  1476. tm.assert_categorical_equal(c.mode(), exp)
  1477. c = Categorical([1, 1, 2, 3, 3])
  1478. exp = Categorical([1, 3], categories=[1, 2, 3])
  1479. tm.assert_categorical_equal(algos.mode(c), exp)
  1480. tm.assert_categorical_equal(c.mode(), exp)
  1481. def test_index(self):
  1482. idx = Index([1, 2, 3])
  1483. exp = Series([1, 2, 3], dtype=np.int64)
  1484. tm.assert_series_equal(algos.mode(idx), exp)
  1485. idx = Index([1, 'a', 'a'])
  1486. exp = Series(['a'], dtype=object)
  1487. tm.assert_series_equal(algos.mode(idx), exp)
  1488. idx = Index([1, 1, 2, 3, 3])
  1489. exp = Series([1, 3], dtype=np.int64)
  1490. tm.assert_series_equal(algos.mode(idx), exp)
  1491. exp = Series(['2 min', '1 day'], dtype='timedelta64[ns]')
  1492. idx = Index(['1 day', '1 day', '-1 day', '-1 day 2 min',
  1493. '2 min', '2 min'], dtype='timedelta64[ns]')
  1494. tm.assert_series_equal(algos.mode(idx), exp)