test_numeric.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. import decimal
  2. import numpy as np
  3. from numpy import iinfo
  4. import pytest
  5. import pandas as pd
  6. from pandas import to_numeric
  7. from pandas.util import testing as tm
  8. class TestToNumeric(object):
  9. def test_empty(self):
  10. # see gh-16302
  11. s = pd.Series([], dtype=object)
  12. res = to_numeric(s)
  13. expected = pd.Series([], dtype=np.int64)
  14. tm.assert_series_equal(res, expected)
  15. # Original issue example
  16. res = to_numeric(s, errors='coerce', downcast='integer')
  17. expected = pd.Series([], dtype=np.int8)
  18. tm.assert_series_equal(res, expected)
  19. def test_series(self):
  20. s = pd.Series(['1', '-3.14', '7'])
  21. res = to_numeric(s)
  22. expected = pd.Series([1, -3.14, 7])
  23. tm.assert_series_equal(res, expected)
  24. s = pd.Series(['1', '-3.14', 7])
  25. res = to_numeric(s)
  26. tm.assert_series_equal(res, expected)
  27. def test_series_numeric(self):
  28. s = pd.Series([1, 3, 4, 5], index=list('ABCD'), name='XXX')
  29. res = to_numeric(s)
  30. tm.assert_series_equal(res, s)
  31. s = pd.Series([1., 3., 4., 5.], index=list('ABCD'), name='XXX')
  32. res = to_numeric(s)
  33. tm.assert_series_equal(res, s)
  34. # bool is regarded as numeric
  35. s = pd.Series([True, False, True, True],
  36. index=list('ABCD'), name='XXX')
  37. res = to_numeric(s)
  38. tm.assert_series_equal(res, s)
  39. def test_error(self):
  40. s = pd.Series([1, -3.14, 'apple'])
  41. msg = 'Unable to parse string "apple" at position 2'
  42. with pytest.raises(ValueError, match=msg):
  43. to_numeric(s, errors='raise')
  44. res = to_numeric(s, errors='ignore')
  45. expected = pd.Series([1, -3.14, 'apple'])
  46. tm.assert_series_equal(res, expected)
  47. res = to_numeric(s, errors='coerce')
  48. expected = pd.Series([1, -3.14, np.nan])
  49. tm.assert_series_equal(res, expected)
  50. s = pd.Series(['orange', 1, -3.14, 'apple'])
  51. msg = 'Unable to parse string "orange" at position 0'
  52. with pytest.raises(ValueError, match=msg):
  53. to_numeric(s, errors='raise')
  54. def test_error_seen_bool(self):
  55. s = pd.Series([True, False, 'apple'])
  56. msg = 'Unable to parse string "apple" at position 2'
  57. with pytest.raises(ValueError, match=msg):
  58. to_numeric(s, errors='raise')
  59. res = to_numeric(s, errors='ignore')
  60. expected = pd.Series([True, False, 'apple'])
  61. tm.assert_series_equal(res, expected)
  62. # coerces to float
  63. res = to_numeric(s, errors='coerce')
  64. expected = pd.Series([1., 0., np.nan])
  65. tm.assert_series_equal(res, expected)
  66. def test_list(self):
  67. s = ['1', '-3.14', '7']
  68. res = to_numeric(s)
  69. expected = np.array([1, -3.14, 7])
  70. tm.assert_numpy_array_equal(res, expected)
  71. def test_list_numeric(self):
  72. s = [1, 3, 4, 5]
  73. res = to_numeric(s)
  74. tm.assert_numpy_array_equal(res, np.array(s, dtype=np.int64))
  75. s = [1., 3., 4., 5.]
  76. res = to_numeric(s)
  77. tm.assert_numpy_array_equal(res, np.array(s))
  78. # bool is regarded as numeric
  79. s = [True, False, True, True]
  80. res = to_numeric(s)
  81. tm.assert_numpy_array_equal(res, np.array(s))
  82. def test_numeric(self):
  83. s = pd.Series([1, -3.14, 7], dtype='O')
  84. res = to_numeric(s)
  85. expected = pd.Series([1, -3.14, 7])
  86. tm.assert_series_equal(res, expected)
  87. s = pd.Series([1, -3.14, 7])
  88. res = to_numeric(s)
  89. tm.assert_series_equal(res, expected)
  90. # GH 14827
  91. df = pd.DataFrame(dict(
  92. a=[1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), '0.1'],
  93. b=[1.0, 2.0, 3.0, 4.0],
  94. ))
  95. expected = pd.DataFrame(dict(
  96. a=[1.2, 3.14, np.inf, 0.1],
  97. b=[1.0, 2.0, 3.0, 4.0],
  98. ))
  99. # Test to_numeric over one column
  100. df_copy = df.copy()
  101. df_copy['a'] = df_copy['a'].apply(to_numeric)
  102. tm.assert_frame_equal(df_copy, expected)
  103. # Test to_numeric over multiple columns
  104. df_copy = df.copy()
  105. df_copy[['a', 'b']] = df_copy[['a', 'b']].apply(to_numeric)
  106. tm.assert_frame_equal(df_copy, expected)
  107. def test_numeric_lists_and_arrays(self):
  108. # Test to_numeric with embedded lists and arrays
  109. df = pd.DataFrame(dict(
  110. a=[[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1]
  111. ))
  112. df['a'] = df['a'].apply(to_numeric)
  113. expected = pd.DataFrame(dict(
  114. a=[[3.14, 1.0], 1.6, 0.1],
  115. ))
  116. tm.assert_frame_equal(df, expected)
  117. df = pd.DataFrame(dict(
  118. a=[np.array([decimal.Decimal(3.14), 1.0]), 0.1]
  119. ))
  120. df['a'] = df['a'].apply(to_numeric)
  121. expected = pd.DataFrame(dict(
  122. a=[[3.14, 1.0], 0.1],
  123. ))
  124. tm.assert_frame_equal(df, expected)
  125. def test_all_nan(self):
  126. s = pd.Series(['a', 'b', 'c'])
  127. res = to_numeric(s, errors='coerce')
  128. expected = pd.Series([np.nan, np.nan, np.nan])
  129. tm.assert_series_equal(res, expected)
  130. @pytest.mark.parametrize("errors", [None, "ignore", "raise", "coerce"])
  131. def test_type_check(self, errors):
  132. # see gh-11776
  133. df = pd.DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]})
  134. kwargs = dict(errors=errors) if errors is not None else dict()
  135. error_ctx = pytest.raises(TypeError, match="1-d array")
  136. with error_ctx:
  137. to_numeric(df, **kwargs)
  138. def test_scalar(self):
  139. assert pd.to_numeric(1) == 1
  140. assert pd.to_numeric(1.1) == 1.1
  141. assert pd.to_numeric('1') == 1
  142. assert pd.to_numeric('1.1') == 1.1
  143. with pytest.raises(ValueError):
  144. to_numeric('XX', errors='raise')
  145. assert to_numeric('XX', errors='ignore') == 'XX'
  146. assert np.isnan(to_numeric('XX', errors='coerce'))
  147. def test_numeric_dtypes(self):
  148. idx = pd.Index([1, 2, 3], name='xxx')
  149. res = pd.to_numeric(idx)
  150. tm.assert_index_equal(res, idx)
  151. res = pd.to_numeric(pd.Series(idx, name='xxx'))
  152. tm.assert_series_equal(res, pd.Series(idx, name='xxx'))
  153. res = pd.to_numeric(idx.values)
  154. tm.assert_numpy_array_equal(res, idx.values)
  155. idx = pd.Index([1., np.nan, 3., np.nan], name='xxx')
  156. res = pd.to_numeric(idx)
  157. tm.assert_index_equal(res, idx)
  158. res = pd.to_numeric(pd.Series(idx, name='xxx'))
  159. tm.assert_series_equal(res, pd.Series(idx, name='xxx'))
  160. res = pd.to_numeric(idx.values)
  161. tm.assert_numpy_array_equal(res, idx.values)
  162. def test_str(self):
  163. idx = pd.Index(['1', '2', '3'], name='xxx')
  164. exp = np.array([1, 2, 3], dtype='int64')
  165. res = pd.to_numeric(idx)
  166. tm.assert_index_equal(res, pd.Index(exp, name='xxx'))
  167. res = pd.to_numeric(pd.Series(idx, name='xxx'))
  168. tm.assert_series_equal(res, pd.Series(exp, name='xxx'))
  169. res = pd.to_numeric(idx.values)
  170. tm.assert_numpy_array_equal(res, exp)
  171. idx = pd.Index(['1.5', '2.7', '3.4'], name='xxx')
  172. exp = np.array([1.5, 2.7, 3.4])
  173. res = pd.to_numeric(idx)
  174. tm.assert_index_equal(res, pd.Index(exp, name='xxx'))
  175. res = pd.to_numeric(pd.Series(idx, name='xxx'))
  176. tm.assert_series_equal(res, pd.Series(exp, name='xxx'))
  177. res = pd.to_numeric(idx.values)
  178. tm.assert_numpy_array_equal(res, exp)
  179. def test_datetime_like(self, tz_naive_fixture):
  180. idx = pd.date_range("20130101", periods=3,
  181. tz=tz_naive_fixture, name="xxx")
  182. res = pd.to_numeric(idx)
  183. tm.assert_index_equal(res, pd.Index(idx.asi8, name="xxx"))
  184. res = pd.to_numeric(pd.Series(idx, name="xxx"))
  185. tm.assert_series_equal(res, pd.Series(idx.asi8, name="xxx"))
  186. res = pd.to_numeric(idx.values)
  187. tm.assert_numpy_array_equal(res, idx.asi8)
  188. def test_timedelta(self):
  189. idx = pd.timedelta_range('1 days', periods=3, freq='D', name='xxx')
  190. res = pd.to_numeric(idx)
  191. tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx'))
  192. res = pd.to_numeric(pd.Series(idx, name='xxx'))
  193. tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx'))
  194. res = pd.to_numeric(idx.values)
  195. tm.assert_numpy_array_equal(res, idx.asi8)
  196. def test_period(self):
  197. idx = pd.period_range('2011-01', periods=3, freq='M', name='xxx')
  198. res = pd.to_numeric(idx)
  199. tm.assert_index_equal(res, pd.Index(idx.asi8, name='xxx'))
  200. # TODO: enable when we can support native PeriodDtype
  201. # res = pd.to_numeric(pd.Series(idx, name='xxx'))
  202. # tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx'))
  203. def test_non_hashable(self):
  204. # Test for Bug #13324
  205. s = pd.Series([[10.0, 2], 1.0, 'apple'])
  206. res = pd.to_numeric(s, errors='coerce')
  207. tm.assert_series_equal(res, pd.Series([np.nan, 1.0, np.nan]))
  208. res = pd.to_numeric(s, errors='ignore')
  209. tm.assert_series_equal(res, pd.Series([[10.0, 2], 1.0, 'apple']))
  210. with pytest.raises(TypeError, match="Invalid object type"):
  211. pd.to_numeric(s)
  212. @pytest.mark.parametrize("data", [
  213. ["1", 2, 3],
  214. [1, 2, 3],
  215. np.array(["1970-01-02", "1970-01-03",
  216. "1970-01-04"], dtype="datetime64[D]")
  217. ])
  218. def test_downcast_basic(self, data):
  219. # see gh-13352
  220. invalid_downcast = "unsigned-integer"
  221. msg = "invalid downcasting method provided"
  222. with pytest.raises(ValueError, match=msg):
  223. pd.to_numeric(data, downcast=invalid_downcast)
  224. expected = np.array([1, 2, 3], dtype=np.int64)
  225. # Basic function tests.
  226. res = pd.to_numeric(data)
  227. tm.assert_numpy_array_equal(res, expected)
  228. res = pd.to_numeric(data, downcast=None)
  229. tm.assert_numpy_array_equal(res, expected)
  230. # Basic dtype support.
  231. smallest_uint_dtype = np.dtype(np.typecodes["UnsignedInteger"][0])
  232. # Support below np.float32 is rare and far between.
  233. float_32_char = np.dtype(np.float32).char
  234. smallest_float_dtype = float_32_char
  235. expected = np.array([1, 2, 3], dtype=smallest_uint_dtype)
  236. res = pd.to_numeric(data, downcast="unsigned")
  237. tm.assert_numpy_array_equal(res, expected)
  238. expected = np.array([1, 2, 3], dtype=smallest_float_dtype)
  239. res = pd.to_numeric(data, downcast="float")
  240. tm.assert_numpy_array_equal(res, expected)
  241. @pytest.mark.parametrize("signed_downcast", ["integer", "signed"])
  242. @pytest.mark.parametrize("data", [
  243. ["1", 2, 3],
  244. [1, 2, 3],
  245. np.array(["1970-01-02", "1970-01-03",
  246. "1970-01-04"], dtype="datetime64[D]")
  247. ])
  248. def test_signed_downcast(self, data, signed_downcast):
  249. # see gh-13352
  250. smallest_int_dtype = np.dtype(np.typecodes["Integer"][0])
  251. expected = np.array([1, 2, 3], dtype=smallest_int_dtype)
  252. res = pd.to_numeric(data, downcast=signed_downcast)
  253. tm.assert_numpy_array_equal(res, expected)
  254. def test_ignore_downcast_invalid_data(self):
  255. # If we can't successfully cast the given
  256. # data to a numeric dtype, do not bother
  257. # with the downcast parameter.
  258. data = ["foo", 2, 3]
  259. expected = np.array(data, dtype=object)
  260. res = pd.to_numeric(data, errors="ignore",
  261. downcast="unsigned")
  262. tm.assert_numpy_array_equal(res, expected)
  263. def test_ignore_downcast_neg_to_unsigned(self):
  264. # Cannot cast to an unsigned integer
  265. # because we have a negative number.
  266. data = ["-1", 2, 3]
  267. expected = np.array([-1, 2, 3], dtype=np.int64)
  268. res = pd.to_numeric(data, downcast="unsigned")
  269. tm.assert_numpy_array_equal(res, expected)
  270. @pytest.mark.parametrize("downcast", ["integer", "signed", "unsigned"])
  271. @pytest.mark.parametrize("data,expected", [
  272. (["1.1", 2, 3],
  273. np.array([1.1, 2, 3], dtype=np.float64)),
  274. ([10000.0, 20000, 3000, 40000.36, 50000, 50000.00],
  275. np.array([10000.0, 20000, 3000,
  276. 40000.36, 50000, 50000.00], dtype=np.float64))
  277. ])
  278. def test_ignore_downcast_cannot_convert_float(
  279. self, data, expected, downcast):
  280. # Cannot cast to an integer (signed or unsigned)
  281. # because we have a float number.
  282. res = pd.to_numeric(data, downcast=downcast)
  283. tm.assert_numpy_array_equal(res, expected)
  284. @pytest.mark.parametrize("downcast,expected_dtype", [
  285. ("integer", np.int16),
  286. ("signed", np.int16),
  287. ("unsigned", np.uint16)
  288. ])
  289. def test_downcast_not8bit(self, downcast, expected_dtype):
  290. # the smallest integer dtype need not be np.(u)int8
  291. data = ["256", 257, 258]
  292. expected = np.array([256, 257, 258], dtype=expected_dtype)
  293. res = pd.to_numeric(data, downcast=downcast)
  294. tm.assert_numpy_array_equal(res, expected)
  295. @pytest.mark.parametrize("dtype,downcast,min_max", [
  296. ("int8", "integer", [iinfo(np.int8).min,
  297. iinfo(np.int8).max]),
  298. ("int16", "integer", [iinfo(np.int16).min,
  299. iinfo(np.int16).max]),
  300. ('int32', "integer", [iinfo(np.int32).min,
  301. iinfo(np.int32).max]),
  302. ('int64', "integer", [iinfo(np.int64).min,
  303. iinfo(np.int64).max]),
  304. ('uint8', "unsigned", [iinfo(np.uint8).min,
  305. iinfo(np.uint8).max]),
  306. ('uint16', "unsigned", [iinfo(np.uint16).min,
  307. iinfo(np.uint16).max]),
  308. ('uint32', "unsigned", [iinfo(np.uint32).min,
  309. iinfo(np.uint32).max]),
  310. ('uint64', "unsigned", [iinfo(np.uint64).min,
  311. iinfo(np.uint64).max]),
  312. ('int16', "integer", [iinfo(np.int8).min,
  313. iinfo(np.int8).max + 1]),
  314. ('int32', "integer", [iinfo(np.int16).min,
  315. iinfo(np.int16).max + 1]),
  316. ('int64', "integer", [iinfo(np.int32).min,
  317. iinfo(np.int32).max + 1]),
  318. ('int16', "integer", [iinfo(np.int8).min - 1,
  319. iinfo(np.int16).max]),
  320. ('int32', "integer", [iinfo(np.int16).min - 1,
  321. iinfo(np.int32).max]),
  322. ('int64', "integer", [iinfo(np.int32).min - 1,
  323. iinfo(np.int64).max]),
  324. ('uint16', "unsigned", [iinfo(np.uint8).min,
  325. iinfo(np.uint8).max + 1]),
  326. ('uint32', "unsigned", [iinfo(np.uint16).min,
  327. iinfo(np.uint16).max + 1]),
  328. ('uint64', "unsigned", [iinfo(np.uint32).min,
  329. iinfo(np.uint32).max + 1])
  330. ])
  331. def test_downcast_limits(self, dtype, downcast, min_max):
  332. # see gh-14404: test the limits of each downcast.
  333. series = pd.to_numeric(pd.Series(min_max), downcast=downcast)
  334. assert series.dtype == dtype
  335. def test_coerce_uint64_conflict(self):
  336. # see gh-17007 and gh-17125
  337. #
  338. # Still returns float despite the uint64-nan conflict,
  339. # which would normally force the casting to object.
  340. df = pd.DataFrame({"a": [200, 300, "", "NaN", 30000000000000000000]})
  341. expected = pd.Series([200, 300, np.nan, np.nan,
  342. 30000000000000000000], dtype=float, name="a")
  343. result = to_numeric(df["a"], errors="coerce")
  344. tm.assert_series_equal(result, expected)
  345. s = pd.Series(["12345678901234567890", "1234567890", "ITEM"])
  346. expected = pd.Series([12345678901234567890,
  347. 1234567890, np.nan], dtype=float)
  348. result = to_numeric(s, errors="coerce")
  349. tm.assert_series_equal(result, expected)
  350. # For completeness, check against "ignore" and "raise"
  351. result = to_numeric(s, errors="ignore")
  352. tm.assert_series_equal(result, s)
  353. msg = "Unable to parse string"
  354. with pytest.raises(ValueError, match=msg):
  355. to_numeric(s, errors="raise")