test_combine_concat.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. # coding=utf-8
  2. # pylint: disable-msg=E1101,W0612
  3. from datetime import datetime
  4. import numpy as np
  5. from numpy import nan
  6. import pytest
  7. import pandas as pd
  8. from pandas import DataFrame, DatetimeIndex, Series, compat, date_range
  9. import pandas.util.testing as tm
  10. from pandas.util.testing import assert_frame_equal, assert_series_equal
  11. class TestSeriesCombine(object):
  12. def test_append(self, datetime_series, string_series, object_series):
  13. appendedSeries = string_series.append(object_series)
  14. for idx, value in compat.iteritems(appendedSeries):
  15. if idx in string_series.index:
  16. assert value == string_series[idx]
  17. elif idx in object_series.index:
  18. assert value == object_series[idx]
  19. else:
  20. raise AssertionError("orphaned index!")
  21. msg = "Indexes have overlapping values:"
  22. with pytest.raises(ValueError, match=msg):
  23. datetime_series.append(datetime_series, verify_integrity=True)
  24. def test_append_many(self, datetime_series):
  25. pieces = [datetime_series[:5], datetime_series[5:10],
  26. datetime_series[10:]]
  27. result = pieces[0].append(pieces[1:])
  28. assert_series_equal(result, datetime_series)
  29. def test_append_duplicates(self):
  30. # GH 13677
  31. s1 = pd.Series([1, 2, 3])
  32. s2 = pd.Series([4, 5, 6])
  33. exp = pd.Series([1, 2, 3, 4, 5, 6], index=[0, 1, 2, 0, 1, 2])
  34. tm.assert_series_equal(s1.append(s2), exp)
  35. tm.assert_series_equal(pd.concat([s1, s2]), exp)
  36. # the result must have RangeIndex
  37. exp = pd.Series([1, 2, 3, 4, 5, 6])
  38. tm.assert_series_equal(s1.append(s2, ignore_index=True),
  39. exp, check_index_type=True)
  40. tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True),
  41. exp, check_index_type=True)
  42. msg = 'Indexes have overlapping values:'
  43. with pytest.raises(ValueError, match=msg):
  44. s1.append(s2, verify_integrity=True)
  45. with pytest.raises(ValueError, match=msg):
  46. pd.concat([s1, s2], verify_integrity=True)
  47. def test_combine_scalar(self):
  48. # GH 21248
  49. # Note - combine() with another Series is tested elsewhere because
  50. # it is used when testing operators
  51. s = pd.Series([i * 10 for i in range(5)])
  52. result = s.combine(3, lambda x, y: x + y)
  53. expected = pd.Series([i * 10 + 3 for i in range(5)])
  54. tm.assert_series_equal(result, expected)
  55. result = s.combine(22, lambda x, y: min(x, y))
  56. expected = pd.Series([min(i * 10, 22) for i in range(5)])
  57. tm.assert_series_equal(result, expected)
  58. def test_combine_first(self):
  59. values = tm.makeIntIndex(20).values.astype(float)
  60. series = Series(values, index=tm.makeIntIndex(20))
  61. series_copy = series * 2
  62. series_copy[::2] = np.NaN
  63. # nothing used from the input
  64. combined = series.combine_first(series_copy)
  65. tm.assert_series_equal(combined, series)
  66. # Holes filled from input
  67. combined = series_copy.combine_first(series)
  68. assert np.isfinite(combined).all()
  69. tm.assert_series_equal(combined[::2], series[::2])
  70. tm.assert_series_equal(combined[1::2], series_copy[1::2])
  71. # mixed types
  72. index = tm.makeStringIndex(20)
  73. floats = Series(tm.randn(20), index=index)
  74. strings = Series(tm.makeStringIndex(10), index=index[::2])
  75. combined = strings.combine_first(floats)
  76. tm.assert_series_equal(strings, combined.loc[index[::2]])
  77. tm.assert_series_equal(floats[1::2].astype(object),
  78. combined.loc[index[1::2]])
  79. # corner case
  80. s = Series([1., 2, 3], index=[0, 1, 2])
  81. result = s.combine_first(Series([], index=[]))
  82. assert_series_equal(s, result)
  83. def test_update(self):
  84. s = Series([1.5, nan, 3., 4., nan])
  85. s2 = Series([nan, 3.5, nan, 5.])
  86. s.update(s2)
  87. expected = Series([1.5, 3.5, 3., 5., np.nan])
  88. assert_series_equal(s, expected)
  89. # GH 3217
  90. df = DataFrame([{"a": 1}, {"a": 3, "b": 2}])
  91. df['c'] = np.nan
  92. df['c'].update(Series(['foo'], index=[0]))
  93. expected = DataFrame([[1, np.nan, 'foo'], [3, 2., np.nan]],
  94. columns=['a', 'b', 'c'])
  95. assert_frame_equal(df, expected)
  96. @pytest.mark.parametrize('other, dtype, expected', [
  97. # other is int
  98. ([61, 63], 'int32', pd.Series([10, 61, 12], dtype='int32')),
  99. ([61, 63], 'int64', pd.Series([10, 61, 12])),
  100. ([61, 63], float, pd.Series([10., 61., 12.])),
  101. ([61, 63], object, pd.Series([10, 61, 12], dtype=object)),
  102. # other is float, but can be cast to int
  103. ([61., 63.], 'int32', pd.Series([10, 61, 12], dtype='int32')),
  104. ([61., 63.], 'int64', pd.Series([10, 61, 12])),
  105. ([61., 63.], float, pd.Series([10., 61., 12.])),
  106. ([61., 63.], object, pd.Series([10, 61., 12], dtype=object)),
  107. # others is float, cannot be cast to int
  108. ([61.1, 63.1], 'int32', pd.Series([10., 61.1, 12.])),
  109. ([61.1, 63.1], 'int64', pd.Series([10., 61.1, 12.])),
  110. ([61.1, 63.1], float, pd.Series([10., 61.1, 12.])),
  111. ([61.1, 63.1], object, pd.Series([10, 61.1, 12], dtype=object)),
  112. # other is object, cannot be cast
  113. ([(61,), (63,)], 'int32', pd.Series([10, (61,), 12])),
  114. ([(61,), (63,)], 'int64', pd.Series([10, (61,), 12])),
  115. ([(61,), (63,)], float, pd.Series([10., (61,), 12.])),
  116. ([(61,), (63,)], object, pd.Series([10, (61,), 12]))
  117. ])
  118. def test_update_dtypes(self, other, dtype, expected):
  119. s = Series([10, 11, 12], dtype=dtype)
  120. other = Series(other, index=[1, 3])
  121. s.update(other)
  122. assert_series_equal(s, expected)
  123. def test_concat_empty_series_dtypes_roundtrips(self):
  124. # round-tripping with self & like self
  125. dtypes = map(np.dtype, ['float64', 'int8', 'uint8', 'bool', 'm8[ns]',
  126. 'M8[ns]'])
  127. for dtype in dtypes:
  128. assert pd.concat([Series(dtype=dtype)]).dtype == dtype
  129. assert pd.concat([Series(dtype=dtype),
  130. Series(dtype=dtype)]).dtype == dtype
  131. def int_result_type(dtype, dtype2):
  132. typs = {dtype.kind, dtype2.kind}
  133. if not len(typs - {'i', 'u', 'b'}) and (dtype.kind == 'i' or
  134. dtype2.kind == 'i'):
  135. return 'i'
  136. elif not len(typs - {'u', 'b'}) and (dtype.kind == 'u' or
  137. dtype2.kind == 'u'):
  138. return 'u'
  139. return None
  140. def float_result_type(dtype, dtype2):
  141. typs = {dtype.kind, dtype2.kind}
  142. if not len(typs - {'f', 'i', 'u'}) and (dtype.kind == 'f' or
  143. dtype2.kind == 'f'):
  144. return 'f'
  145. return None
  146. def get_result_type(dtype, dtype2):
  147. result = float_result_type(dtype, dtype2)
  148. if result is not None:
  149. return result
  150. result = int_result_type(dtype, dtype2)
  151. if result is not None:
  152. return result
  153. return 'O'
  154. for dtype in dtypes:
  155. for dtype2 in dtypes:
  156. if dtype == dtype2:
  157. continue
  158. expected = get_result_type(dtype, dtype2)
  159. result = pd.concat([Series(dtype=dtype), Series(dtype=dtype2)
  160. ]).dtype
  161. assert result.kind == expected
  162. def test_combine_first_dt_tz_values(self, tz_naive_fixture):
  163. ser1 = pd.Series(pd.DatetimeIndex(['20150101', '20150102', '20150103'],
  164. tz=tz_naive_fixture),
  165. name='ser1')
  166. ser2 = pd.Series(pd.DatetimeIndex(['20160514', '20160515', '20160516'],
  167. tz=tz_naive_fixture),
  168. index=[2, 3, 4], name='ser2')
  169. result = ser1.combine_first(ser2)
  170. exp_vals = pd.DatetimeIndex(['20150101', '20150102', '20150103',
  171. '20160515', '20160516'],
  172. tz=tz_naive_fixture)
  173. exp = pd.Series(exp_vals, name='ser1')
  174. assert_series_equal(exp, result)
  175. def test_concat_empty_series_dtypes(self):
  176. # booleans
  177. assert pd.concat([Series(dtype=np.bool_),
  178. Series(dtype=np.int32)]).dtype == np.int32
  179. assert pd.concat([Series(dtype=np.bool_),
  180. Series(dtype=np.float32)]).dtype == np.object_
  181. # datetime-like
  182. assert pd.concat([Series(dtype='m8[ns]'),
  183. Series(dtype=np.bool)]).dtype == np.object_
  184. assert pd.concat([Series(dtype='m8[ns]'),
  185. Series(dtype=np.int64)]).dtype == np.object_
  186. assert pd.concat([Series(dtype='M8[ns]'),
  187. Series(dtype=np.bool)]).dtype == np.object_
  188. assert pd.concat([Series(dtype='M8[ns]'),
  189. Series(dtype=np.int64)]).dtype == np.object_
  190. assert pd.concat([Series(dtype='M8[ns]'),
  191. Series(dtype=np.bool_),
  192. Series(dtype=np.int64)]).dtype == np.object_
  193. # categorical
  194. assert pd.concat([Series(dtype='category'),
  195. Series(dtype='category')]).dtype == 'category'
  196. # GH 18515
  197. assert pd.concat([Series(np.array([]), dtype='category'),
  198. Series(dtype='float64')]).dtype == 'float64'
  199. assert pd.concat([Series(dtype='category'),
  200. Series(dtype='object')]).dtype == 'object'
  201. # sparse
  202. # TODO: move?
  203. result = pd.concat([Series(dtype='float64').to_sparse(), Series(
  204. dtype='float64').to_sparse()])
  205. assert result.dtype == 'Sparse[float64]'
  206. assert result.ftype == 'float64:sparse'
  207. result = pd.concat([Series(dtype='float64').to_sparse(), Series(
  208. dtype='float64')])
  209. # TODO: release-note: concat sparse dtype
  210. expected = pd.core.sparse.api.SparseDtype(np.float64)
  211. assert result.dtype == expected
  212. assert result.ftype == 'float64:sparse'
  213. result = pd.concat([Series(dtype='float64').to_sparse(), Series(
  214. dtype='object')])
  215. # TODO: release-note: concat sparse dtype
  216. expected = pd.core.sparse.api.SparseDtype('object')
  217. assert result.dtype == expected
  218. assert result.ftype == 'object:sparse'
  219. def test_combine_first_dt64(self):
  220. from pandas.core.tools.datetimes import to_datetime
  221. s0 = to_datetime(Series(["2010", np.NaN]))
  222. s1 = to_datetime(Series([np.NaN, "2011"]))
  223. rs = s0.combine_first(s1)
  224. xp = to_datetime(Series(['2010', '2011']))
  225. assert_series_equal(rs, xp)
  226. s0 = to_datetime(Series(["2010", np.NaN]))
  227. s1 = Series([np.NaN, "2011"])
  228. rs = s0.combine_first(s1)
  229. xp = Series([datetime(2010, 1, 1), '2011'])
  230. assert_series_equal(rs, xp)
  231. class TestTimeseries(object):
  232. def test_append_concat(self):
  233. rng = date_range('5/8/2012 1:45', periods=10, freq='5T')
  234. ts = Series(np.random.randn(len(rng)), rng)
  235. df = DataFrame(np.random.randn(len(rng), 4), index=rng)
  236. result = ts.append(ts)
  237. result_df = df.append(df)
  238. ex_index = DatetimeIndex(np.tile(rng.values, 2))
  239. tm.assert_index_equal(result.index, ex_index)
  240. tm.assert_index_equal(result_df.index, ex_index)
  241. appended = rng.append(rng)
  242. tm.assert_index_equal(appended, ex_index)
  243. appended = rng.append([rng, rng])
  244. ex_index = DatetimeIndex(np.tile(rng.values, 3))
  245. tm.assert_index_equal(appended, ex_index)
  246. # different index names
  247. rng1 = rng.copy()
  248. rng2 = rng.copy()
  249. rng1.name = 'foo'
  250. rng2.name = 'bar'
  251. assert rng1.append(rng1).name == 'foo'
  252. assert rng1.append(rng2).name is None
  253. def test_append_concat_tz(self):
  254. # see gh-2938
  255. rng = date_range('5/8/2012 1:45', periods=10, freq='5T',
  256. tz='US/Eastern')
  257. rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T',
  258. tz='US/Eastern')
  259. rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T',
  260. tz='US/Eastern')
  261. ts = Series(np.random.randn(len(rng)), rng)
  262. df = DataFrame(np.random.randn(len(rng), 4), index=rng)
  263. ts2 = Series(np.random.randn(len(rng2)), rng2)
  264. df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2)
  265. result = ts.append(ts2)
  266. result_df = df.append(df2)
  267. tm.assert_index_equal(result.index, rng3)
  268. tm.assert_index_equal(result_df.index, rng3)
  269. appended = rng.append(rng2)
  270. tm.assert_index_equal(appended, rng3)
  271. def test_append_concat_tz_explicit_pytz(self):
  272. # see gh-2938
  273. from pytz import timezone as timezone
  274. rng = date_range('5/8/2012 1:45', periods=10, freq='5T',
  275. tz=timezone('US/Eastern'))
  276. rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T',
  277. tz=timezone('US/Eastern'))
  278. rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T',
  279. tz=timezone('US/Eastern'))
  280. ts = Series(np.random.randn(len(rng)), rng)
  281. df = DataFrame(np.random.randn(len(rng), 4), index=rng)
  282. ts2 = Series(np.random.randn(len(rng2)), rng2)
  283. df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2)
  284. result = ts.append(ts2)
  285. result_df = df.append(df2)
  286. tm.assert_index_equal(result.index, rng3)
  287. tm.assert_index_equal(result_df.index, rng3)
  288. appended = rng.append(rng2)
  289. tm.assert_index_equal(appended, rng3)
  290. def test_append_concat_tz_dateutil(self):
  291. # see gh-2938
  292. rng = date_range('5/8/2012 1:45', periods=10, freq='5T',
  293. tz='dateutil/US/Eastern')
  294. rng2 = date_range('5/8/2012 2:35', periods=10, freq='5T',
  295. tz='dateutil/US/Eastern')
  296. rng3 = date_range('5/8/2012 1:45', periods=20, freq='5T',
  297. tz='dateutil/US/Eastern')
  298. ts = Series(np.random.randn(len(rng)), rng)
  299. df = DataFrame(np.random.randn(len(rng), 4), index=rng)
  300. ts2 = Series(np.random.randn(len(rng2)), rng2)
  301. df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2)
  302. result = ts.append(ts2)
  303. result_df = df.append(df2)
  304. tm.assert_index_equal(result.index, rng3)
  305. tm.assert_index_equal(result_df.index, rng3)
  306. appended = rng.append(rng2)
  307. tm.assert_index_equal(appended, rng3)