test_strings.py 133 KB


  1. # -*- coding: utf-8 -*-
  2. # pylint: disable-msg=E1101,W0612
  3. from datetime import datetime, timedelta
  4. import re
  5. import numpy as np
  6. from numpy import nan as NA
  7. from numpy.random import randint
  8. import pytest
  9. import pandas.compat as compat
  10. from pandas.compat import PY3, range, u
  11. from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna
  12. import pandas.core.strings as strings
  13. import pandas.util.testing as tm
  14. from pandas.util.testing import assert_index_equal, assert_series_equal
  15. def assert_series_or_index_equal(left, right):
  16. if isinstance(left, Series):
  17. assert_series_equal(left, right)
  18. else: # Index
  19. assert_index_equal(left, right)
  20. _any_string_method = [
  21. ('cat', (), {'sep': ','}), # noqa: E241
  22. ('cat', (Series(list('zyx')),), {'sep': ',', # noqa: E241
  23. 'join': 'left'}),
  24. ('center', (10,), {}), # noqa: E241
  25. ('contains', ('a',), {}), # noqa: E241
  26. ('count', ('a',), {}), # noqa: E241
  27. ('decode', ('UTF-8',), {}), # noqa: E241
  28. ('encode', ('UTF-8',), {}), # noqa: E241
  29. ('endswith', ('a',), {}), # noqa: E241
  30. ('extract', ('([a-z]*)',), {'expand': False}), # noqa: E241
  31. ('extract', ('([a-z]*)',), {'expand': True}), # noqa: E241
  32. ('extractall', ('([a-z]*)',), {}), # noqa: E241
  33. ('find', ('a',), {}), # noqa: E241
  34. ('findall', ('a',), {}), # noqa: E241
  35. ('get', (0,), {}), # noqa: E241
  36. # because "index" (and "rindex") fail intentionally
  37. # if the string is not found, search only for empty string
  38. ('index', ('',), {}), # noqa: E241
  39. ('join', (',',), {}), # noqa: E241
  40. ('ljust', (10,), {}), # noqa: E241
  41. ('match', ('a',), {}), # noqa: E241
  42. ('normalize', ('NFC',), {}), # noqa: E241
  43. ('pad', (10,), {}), # noqa: E241
  44. ('partition', (' ',), {'expand': False}), # noqa: E241
  45. ('partition', (' ',), {'expand': True}), # noqa: E241
  46. ('repeat', (3,), {}), # noqa: E241
  47. ('replace', ('a', 'z',), {}), # noqa: E241
  48. ('rfind', ('a',), {}), # noqa: E241
  49. ('rindex', ('',), {}), # noqa: E241
  50. ('rjust', (10,), {}), # noqa: E241
  51. ('rpartition', (' ',), {'expand': False}), # noqa: E241
  52. ('rpartition', (' ',), {'expand': True}), # noqa: E241
  53. ('slice', (0, 1,), {}), # noqa: E241
  54. ('slice_replace', (0, 1, 'z',), {}), # noqa: E241
  55. ('split', (' ',), {'expand': False}), # noqa: E241
  56. ('split', (' ',), {'expand': True}), # noqa: E241
  57. ('startswith', ('a',), {}), # noqa: E241
  58. # translating unicode points of "a" to "d"
  59. ('translate', ({97: 100},), {}), # noqa: E241
  60. ('wrap', (2,), {}), # noqa: E241
  61. ('zfill', (10,), {}) # noqa: E241
  62. ] + list(zip([
  63. # methods without positional arguments: zip with empty tuple and empty dict
  64. 'capitalize', 'cat', 'get_dummies',
  65. 'isalnum', 'isalpha', 'isdecimal',
  66. 'isdigit', 'islower', 'isnumeric',
  67. 'isspace', 'istitle', 'isupper',
  68. 'len', 'lower', 'lstrip', 'partition',
  69. 'rpartition', 'rsplit', 'rstrip',
  70. 'slice', 'slice_replace', 'split',
  71. 'strip', 'swapcase', 'title', 'upper'
  72. ], [()] * 100, [{}] * 100))
  73. ids, _, _ = zip(*_any_string_method) # use method name as fixture-id
  74. # test that the above list captures all methods of StringMethods
  75. missing_methods = {f for f in dir(strings.StringMethods)
  76. if not f.startswith('_')} - set(ids)
  77. assert not missing_methods
  78. @pytest.fixture(params=_any_string_method, ids=ids)
  79. def any_string_method(request):
  80. """
  81. Fixture for all public methods of `StringMethods`
  82. This fixture returns a tuple of the method name and sample arguments
  83. necessary to call the method.
  84. Returns
  85. -------
  86. method_name : str
  87. The name of the method in `StringMethods`
  88. args : tuple
  89. Sample values for the positional arguments
  90. kwargs : dict
  91. Sample values for the keyword arguments
  92. Examples
  93. --------
  94. >>> def test_something(any_string_method):
  95. ... s = pd.Series(['a', 'b', np.nan, 'd'])
  96. ...
  97. ... method_name, args, kwargs = any_string_method
  98. ... method = getattr(s.str, method_name)
  99. ... # will not raise
  100. ... method(*args, **kwargs)
  101. """
  102. return request.param
  103. # subset of the full set from pandas/conftest.py
  104. _any_allowed_skipna_inferred_dtype = [
  105. ('string', ['a', np.nan, 'c']),
  106. ('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]),
  107. ('bytes' if PY3 else 'string', [b'a', np.nan, b'c']),
  108. ('empty', [np.nan, np.nan, np.nan]),
  109. ('empty', []),
  110. ('mixed-integer', ['a', np.nan, 2])
  111. ]
  112. ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id
  113. @pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids)
  114. def any_allowed_skipna_inferred_dtype(request):
  115. """
  116. Fixture for all (inferred) dtypes allowed in StringMethods.__init__
  117. The covered (inferred) types are:
  118. * 'string'
  119. * 'unicode' (if PY2)
  120. * 'empty'
  121. * 'bytes' (if PY3)
  122. * 'mixed'
  123. * 'mixed-integer'
  124. Returns
  125. -------
  126. inferred_dtype : str
  127. The string for the inferred dtype from _libs.lib.infer_dtype
  128. values : np.ndarray
  129. An array of object dtype that will be inferred to have
  130. `inferred_dtype`
  131. Examples
  132. --------
  133. >>> import pandas._libs.lib as lib
  134. >>>
  135. >>> def test_something(any_allowed_skipna_inferred_dtype):
  136. ... inferred_dtype, values = any_allowed_skipna_inferred_dtype
  137. ... # will pass
  138. ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype
  139. """
  140. inferred_dtype, values = request.param
  141. values = np.array(values, dtype=object) # object dtype to avoid casting
  142. # correctness of inference tested in tests/dtypes/test_inference.py
  143. return inferred_dtype, values
  144. class TestStringMethods(object):
  145. def test_api(self):
  146. # GH 6106, GH 9322
  147. assert Series.str is strings.StringMethods
  148. assert isinstance(Series(['']).str, strings.StringMethods)
  149. @pytest.mark.parametrize('dtype', [object, 'category'])
  150. @pytest.mark.parametrize('box', [Series, Index])
  151. def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype):
  152. # one instance of parametrized fixture
  153. inferred_dtype, values = any_skipna_inferred_dtype
  154. t = box(values, dtype=dtype) # explicit dtype to avoid casting
  155. # TODO: get rid of these xfails
  156. if dtype == 'category' and inferred_dtype in ['period', 'interval']:
  157. pytest.xfail(reason='Conversion to numpy array fails because '
  158. 'the ._values-attribute is not a numpy array for '
  159. 'PeriodArray/IntervalArray; see GH 23553')
  160. if box == Index and inferred_dtype in ['empty', 'bytes']:
  161. pytest.xfail(reason='Raising too restrictively; '
  162. 'solved by GH 23167')
  163. if (box == Index and dtype == object
  164. and inferred_dtype in ['boolean', 'date', 'time']):
  165. pytest.xfail(reason='Inferring incorrectly because of NaNs; '
  166. 'solved by GH 23167')
  167. if (box == Series
  168. and (dtype == object and inferred_dtype not in [
  169. 'string', 'unicode', 'empty',
  170. 'bytes', 'mixed', 'mixed-integer'])
  171. or (dtype == 'category'
  172. and inferred_dtype in ['decimal', 'boolean', 'time'])):
  173. pytest.xfail(reason='Not raising correctly; solved by GH 23167')
  174. types_passing_constructor = ['string', 'unicode', 'empty',
  175. 'bytes', 'mixed', 'mixed-integer']
  176. if inferred_dtype in types_passing_constructor:
  177. # GH 6106
  178. assert isinstance(t.str, strings.StringMethods)
  179. else:
  180. # GH 9184, GH 23011, GH 23163
  181. with pytest.raises(AttributeError, match='Can only use .str '
  182. 'accessor with string values.*'):
  183. t.str
  184. assert not hasattr(t, 'str')
  185. @pytest.mark.parametrize('dtype', [object, 'category'])
  186. @pytest.mark.parametrize('box', [Series, Index])
  187. def test_api_per_method(self, box, dtype,
  188. any_allowed_skipna_inferred_dtype,
  189. any_string_method):
  190. # this test does not check correctness of the different methods,
  191. # just that the methods work on the specified (inferred) dtypes,
  192. # and raise on all others
  193. # one instance of each parametrized fixture
  194. inferred_dtype, values = any_allowed_skipna_inferred_dtype
  195. method_name, args, kwargs = any_string_method
  196. # TODO: get rid of these xfails
  197. if (method_name not in ['encode', 'decode', 'len']
  198. and inferred_dtype == 'bytes'):
  199. pytest.xfail(reason='Not raising for "bytes", see GH 23011;'
  200. 'Also: malformed method names, see GH 23551; '
  201. 'solved by GH 23167')
  202. if (method_name == 'cat'
  203. and inferred_dtype in ['mixed', 'mixed-integer']):
  204. pytest.xfail(reason='Bad error message; should raise better; '
  205. 'solved by GH 23167')
  206. if box == Index and inferred_dtype in ['empty', 'bytes']:
  207. pytest.xfail(reason='Raising too restrictively; '
  208. 'solved by GH 23167')
  209. if (box == Index and dtype == object
  210. and inferred_dtype in ['boolean', 'date', 'time']):
  211. pytest.xfail(reason='Inferring incorrectly because of NaNs; '
  212. 'solved by GH 23167')
  213. t = box(values, dtype=dtype) # explicit dtype to avoid casting
  214. method = getattr(t.str, method_name)
  215. bytes_allowed = method_name in ['encode', 'decode', 'len']
  216. # as of v0.23.4, all methods except 'cat' are very lenient with the
  217. # allowed data types, just returning NaN for entries that error.
  218. # This could be changed with an 'errors'-kwarg to the `str`-accessor,
  219. # see discussion in GH 13877
  220. mixed_allowed = method_name not in ['cat']
  221. allowed_types = (['string', 'unicode', 'empty']
  222. + ['bytes'] * bytes_allowed
  223. + ['mixed', 'mixed-integer'] * mixed_allowed)
  224. if inferred_dtype in allowed_types:
  225. # xref GH 23555, GH 23556
  226. method(*args, **kwargs) # works!
  227. else:
  228. # GH 23011, GH 23163
  229. msg = ('Cannot use .str.{name} with values of inferred dtype '
  230. '{inferred_dtype!r}.'.format(name=method_name,
  231. inferred_dtype=inferred_dtype))
  232. with pytest.raises(TypeError, match=msg):
  233. method(*args, **kwargs)
  234. def test_api_for_categorical(self, any_string_method):
  235. # https://github.com/pandas-dev/pandas/issues/10661
  236. s = Series(list('aabb'))
  237. s = s + " " + s
  238. c = s.astype('category')
  239. assert isinstance(c.str, strings.StringMethods)
  240. method_name, args, kwargs = any_string_method
  241. result = getattr(c.str, method_name)(*args, **kwargs)
  242. expected = getattr(s.str, method_name)(*args, **kwargs)
  243. if isinstance(result, DataFrame):
  244. tm.assert_frame_equal(result, expected)
  245. elif isinstance(result, Series):
  246. tm.assert_series_equal(result, expected)
  247. else:
  248. # str.cat(others=None) returns string, for example
  249. assert result == expected
  250. def test_iter(self):
  251. # GH3638
  252. strs = 'google', 'wikimedia', 'wikipedia', 'wikitravel'
  253. ds = Series(strs)
  254. for s in ds.str:
  255. # iter must yield a Series
  256. assert isinstance(s, Series)
  257. # indices of each yielded Series should be equal to the index of
  258. # the original Series
  259. tm.assert_index_equal(s.index, ds.index)
  260. for el in s:
  261. # each element of the series is either a basestring/str or nan
  262. assert isinstance(el, compat.string_types) or isna(el)
  263. # desired behavior is to iterate until everything would be nan on the
  264. # next iter so make sure the last element of the iterator was 'l' in
  265. # this case since 'wikitravel' is the longest string
  266. assert s.dropna().values.item() == 'l'
  267. def test_iter_empty(self):
  268. ds = Series([], dtype=object)
  269. i, s = 100, 1
  270. for i, s in enumerate(ds.str):
  271. pass
  272. # nothing to iterate over so nothing defined values should remain
  273. # unchanged
  274. assert i == 100
  275. assert s == 1
  276. def test_iter_single_element(self):
  277. ds = Series(['a'])
  278. for i, s in enumerate(ds.str):
  279. pass
  280. assert not i
  281. assert_series_equal(ds, s)
  282. def test_iter_object_try_string(self):
  283. ds = Series([slice(None, randint(10), randint(10, 20)) for _ in range(
  284. 4)])
  285. i, s = 100, 'h'
  286. for i, s in enumerate(ds.str):
  287. pass
  288. assert i == 100
  289. assert s == 'h'
  290. @pytest.mark.parametrize('box', [Series, Index])
  291. @pytest.mark.parametrize('other', [None, Series, Index])
  292. def test_str_cat_name(self, box, other):
  293. # GH 21053
  294. values = ['a', 'b']
  295. if other:
  296. other = other(values)
  297. else:
  298. other = values
  299. result = box(values, name='name').str.cat(other, sep=',', join='left')
  300. assert result.name == 'name'
  301. @pytest.mark.parametrize('box', [Series, Index])
  302. def test_str_cat(self, box):
  303. # test_cat above tests "str_cat" from ndarray;
  304. # here testing "str.cat" from Series/Indext to ndarray/list
  305. s = box(['a', 'a', 'b', 'b', 'c', np.nan])
  306. # single array
  307. result = s.str.cat()
  308. expected = 'aabbc'
  309. assert result == expected
  310. result = s.str.cat(na_rep='-')
  311. expected = 'aabbc-'
  312. assert result == expected
  313. result = s.str.cat(sep='_', na_rep='NA')
  314. expected = 'a_a_b_b_c_NA'
  315. assert result == expected
  316. t = np.array(['a', np.nan, 'b', 'd', 'foo', np.nan], dtype=object)
  317. expected = box(['aa', 'a-', 'bb', 'bd', 'cfoo', '--'])
  318. # Series/Index with array
  319. result = s.str.cat(t, na_rep='-')
  320. assert_series_or_index_equal(result, expected)
  321. # Series/Index with list
  322. result = s.str.cat(list(t), na_rep='-')
  323. assert_series_or_index_equal(result, expected)
  324. # errors for incorrect lengths
  325. rgx = 'All arrays must be same length, except those having an index.*'
  326. z = Series(['1', '2', '3'])
  327. with pytest.raises(ValueError, match=rgx):
  328. s.str.cat(z)
  329. with pytest.raises(ValueError, match=rgx):
  330. s.str.cat(z.values)
  331. with pytest.raises(ValueError, match=rgx):
  332. s.str.cat(list(z))
  333. @pytest.mark.parametrize('box', [Series, Index])
  334. def test_str_cat_raises_intuitive_error(self, box):
  335. # GH 11334
  336. s = box(['a', 'b', 'c', 'd'])
  337. message = "Did you mean to supply a `sep` keyword?"
  338. with pytest.raises(ValueError, match=message):
  339. s.str.cat('|')
  340. with pytest.raises(ValueError, match=message):
  341. s.str.cat(' ')
  342. @pytest.mark.parametrize('sep', ['', None])
  343. @pytest.mark.parametrize('dtype_target', ['object', 'category'])
  344. @pytest.mark.parametrize('dtype_caller', ['object', 'category'])
  345. @pytest.mark.parametrize('box', [Series, Index])
  346. def test_str_cat_categorical(self, box, dtype_caller, dtype_target, sep):
  347. s = Index(['a', 'a', 'b', 'a'], dtype=dtype_caller)
  348. s = s if box == Index else Series(s, index=s)
  349. t = Index(['b', 'a', 'b', 'c'], dtype=dtype_target)
  350. expected = Index(['ab', 'aa', 'bb', 'ac'])
  351. expected = expected if box == Index else Series(expected, index=s)
  352. # Series/Index with unaligned Index
  353. with tm.assert_produces_warning(expected_warning=FutureWarning):
  354. # FutureWarning to switch to alignment by default
  355. result = s.str.cat(t, sep=sep)
  356. assert_series_or_index_equal(result, expected)
  357. # Series/Index with Series having matching Index
  358. t = Series(t, index=s)
  359. result = s.str.cat(t, sep=sep)
  360. assert_series_or_index_equal(result, expected)
  361. # Series/Index with Series.values
  362. result = s.str.cat(t.values, sep=sep)
  363. assert_series_or_index_equal(result, expected)
  364. # Series/Index with Series having different Index
  365. t = Series(t.values, index=t)
  366. with tm.assert_produces_warning(expected_warning=FutureWarning):
  367. # FutureWarning to switch to alignment by default
  368. result = s.str.cat(t, sep=sep)
  369. assert_series_or_index_equal(result, expected)
  370. @pytest.mark.parametrize('box', [Series, Index])
  371. def test_str_cat_mixed_inputs(self, box):
  372. s = Index(['a', 'b', 'c', 'd'])
  373. s = s if box == Index else Series(s, index=s)
  374. t = Series(['A', 'B', 'C', 'D'], index=s.values)
  375. d = concat([t, Series(s, index=s)], axis=1)
  376. expected = Index(['aAa', 'bBb', 'cCc', 'dDd'])
  377. expected = expected if box == Index else Series(expected.values,
  378. index=s.values)
  379. # Series/Index with DataFrame
  380. result = s.str.cat(d)
  381. assert_series_or_index_equal(result, expected)
  382. # Series/Index with two-dimensional ndarray
  383. result = s.str.cat(d.values)
  384. assert_series_or_index_equal(result, expected)
  385. # Series/Index with list of Series
  386. result = s.str.cat([t, s])
  387. assert_series_or_index_equal(result, expected)
  388. # Series/Index with mixed list of Series/array
  389. result = s.str.cat([t, s.values])
  390. assert_series_or_index_equal(result, expected)
  391. # Series/Index with list of list-likes
  392. with tm.assert_produces_warning(expected_warning=FutureWarning):
  393. # nested list-likes will be deprecated
  394. result = s.str.cat([t.values, list(s)])
  395. assert_series_or_index_equal(result, expected)
  396. # Series/Index with list of Series; different indexes
  397. t.index = ['b', 'c', 'd', 'a']
  398. with tm.assert_produces_warning(expected_warning=FutureWarning):
  399. # FutureWarning to switch to alignment by default
  400. result = s.str.cat([t, s])
  401. assert_series_or_index_equal(result, expected)
  402. # Series/Index with mixed list; different indexes
  403. with tm.assert_produces_warning(expected_warning=FutureWarning):
  404. # FutureWarning to switch to alignment by default
  405. result = s.str.cat([t, s.values])
  406. assert_series_or_index_equal(result, expected)
  407. # Series/Index with DataFrame; different indexes
  408. d.index = ['b', 'c', 'd', 'a']
  409. with tm.assert_produces_warning(expected_warning=FutureWarning):
  410. # FutureWarning to switch to alignment by default
  411. result = s.str.cat(d)
  412. assert_series_or_index_equal(result, expected)
  413. # Series/Index with iterator of list-likes
  414. with tm.assert_produces_warning(expected_warning=FutureWarning):
  415. # nested list-likes will be deprecated
  416. result = s.str.cat(iter([t.values, list(s)]))
  417. assert_series_or_index_equal(result, expected)
  418. # errors for incorrect lengths
  419. rgx = 'All arrays must be same length, except those having an index.*'
  420. z = Series(['1', '2', '3'])
  421. e = concat([z, z], axis=1)
  422. # DataFrame
  423. with pytest.raises(ValueError, match=rgx):
  424. s.str.cat(e)
  425. # two-dimensional ndarray
  426. with pytest.raises(ValueError, match=rgx):
  427. s.str.cat(e.values)
  428. # list of Series
  429. with pytest.raises(ValueError, match=rgx):
  430. s.str.cat([z, s])
  431. # list of list-likes
  432. with pytest.raises(ValueError, match=rgx):
  433. s.str.cat([z.values, s.values])
  434. # mixed list of Series/list-like
  435. with pytest.raises(ValueError, match=rgx):
  436. s.str.cat([z.values, s])
  437. # errors for incorrect arguments in list-like
  438. rgx = 'others must be Series, Index, DataFrame,.*'
  439. # make sure None/NaN do not crash checks in _get_series_list
  440. u = Series(['a', np.nan, 'c', None])
  441. # mix of string and Series
  442. with pytest.raises(TypeError, match=rgx):
  443. s.str.cat([u, 'u'])
  444. # DataFrame in list
  445. with pytest.raises(TypeError, match=rgx):
  446. s.str.cat([u, d])
  447. # 2-dim ndarray in list
  448. with pytest.raises(TypeError, match=rgx):
  449. s.str.cat([u, d.values])
  450. # nested lists
  451. with pytest.raises(TypeError, match=rgx):
  452. s.str.cat([u, [u, d]])
  453. # forbidden input type: set
  454. # GH 23009
  455. with pytest.raises(TypeError, match=rgx):
  456. s.str.cat(set(u))
  457. # forbidden input type: set in list
  458. # GH 23009
  459. with pytest.raises(TypeError, match=rgx):
  460. s.str.cat([u, set(u)])
  461. # other forbidden input type, e.g. int
  462. with pytest.raises(TypeError, match=rgx):
  463. s.str.cat(1)
  464. @pytest.mark.parametrize('join', ['left', 'outer', 'inner', 'right'])
  465. @pytest.mark.parametrize('box', [Series, Index])
  466. def test_str_cat_align_indexed(self, box, join):
  467. # https://github.com/pandas-dev/pandas/issues/18657
  468. s = Series(['a', 'b', 'c', 'd'], index=['a', 'b', 'c', 'd'])
  469. t = Series(['D', 'A', 'E', 'B'], index=['d', 'a', 'e', 'b'])
  470. sa, ta = s.align(t, join=join)
  471. # result after manual alignment of inputs
  472. expected = sa.str.cat(ta, na_rep='-')
  473. if box == Index:
  474. s = Index(s)
  475. sa = Index(sa)
  476. expected = Index(expected)
  477. result = s.str.cat(t, join=join, na_rep='-')
  478. assert_series_or_index_equal(result, expected)
  479. @pytest.mark.parametrize('join', ['left', 'outer', 'inner', 'right'])
  480. def test_str_cat_align_mixed_inputs(self, join):
  481. s = Series(['a', 'b', 'c', 'd'])
  482. t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1])
  483. d = concat([t, t], axis=1)
  484. expected_outer = Series(['aaa', 'bbb', 'c--', 'ddd', '-ee'])
  485. expected = expected_outer.loc[s.index.join(t.index, how=join)]
  486. # list of Series
  487. result = s.str.cat([t, t], join=join, na_rep='-')
  488. tm.assert_series_equal(result, expected)
  489. # DataFrame
  490. result = s.str.cat(d, join=join, na_rep='-')
  491. tm.assert_series_equal(result, expected)
  492. # mixed list of indexed/unindexed
  493. u = np.array(['A', 'B', 'C', 'D'])
  494. expected_outer = Series(['aaA', 'bbB', 'c-C', 'ddD', '-e-'])
  495. # joint index of rhs [t, u]; u will be forced have index of s
  496. rhs_idx = t.index & s.index if join == 'inner' else t.index | s.index
  497. expected = expected_outer.loc[s.index.join(rhs_idx, how=join)]
  498. result = s.str.cat([t, u], join=join, na_rep='-')
  499. tm.assert_series_equal(result, expected)
  500. with tm.assert_produces_warning(expected_warning=FutureWarning):
  501. # nested list-likes will be deprecated
  502. result = s.str.cat([t, list(u)], join=join, na_rep='-')
  503. tm.assert_series_equal(result, expected)
  504. # errors for incorrect lengths
  505. rgx = r'If `others` contains arrays or lists \(or other list-likes.*'
  506. z = Series(['1', '2', '3']).values
  507. # unindexed object of wrong length
  508. with pytest.raises(ValueError, match=rgx):
  509. s.str.cat(z, join=join)
  510. # unindexed object of wrong length in list
  511. with pytest.raises(ValueError, match=rgx):
  512. s.str.cat([t, z], join=join)
  513. @pytest.mark.parametrize('box', [Series, Index])
  514. @pytest.mark.parametrize('other', [Series, Index])
  515. def test_str_cat_all_na(self, box, other):
  516. # GH 24044
  517. # check that all NaNs in caller / target work
  518. s = Index(['a', 'b', 'c', 'd'])
  519. s = s if box == Index else Series(s, index=s)
  520. t = other([np.nan] * 4, dtype=object)
  521. # add index of s for alignment
  522. t = t if other == Index else Series(t, index=s)
  523. # all-NA target
  524. if box == Series:
  525. expected = Series([np.nan] * 4, index=s.index, dtype=object)
  526. else: # box == Index
  527. expected = Index([np.nan] * 4, dtype=object)
  528. result = s.str.cat(t, join='left')
  529. assert_series_or_index_equal(result, expected)
  530. # all-NA caller (only for Series)
  531. if other == Series:
  532. expected = Series([np.nan] * 4, dtype=object, index=t.index)
  533. result = t.str.cat(s, join='left')
  534. tm.assert_series_equal(result, expected)
  535. def test_str_cat_special_cases(self):
  536. s = Series(['a', 'b', 'c', 'd'])
  537. t = Series(['d', 'a', 'e', 'b'], index=[3, 0, 4, 1])
  538. # iterator of elements with different types
  539. expected = Series(['aaa', 'bbb', 'c-c', 'ddd', '-e-'])
  540. result = s.str.cat(iter([t, s.values]), join='outer', na_rep='-')
  541. tm.assert_series_equal(result, expected)
  542. # right-align with different indexes in others
  543. expected = Series(['aa-', 'd-d'], index=[0, 3])
  544. result = s.str.cat([t.loc[[0]], t.loc[[3]]], join='right', na_rep='-')
  545. tm.assert_series_equal(result, expected)
  546. def test_cat_on_filtered_index(self):
  547. df = DataFrame(index=MultiIndex.from_product(
  548. [[2011, 2012], [1, 2, 3]], names=['year', 'month']))
  549. df = df.reset_index()
  550. df = df[df.month > 1]
  551. str_year = df.year.astype('str')
  552. str_month = df.month.astype('str')
  553. str_both = str_year.str.cat(str_month, sep=' ')
  554. assert str_both.loc[1] == '2011 2'
  555. str_multiple = str_year.str.cat([str_month, str_month], sep=' ')
  556. assert str_multiple.loc[1] == '2011 2 2'
  557. def test_count(self):
  558. values = np.array(['foo', 'foofoo', NA, 'foooofooofommmfoo'],
  559. dtype=np.object_)
  560. result = strings.str_count(values, 'f[o]+')
  561. exp = np.array([1, 2, NA, 4])
  562. tm.assert_numpy_array_equal(result, exp)
  563. result = Series(values).str.count('f[o]+')
  564. exp = Series([1, 2, NA, 4])
  565. assert isinstance(result, Series)
  566. tm.assert_series_equal(result, exp)
  567. # mixed
  568. mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.]
  569. rs = strings.str_count(mixed, 'a')
  570. xp = np.array([1, NA, 0, NA, NA, 0, NA, NA, NA])
  571. tm.assert_numpy_array_equal(rs, xp)
  572. rs = Series(mixed).str.count('a')
  573. xp = Series([1, NA, 0, NA, NA, 0, NA, NA, NA])
  574. assert isinstance(rs, Series)
  575. tm.assert_series_equal(rs, xp)
  576. # unicode
  577. values = [u('foo'), u('foofoo'), NA, u('foooofooofommmfoo')]
  578. result = strings.str_count(values, 'f[o]+')
  579. exp = np.array([1, 2, NA, 4])
  580. tm.assert_numpy_array_equal(result, exp)
  581. result = Series(values).str.count('f[o]+')
  582. exp = Series([1, 2, NA, 4])
  583. assert isinstance(result, Series)
  584. tm.assert_series_equal(result, exp)
  585. def test_contains(self):
  586. values = np.array(['foo', NA, 'fooommm__foo',
  587. 'mmm_', 'foommm[_]+bar'], dtype=np.object_)
  588. pat = 'mmm[_]+'
  589. result = strings.str_contains(values, pat)
  590. expected = np.array([False, NA, True, True, False], dtype=np.object_)
  591. tm.assert_numpy_array_equal(result, expected)
  592. result = strings.str_contains(values, pat, regex=False)
  593. expected = np.array([False, NA, False, False, True], dtype=np.object_)
  594. tm.assert_numpy_array_equal(result, expected)
  595. values = ['foo', 'xyz', 'fooommm__foo', 'mmm_']
  596. result = strings.str_contains(values, pat)
  597. expected = np.array([False, False, True, True])
  598. assert result.dtype == np.bool_
  599. tm.assert_numpy_array_equal(result, expected)
  600. # case insensitive using regex
  601. values = ['Foo', 'xYz', 'fOOomMm__fOo', 'MMM_']
  602. result = strings.str_contains(values, 'FOO|mmm', case=False)
  603. expected = np.array([True, False, True, True])
  604. tm.assert_numpy_array_equal(result, expected)
  605. # case insensitive without regex
  606. result = strings.str_contains(values, 'foo', regex=False, case=False)
  607. expected = np.array([True, False, True, False])
  608. tm.assert_numpy_array_equal(result, expected)
  609. # mixed
  610. mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.]
  611. rs = strings.str_contains(mixed, 'o')
  612. xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA],
  613. dtype=np.object_)
  614. tm.assert_numpy_array_equal(rs, xp)
  615. rs = Series(mixed).str.contains('o')
  616. xp = Series([False, NA, False, NA, NA, True, NA, NA, NA])
  617. assert isinstance(rs, Series)
  618. tm.assert_series_equal(rs, xp)
  619. # unicode
  620. values = np.array([u'foo', NA, u'fooommm__foo', u'mmm_'],
  621. dtype=np.object_)
  622. pat = 'mmm[_]+'
  623. result = strings.str_contains(values, pat)
  624. expected = np.array([False, np.nan, True, True], dtype=np.object_)
  625. tm.assert_numpy_array_equal(result, expected)
  626. result = strings.str_contains(values, pat, na=False)
  627. expected = np.array([False, False, True, True])
  628. tm.assert_numpy_array_equal(result, expected)
  629. values = np.array(['foo', 'xyz', 'fooommm__foo', 'mmm_'],
  630. dtype=np.object_)
  631. result = strings.str_contains(values, pat)
  632. expected = np.array([False, False, True, True])
  633. assert result.dtype == np.bool_
  634. tm.assert_numpy_array_equal(result, expected)
  635. def test_contains_for_object_category(self):
  636. # gh 22158
  637. # na for category
  638. values = Series(["a", "b", "c", "a", np.nan], dtype="category")
  639. result = values.str.contains('a', na=True)
  640. expected = Series([True, False, False, True, True])
  641. tm.assert_series_equal(result, expected)
  642. result = values.str.contains('a', na=False)
  643. expected = Series([True, False, False, True, False])
  644. tm.assert_series_equal(result, expected)
  645. # na for objects
  646. values = Series(["a", "b", "c", "a", np.nan])
  647. result = values.str.contains('a', na=True)
  648. expected = Series([True, False, False, True, True])
  649. tm.assert_series_equal(result, expected)
  650. result = values.str.contains('a', na=False)
  651. expected = Series([True, False, False, True, False])
  652. tm.assert_series_equal(result, expected)
  653. def test_startswith(self):
  654. values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo'])
  655. result = values.str.startswith('foo')
  656. exp = Series([False, NA, True, False, False, NA, True])
  657. tm.assert_series_equal(result, exp)
  658. # mixed
  659. mixed = np.array(['a', NA, 'b', True, datetime.today(),
  660. 'foo', None, 1, 2.], dtype=np.object_)
  661. rs = strings.str_startswith(mixed, 'f')
  662. xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA],
  663. dtype=np.object_)
  664. tm.assert_numpy_array_equal(rs, xp)
  665. rs = Series(mixed).str.startswith('f')
  666. assert isinstance(rs, Series)
  667. xp = Series([False, NA, False, NA, NA, True, NA, NA, NA])
  668. tm.assert_series_equal(rs, xp)
  669. # unicode
  670. values = Series([u('om'), NA, u('foo_nom'), u('nom'), u('bar_foo'), NA,
  671. u('foo')])
  672. result = values.str.startswith('foo')
  673. exp = Series([False, NA, True, False, False, NA, True])
  674. tm.assert_series_equal(result, exp)
  675. result = values.str.startswith('foo', na=True)
  676. tm.assert_series_equal(result, exp.fillna(True).astype(bool))
  677. def test_endswith(self):
  678. values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo'])
  679. result = values.str.endswith('foo')
  680. exp = Series([False, NA, False, False, True, NA, True])
  681. tm.assert_series_equal(result, exp)
  682. # mixed
  683. mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.]
  684. rs = strings.str_endswith(mixed, 'f')
  685. xp = np.array([False, NA, False, NA, NA, False, NA, NA, NA],
  686. dtype=np.object_)
  687. tm.assert_numpy_array_equal(rs, xp)
  688. rs = Series(mixed).str.endswith('f')
  689. xp = Series([False, NA, False, NA, NA, False, NA, NA, NA])
  690. assert isinstance(rs, Series)
  691. tm.assert_series_equal(rs, xp)
  692. # unicode
  693. values = Series([u('om'), NA, u('foo_nom'), u('nom'), u('bar_foo'), NA,
  694. u('foo')])
  695. result = values.str.endswith('foo')
  696. exp = Series([False, NA, False, False, True, NA, True])
  697. tm.assert_series_equal(result, exp)
  698. result = values.str.endswith('foo', na=False)
  699. tm.assert_series_equal(result, exp.fillna(False).astype(bool))
  700. def test_title(self):
  701. values = Series(["FOO", "BAR", NA, "Blah", "blurg"])
  702. result = values.str.title()
  703. exp = Series(["Foo", "Bar", NA, "Blah", "Blurg"])
  704. tm.assert_series_equal(result, exp)
  705. # mixed
  706. mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None,
  707. 1, 2.])
  708. mixed = mixed.str.title()
  709. exp = Series(["Foo", NA, "Bar", NA, NA, "Blah", NA, NA, NA])
  710. tm.assert_almost_equal(mixed, exp)
  711. # unicode
  712. values = Series([u("FOO"), NA, u("bar"), u("Blurg")])
  713. results = values.str.title()
  714. exp = Series([u("Foo"), NA, u("Bar"), u("Blurg")])
  715. tm.assert_series_equal(results, exp)
  716. def test_lower_upper(self):
  717. values = Series(['om', NA, 'nom', 'nom'])
  718. result = values.str.upper()
  719. exp = Series(['OM', NA, 'NOM', 'NOM'])
  720. tm.assert_series_equal(result, exp)
  721. result = result.str.lower()
  722. tm.assert_series_equal(result, values)
  723. # mixed
  724. mixed = Series(['a', NA, 'b', True, datetime.today(), 'foo', None, 1,
  725. 2.])
  726. mixed = mixed.str.upper()
  727. rs = Series(mixed).str.lower()
  728. xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA])
  729. assert isinstance(rs, Series)
  730. tm.assert_series_equal(rs, xp)
  731. # unicode
  732. values = Series([u('om'), NA, u('nom'), u('nom')])
  733. result = values.str.upper()
  734. exp = Series([u('OM'), NA, u('NOM'), u('NOM')])
  735. tm.assert_series_equal(result, exp)
  736. result = result.str.lower()
  737. tm.assert_series_equal(result, values)
  738. def test_capitalize(self):
  739. values = Series(["FOO", "BAR", NA, "Blah", "blurg"])
  740. result = values.str.capitalize()
  741. exp = Series(["Foo", "Bar", NA, "Blah", "Blurg"])
  742. tm.assert_series_equal(result, exp)
  743. # mixed
  744. mixed = Series(["FOO", NA, "bar", True, datetime.today(), "blah", None,
  745. 1, 2.])
  746. mixed = mixed.str.capitalize()
  747. exp = Series(["Foo", NA, "Bar", NA, NA, "Blah", NA, NA, NA])
  748. tm.assert_almost_equal(mixed, exp)
  749. # unicode
  750. values = Series([u("FOO"), NA, u("bar"), u("Blurg")])
  751. results = values.str.capitalize()
  752. exp = Series([u("Foo"), NA, u("Bar"), u("Blurg")])
  753. tm.assert_series_equal(results, exp)
  754. def test_swapcase(self):
  755. values = Series(["FOO", "BAR", NA, "Blah", "blurg"])
  756. result = values.str.swapcase()
  757. exp = Series(["foo", "bar", NA, "bLAH", "BLURG"])
  758. tm.assert_series_equal(result, exp)
  759. # mixed
  760. mixed = Series(["FOO", NA, "bar", True, datetime.today(), "Blah", None,
  761. 1, 2.])
  762. mixed = mixed.str.swapcase()
  763. exp = Series(["foo", NA, "BAR", NA, NA, "bLAH", NA, NA, NA])
  764. tm.assert_almost_equal(mixed, exp)
  765. # unicode
  766. values = Series([u("FOO"), NA, u("bar"), u("Blurg")])
  767. results = values.str.swapcase()
  768. exp = Series([u("foo"), NA, u("BAR"), u("bLURG")])
  769. tm.assert_series_equal(results, exp)
  770. def test_casemethods(self):
  771. values = ['aaa', 'bbb', 'CCC', 'Dddd', 'eEEE']
  772. s = Series(values)
  773. assert s.str.lower().tolist() == [v.lower() for v in values]
  774. assert s.str.upper().tolist() == [v.upper() for v in values]
  775. assert s.str.title().tolist() == [v.title() for v in values]
  776. assert s.str.capitalize().tolist() == [v.capitalize() for v in values]
  777. assert s.str.swapcase().tolist() == [v.swapcase() for v in values]
  778. def test_replace(self):
  779. values = Series(['fooBAD__barBAD', NA])
  780. result = values.str.replace('BAD[_]*', '')
  781. exp = Series(['foobar', NA])
  782. tm.assert_series_equal(result, exp)
  783. result = values.str.replace('BAD[_]*', '', n=1)
  784. exp = Series(['foobarBAD', NA])
  785. tm.assert_series_equal(result, exp)
  786. # mixed
  787. mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD',
  788. None, 1, 2.])
  789. rs = Series(mixed).str.replace('BAD[_]*', '')
  790. xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA])
  791. assert isinstance(rs, Series)
  792. tm.assert_almost_equal(rs, xp)
  793. # unicode
  794. values = Series([u('fooBAD__barBAD'), NA])
  795. result = values.str.replace('BAD[_]*', '')
  796. exp = Series([u('foobar'), NA])
  797. tm.assert_series_equal(result, exp)
  798. result = values.str.replace('BAD[_]*', '', n=1)
  799. exp = Series([u('foobarBAD'), NA])
  800. tm.assert_series_equal(result, exp)
  801. # flags + unicode
  802. values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
  803. exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
  804. result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE)
  805. tm.assert_series_equal(result, exp)
  806. # GH 13438
  807. for klass in (Series, Index):
  808. for repl in (None, 3, {'a': 'b'}):
  809. for data in (['a', 'b', None], ['a', 'b', 'c', 'ad']):
  810. values = klass(data)
  811. pytest.raises(TypeError, values.str.replace, 'a', repl)
  812. def test_replace_callable(self):
  813. # GH 15055
  814. values = Series(['fooBAD__barBAD', NA])
  815. # test with callable
  816. repl = lambda m: m.group(0).swapcase()
  817. result = values.str.replace('[a-z][A-Z]{2}', repl, n=2)
  818. exp = Series(['foObaD__baRbaD', NA])
  819. tm.assert_series_equal(result, exp)
  820. # test with wrong number of arguments, raising an error
  821. if compat.PY2:
  822. p_err = r'takes (no|(exactly|at (least|most)) ?\d+) arguments?'
  823. else:
  824. p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ '
  825. r'(?(3)required )positional arguments?')
  826. repl = lambda: None
  827. with pytest.raises(TypeError, match=p_err):
  828. values.str.replace('a', repl)
  829. repl = lambda m, x: None
  830. with pytest.raises(TypeError, match=p_err):
  831. values.str.replace('a', repl)
  832. repl = lambda m, x, y=None: None
  833. with pytest.raises(TypeError, match=p_err):
  834. values.str.replace('a', repl)
  835. # test regex named groups
  836. values = Series(['Foo Bar Baz', NA])
  837. pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
  838. repl = lambda m: m.group('middle').swapcase()
  839. result = values.str.replace(pat, repl)
  840. exp = Series(['bAR', NA])
  841. tm.assert_series_equal(result, exp)
  842. def test_replace_compiled_regex(self):
  843. # GH 15446
  844. values = Series(['fooBAD__barBAD', NA])
  845. # test with compiled regex
  846. pat = re.compile(r'BAD[_]*')
  847. result = values.str.replace(pat, '')
  848. exp = Series(['foobar', NA])
  849. tm.assert_series_equal(result, exp)
  850. # mixed
  851. mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD',
  852. None, 1, 2.])
  853. rs = Series(mixed).str.replace(pat, '')
  854. xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA])
  855. assert isinstance(rs, Series)
  856. tm.assert_almost_equal(rs, xp)
  857. # unicode
  858. values = Series([u('fooBAD__barBAD'), NA])
  859. result = values.str.replace(pat, '')
  860. exp = Series([u('foobar'), NA])
  861. tm.assert_series_equal(result, exp)
  862. result = values.str.replace(pat, '', n=1)
  863. exp = Series([u('foobarBAD'), NA])
  864. tm.assert_series_equal(result, exp)
  865. # flags + unicode
  866. values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
  867. exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
  868. pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
  869. result = values.str.replace(pat, ", ")
  870. tm.assert_series_equal(result, exp)
  871. # case and flags provided to str.replace will have no effect
  872. # and will produce warnings
  873. values = Series(['fooBAD__barBAD__bad', NA])
  874. pat = re.compile(r'BAD[_]*')
  875. with pytest.raises(ValueError,
  876. match="case and flags cannot be"):
  877. result = values.str.replace(pat, '', flags=re.IGNORECASE)
  878. with pytest.raises(ValueError,
  879. match="case and flags cannot be"):
  880. result = values.str.replace(pat, '', case=False)
  881. with pytest.raises(ValueError,
  882. match="case and flags cannot be"):
  883. result = values.str.replace(pat, '', case=True)
  884. # test with callable
  885. values = Series(['fooBAD__barBAD', NA])
  886. repl = lambda m: m.group(0).swapcase()
  887. pat = re.compile('[a-z][A-Z]{2}')
  888. result = values.str.replace(pat, repl, n=2)
  889. exp = Series(['foObaD__baRbaD', NA])
  890. tm.assert_series_equal(result, exp)
  891. def test_replace_literal(self):
  892. # GH16808 literal replace (regex=False vs regex=True)
  893. values = Series(['f.o', 'foo', NA])
  894. exp = Series(['bao', 'bao', NA])
  895. result = values.str.replace('f.', 'ba')
  896. tm.assert_series_equal(result, exp)
  897. exp = Series(['bao', 'foo', NA])
  898. result = values.str.replace('f.', 'ba', regex=False)
  899. tm.assert_series_equal(result, exp)
  900. # Cannot do a literal replace if given a callable repl or compiled
  901. # pattern
  902. callable_repl = lambda m: m.group(0).swapcase()
  903. compiled_pat = re.compile('[a-z][A-Z]{2}')
  904. pytest.raises(ValueError, values.str.replace, 'abc', callable_repl,
  905. regex=False)
  906. pytest.raises(ValueError, values.str.replace, compiled_pat, '',
  907. regex=False)
  908. def test_repeat(self):
  909. values = Series(['a', 'b', NA, 'c', NA, 'd'])
  910. result = values.str.repeat(3)
  911. exp = Series(['aaa', 'bbb', NA, 'ccc', NA, 'ddd'])
  912. tm.assert_series_equal(result, exp)
  913. result = values.str.repeat([1, 2, 3, 4, 5, 6])
  914. exp = Series(['a', 'bb', NA, 'cccc', NA, 'dddddd'])
  915. tm.assert_series_equal(result, exp)
  916. # mixed
  917. mixed = Series(['a', NA, 'b', True, datetime.today(), 'foo', None, 1,
  918. 2.])
  919. rs = Series(mixed).str.repeat(3)
  920. xp = Series(['aaa', NA, 'bbb', NA, NA, 'foofoofoo', NA, NA, NA])
  921. assert isinstance(rs, Series)
  922. tm.assert_series_equal(rs, xp)
  923. # unicode
  924. values = Series([u('a'), u('b'), NA, u('c'), NA, u('d')])
  925. result = values.str.repeat(3)
  926. exp = Series([u('aaa'), u('bbb'), NA, u('ccc'), NA, u('ddd')])
  927. tm.assert_series_equal(result, exp)
  928. result = values.str.repeat([1, 2, 3, 4, 5, 6])
  929. exp = Series([u('a'), u('bb'), NA, u('cccc'), NA, u('dddddd')])
  930. tm.assert_series_equal(result, exp)
  931. def test_match(self):
  932. # New match behavior introduced in 0.13
  933. values = Series(['fooBAD__barBAD', NA, 'foo'])
  934. result = values.str.match('.*(BAD[_]+).*(BAD)')
  935. exp = Series([True, NA, False])
  936. tm.assert_series_equal(result, exp)
  937. values = Series(['fooBAD__barBAD', NA, 'foo'])
  938. result = values.str.match('.*BAD[_]+.*BAD')
  939. exp = Series([True, NA, False])
  940. tm.assert_series_equal(result, exp)
  941. # mixed
  942. mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
  943. 'foo', None, 1, 2.])
  944. rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
  945. xp = Series([True, NA, True, NA, NA, False, NA, NA, NA])
  946. assert isinstance(rs, Series)
  947. tm.assert_series_equal(rs, xp)
  948. # unicode
  949. values = Series([u('fooBAD__barBAD'), NA, u('foo')])
  950. result = values.str.match('.*(BAD[_]+).*(BAD)')
  951. exp = Series([True, NA, False])
  952. tm.assert_series_equal(result, exp)
  953. # na GH #6609
  954. res = Series(['a', 0, np.nan]).str.match('a', na=False)
  955. exp = Series([True, False, False])
  956. assert_series_equal(exp, res)
  957. res = Series(['a', 0, np.nan]).str.match('a')
  958. exp = Series([True, np.nan, np.nan])
  959. assert_series_equal(exp, res)
  960. def test_extract_expand_None(self):
  961. values = Series(['fooBAD__barBAD', NA, 'foo'])
  962. with pytest.raises(ValueError,
  963. match='expand must be True or False'):
  964. values.str.extract('.*(BAD[_]+).*(BAD)', expand=None)
  965. def test_extract_expand_unspecified(self):
  966. values = Series(['fooBAD__barBAD', NA, 'foo'])
  967. result_unspecified = values.str.extract('.*(BAD[_]+).*')
  968. assert isinstance(result_unspecified, DataFrame)
  969. result_true = values.str.extract('.*(BAD[_]+).*', expand=True)
  970. tm.assert_frame_equal(result_unspecified, result_true)
  971. def test_extract_expand_False(self):
  972. # Contains tests like those in test_match and some others.
  973. values = Series(['fooBAD__barBAD', NA, 'foo'])
  974. er = [NA, NA] # empty row
  975. result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=False)
  976. exp = DataFrame([['BAD__', 'BAD'], er, er])
  977. tm.assert_frame_equal(result, exp)
  978. # mixed
  979. mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
  980. 'foo', None, 1, 2.])
  981. rs = Series(mixed).str.extract('.*(BAD[_]+).*(BAD)', expand=False)
  982. exp = DataFrame([['BAD_', 'BAD'], er, ['BAD_', 'BAD'], er, er, er, er,
  983. er, er])
  984. tm.assert_frame_equal(rs, exp)
  985. # unicode
  986. values = Series([u('fooBAD__barBAD'), NA, u('foo')])
  987. result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=False)
  988. exp = DataFrame([[u('BAD__'), u('BAD')], er, er])
  989. tm.assert_frame_equal(result, exp)
  990. # GH9980
  991. # Index only works with one regex group since
  992. # multi-group would expand to a frame
  993. idx = Index(['A1', 'A2', 'A3', 'A4', 'B5'])
  994. with pytest.raises(ValueError, match="supported"):
  995. idx.str.extract('([AB])([123])', expand=False)
  996. # these should work for both Series and Index
  997. for klass in [Series, Index]:
  998. # no groups
  999. s_or_idx = klass(['A1', 'B2', 'C3'])
  1000. f = lambda: s_or_idx.str.extract('[ABC][123]', expand=False)
  1001. pytest.raises(ValueError, f)
  1002. # only non-capturing groups
  1003. f = lambda: s_or_idx.str.extract('(?:[AB]).*', expand=False)
  1004. pytest.raises(ValueError, f)
  1005. # single group renames series/index properly
  1006. s_or_idx = klass(['A1', 'A2'])
  1007. result = s_or_idx.str.extract(r'(?P<uno>A)\d', expand=False)
  1008. assert result.name == 'uno'
  1009. exp = klass(['A', 'A'], name='uno')
  1010. if klass == Series:
  1011. tm.assert_series_equal(result, exp)
  1012. else:
  1013. tm.assert_index_equal(result, exp)
  1014. s = Series(['A1', 'B2', 'C3'])
  1015. # one group, no matches
  1016. result = s.str.extract('(_)', expand=False)
  1017. exp = Series([NA, NA, NA], dtype=object)
  1018. tm.assert_series_equal(result, exp)
  1019. # two groups, no matches
  1020. result = s.str.extract('(_)(_)', expand=False)
  1021. exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]], dtype=object)
  1022. tm.assert_frame_equal(result, exp)
  1023. # one group, some matches
  1024. result = s.str.extract('([AB])[123]', expand=False)
  1025. exp = Series(['A', 'B', NA])
  1026. tm.assert_series_equal(result, exp)
  1027. # two groups, some matches
  1028. result = s.str.extract('([AB])([123])', expand=False)
  1029. exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
  1030. tm.assert_frame_equal(result, exp)
  1031. # one named group
  1032. result = s.str.extract('(?P<letter>[AB])', expand=False)
  1033. exp = Series(['A', 'B', NA], name='letter')
  1034. tm.assert_series_equal(result, exp)
  1035. # two named groups
  1036. result = s.str.extract('(?P<letter>[AB])(?P<number>[123])',
  1037. expand=False)
  1038. exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]],
  1039. columns=['letter', 'number'])
  1040. tm.assert_frame_equal(result, exp)
  1041. # mix named and unnamed groups
  1042. result = s.str.extract('([AB])(?P<number>[123])', expand=False)
  1043. exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]],
  1044. columns=[0, 'number'])
  1045. tm.assert_frame_equal(result, exp)
  1046. # one normal group, one non-capturing group
  1047. result = s.str.extract('([AB])(?:[123])', expand=False)
  1048. exp = Series(['A', 'B', NA])
  1049. tm.assert_series_equal(result, exp)
  1050. # two normal groups, one non-capturing group
  1051. result = Series(['A11', 'B22', 'C33']).str.extract(
  1052. '([AB])([123])(?:[123])', expand=False)
  1053. exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
  1054. tm.assert_frame_equal(result, exp)
  1055. # one optional group followed by one normal group
  1056. result = Series(['A1', 'B2', '3']).str.extract(
  1057. '(?P<letter>[AB])?(?P<number>[123])', expand=False)
  1058. exp = DataFrame([['A', '1'], ['B', '2'], [NA, '3']],
  1059. columns=['letter', 'number'])
  1060. tm.assert_frame_equal(result, exp)
  1061. # one normal group followed by one optional group
  1062. result = Series(['A1', 'B2', 'C']).str.extract(
  1063. '(?P<letter>[ABC])(?P<number>[123])?', expand=False)
  1064. exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]],
  1065. columns=['letter', 'number'])
  1066. tm.assert_frame_equal(result, exp)
  1067. # GH6348
  1068. # not passing index to the extractor
  1069. def check_index(index):
  1070. data = ['A1', 'B2', 'C']
  1071. index = index[:len(data)]
  1072. s = Series(data, index=index)
  1073. result = s.str.extract(r'(\d)', expand=False)
  1074. exp = Series(['1', '2', NA], index=index)
  1075. tm.assert_series_equal(result, exp)
  1076. result = Series(data, index=index).str.extract(
  1077. r'(?P<letter>\D)(?P<number>\d)?', expand=False)
  1078. e_list = [
  1079. ['A', '1'],
  1080. ['B', '2'],
  1081. ['C', NA]
  1082. ]
  1083. exp = DataFrame(e_list, columns=['letter', 'number'], index=index)
  1084. tm.assert_frame_equal(result, exp)
  1085. i_funs = [
  1086. tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex,
  1087. tm.makeDateIndex, tm.makePeriodIndex, tm.makeRangeIndex
  1088. ]
  1089. for index in i_funs:
  1090. check_index(index())
  1091. # single_series_name_is_preserved.
  1092. s = Series(['a3', 'b3', 'c2'], name='bob')
  1093. r = s.str.extract(r'(?P<sue>[a-z])', expand=False)
  1094. e = Series(['a', 'b', 'c'], name='sue')
  1095. tm.assert_series_equal(r, e)
  1096. assert r.name == e.name
  1097. def test_extract_expand_True(self):
  1098. # Contains tests like those in test_match and some others.
  1099. values = Series(['fooBAD__barBAD', NA, 'foo'])
  1100. er = [NA, NA] # empty row
  1101. result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=True)
  1102. exp = DataFrame([['BAD__', 'BAD'], er, er])
  1103. tm.assert_frame_equal(result, exp)
  1104. # mixed
  1105. mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
  1106. 'foo', None, 1, 2.])
  1107. rs = Series(mixed).str.extract('.*(BAD[_]+).*(BAD)', expand=True)
  1108. exp = DataFrame([['BAD_', 'BAD'], er, ['BAD_', 'BAD'], er, er,
  1109. er, er, er, er])
  1110. tm.assert_frame_equal(rs, exp)
  1111. # unicode
  1112. values = Series([u('fooBAD__barBAD'), NA, u('foo')])
  1113. result = values.str.extract('.*(BAD[_]+).*(BAD)', expand=True)
  1114. exp = DataFrame([[u('BAD__'), u('BAD')], er, er])
  1115. tm.assert_frame_equal(result, exp)
  1116. # these should work for both Series and Index
  1117. for klass in [Series, Index]:
  1118. # no groups
  1119. s_or_idx = klass(['A1', 'B2', 'C3'])
  1120. f = lambda: s_or_idx.str.extract('[ABC][123]', expand=True)
  1121. pytest.raises(ValueError, f)
  1122. # only non-capturing groups
  1123. f = lambda: s_or_idx.str.extract('(?:[AB]).*', expand=True)
  1124. pytest.raises(ValueError, f)
  1125. # single group renames series/index properly
  1126. s_or_idx = klass(['A1', 'A2'])
  1127. result_df = s_or_idx.str.extract(r'(?P<uno>A)\d', expand=True)
  1128. assert isinstance(result_df, DataFrame)
  1129. result_series = result_df['uno']
  1130. assert_series_equal(result_series, Series(['A', 'A'], name='uno'))
  1131. def test_extract_series(self):
  1132. # extract should give the same result whether or not the
  1133. # series has a name.
  1134. for series_name in None, "series_name":
  1135. s = Series(['A1', 'B2', 'C3'], name=series_name)
  1136. # one group, no matches
  1137. result = s.str.extract('(_)', expand=True)
  1138. exp = DataFrame([NA, NA, NA], dtype=object)
  1139. tm.assert_frame_equal(result, exp)
  1140. # two groups, no matches
  1141. result = s.str.extract('(_)(_)', expand=True)
  1142. exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]], dtype=object)
  1143. tm.assert_frame_equal(result, exp)
  1144. # one group, some matches
  1145. result = s.str.extract('([AB])[123]', expand=True)
  1146. exp = DataFrame(['A', 'B', NA])
  1147. tm.assert_frame_equal(result, exp)
  1148. # two groups, some matches
  1149. result = s.str.extract('([AB])([123])', expand=True)
  1150. exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
  1151. tm.assert_frame_equal(result, exp)
  1152. # one named group
  1153. result = s.str.extract('(?P<letter>[AB])', expand=True)
  1154. exp = DataFrame({"letter": ['A', 'B', NA]})
  1155. tm.assert_frame_equal(result, exp)
  1156. # two named groups
  1157. result = s.str.extract(
  1158. '(?P<letter>[AB])(?P<number>[123])',
  1159. expand=True)
  1160. e_list = [
  1161. ['A', '1'],
  1162. ['B', '2'],
  1163. [NA, NA]
  1164. ]
  1165. exp = DataFrame(e_list, columns=['letter', 'number'])
  1166. tm.assert_frame_equal(result, exp)
  1167. # mix named and unnamed groups
  1168. result = s.str.extract('([AB])(?P<number>[123])', expand=True)
  1169. exp = DataFrame(e_list, columns=[0, 'number'])
  1170. tm.assert_frame_equal(result, exp)
  1171. # one normal group, one non-capturing group
  1172. result = s.str.extract('([AB])(?:[123])', expand=True)
  1173. exp = DataFrame(['A', 'B', NA])
  1174. tm.assert_frame_equal(result, exp)
  1175. def test_extract_optional_groups(self):
  1176. # two normal groups, one non-capturing group
  1177. result = Series(['A11', 'B22', 'C33']).str.extract(
  1178. '([AB])([123])(?:[123])', expand=True)
  1179. exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
  1180. tm.assert_frame_equal(result, exp)
  1181. # one optional group followed by one normal group
  1182. result = Series(['A1', 'B2', '3']).str.extract(
  1183. '(?P<letter>[AB])?(?P<number>[123])', expand=True)
  1184. e_list = [
  1185. ['A', '1'],
  1186. ['B', '2'],
  1187. [NA, '3']
  1188. ]
  1189. exp = DataFrame(e_list, columns=['letter', 'number'])
  1190. tm.assert_frame_equal(result, exp)
  1191. # one normal group followed by one optional group
  1192. result = Series(['A1', 'B2', 'C']).str.extract(
  1193. '(?P<letter>[ABC])(?P<number>[123])?', expand=True)
  1194. e_list = [
  1195. ['A', '1'],
  1196. ['B', '2'],
  1197. ['C', NA]
  1198. ]
  1199. exp = DataFrame(e_list, columns=['letter', 'number'])
  1200. tm.assert_frame_equal(result, exp)
  1201. # GH6348
  1202. # not passing index to the extractor
  1203. def check_index(index):
  1204. data = ['A1', 'B2', 'C']
  1205. index = index[:len(data)]
  1206. result = Series(data, index=index).str.extract(
  1207. r'(\d)', expand=True)
  1208. exp = DataFrame(['1', '2', NA], index=index)
  1209. tm.assert_frame_equal(result, exp)
  1210. result = Series(data, index=index).str.extract(
  1211. r'(?P<letter>\D)(?P<number>\d)?', expand=True)
  1212. e_list = [
  1213. ['A', '1'],
  1214. ['B', '2'],
  1215. ['C', NA]
  1216. ]
  1217. exp = DataFrame(e_list, columns=['letter', 'number'], index=index)
  1218. tm.assert_frame_equal(result, exp)
  1219. i_funs = [
  1220. tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex,
  1221. tm.makeDateIndex, tm.makePeriodIndex, tm.makeRangeIndex
  1222. ]
  1223. for index in i_funs:
  1224. check_index(index())
  1225. def test_extract_single_group_returns_frame(self):
  1226. # GH11386 extract should always return DataFrame, even when
  1227. # there is only one group. Prior to v0.18.0, extract returned
  1228. # Series when there was only one group in the regex.
  1229. s = Series(['a3', 'b3', 'c2'], name='series_name')
  1230. r = s.str.extract(r'(?P<letter>[a-z])', expand=True)
  1231. e = DataFrame({"letter": ['a', 'b', 'c']})
  1232. tm.assert_frame_equal(r, e)
  1233. def test_extractall(self):
  1234. subject_list = [
  1235. 'dave@google.com',
  1236. 'tdhock5@gmail.com',
  1237. 'maudelaperriere@gmail.com',
  1238. 'rob@gmail.com some text steve@gmail.com',
  1239. 'a@b.com some text c@d.com and e@f.com',
  1240. np.nan,
  1241. "",
  1242. ]
  1243. expected_tuples = [
  1244. ("dave", "google", "com"),
  1245. ("tdhock5", "gmail", "com"),
  1246. ("maudelaperriere", "gmail", "com"),
  1247. ("rob", "gmail", "com"), ("steve", "gmail", "com"),
  1248. ("a", "b", "com"), ("c", "d", "com"), ("e", "f", "com"),
  1249. ]
  1250. named_pattern = r"""
  1251. (?P<user>[a-z0-9]+)
  1252. @
  1253. (?P<domain>[a-z]+)
  1254. \.
  1255. (?P<tld>[a-z]{2,4})
  1256. """
  1257. expected_columns = ["user", "domain", "tld"]
  1258. S = Series(subject_list)
  1259. # extractall should return a DataFrame with one row for each
  1260. # match, indexed by the subject from which the match came.
  1261. expected_index = MultiIndex.from_tuples([
  1262. (0, 0),
  1263. (1, 0),
  1264. (2, 0),
  1265. (3, 0),
  1266. (3, 1),
  1267. (4, 0),
  1268. (4, 1),
  1269. (4, 2),
  1270. ], names=(None, "match"))
  1271. expected_df = DataFrame(
  1272. expected_tuples, expected_index, expected_columns)
  1273. computed_df = S.str.extractall(named_pattern, re.VERBOSE)
  1274. tm.assert_frame_equal(computed_df, expected_df)
  1275. # The index of the input Series should be used to construct
  1276. # the index of the output DataFrame:
  1277. series_index = MultiIndex.from_tuples([
  1278. ("single", "Dave"),
  1279. ("single", "Toby"),
  1280. ("single", "Maude"),
  1281. ("multiple", "robAndSteve"),
  1282. ("multiple", "abcdef"),
  1283. ("none", "missing"),
  1284. ("none", "empty"),
  1285. ])
  1286. Si = Series(subject_list, series_index)
  1287. expected_index = MultiIndex.from_tuples([
  1288. ("single", "Dave", 0),
  1289. ("single", "Toby", 0),
  1290. ("single", "Maude", 0),
  1291. ("multiple", "robAndSteve", 0),
  1292. ("multiple", "robAndSteve", 1),
  1293. ("multiple", "abcdef", 0),
  1294. ("multiple", "abcdef", 1),
  1295. ("multiple", "abcdef", 2),
  1296. ], names=(None, None, "match"))
  1297. expected_df = DataFrame(
  1298. expected_tuples, expected_index, expected_columns)
  1299. computed_df = Si.str.extractall(named_pattern, re.VERBOSE)
  1300. tm.assert_frame_equal(computed_df, expected_df)
  1301. # MultiIndexed subject with names.
  1302. Sn = Series(subject_list, series_index)
  1303. Sn.index.names = ("matches", "description")
  1304. expected_index.names = ("matches", "description", "match")
  1305. expected_df = DataFrame(
  1306. expected_tuples, expected_index, expected_columns)
  1307. computed_df = Sn.str.extractall(named_pattern, re.VERBOSE)
  1308. tm.assert_frame_equal(computed_df, expected_df)
  1309. # optional groups.
  1310. subject_list = ['', 'A1', '32']
  1311. named_pattern = '(?P<letter>[AB])?(?P<number>[123])'
  1312. computed_df = Series(subject_list).str.extractall(named_pattern)
  1313. expected_index = MultiIndex.from_tuples([
  1314. (1, 0),
  1315. (2, 0),
  1316. (2, 1),
  1317. ], names=(None, "match"))
  1318. expected_df = DataFrame([
  1319. ('A', '1'),
  1320. (NA, '3'),
  1321. (NA, '2'),
  1322. ], expected_index, columns=['letter', 'number'])
  1323. tm.assert_frame_equal(computed_df, expected_df)
  1324. # only one of two groups has a name.
  1325. pattern = '([AB])?(?P<number>[123])'
  1326. computed_df = Series(subject_list).str.extractall(pattern)
  1327. expected_df = DataFrame([
  1328. ('A', '1'),
  1329. (NA, '3'),
  1330. (NA, '2'),
  1331. ], expected_index, columns=[0, 'number'])
  1332. tm.assert_frame_equal(computed_df, expected_df)
  1333. def test_extractall_single_group(self):
  1334. # extractall(one named group) returns DataFrame with one named
  1335. # column.
  1336. s = Series(['a3', 'b3', 'd4c2'], name='series_name')
  1337. r = s.str.extractall(r'(?P<letter>[a-z])')
  1338. i = MultiIndex.from_tuples([
  1339. (0, 0),
  1340. (1, 0),
  1341. (2, 0),
  1342. (2, 1),
  1343. ], names=(None, "match"))
  1344. e = DataFrame({"letter": ['a', 'b', 'd', 'c']}, i)
  1345. tm.assert_frame_equal(r, e)
  1346. # extractall(one un-named group) returns DataFrame with one
  1347. # un-named column.
  1348. r = s.str.extractall(r'([a-z])')
  1349. e = DataFrame(['a', 'b', 'd', 'c'], i)
  1350. tm.assert_frame_equal(r, e)
  1351. def test_extractall_single_group_with_quantifier(self):
  1352. # extractall(one un-named group with quantifier) returns
  1353. # DataFrame with one un-named column (GH13382).
  1354. s = Series(['ab3', 'abc3', 'd4cd2'], name='series_name')
  1355. r = s.str.extractall(r'([a-z]+)')
  1356. i = MultiIndex.from_tuples([
  1357. (0, 0),
  1358. (1, 0),
  1359. (2, 0),
  1360. (2, 1),
  1361. ], names=(None, "match"))
  1362. e = DataFrame(['ab', 'abc', 'd', 'cd'], i)
  1363. tm.assert_frame_equal(r, e)
  1364. @pytest.mark.parametrize('data, names', [
  1365. ([], (None, )),
  1366. ([], ('i1', )),
  1367. ([], (None, 'i2')),
  1368. ([], ('i1', 'i2')),
  1369. (['a3', 'b3', 'd4c2'], (None, )),
  1370. (['a3', 'b3', 'd4c2'], ('i1', 'i2')),
  1371. (['a3', 'b3', 'd4c2'], (None, 'i2')),
  1372. (['a3', 'b3', 'd4c2'], ('i1', 'i2')),
  1373. ])
  1374. def test_extractall_no_matches(self, data, names):
  1375. # GH19075 extractall with no matches should return a valid MultiIndex
  1376. n = len(data)
  1377. if len(names) == 1:
  1378. i = Index(range(n), name=names[0])
  1379. else:
  1380. a = (tuple([i] * (n - 1)) for i in range(n))
  1381. i = MultiIndex.from_tuples(a, names=names)
  1382. s = Series(data, name='series_name', index=i, dtype='object')
  1383. ei = MultiIndex.from_tuples([], names=(names + ('match',)))
  1384. # one un-named group.
  1385. r = s.str.extractall('(z)')
  1386. e = DataFrame(columns=[0], index=ei)
  1387. tm.assert_frame_equal(r, e)
  1388. # two un-named groups.
  1389. r = s.str.extractall('(z)(z)')
  1390. e = DataFrame(columns=[0, 1], index=ei)
  1391. tm.assert_frame_equal(r, e)
  1392. # one named group.
  1393. r = s.str.extractall('(?P<first>z)')
  1394. e = DataFrame(columns=["first"], index=ei)
  1395. tm.assert_frame_equal(r, e)
  1396. # two named groups.
  1397. r = s.str.extractall('(?P<first>z)(?P<second>z)')
  1398. e = DataFrame(columns=["first", "second"], index=ei)
  1399. tm.assert_frame_equal(r, e)
  1400. # one named, one un-named.
  1401. r = s.str.extractall('(z)(?P<second>z)')
  1402. e = DataFrame(columns=[0, "second"], index=ei)
  1403. tm.assert_frame_equal(r, e)
  1404. def test_extractall_stringindex(self):
  1405. s = Series(["a1a2", "b1", "c1"], name='xxx')
  1406. res = s.str.extractall(r"[ab](?P<digit>\d)")
  1407. exp_idx = MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)],
  1408. names=[None, 'match'])
  1409. exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx)
  1410. tm.assert_frame_equal(res, exp)
  1411. # index should return the same result as the default index without name
  1412. # thus index.name doesn't affect to the result
  1413. for idx in [Index(["a1a2", "b1", "c1"]),
  1414. Index(["a1a2", "b1", "c1"], name='xxx')]:
  1415. res = idx.str.extractall(r"[ab](?P<digit>\d)")
  1416. tm.assert_frame_equal(res, exp)
  1417. s = Series(["a1a2", "b1", "c1"], name='s_name',
  1418. index=Index(["XX", "yy", "zz"], name='idx_name'))
  1419. res = s.str.extractall(r"[ab](?P<digit>\d)")
  1420. exp_idx = MultiIndex.from_tuples([("XX", 0), ("XX", 1), ("yy", 0)],
  1421. names=["idx_name", 'match'])
  1422. exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx)
  1423. tm.assert_frame_equal(res, exp)
  1424. def test_extractall_errors(self):
  1425. # Does not make sense to use extractall with a regex that has
  1426. # no capture groups. (it returns DataFrame with one column for
  1427. # each capture group)
  1428. s = Series(['a3', 'b3', 'd4c2'], name='series_name')
  1429. with pytest.raises(ValueError, match="no capture groups"):
  1430. s.str.extractall(r'[a-z]')
  1431. def test_extract_index_one_two_groups(self):
  1432. s = Series(['a3', 'b3', 'd4c2'], index=["A3", "B3", "D4"],
  1433. name='series_name')
  1434. r = s.index.str.extract(r'([A-Z])', expand=True)
  1435. e = DataFrame(['A', "B", "D"])
  1436. tm.assert_frame_equal(r, e)
  1437. # Prior to v0.18.0, index.str.extract(regex with one group)
  1438. # returned Index. With more than one group, extract raised an
  1439. # error (GH9980). Now extract always returns DataFrame.
  1440. r = s.index.str.extract(
  1441. r'(?P<letter>[A-Z])(?P<digit>[0-9])', expand=True)
  1442. e_list = [
  1443. ("A", "3"),
  1444. ("B", "3"),
  1445. ("D", "4"),
  1446. ]
  1447. e = DataFrame(e_list, columns=["letter", "digit"])
  1448. tm.assert_frame_equal(r, e)
  1449. def test_extractall_same_as_extract(self):
  1450. s = Series(['a3', 'b3', 'c2'], name='series_name')
  1451. pattern_two_noname = r'([a-z])([0-9])'
  1452. extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
  1453. has_multi_index = s.str.extractall(pattern_two_noname)
  1454. no_multi_index = has_multi_index.xs(0, level="match")
  1455. tm.assert_frame_equal(extract_two_noname, no_multi_index)
  1456. pattern_two_named = r'(?P<letter>[a-z])(?P<digit>[0-9])'
  1457. extract_two_named = s.str.extract(pattern_two_named, expand=True)
  1458. has_multi_index = s.str.extractall(pattern_two_named)
  1459. no_multi_index = has_multi_index.xs(0, level="match")
  1460. tm.assert_frame_equal(extract_two_named, no_multi_index)
  1461. pattern_one_named = r'(?P<group_name>[a-z])'
  1462. extract_one_named = s.str.extract(pattern_one_named, expand=True)
  1463. has_multi_index = s.str.extractall(pattern_one_named)
  1464. no_multi_index = has_multi_index.xs(0, level="match")
  1465. tm.assert_frame_equal(extract_one_named, no_multi_index)
  1466. pattern_one_noname = r'([a-z])'
  1467. extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
  1468. has_multi_index = s.str.extractall(pattern_one_noname)
  1469. no_multi_index = has_multi_index.xs(0, level="match")
  1470. tm.assert_frame_equal(extract_one_noname, no_multi_index)
  1471. def test_extractall_same_as_extract_subject_index(self):
  1472. # same as above tests, but s has an MultiIndex.
  1473. i = MultiIndex.from_tuples([
  1474. ("A", "first"),
  1475. ("B", "second"),
  1476. ("C", "third"),
  1477. ], names=("capital", "ordinal"))
  1478. s = Series(['a3', 'b3', 'c2'], i, name='series_name')
  1479. pattern_two_noname = r'([a-z])([0-9])'
  1480. extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
  1481. has_match_index = s.str.extractall(pattern_two_noname)
  1482. no_match_index = has_match_index.xs(0, level="match")
  1483. tm.assert_frame_equal(extract_two_noname, no_match_index)
  1484. pattern_two_named = r'(?P<letter>[a-z])(?P<digit>[0-9])'
  1485. extract_two_named = s.str.extract(pattern_two_named, expand=True)
  1486. has_match_index = s.str.extractall(pattern_two_named)
  1487. no_match_index = has_match_index.xs(0, level="match")
  1488. tm.assert_frame_equal(extract_two_named, no_match_index)
  1489. pattern_one_named = r'(?P<group_name>[a-z])'
  1490. extract_one_named = s.str.extract(pattern_one_named, expand=True)
  1491. has_match_index = s.str.extractall(pattern_one_named)
  1492. no_match_index = has_match_index.xs(0, level="match")
  1493. tm.assert_frame_equal(extract_one_named, no_match_index)
  1494. pattern_one_noname = r'([a-z])'
  1495. extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
  1496. has_match_index = s.str.extractall(pattern_one_noname)
  1497. no_match_index = has_match_index.xs(0, level="match")
  1498. tm.assert_frame_equal(extract_one_noname, no_match_index)
  1499. def test_empty_str_methods(self):
  1500. empty_str = empty = Series(dtype=object)
  1501. empty_int = Series(dtype=int)
  1502. empty_bool = Series(dtype=bool)
  1503. empty_bytes = Series(dtype=object)
  1504. # GH7241
  1505. # (extract) on empty series
  1506. tm.assert_series_equal(empty_str, empty.str.cat(empty))
  1507. assert '' == empty.str.cat()
  1508. tm.assert_series_equal(empty_str, empty.str.title())
  1509. tm.assert_series_equal(empty_int, empty.str.count('a'))
  1510. tm.assert_series_equal(empty_bool, empty.str.contains('a'))
  1511. tm.assert_series_equal(empty_bool, empty.str.startswith('a'))
  1512. tm.assert_series_equal(empty_bool, empty.str.endswith('a'))
  1513. tm.assert_series_equal(empty_str, empty.str.lower())
  1514. tm.assert_series_equal(empty_str, empty.str.upper())
  1515. tm.assert_series_equal(empty_str, empty.str.replace('a', 'b'))
  1516. tm.assert_series_equal(empty_str, empty.str.repeat(3))
  1517. tm.assert_series_equal(empty_bool, empty.str.match('^a'))
  1518. tm.assert_frame_equal(
  1519. DataFrame(columns=[0], dtype=str),
  1520. empty.str.extract('()', expand=True))
  1521. tm.assert_frame_equal(
  1522. DataFrame(columns=[0, 1], dtype=str),
  1523. empty.str.extract('()()', expand=True))
  1524. tm.assert_series_equal(
  1525. empty_str,
  1526. empty.str.extract('()', expand=False))
  1527. tm.assert_frame_equal(
  1528. DataFrame(columns=[0, 1], dtype=str),
  1529. empty.str.extract('()()', expand=False))
  1530. tm.assert_frame_equal(DataFrame(dtype=str), empty.str.get_dummies())
  1531. tm.assert_series_equal(empty_str, empty_str.str.join(''))
  1532. tm.assert_series_equal(empty_int, empty.str.len())
  1533. tm.assert_series_equal(empty_str, empty_str.str.findall('a'))
  1534. tm.assert_series_equal(empty_int, empty.str.find('a'))
  1535. tm.assert_series_equal(empty_int, empty.str.rfind('a'))
  1536. tm.assert_series_equal(empty_str, empty.str.pad(42))
  1537. tm.assert_series_equal(empty_str, empty.str.center(42))
  1538. tm.assert_series_equal(empty_str, empty.str.split('a'))
  1539. tm.assert_series_equal(empty_str, empty.str.rsplit('a'))
  1540. tm.assert_series_equal(empty_str,
  1541. empty.str.partition('a', expand=False))
  1542. tm.assert_series_equal(empty_str,
  1543. empty.str.rpartition('a', expand=False))
  1544. tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
  1545. tm.assert_series_equal(empty_str, empty.str.slice(step=1))
  1546. tm.assert_series_equal(empty_str, empty.str.strip())
  1547. tm.assert_series_equal(empty_str, empty.str.lstrip())
  1548. tm.assert_series_equal(empty_str, empty.str.rstrip())
  1549. tm.assert_series_equal(empty_str, empty.str.wrap(42))
  1550. tm.assert_series_equal(empty_str, empty.str.get(0))
  1551. tm.assert_series_equal(empty_str, empty_bytes.str.decode('ascii'))
  1552. tm.assert_series_equal(empty_bytes, empty.str.encode('ascii'))
  1553. tm.assert_series_equal(empty_str, empty.str.isalnum())
  1554. tm.assert_series_equal(empty_str, empty.str.isalpha())
  1555. tm.assert_series_equal(empty_str, empty.str.isdigit())
  1556. tm.assert_series_equal(empty_str, empty.str.isspace())
  1557. tm.assert_series_equal(empty_str, empty.str.islower())
  1558. tm.assert_series_equal(empty_str, empty.str.isupper())
  1559. tm.assert_series_equal(empty_str, empty.str.istitle())
  1560. tm.assert_series_equal(empty_str, empty.str.isnumeric())
  1561. tm.assert_series_equal(empty_str, empty.str.isdecimal())
  1562. tm.assert_series_equal(empty_str, empty.str.capitalize())
  1563. tm.assert_series_equal(empty_str, empty.str.swapcase())
  1564. tm.assert_series_equal(empty_str, empty.str.normalize('NFC'))
  1565. if compat.PY3:
  1566. table = str.maketrans('a', 'b')
  1567. else:
  1568. import string
  1569. table = string.maketrans('a', 'b')
  1570. tm.assert_series_equal(empty_str, empty.str.translate(table))
  1571. def test_empty_str_methods_to_frame(self):
  1572. empty = Series(dtype=str)
  1573. empty_df = DataFrame([])
  1574. tm.assert_frame_equal(empty_df, empty.str.partition('a'))
  1575. tm.assert_frame_equal(empty_df, empty.str.rpartition('a'))
  1576. def test_ismethods(self):
  1577. values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' ']
  1578. str_s = Series(values)
  1579. alnum_e = [True, True, True, True, True, False, True, True, False,
  1580. False]
  1581. alpha_e = [True, True, True, False, False, False, True, False, False,
  1582. False]
  1583. digit_e = [False, False, False, True, False, False, False, True, False,
  1584. False]
  1585. # TODO: unused
  1586. num_e = [False, False, False, True, False, False, # noqa
  1587. False, True, False, False]
  1588. space_e = [False, False, False, False, False, False, False, False,
  1589. False, True]
  1590. lower_e = [False, True, False, False, False, False, False, False,
  1591. False, False]
  1592. upper_e = [True, False, False, False, True, False, True, False, False,
  1593. False]
  1594. title_e = [True, False, True, False, True, False, False, False, False,
  1595. False]
  1596. tm.assert_series_equal(str_s.str.isalnum(), Series(alnum_e))
  1597. tm.assert_series_equal(str_s.str.isalpha(), Series(alpha_e))
  1598. tm.assert_series_equal(str_s.str.isdigit(), Series(digit_e))
  1599. tm.assert_series_equal(str_s.str.isspace(), Series(space_e))
  1600. tm.assert_series_equal(str_s.str.islower(), Series(lower_e))
  1601. tm.assert_series_equal(str_s.str.isupper(), Series(upper_e))
  1602. tm.assert_series_equal(str_s.str.istitle(), Series(title_e))
  1603. assert str_s.str.isalnum().tolist() == [v.isalnum() for v in values]
  1604. assert str_s.str.isalpha().tolist() == [v.isalpha() for v in values]
  1605. assert str_s.str.isdigit().tolist() == [v.isdigit() for v in values]
  1606. assert str_s.str.isspace().tolist() == [v.isspace() for v in values]
  1607. assert str_s.str.islower().tolist() == [v.islower() for v in values]
  1608. assert str_s.str.isupper().tolist() == [v.isupper() for v in values]
  1609. assert str_s.str.istitle().tolist() == [v.istitle() for v in values]
  1610. def test_isnumeric(self):
  1611. # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER
  1612. # 0x2605: ★ not number
  1613. # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY
  1614. # 0xFF13: 3 Em 3
  1615. values = ['A', '3', u'¼', u'★', u'፸', u'3', 'four']
  1616. s = Series(values)
  1617. numeric_e = [False, True, True, False, True, True, False]
  1618. decimal_e = [False, True, False, False, False, True, False]
  1619. tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e))
  1620. tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e))
  1621. unicodes = [u'A', u'3', u'¼', u'★', u'፸', u'3', u'four']
  1622. assert s.str.isnumeric().tolist() == [v.isnumeric() for v in unicodes]
  1623. assert s.str.isdecimal().tolist() == [v.isdecimal() for v in unicodes]
  1624. values = ['A', np.nan, u'¼', u'★', np.nan, u'3', 'four']
  1625. s = Series(values)
  1626. numeric_e = [False, np.nan, True, False, np.nan, True, False]
  1627. decimal_e = [False, np.nan, False, False, np.nan, True, False]
  1628. tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e))
  1629. tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e))
  1630. def test_get_dummies(self):
  1631. s = Series(['a|b', 'a|c', np.nan])
  1632. result = s.str.get_dummies('|')
  1633. expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]],
  1634. columns=list('abc'))
  1635. tm.assert_frame_equal(result, expected)
  1636. s = Series(['a;b', 'a', 7])
  1637. result = s.str.get_dummies(';')
  1638. expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]],
  1639. columns=list('7ab'))
  1640. tm.assert_frame_equal(result, expected)
  1641. # GH9980, GH8028
  1642. idx = Index(['a|b', 'a|c', 'b|c'])
  1643. result = idx.str.get_dummies('|')
  1644. expected = MultiIndex.from_tuples([(1, 1, 0), (1, 0, 1),
  1645. (0, 1, 1)], names=('a', 'b', 'c'))
  1646. tm.assert_index_equal(result, expected)
  1647. def test_get_dummies_with_name_dummy(self):
  1648. # GH 12180
  1649. # Dummies named 'name' should work as expected
  1650. s = Series(['a', 'b,name', 'b'])
  1651. result = s.str.get_dummies(',')
  1652. expected = DataFrame([[1, 0, 0], [0, 1, 1], [0, 1, 0]],
  1653. columns=['a', 'b', 'name'])
  1654. tm.assert_frame_equal(result, expected)
  1655. idx = Index(['a|b', 'name|c', 'b|name'])
  1656. result = idx.str.get_dummies('|')
  1657. expected = MultiIndex.from_tuples([(1, 1, 0, 0), (0, 0, 1, 1),
  1658. (0, 1, 0, 1)],
  1659. names=('a', 'b', 'c', 'name'))
  1660. tm.assert_index_equal(result, expected)
  1661. def test_join(self):
  1662. values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
  1663. result = values.str.split('_').str.join('_')
  1664. tm.assert_series_equal(values, result)
  1665. # mixed
  1666. mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(),
  1667. 'foo', None, 1, 2.])
  1668. rs = Series(mixed).str.split('_').str.join('_')
  1669. xp = Series(['a_b', NA, 'asdf_cas_asdf', NA, NA, 'foo', NA, NA, NA])
  1670. assert isinstance(rs, Series)
  1671. tm.assert_almost_equal(rs, xp)
  1672. # unicode
  1673. values = Series([u('a_b_c'), u('c_d_e'), np.nan, u('f_g_h')])
  1674. result = values.str.split('_').str.join('_')
  1675. tm.assert_series_equal(values, result)
  1676. def test_len(self):
  1677. values = Series(['foo', 'fooo', 'fooooo', np.nan, 'fooooooo'])
  1678. result = values.str.len()
  1679. exp = values.map(lambda x: len(x) if notna(x) else NA)
  1680. tm.assert_series_equal(result, exp)
  1681. # mixed
  1682. mixed = Series(['a_b', NA, 'asdf_cas_asdf', True, datetime.today(),
  1683. 'foo', None, 1, 2.])
  1684. rs = Series(mixed).str.len()
  1685. xp = Series([3, NA, 13, NA, NA, 3, NA, NA, NA])
  1686. assert isinstance(rs, Series)
  1687. tm.assert_almost_equal(rs, xp)
  1688. # unicode
  1689. values = Series([u('foo'), u('fooo'), u('fooooo'), np.nan, u(
  1690. 'fooooooo')])
  1691. result = values.str.len()
  1692. exp = values.map(lambda x: len(x) if notna(x) else NA)
  1693. tm.assert_series_equal(result, exp)
  1694. def test_findall(self):
  1695. values = Series(['fooBAD__barBAD', NA, 'foo', 'BAD'])
  1696. result = values.str.findall('BAD[_]*')
  1697. exp = Series([['BAD__', 'BAD'], NA, [], ['BAD']])
  1698. tm.assert_almost_equal(result, exp)
  1699. # mixed
  1700. mixed = Series(['fooBAD__barBAD', NA, 'foo', True, datetime.today(),
  1701. 'BAD', None, 1, 2.])
  1702. rs = Series(mixed).str.findall('BAD[_]*')
  1703. xp = Series([['BAD__', 'BAD'], NA, [], NA, NA, ['BAD'], NA, NA, NA])
  1704. assert isinstance(rs, Series)
  1705. tm.assert_almost_equal(rs, xp)
  1706. # unicode
  1707. values = Series([u('fooBAD__barBAD'), NA, u('foo'), u('BAD')])
  1708. result = values.str.findall('BAD[_]*')
  1709. exp = Series([[u('BAD__'), u('BAD')], NA, [], [u('BAD')]])
  1710. tm.assert_almost_equal(result, exp)
  1711. def test_find(self):
  1712. values = Series(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF', 'XXXX'])
  1713. result = values.str.find('EF')
  1714. tm.assert_series_equal(result, Series([4, 3, 1, 0, -1]))
  1715. expected = np.array([v.find('EF') for v in values.values],
  1716. dtype=np.int64)
  1717. tm.assert_numpy_array_equal(result.values, expected)
  1718. result = values.str.rfind('EF')
  1719. tm.assert_series_equal(result, Series([4, 5, 7, 4, -1]))
  1720. expected = np.array([v.rfind('EF') for v in values.values],
  1721. dtype=np.int64)
  1722. tm.assert_numpy_array_equal(result.values, expected)
  1723. result = values.str.find('EF', 3)
  1724. tm.assert_series_equal(result, Series([4, 3, 7, 4, -1]))
  1725. expected = np.array([v.find('EF', 3) for v in values.values],
  1726. dtype=np.int64)
  1727. tm.assert_numpy_array_equal(result.values, expected)
  1728. result = values.str.rfind('EF', 3)
  1729. tm.assert_series_equal(result, Series([4, 5, 7, 4, -1]))
  1730. expected = np.array([v.rfind('EF', 3) for v in values.values],
  1731. dtype=np.int64)
  1732. tm.assert_numpy_array_equal(result.values, expected)
  1733. result = values.str.find('EF', 3, 6)
  1734. tm.assert_series_equal(result, Series([4, 3, -1, 4, -1]))
  1735. expected = np.array([v.find('EF', 3, 6) for v in values.values],
  1736. dtype=np.int64)
  1737. tm.assert_numpy_array_equal(result.values, expected)
  1738. result = values.str.rfind('EF', 3, 6)
  1739. tm.assert_series_equal(result, Series([4, 3, -1, 4, -1]))
  1740. expected = np.array([v.rfind('EF', 3, 6) for v in values.values],
  1741. dtype=np.int64)
  1742. tm.assert_numpy_array_equal(result.values, expected)
  1743. with pytest.raises(TypeError,
  1744. match="expected a string object, not int"):
  1745. result = values.str.find(0)
  1746. with pytest.raises(TypeError,
  1747. match="expected a string object, not int"):
  1748. result = values.str.rfind(0)
  1749. def test_find_nan(self):
  1750. values = Series(['ABCDEFG', np.nan, 'DEFGHIJEF', np.nan, 'XXXX'])
  1751. result = values.str.find('EF')
  1752. tm.assert_series_equal(result, Series([4, np.nan, 1, np.nan, -1]))
  1753. result = values.str.rfind('EF')
  1754. tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1]))
  1755. result = values.str.find('EF', 3)
  1756. tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1]))
  1757. result = values.str.rfind('EF', 3)
  1758. tm.assert_series_equal(result, Series([4, np.nan, 7, np.nan, -1]))
  1759. result = values.str.find('EF', 3, 6)
  1760. tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1]))
  1761. result = values.str.rfind('EF', 3, 6)
  1762. tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1]))
  1763. def test_index(self):
  1764. def _check(result, expected):
  1765. if isinstance(result, Series):
  1766. tm.assert_series_equal(result, expected)
  1767. else:
  1768. tm.assert_index_equal(result, expected)
  1769. for klass in [Series, Index]:
  1770. s = klass(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF'])
  1771. result = s.str.index('EF')
  1772. _check(result, klass([4, 3, 1, 0]))
  1773. expected = np.array([v.index('EF') for v in s.values],
  1774. dtype=np.int64)
  1775. tm.assert_numpy_array_equal(result.values, expected)
  1776. result = s.str.rindex('EF')
  1777. _check(result, klass([4, 5, 7, 4]))
  1778. expected = np.array([v.rindex('EF') for v in s.values],
  1779. dtype=np.int64)
  1780. tm.assert_numpy_array_equal(result.values, expected)
  1781. result = s.str.index('EF', 3)
  1782. _check(result, klass([4, 3, 7, 4]))
  1783. expected = np.array([v.index('EF', 3) for v in s.values],
  1784. dtype=np.int64)
  1785. tm.assert_numpy_array_equal(result.values, expected)
  1786. result = s.str.rindex('EF', 3)
  1787. _check(result, klass([4, 5, 7, 4]))
  1788. expected = np.array([v.rindex('EF', 3) for v in s.values],
  1789. dtype=np.int64)
  1790. tm.assert_numpy_array_equal(result.values, expected)
  1791. result = s.str.index('E', 4, 8)
  1792. _check(result, klass([4, 5, 7, 4]))
  1793. expected = np.array([v.index('E', 4, 8) for v in s.values],
  1794. dtype=np.int64)
  1795. tm.assert_numpy_array_equal(result.values, expected)
  1796. result = s.str.rindex('E', 0, 5)
  1797. _check(result, klass([4, 3, 1, 4]))
  1798. expected = np.array([v.rindex('E', 0, 5) for v in s.values],
  1799. dtype=np.int64)
  1800. tm.assert_numpy_array_equal(result.values, expected)
  1801. with pytest.raises(ValueError, match="substring not found"):
  1802. result = s.str.index('DE')
  1803. msg = "expected a string object, not int"
  1804. with pytest.raises(TypeError, match=msg):
  1805. result = s.str.index(0)
  1806. # test with nan
  1807. s = Series(['abcb', 'ab', 'bcbe', np.nan])
  1808. result = s.str.index('b')
  1809. tm.assert_series_equal(result, Series([1, 1, 0, np.nan]))
  1810. result = s.str.rindex('b')
  1811. tm.assert_series_equal(result, Series([3, 1, 2, np.nan]))
  1812. def test_pad(self):
  1813. values = Series(['a', 'b', NA, 'c', NA, 'eeeeee'])
  1814. result = values.str.pad(5, side='left')
  1815. exp = Series([' a', ' b', NA, ' c', NA, 'eeeeee'])
  1816. tm.assert_almost_equal(result, exp)
  1817. result = values.str.pad(5, side='right')
  1818. exp = Series(['a ', 'b ', NA, 'c ', NA, 'eeeeee'])
  1819. tm.assert_almost_equal(result, exp)
  1820. result = values.str.pad(5, side='both')
  1821. exp = Series([' a ', ' b ', NA, ' c ', NA, 'eeeeee'])
  1822. tm.assert_almost_equal(result, exp)
  1823. # mixed
  1824. mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2.
  1825. ])
  1826. rs = Series(mixed).str.pad(5, side='left')
  1827. xp = Series([' a', NA, ' b', NA, NA, ' ee', NA, NA, NA])
  1828. assert isinstance(rs, Series)
  1829. tm.assert_almost_equal(rs, xp)
  1830. mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2.
  1831. ])
  1832. rs = Series(mixed).str.pad(5, side='right')
  1833. xp = Series(['a ', NA, 'b ', NA, NA, 'ee ', NA, NA, NA])
  1834. assert isinstance(rs, Series)
  1835. tm.assert_almost_equal(rs, xp)
  1836. mixed = Series(['a', NA, 'b', True, datetime.today(), 'ee', None, 1, 2.
  1837. ])
  1838. rs = Series(mixed).str.pad(5, side='both')
  1839. xp = Series([' a ', NA, ' b ', NA, NA, ' ee ', NA, NA, NA])
  1840. assert isinstance(rs, Series)
  1841. tm.assert_almost_equal(rs, xp)
  1842. # unicode
  1843. values = Series([u('a'), u('b'), NA, u('c'), NA, u('eeeeee')])
  1844. result = values.str.pad(5, side='left')
  1845. exp = Series([u(' a'), u(' b'), NA, u(' c'), NA, u('eeeeee')])
  1846. tm.assert_almost_equal(result, exp)
  1847. result = values.str.pad(5, side='right')
  1848. exp = Series([u('a '), u('b '), NA, u('c '), NA, u('eeeeee')])
  1849. tm.assert_almost_equal(result, exp)
  1850. result = values.str.pad(5, side='both')
  1851. exp = Series([u(' a '), u(' b '), NA, u(' c '), NA, u('eeeeee')])
  1852. tm.assert_almost_equal(result, exp)
  1853. def test_pad_fillchar(self):
  1854. values = Series(['a', 'b', NA, 'c', NA, 'eeeeee'])
  1855. result = values.str.pad(5, side='left', fillchar='X')
  1856. exp = Series(['XXXXa', 'XXXXb', NA, 'XXXXc', NA, 'eeeeee'])
  1857. tm.assert_almost_equal(result, exp)
  1858. result = values.str.pad(5, side='right', fillchar='X')
  1859. exp = Series(['aXXXX', 'bXXXX', NA, 'cXXXX', NA, 'eeeeee'])
  1860. tm.assert_almost_equal(result, exp)
  1861. result = values.str.pad(5, side='both', fillchar='X')
  1862. exp = Series(['XXaXX', 'XXbXX', NA, 'XXcXX', NA, 'eeeeee'])
  1863. tm.assert_almost_equal(result, exp)
  1864. msg = "fillchar must be a character, not str"
  1865. with pytest.raises(TypeError, match=msg):
  1866. result = values.str.pad(5, fillchar='XY')
  1867. msg = "fillchar must be a character, not int"
  1868. with pytest.raises(TypeError, match=msg):
  1869. result = values.str.pad(5, fillchar=5)
  1870. @pytest.mark.parametrize("f", ['center', 'ljust', 'rjust', 'zfill', 'pad'])
  1871. def test_pad_width(self, f):
  1872. # see gh-13598
  1873. s = Series(['1', '22', 'a', 'bb'])
  1874. msg = "width must be of integer type, not*"
  1875. with pytest.raises(TypeError, match=msg):
  1876. getattr(s.str, f)('f')
  1877. def test_translate(self):
  1878. def _check(result, expected):
  1879. if isinstance(result, Series):
  1880. tm.assert_series_equal(result, expected)
  1881. else:
  1882. tm.assert_index_equal(result, expected)
  1883. for klass in [Series, Index]:
  1884. s = klass(['abcdefg', 'abcc', 'cdddfg', 'cdefggg'])
  1885. if not compat.PY3:
  1886. import string
  1887. table = string.maketrans('abc', 'cde')
  1888. else:
  1889. table = str.maketrans('abc', 'cde')
  1890. result = s.str.translate(table)
  1891. expected = klass(['cdedefg', 'cdee', 'edddfg', 'edefggg'])
  1892. _check(result, expected)
  1893. # use of deletechars is python 2 only
  1894. if not compat.PY3:
  1895. result = s.str.translate(table, deletechars='fg')
  1896. expected = klass(['cdede', 'cdee', 'eddd', 'ede'])
  1897. _check(result, expected)
  1898. result = s.str.translate(None, deletechars='fg')
  1899. expected = klass(['abcde', 'abcc', 'cddd', 'cde'])
  1900. _check(result, expected)
  1901. else:
  1902. msg = "deletechars is not a valid argument"
  1903. with pytest.raises(ValueError, match=msg):
  1904. result = s.str.translate(table, deletechars='fg')
  1905. # Series with non-string values
  1906. s = Series(['a', 'b', 'c', 1.2])
  1907. expected = Series(['c', 'd', 'e', np.nan])
  1908. result = s.str.translate(table)
  1909. tm.assert_series_equal(result, expected)
  1910. def test_center_ljust_rjust(self):
  1911. values = Series(['a', 'b', NA, 'c', NA, 'eeeeee'])
  1912. result = values.str.center(5)
  1913. exp = Series([' a ', ' b ', NA, ' c ', NA, 'eeeeee'])
  1914. tm.assert_almost_equal(result, exp)
  1915. result = values.str.ljust(5)
  1916. exp = Series(['a ', 'b ', NA, 'c ', NA, 'eeeeee'])
  1917. tm.assert_almost_equal(result, exp)
  1918. result = values.str.rjust(5)
  1919. exp = Series([' a', ' b', NA, ' c', NA, 'eeeeee'])
  1920. tm.assert_almost_equal(result, exp)
  1921. # mixed
  1922. mixed = Series(['a', NA, 'b', True, datetime.today(), 'c', 'eee', None,
  1923. 1, 2.])
  1924. rs = Series(mixed).str.center(5)
  1925. xp = Series([' a ', NA, ' b ', NA, NA, ' c ', ' eee ', NA, NA, NA
  1926. ])
  1927. assert isinstance(rs, Series)
  1928. tm.assert_almost_equal(rs, xp)
  1929. rs = Series(mixed).str.ljust(5)
  1930. xp = Series(['a ', NA, 'b ', NA, NA, 'c ', 'eee ', NA, NA, NA
  1931. ])
  1932. assert isinstance(rs, Series)
  1933. tm.assert_almost_equal(rs, xp)
  1934. rs = Series(mixed).str.rjust(5)
  1935. xp = Series([' a', NA, ' b', NA, NA, ' c', ' eee', NA, NA, NA
  1936. ])
  1937. assert isinstance(rs, Series)
  1938. tm.assert_almost_equal(rs, xp)
  1939. # unicode
  1940. values = Series([u('a'), u('b'), NA, u('c'), NA, u('eeeeee')])
  1941. result = values.str.center(5)
  1942. exp = Series([u(' a '), u(' b '), NA, u(' c '), NA, u('eeeeee')])
  1943. tm.assert_almost_equal(result, exp)
  1944. result = values.str.ljust(5)
  1945. exp = Series([u('a '), u('b '), NA, u('c '), NA, u('eeeeee')])
  1946. tm.assert_almost_equal(result, exp)
  1947. result = values.str.rjust(5)
  1948. exp = Series([u(' a'), u(' b'), NA, u(' c'), NA, u('eeeeee')])
  1949. tm.assert_almost_equal(result, exp)
  1950. def test_center_ljust_rjust_fillchar(self):
  1951. values = Series(['a', 'bb', 'cccc', 'ddddd', 'eeeeee'])
  1952. result = values.str.center(5, fillchar='X')
  1953. expected = Series(['XXaXX', 'XXbbX', 'Xcccc', 'ddddd', 'eeeeee'])
  1954. tm.assert_series_equal(result, expected)
  1955. expected = np.array([v.center(5, 'X') for v in values.values],
  1956. dtype=np.object_)
  1957. tm.assert_numpy_array_equal(result.values, expected)
  1958. result = values.str.ljust(5, fillchar='X')
  1959. expected = Series(['aXXXX', 'bbXXX', 'ccccX', 'ddddd', 'eeeeee'])
  1960. tm.assert_series_equal(result, expected)
  1961. expected = np.array([v.ljust(5, 'X') for v in values.values],
  1962. dtype=np.object_)
  1963. tm.assert_numpy_array_equal(result.values, expected)
  1964. result = values.str.rjust(5, fillchar='X')
  1965. expected = Series(['XXXXa', 'XXXbb', 'Xcccc', 'ddddd', 'eeeeee'])
  1966. tm.assert_series_equal(result, expected)
  1967. expected = np.array([v.rjust(5, 'X') for v in values.values],
  1968. dtype=np.object_)
  1969. tm.assert_numpy_array_equal(result.values, expected)
  1970. # If fillchar is not a charatter, normal str raises TypeError
  1971. # 'aaa'.ljust(5, 'XY')
  1972. # TypeError: must be char, not str
  1973. template = "fillchar must be a character, not {dtype}"
  1974. with pytest.raises(TypeError, match=template.format(dtype="str")):
  1975. values.str.center(5, fillchar='XY')
  1976. with pytest.raises(TypeError, match=template.format(dtype="str")):
  1977. values.str.ljust(5, fillchar='XY')
  1978. with pytest.raises(TypeError, match=template.format(dtype="str")):
  1979. values.str.rjust(5, fillchar='XY')
  1980. with pytest.raises(TypeError, match=template.format(dtype="int")):
  1981. values.str.center(5, fillchar=1)
  1982. with pytest.raises(TypeError, match=template.format(dtype="int")):
  1983. values.str.ljust(5, fillchar=1)
  1984. with pytest.raises(TypeError, match=template.format(dtype="int")):
  1985. values.str.rjust(5, fillchar=1)
  1986. def test_zfill(self):
  1987. values = Series(['1', '22', 'aaa', '333', '45678'])
  1988. result = values.str.zfill(5)
  1989. expected = Series(['00001', '00022', '00aaa', '00333', '45678'])
  1990. tm.assert_series_equal(result, expected)
  1991. expected = np.array([v.zfill(5) for v in values.values],
  1992. dtype=np.object_)
  1993. tm.assert_numpy_array_equal(result.values, expected)
  1994. result = values.str.zfill(3)
  1995. expected = Series(['001', '022', 'aaa', '333', '45678'])
  1996. tm.assert_series_equal(result, expected)
  1997. expected = np.array([v.zfill(3) for v in values.values],
  1998. dtype=np.object_)
  1999. tm.assert_numpy_array_equal(result.values, expected)
  2000. values = Series(['1', np.nan, 'aaa', np.nan, '45678'])
  2001. result = values.str.zfill(5)
  2002. expected = Series(['00001', np.nan, '00aaa', np.nan, '45678'])
  2003. tm.assert_series_equal(result, expected)
  2004. def test_split(self):
  2005. values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
  2006. result = values.str.split('_')
  2007. exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']])
  2008. tm.assert_series_equal(result, exp)
  2009. # more than one char
  2010. values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
  2011. result = values.str.split('__')
  2012. tm.assert_series_equal(result, exp)
  2013. result = values.str.split('__', expand=False)
  2014. tm.assert_series_equal(result, exp)
  2015. # mixed
  2016. mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), None, 1,
  2017. 2.])
  2018. result = mixed.str.split('_')
  2019. exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, NA, NA, NA
  2020. ])
  2021. assert isinstance(result, Series)
  2022. tm.assert_almost_equal(result, exp)
  2023. result = mixed.str.split('_', expand=False)
  2024. assert isinstance(result, Series)
  2025. tm.assert_almost_equal(result, exp)
  2026. # unicode
  2027. values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])
  2028. result = values.str.split('_')
  2029. exp = Series([[u('a'), u('b'), u('c')], [u('c'), u('d'), u('e')], NA,
  2030. [u('f'), u('g'), u('h')]])
  2031. tm.assert_series_equal(result, exp)
  2032. result = values.str.split('_', expand=False)
  2033. tm.assert_series_equal(result, exp)
  2034. # regex split
  2035. values = Series([u('a,b_c'), u('c_d,e'), NA, u('f,g,h')])
  2036. result = values.str.split('[,_]')
  2037. exp = Series([[u('a'), u('b'), u('c')], [u('c'), u('d'), u('e')], NA,
  2038. [u('f'), u('g'), u('h')]])
  2039. tm.assert_series_equal(result, exp)
  2040. def test_rsplit(self):
  2041. values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
  2042. result = values.str.rsplit('_')
  2043. exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']])
  2044. tm.assert_series_equal(result, exp)
  2045. # more than one char
  2046. values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
  2047. result = values.str.rsplit('__')
  2048. tm.assert_series_equal(result, exp)
  2049. result = values.str.rsplit('__', expand=False)
  2050. tm.assert_series_equal(result, exp)
  2051. # mixed
  2052. mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), None, 1,
  2053. 2.])
  2054. result = mixed.str.rsplit('_')
  2055. exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, NA, NA, NA
  2056. ])
  2057. assert isinstance(result, Series)
  2058. tm.assert_almost_equal(result, exp)
  2059. result = mixed.str.rsplit('_', expand=False)
  2060. assert isinstance(result, Series)
  2061. tm.assert_almost_equal(result, exp)
  2062. # unicode
  2063. values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])
  2064. result = values.str.rsplit('_')
  2065. exp = Series([[u('a'), u('b'), u('c')], [u('c'), u('d'), u('e')], NA,
  2066. [u('f'), u('g'), u('h')]])
  2067. tm.assert_series_equal(result, exp)
  2068. result = values.str.rsplit('_', expand=False)
  2069. tm.assert_series_equal(result, exp)
  2070. # regex split is not supported by rsplit
  2071. values = Series([u('a,b_c'), u('c_d,e'), NA, u('f,g,h')])
  2072. result = values.str.rsplit('[,_]')
  2073. exp = Series([[u('a,b_c')], [u('c_d,e')], NA, [u('f,g,h')]])
  2074. tm.assert_series_equal(result, exp)
  2075. # setting max number of splits, make sure it's from reverse
  2076. values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
  2077. result = values.str.rsplit('_', n=1)
  2078. exp = Series([['a_b', 'c'], ['c_d', 'e'], NA, ['f_g', 'h']])
  2079. tm.assert_series_equal(result, exp)
  2080. def test_split_blank_string(self):
  2081. # expand blank split GH 20067
  2082. values = Series([''], name='test')
  2083. result = values.str.split(expand=True)
  2084. exp = DataFrame([[]])
  2085. tm.assert_frame_equal(result, exp)
  2086. values = Series(['a b c', 'a b', '', ' '], name='test')
  2087. result = values.str.split(expand=True)
  2088. exp = DataFrame([['a', 'b', 'c'], ['a', 'b', np.nan],
  2089. [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan]])
  2090. tm.assert_frame_equal(result, exp)
  2091. def test_split_noargs(self):
  2092. # #1859
  2093. s = Series(['Wes McKinney', 'Travis Oliphant'])
  2094. result = s.str.split()
  2095. expected = ['Travis', 'Oliphant']
  2096. assert result[1] == expected
  2097. result = s.str.rsplit()
  2098. assert result[1] == expected
  2099. def test_split_maxsplit(self):
  2100. # re.split 0, str.split -1
  2101. s = Series(['bd asdf jfg', 'kjasdflqw asdfnfk'])
  2102. result = s.str.split(n=-1)
  2103. xp = s.str.split()
  2104. tm.assert_series_equal(result, xp)
  2105. result = s.str.split(n=0)
  2106. tm.assert_series_equal(result, xp)
  2107. xp = s.str.split('asdf')
  2108. result = s.str.split('asdf', n=0)
  2109. tm.assert_series_equal(result, xp)
  2110. result = s.str.split('asdf', n=-1)
  2111. tm.assert_series_equal(result, xp)
  2112. def test_split_no_pat_with_nonzero_n(self):
  2113. s = Series(['split once', 'split once too!'])
  2114. result = s.str.split(n=1)
  2115. expected = Series({0: ['split', 'once'], 1: ['split', 'once too!']})
  2116. tm.assert_series_equal(expected, result, check_index_type=False)
  2117. def test_split_to_dataframe(self):
  2118. s = Series(['nosplit', 'alsonosplit'])
  2119. result = s.str.split('_', expand=True)
  2120. exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
  2121. tm.assert_frame_equal(result, exp)
  2122. s = Series(['some_equal_splits', 'with_no_nans'])
  2123. result = s.str.split('_', expand=True)
  2124. exp = DataFrame({0: ['some', 'with'],
  2125. 1: ['equal', 'no'],
  2126. 2: ['splits', 'nans']})
  2127. tm.assert_frame_equal(result, exp)
  2128. s = Series(['some_unequal_splits', 'one_of_these_things_is_not'])
  2129. result = s.str.split('_', expand=True)
  2130. exp = DataFrame({0: ['some', 'one'],
  2131. 1: ['unequal', 'of'],
  2132. 2: ['splits', 'these'],
  2133. 3: [NA, 'things'],
  2134. 4: [NA, 'is'],
  2135. 5: [NA, 'not']})
  2136. tm.assert_frame_equal(result, exp)
  2137. s = Series(['some_splits', 'with_index'], index=['preserve', 'me'])
  2138. result = s.str.split('_', expand=True)
  2139. exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']},
  2140. index=['preserve', 'me'])
  2141. tm.assert_frame_equal(result, exp)
  2142. with pytest.raises(ValueError, match="expand must be"):
  2143. s.str.split('_', expand="not_a_boolean")
  2144. def test_split_to_multiindex_expand(self):
  2145. # https://github.com/pandas-dev/pandas/issues/23677
  2146. idx = Index(['nosplit', 'alsonosplit', np.nan])
  2147. result = idx.str.split('_', expand=True)
  2148. exp = idx
  2149. tm.assert_index_equal(result, exp)
  2150. assert result.nlevels == 1
  2151. idx = Index(['some_equal_splits', 'with_no_nans', np.nan, None])
  2152. result = idx.str.split('_', expand=True)
  2153. exp = MultiIndex.from_tuples([('some', 'equal', 'splits'),
  2154. ('with', 'no', 'nans'),
  2155. [np.nan, np.nan, np.nan],
  2156. [None, None, None]])
  2157. tm.assert_index_equal(result, exp)
  2158. assert result.nlevels == 3
  2159. idx = Index(['some_unequal_splits',
  2160. 'one_of_these_things_is_not',
  2161. np.nan, None])
  2162. result = idx.str.split('_', expand=True)
  2163. exp = MultiIndex.from_tuples([('some', 'unequal', 'splits',
  2164. NA, NA, NA),
  2165. ('one', 'of', 'these',
  2166. 'things', 'is', 'not'),
  2167. (np.nan, np.nan, np.nan,
  2168. np.nan, np.nan, np.nan),
  2169. (None, None, None,
  2170. None, None, None)])
  2171. tm.assert_index_equal(result, exp)
  2172. assert result.nlevels == 6
  2173. with pytest.raises(ValueError, match="expand must be"):
  2174. idx.str.split('_', expand="not_a_boolean")
  2175. def test_rsplit_to_dataframe_expand(self):
  2176. s = Series(['nosplit', 'alsonosplit'])
  2177. result = s.str.rsplit('_', expand=True)
  2178. exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
  2179. tm.assert_frame_equal(result, exp)
  2180. s = Series(['some_equal_splits', 'with_no_nans'])
  2181. result = s.str.rsplit('_', expand=True)
  2182. exp = DataFrame({0: ['some', 'with'],
  2183. 1: ['equal', 'no'],
  2184. 2: ['splits', 'nans']})
  2185. tm.assert_frame_equal(result, exp)
  2186. result = s.str.rsplit('_', expand=True, n=2)
  2187. exp = DataFrame({0: ['some', 'with'],
  2188. 1: ['equal', 'no'],
  2189. 2: ['splits', 'nans']})
  2190. tm.assert_frame_equal(result, exp)
  2191. result = s.str.rsplit('_', expand=True, n=1)
  2192. exp = DataFrame({0: ['some_equal', 'with_no'], 1: ['splits', 'nans']})
  2193. tm.assert_frame_equal(result, exp)
  2194. s = Series(['some_splits', 'with_index'], index=['preserve', 'me'])
  2195. result = s.str.rsplit('_', expand=True)
  2196. exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']},
  2197. index=['preserve', 'me'])
  2198. tm.assert_frame_equal(result, exp)
  2199. def test_rsplit_to_multiindex_expand(self):
  2200. idx = Index(['nosplit', 'alsonosplit'])
  2201. result = idx.str.rsplit('_', expand=True)
  2202. exp = idx
  2203. tm.assert_index_equal(result, exp)
  2204. assert result.nlevels == 1
  2205. idx = Index(['some_equal_splits', 'with_no_nans'])
  2206. result = idx.str.rsplit('_', expand=True)
  2207. exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), (
  2208. 'with', 'no', 'nans')])
  2209. tm.assert_index_equal(result, exp)
  2210. assert result.nlevels == 3
  2211. idx = Index(['some_equal_splits', 'with_no_nans'])
  2212. result = idx.str.rsplit('_', expand=True, n=1)
  2213. exp = MultiIndex.from_tuples([('some_equal', 'splits'),
  2214. ('with_no', 'nans')])
  2215. tm.assert_index_equal(result, exp)
  2216. assert result.nlevels == 2
  2217. def test_split_nan_expand(self):
  2218. # gh-18450
  2219. s = Series(["foo,bar,baz", NA])
  2220. result = s.str.split(",", expand=True)
  2221. exp = DataFrame([["foo", "bar", "baz"], [NA, NA, NA]])
  2222. tm.assert_frame_equal(result, exp)
  2223. # check that these are actually np.nan and not None
  2224. # TODO see GH 18463
  2225. # tm.assert_frame_equal does not differentiate
  2226. assert all(np.isnan(x) for x in result.iloc[1])
  2227. def test_split_with_name(self):
  2228. # GH 12617
  2229. # should preserve name
  2230. s = Series(['a,b', 'c,d'], name='xxx')
  2231. res = s.str.split(',')
  2232. exp = Series([['a', 'b'], ['c', 'd']], name='xxx')
  2233. tm.assert_series_equal(res, exp)
  2234. res = s.str.split(',', expand=True)
  2235. exp = DataFrame([['a', 'b'], ['c', 'd']])
  2236. tm.assert_frame_equal(res, exp)
  2237. idx = Index(['a,b', 'c,d'], name='xxx')
  2238. res = idx.str.split(',')
  2239. exp = Index([['a', 'b'], ['c', 'd']], name='xxx')
  2240. assert res.nlevels == 1
  2241. tm.assert_index_equal(res, exp)
  2242. res = idx.str.split(',', expand=True)
  2243. exp = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')])
  2244. assert res.nlevels == 2
  2245. tm.assert_index_equal(res, exp)
  2246. def test_partition_series(self):
  2247. # https://github.com/pandas-dev/pandas/issues/23558
  2248. values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])
  2249. result = values.str.partition('_', expand=False)
  2250. exp = Series([('a', '_', 'b_c'), ('c', '_', 'd_e'), NA,
  2251. ('f', '_', 'g_h'), None])
  2252. tm.assert_series_equal(result, exp)
  2253. result = values.str.rpartition('_', expand=False)
  2254. exp = Series([('a_b', '_', 'c'), ('c_d', '_', 'e'), NA,
  2255. ('f_g', '_', 'h'), None])
  2256. tm.assert_series_equal(result, exp)
  2257. # more than one char
  2258. values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h', None])
  2259. result = values.str.partition('__', expand=False)
  2260. exp = Series([('a', '__', 'b__c'), ('c', '__', 'd__e'), NA,
  2261. ('f', '__', 'g__h'), None])
  2262. tm.assert_series_equal(result, exp)
  2263. result = values.str.rpartition('__', expand=False)
  2264. exp = Series([('a__b', '__', 'c'), ('c__d', '__', 'e'), NA,
  2265. ('f__g', '__', 'h'), None])
  2266. tm.assert_series_equal(result, exp)
  2267. # None
  2268. values = Series(['a b c', 'c d e', NA, 'f g h', None])
  2269. result = values.str.partition(expand=False)
  2270. exp = Series([('a', ' ', 'b c'), ('c', ' ', 'd e'), NA,
  2271. ('f', ' ', 'g h'), None])
  2272. tm.assert_series_equal(result, exp)
  2273. result = values.str.rpartition(expand=False)
  2274. exp = Series([('a b', ' ', 'c'), ('c d', ' ', 'e'), NA,
  2275. ('f g', ' ', 'h'), None])
  2276. tm.assert_series_equal(result, exp)
  2277. # Not split
  2278. values = Series(['abc', 'cde', NA, 'fgh', None])
  2279. result = values.str.partition('_', expand=False)
  2280. exp = Series([('abc', '', ''), ('cde', '', ''), NA,
  2281. ('fgh', '', ''), None])
  2282. tm.assert_series_equal(result, exp)
  2283. result = values.str.rpartition('_', expand=False)
  2284. exp = Series([('', '', 'abc'), ('', '', 'cde'), NA,
  2285. ('', '', 'fgh'), None])
  2286. tm.assert_series_equal(result, exp)
  2287. # unicode
  2288. values = Series([u'a_b_c', u'c_d_e', NA, u'f_g_h'])
  2289. result = values.str.partition('_', expand=False)
  2290. exp = Series([(u'a', u'_', u'b_c'), (u'c', u'_', u'd_e'),
  2291. NA, (u'f', u'_', u'g_h')])
  2292. tm.assert_series_equal(result, exp)
  2293. result = values.str.rpartition('_', expand=False)
  2294. exp = Series([(u'a_b', u'_', u'c'), (u'c_d', u'_', u'e'),
  2295. NA, (u'f_g', u'_', u'h')])
  2296. tm.assert_series_equal(result, exp)
  2297. # compare to standard lib
  2298. values = Series(['A_B_C', 'B_C_D', 'E_F_G', 'EFGHEF'])
  2299. result = values.str.partition('_', expand=False).tolist()
  2300. assert result == [v.partition('_') for v in values]
  2301. result = values.str.rpartition('_', expand=False).tolist()
  2302. assert result == [v.rpartition('_') for v in values]
  2303. def test_partition_index(self):
  2304. # https://github.com/pandas-dev/pandas/issues/23558
  2305. values = Index(['a_b_c', 'c_d_e', 'f_g_h', np.nan, None])
  2306. result = values.str.partition('_', expand=False)
  2307. exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'),
  2308. ('f', '_', 'g_h'), np.nan, None]))
  2309. tm.assert_index_equal(result, exp)
  2310. assert result.nlevels == 1
  2311. result = values.str.rpartition('_', expand=False)
  2312. exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'),
  2313. ('f_g', '_', 'h'), np.nan, None]))
  2314. tm.assert_index_equal(result, exp)
  2315. assert result.nlevels == 1
  2316. result = values.str.partition('_')
  2317. exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'),
  2318. ('f', '_', 'g_h'), (np.nan, np.nan, np.nan),
  2319. (None, None, None)])
  2320. tm.assert_index_equal(result, exp)
  2321. assert isinstance(result, MultiIndex)
  2322. assert result.nlevels == 3
  2323. result = values.str.rpartition('_')
  2324. exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'),
  2325. ('f_g', '_', 'h'), (np.nan, np.nan, np.nan),
  2326. (None, None, None)])
  2327. tm.assert_index_equal(result, exp)
  2328. assert isinstance(result, MultiIndex)
  2329. assert result.nlevels == 3
  2330. def test_partition_to_dataframe(self):
  2331. # https://github.com/pandas-dev/pandas/issues/23558
  2332. values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])
  2333. result = values.str.partition('_')
  2334. exp = DataFrame({0: ['a', 'c', np.nan, 'f', None],
  2335. 1: ['_', '_', np.nan, '_', None],
  2336. 2: ['b_c', 'd_e', np.nan, 'g_h', None]})
  2337. tm.assert_frame_equal(result, exp)
  2338. result = values.str.rpartition('_')
  2339. exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None],
  2340. 1: ['_', '_', np.nan, '_', None],
  2341. 2: ['c', 'e', np.nan, 'h', None]})
  2342. tm.assert_frame_equal(result, exp)
  2343. values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])
  2344. result = values.str.partition('_', expand=True)
  2345. exp = DataFrame({0: ['a', 'c', np.nan, 'f', None],
  2346. 1: ['_', '_', np.nan, '_', None],
  2347. 2: ['b_c', 'd_e', np.nan, 'g_h', None]})
  2348. tm.assert_frame_equal(result, exp)
  2349. result = values.str.rpartition('_', expand=True)
  2350. exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None],
  2351. 1: ['_', '_', np.nan, '_', None],
  2352. 2: ['c', 'e', np.nan, 'h', None]})
  2353. tm.assert_frame_equal(result, exp)
  2354. def test_partition_with_name(self):
  2355. # GH 12617
  2356. s = Series(['a,b', 'c,d'], name='xxx')
  2357. res = s.str.partition(',')
  2358. exp = DataFrame({0: ['a', 'c'], 1: [',', ','], 2: ['b', 'd']})
  2359. tm.assert_frame_equal(res, exp)
  2360. # should preserve name
  2361. res = s.str.partition(',', expand=False)
  2362. exp = Series([('a', ',', 'b'), ('c', ',', 'd')], name='xxx')
  2363. tm.assert_series_equal(res, exp)
  2364. idx = Index(['a,b', 'c,d'], name='xxx')
  2365. res = idx.str.partition(',')
  2366. exp = MultiIndex.from_tuples([('a', ',', 'b'), ('c', ',', 'd')])
  2367. assert res.nlevels == 3
  2368. tm.assert_index_equal(res, exp)
  2369. # should preserve name
  2370. res = idx.str.partition(',', expand=False)
  2371. exp = Index(np.array([('a', ',', 'b'), ('c', ',', 'd')]), name='xxx')
  2372. assert res.nlevels == 1
  2373. tm.assert_index_equal(res, exp)
  2374. def test_partition_deprecation(self):
  2375. # GH 22676; depr kwarg "pat" in favor of "sep"
  2376. values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
  2377. # str.partition
  2378. # using sep -> no warning
  2379. expected = values.str.partition(sep='_')
  2380. with tm.assert_produces_warning(FutureWarning):
  2381. result = values.str.partition(pat='_')
  2382. tm.assert_frame_equal(result, expected)
  2383. # str.rpartition
  2384. # using sep -> no warning
  2385. expected = values.str.rpartition(sep='_')
  2386. with tm.assert_produces_warning(FutureWarning):
  2387. result = values.str.rpartition(pat='_')
  2388. tm.assert_frame_equal(result, expected)
  2389. def test_pipe_failures(self):
  2390. # #2119
  2391. s = Series(['A|B|C'])
  2392. result = s.str.split('|')
  2393. exp = Series([['A', 'B', 'C']])
  2394. tm.assert_series_equal(result, exp)
  2395. result = s.str.replace('|', ' ')
  2396. exp = Series(['A B C'])
  2397. tm.assert_series_equal(result, exp)
  2398. def test_slice(self):
  2399. values = Series(['aafootwo', 'aabartwo', NA, 'aabazqux'])
  2400. result = values.str.slice(2, 5)
  2401. exp = Series(['foo', 'bar', NA, 'baz'])
  2402. tm.assert_series_equal(result, exp)
  2403. for start, stop, step in [(0, 3, -1), (None, None, -1), (3, 10, 2),
  2404. (3, 0, -1)]:
  2405. try:
  2406. result = values.str.slice(start, stop, step)
  2407. expected = Series([s[start:stop:step] if not isna(s) else NA
  2408. for s in values])
  2409. tm.assert_series_equal(result, expected)
  2410. except IndexError:
  2411. print('failed on %s:%s:%s' % (start, stop, step))
  2412. raise
  2413. # mixed
  2414. mixed = Series(['aafootwo', NA, 'aabartwo', True, datetime.today(),
  2415. None, 1, 2.])
  2416. rs = Series(mixed).str.slice(2, 5)
  2417. xp = Series(['foo', NA, 'bar', NA, NA, NA, NA, NA])
  2418. assert isinstance(rs, Series)
  2419. tm.assert_almost_equal(rs, xp)
  2420. rs = Series(mixed).str.slice(2, 5, -1)
  2421. xp = Series(['oof', NA, 'rab', NA, NA, NA, NA, NA])
  2422. # unicode
  2423. values = Series([u('aafootwo'), u('aabartwo'), NA, u('aabazqux')])
  2424. result = values.str.slice(2, 5)
  2425. exp = Series([u('foo'), u('bar'), NA, u('baz')])
  2426. tm.assert_series_equal(result, exp)
  2427. result = values.str.slice(0, -1, 2)
  2428. exp = Series([u('afow'), u('abrw'), NA, u('abzu')])
  2429. tm.assert_series_equal(result, exp)
  2430. def test_slice_replace(self):
  2431. values = Series(['short', 'a bit longer', 'evenlongerthanthat', '', NA
  2432. ])
  2433. exp = Series(['shrt', 'a it longer', 'evnlongerthanthat', '', NA])
  2434. result = values.str.slice_replace(2, 3)
  2435. tm.assert_series_equal(result, exp)
  2436. exp = Series(['shzrt', 'a zit longer', 'evznlongerthanthat', 'z', NA])
  2437. result = values.str.slice_replace(2, 3, 'z')
  2438. tm.assert_series_equal(result, exp)
  2439. exp = Series(['shzort', 'a zbit longer', 'evzenlongerthanthat', 'z', NA
  2440. ])
  2441. result = values.str.slice_replace(2, 2, 'z')
  2442. tm.assert_series_equal(result, exp)
  2443. exp = Series(['shzort', 'a zbit longer', 'evzenlongerthanthat', 'z', NA
  2444. ])
  2445. result = values.str.slice_replace(2, 1, 'z')
  2446. tm.assert_series_equal(result, exp)
  2447. exp = Series(['shorz', 'a bit longez', 'evenlongerthanthaz', 'z', NA])
  2448. result = values.str.slice_replace(-1, None, 'z')
  2449. tm.assert_series_equal(result, exp)
  2450. exp = Series(['zrt', 'zer', 'zat', 'z', NA])
  2451. result = values.str.slice_replace(None, -2, 'z')
  2452. tm.assert_series_equal(result, exp)
  2453. exp = Series(['shortz', 'a bit znger', 'evenlozerthanthat', 'z', NA])
  2454. result = values.str.slice_replace(6, 8, 'z')
  2455. tm.assert_series_equal(result, exp)
  2456. exp = Series(['zrt', 'a zit longer', 'evenlongzerthanthat', 'z', NA])
  2457. result = values.str.slice_replace(-10, 3, 'z')
  2458. tm.assert_series_equal(result, exp)
  2459. def test_strip_lstrip_rstrip(self):
  2460. values = Series([' aa ', ' bb \n', NA, 'cc '])
  2461. result = values.str.strip()
  2462. exp = Series(['aa', 'bb', NA, 'cc'])
  2463. tm.assert_series_equal(result, exp)
  2464. result = values.str.lstrip()
  2465. exp = Series(['aa ', 'bb \n', NA, 'cc '])
  2466. tm.assert_series_equal(result, exp)
  2467. result = values.str.rstrip()
  2468. exp = Series([' aa', ' bb', NA, 'cc'])
  2469. tm.assert_series_equal(result, exp)
  2470. def test_strip_lstrip_rstrip_mixed(self):
  2471. # mixed
  2472. mixed = Series([' aa ', NA, ' bb \t\n', True, datetime.today(), None,
  2473. 1, 2.])
  2474. rs = Series(mixed).str.strip()
  2475. xp = Series(['aa', NA, 'bb', NA, NA, NA, NA, NA])
  2476. assert isinstance(rs, Series)
  2477. tm.assert_almost_equal(rs, xp)
  2478. rs = Series(mixed).str.lstrip()
  2479. xp = Series(['aa ', NA, 'bb \t\n', NA, NA, NA, NA, NA])
  2480. assert isinstance(rs, Series)
  2481. tm.assert_almost_equal(rs, xp)
  2482. rs = Series(mixed).str.rstrip()
  2483. xp = Series([' aa', NA, ' bb', NA, NA, NA, NA, NA])
  2484. assert isinstance(rs, Series)
  2485. tm.assert_almost_equal(rs, xp)
  2486. def test_strip_lstrip_rstrip_unicode(self):
  2487. # unicode
  2488. values = Series([u(' aa '), u(' bb \n'), NA, u('cc ')])
  2489. result = values.str.strip()
  2490. exp = Series([u('aa'), u('bb'), NA, u('cc')])
  2491. tm.assert_series_equal(result, exp)
  2492. result = values.str.lstrip()
  2493. exp = Series([u('aa '), u('bb \n'), NA, u('cc ')])
  2494. tm.assert_series_equal(result, exp)
  2495. result = values.str.rstrip()
  2496. exp = Series([u(' aa'), u(' bb'), NA, u('cc')])
  2497. tm.assert_series_equal(result, exp)
  2498. def test_strip_lstrip_rstrip_args(self):
  2499. values = Series(['xxABCxx', 'xx BNSD', 'LDFJH xx'])
  2500. rs = values.str.strip('x')
  2501. xp = Series(['ABC', ' BNSD', 'LDFJH '])
  2502. assert_series_equal(rs, xp)
  2503. rs = values.str.lstrip('x')
  2504. xp = Series(['ABCxx', ' BNSD', 'LDFJH xx'])
  2505. assert_series_equal(rs, xp)
  2506. rs = values.str.rstrip('x')
  2507. xp = Series(['xxABC', 'xx BNSD', 'LDFJH '])
  2508. assert_series_equal(rs, xp)
  2509. def test_strip_lstrip_rstrip_args_unicode(self):
  2510. values = Series([u('xxABCxx'), u('xx BNSD'), u('LDFJH xx')])
  2511. rs = values.str.strip(u('x'))
  2512. xp = Series(['ABC', ' BNSD', 'LDFJH '])
  2513. assert_series_equal(rs, xp)
  2514. rs = values.str.lstrip(u('x'))
  2515. xp = Series(['ABCxx', ' BNSD', 'LDFJH xx'])
  2516. assert_series_equal(rs, xp)
  2517. rs = values.str.rstrip(u('x'))
  2518. xp = Series(['xxABC', 'xx BNSD', 'LDFJH '])
  2519. assert_series_equal(rs, xp)
  2520. def test_wrap(self):
  2521. # test values are: two words less than width, two words equal to width,
  2522. # two words greater than width, one word less than width, one word
  2523. # equal to width, one word greater than width, multiple tokens with
  2524. # trailing whitespace equal to width
  2525. values = Series([u('hello world'), u('hello world!'), u(
  2526. 'hello world!!'), u('abcdefabcde'), u('abcdefabcdef'), u(
  2527. 'abcdefabcdefa'), u('ab ab ab ab '), u('ab ab ab ab a'), u(
  2528. '\t')])
  2529. # expected values
  2530. xp = Series([u('hello world'), u('hello world!'), u('hello\nworld!!'),
  2531. u('abcdefabcde'), u('abcdefabcdef'), u('abcdefabcdef\na'),
  2532. u('ab ab ab ab'), u('ab ab ab ab\na'), u('')])
  2533. rs = values.str.wrap(12, break_long_words=True)
  2534. assert_series_equal(rs, xp)
  2535. # test with pre and post whitespace (non-unicode), NaN, and non-ascii
  2536. # Unicode
  2537. values = Series([' pre ', np.nan, u('\xac\u20ac\U00008000 abadcafe')
  2538. ])
  2539. xp = Series([' pre', NA, u('\xac\u20ac\U00008000 ab\nadcafe')])
  2540. rs = values.str.wrap(6)
  2541. assert_series_equal(rs, xp)
  2542. def test_get(self):
  2543. values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
  2544. result = values.str.split('_').str.get(1)
  2545. expected = Series(['b', 'd', np.nan, 'g'])
  2546. tm.assert_series_equal(result, expected)
  2547. # mixed
  2548. mixed = Series(['a_b_c', NA, 'c_d_e', True, datetime.today(), None, 1,
  2549. 2.])
  2550. rs = Series(mixed).str.split('_').str.get(1)
  2551. xp = Series(['b', NA, 'd', NA, NA, NA, NA, NA])
  2552. assert isinstance(rs, Series)
  2553. tm.assert_almost_equal(rs, xp)
  2554. # unicode
  2555. values = Series([u('a_b_c'), u('c_d_e'), np.nan, u('f_g_h')])
  2556. result = values.str.split('_').str.get(1)
  2557. expected = Series([u('b'), u('d'), np.nan, u('g')])
  2558. tm.assert_series_equal(result, expected)
  2559. # bounds testing
  2560. values = Series(['1_2_3_4_5', '6_7_8_9_10', '11_12'])
  2561. # positive index
  2562. result = values.str.split('_').str.get(2)
  2563. expected = Series(['3', '8', np.nan])
  2564. tm.assert_series_equal(result, expected)
  2565. # negative index
  2566. result = values.str.split('_').str.get(-3)
  2567. expected = Series(['3', '8', np.nan])
  2568. tm.assert_series_equal(result, expected)
  2569. def test_get_complex(self):
  2570. # GH 20671, getting value not in dict raising `KeyError`
  2571. values = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3},
  2572. {1: 'a', 2: 'b', 3: 'c'}])
  2573. result = values.str.get(1)
  2574. expected = Series([2, 2, np.nan, 'a'])
  2575. tm.assert_series_equal(result, expected)
  2576. result = values.str.get(-1)
  2577. expected = Series([3, 3, np.nan, np.nan])
  2578. tm.assert_series_equal(result, expected)
  2579. @pytest.mark.parametrize('to_type', [tuple, list, np.array])
  2580. def test_get_complex_nested(self, to_type):
  2581. values = Series([to_type([to_type([1, 2])])])
  2582. result = values.str.get(0)
  2583. expected = Series([to_type([1, 2])])
  2584. tm.assert_series_equal(result, expected)
  2585. result = values.str.get(1)
  2586. expected = Series([np.nan])
  2587. tm.assert_series_equal(result, expected)
  2588. def test_contains_moar(self):
  2589. # PR #1179
  2590. s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA,
  2591. 'CABA', 'dog', 'cat'])
  2592. result = s.str.contains('a')
  2593. expected = Series([False, False, False, True, True, False, np.nan,
  2594. False, False, True])
  2595. assert_series_equal(result, expected)
  2596. result = s.str.contains('a', case=False)
  2597. expected = Series([True, False, False, True, True, False, np.nan, True,
  2598. False, True])
  2599. assert_series_equal(result, expected)
  2600. result = s.str.contains('Aa')
  2601. expected = Series([False, False, False, True, False, False, np.nan,
  2602. False, False, False])
  2603. assert_series_equal(result, expected)
  2604. result = s.str.contains('ba')
  2605. expected = Series([False, False, False, True, False, False, np.nan,
  2606. False, False, False])
  2607. assert_series_equal(result, expected)
  2608. result = s.str.contains('ba', case=False)
  2609. expected = Series([False, False, False, True, True, False, np.nan,
  2610. True, False, False])
  2611. assert_series_equal(result, expected)
  2612. def test_contains_nan(self):
  2613. # PR #14171
  2614. s = Series([np.nan, np.nan, np.nan], dtype=np.object_)
  2615. result = s.str.contains('foo', na=False)
  2616. expected = Series([False, False, False], dtype=np.bool_)
  2617. assert_series_equal(result, expected)
  2618. result = s.str.contains('foo', na=True)
  2619. expected = Series([True, True, True], dtype=np.bool_)
  2620. assert_series_equal(result, expected)
  2621. result = s.str.contains('foo', na="foo")
  2622. expected = Series(["foo", "foo", "foo"], dtype=np.object_)
  2623. assert_series_equal(result, expected)
  2624. result = s.str.contains('foo')
  2625. expected = Series([np.nan, np.nan, np.nan], dtype=np.object_)
  2626. assert_series_equal(result, expected)
  2627. def test_replace_moar(self):
  2628. # PR #1179
  2629. s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, 'CABA',
  2630. 'dog', 'cat'])
  2631. result = s.str.replace('A', 'YYY')
  2632. expected = Series(['YYY', 'B', 'C', 'YYYaba', 'Baca', '', NA,
  2633. 'CYYYBYYY', 'dog', 'cat'])
  2634. assert_series_equal(result, expected)
  2635. result = s.str.replace('A', 'YYY', case=False)
  2636. expected = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', '', NA,
  2637. 'CYYYBYYY', 'dog', 'cYYYt'])
  2638. assert_series_equal(result, expected)
  2639. result = s.str.replace('^.a|dog', 'XX-XX ', case=False)
  2640. expected = Series(['A', 'B', 'C', 'XX-XX ba', 'XX-XX ca', '', NA,
  2641. 'XX-XX BA', 'XX-XX ', 'XX-XX t'])
  2642. assert_series_equal(result, expected)
  2643. def test_string_slice_get_syntax(self):
  2644. s = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', NA, 'CYYYBYYY',
  2645. 'dog', 'cYYYt'])
  2646. result = s.str[0]
  2647. expected = s.str.get(0)
  2648. assert_series_equal(result, expected)
  2649. result = s.str[:3]
  2650. expected = s.str.slice(stop=3)
  2651. assert_series_equal(result, expected)
  2652. result = s.str[2::-1]
  2653. expected = s.str.slice(start=2, step=-1)
  2654. assert_series_equal(result, expected)
  2655. def test_string_slice_out_of_bounds(self):
  2656. s = Series([(1, 2), (1, ), (3, 4, 5)])
  2657. result = s.str[1]
  2658. expected = Series([2, np.nan, 4])
  2659. assert_series_equal(result, expected)
  2660. s = Series(['foo', 'b', 'ba'])
  2661. result = s.str[1]
  2662. expected = Series(['o', np.nan, 'a'])
  2663. assert_series_equal(result, expected)
  2664. def test_match_findall_flags(self):
  2665. data = {'Dave': 'dave@google.com',
  2666. 'Steve': 'steve@gmail.com',
  2667. 'Rob': 'rob@gmail.com',
  2668. 'Wes': np.nan}
  2669. data = Series(data)
  2670. pat = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
  2671. result = data.str.extract(pat, flags=re.IGNORECASE, expand=True)
  2672. assert result.iloc[0].tolist() == ['dave', 'google', 'com']
  2673. result = data.str.match(pat, flags=re.IGNORECASE)
  2674. assert result[0]
  2675. result = data.str.findall(pat, flags=re.IGNORECASE)
  2676. assert result[0][0] == ('dave', 'google', 'com')
  2677. result = data.str.count(pat, flags=re.IGNORECASE)
  2678. assert result[0] == 1
  2679. with tm.assert_produces_warning(UserWarning):
  2680. result = data.str.contains(pat, flags=re.IGNORECASE)
  2681. assert result[0]
  2682. def test_encode_decode(self):
  2683. base = Series([u('a'), u('b'), u('a\xe4')])
  2684. series = base.str.encode('utf-8')
  2685. f = lambda x: x.decode('utf-8')
  2686. result = series.str.decode('utf-8')
  2687. exp = series.map(f)
  2688. tm.assert_series_equal(result, exp)
  2689. def test_encode_decode_errors(self):
  2690. encodeBase = Series([u('a'), u('b'), u('a\x9d')])
  2691. pytest.raises(UnicodeEncodeError, encodeBase.str.encode, 'cp1252')
  2692. f = lambda x: x.encode('cp1252', 'ignore')
  2693. result = encodeBase.str.encode('cp1252', 'ignore')
  2694. exp = encodeBase.map(f)
  2695. tm.assert_series_equal(result, exp)
  2696. decodeBase = Series([b'a', b'b', b'a\x9d'])
  2697. pytest.raises(UnicodeDecodeError, decodeBase.str.decode, 'cp1252')
  2698. f = lambda x: x.decode('cp1252', 'ignore')
  2699. result = decodeBase.str.decode('cp1252', 'ignore')
  2700. exp = decodeBase.map(f)
  2701. tm.assert_series_equal(result, exp)
  2702. def test_normalize(self):
  2703. values = ['ABC', u'ABC', u'123', np.nan, u'アイエ']
  2704. s = Series(values, index=['a', 'b', 'c', 'd', 'e'])
  2705. normed = [u'ABC', u'ABC', u'123', np.nan, u'アイエ']
  2706. expected = Series(normed, index=['a', 'b', 'c', 'd', 'e'])
  2707. result = s.str.normalize('NFKC')
  2708. tm.assert_series_equal(result, expected)
  2709. expected = Series([u'ABC', u'ABC', u'123', np.nan, u'アイエ'],
  2710. index=['a', 'b', 'c', 'd', 'e'])
  2711. result = s.str.normalize('NFC')
  2712. tm.assert_series_equal(result, expected)
  2713. with pytest.raises(ValueError, match="invalid normalization form"):
  2714. s.str.normalize('xxx')
  2715. s = Index([u'ABC', u'123', u'アイエ'])
  2716. expected = Index([u'ABC', u'123', u'アイエ'])
  2717. result = s.str.normalize('NFKC')
  2718. tm.assert_index_equal(result, expected)
  2719. def test_index_str_accessor_visibility(self):
  2720. from pandas.core.strings import StringMethods
  2721. if not compat.PY3:
  2722. cases = [(['a', 'b'], 'string'), (['a', u('b')], 'mixed'),
  2723. ([u('a'), u('b')], 'unicode'),
  2724. (['a', 'b', 1], 'mixed-integer'),
  2725. (['a', 'b', 1.3], 'mixed'),
  2726. (['a', 'b', 1.3, 1], 'mixed-integer'),
  2727. (['aa', datetime(2011, 1, 1)], 'mixed')]
  2728. else:
  2729. cases = [(['a', 'b'], 'string'), (['a', u('b')], 'string'),
  2730. ([u('a'), u('b')], 'string'),
  2731. (['a', 'b', 1], 'mixed-integer'),
  2732. (['a', 'b', 1.3], 'mixed'),
  2733. (['a', 'b', 1.3, 1], 'mixed-integer'),
  2734. (['aa', datetime(2011, 1, 1)], 'mixed')]
  2735. for values, tp in cases:
  2736. idx = Index(values)
  2737. assert isinstance(Series(values).str, StringMethods)
  2738. assert isinstance(idx.str, StringMethods)
  2739. assert idx.inferred_type == tp
  2740. for values, tp in cases:
  2741. idx = Index(values)
  2742. assert isinstance(Series(values).str, StringMethods)
  2743. assert isinstance(idx.str, StringMethods)
  2744. assert idx.inferred_type == tp
  2745. cases = [([1, np.nan], 'floating'),
  2746. ([datetime(2011, 1, 1)], 'datetime64'),
  2747. ([timedelta(1)], 'timedelta64')]
  2748. for values, tp in cases:
  2749. idx = Index(values)
  2750. message = 'Can only use .str accessor with string values'
  2751. with pytest.raises(AttributeError, match=message):
  2752. Series(values).str
  2753. with pytest.raises(AttributeError, match=message):
  2754. idx.str
  2755. assert idx.inferred_type == tp
  2756. # MultiIndex has mixed dtype, but not allow to use accessor
  2757. idx = MultiIndex.from_tuples([('a', 'b'), ('a', 'b')])
  2758. assert idx.inferred_type == 'mixed'
  2759. message = 'Can only use .str accessor with Index, not MultiIndex'
  2760. with pytest.raises(AttributeError, match=message):
  2761. idx.str
  2762. def test_str_accessor_no_new_attributes(self):
  2763. # https://github.com/pandas-dev/pandas/issues/10673
  2764. s = Series(list('aabbcde'))
  2765. with pytest.raises(AttributeError,
  2766. match="You cannot add any new attribute"):
  2767. s.str.xlabel = "a"
  2768. def test_method_on_bytes(self):
  2769. lhs = Series(np.array(list('abc'), 'S1').astype(object))
  2770. rhs = Series(np.array(list('def'), 'S1').astype(object))
  2771. if compat.PY3:
  2772. pytest.raises(TypeError, lhs.str.cat, rhs)
  2773. else:
  2774. result = lhs.str.cat(rhs)
  2775. expected = Series(np.array(
  2776. ['ad', 'be', 'cf'], 'S2').astype(object))
  2777. tm.assert_series_equal(result, expected)