strings.py 98 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184
  1. # -*- coding: utf-8 -*-
  2. import codecs
  3. import re
  4. import textwrap
  5. import warnings
  6. import numpy as np
  7. import pandas._libs.lib as lib
  8. import pandas._libs.ops as libops
  9. import pandas.compat as compat
  10. from pandas.compat import zip
  11. from pandas.util._decorators import Appender, deprecate_kwarg
  12. from pandas.core.dtypes.common import (
  13. ensure_object, is_bool_dtype, is_categorical_dtype, is_integer,
  14. is_list_like, is_object_dtype, is_re, is_scalar, is_string_like)
  15. from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
  16. from pandas.core.dtypes.missing import isna
  17. from pandas.core.algorithms import take_1d
  18. from pandas.core.base import NoNewAttributesMixin
  19. import pandas.core.common as com
  20. _cpython_optimized_encoders = (
  21. "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii"
  22. )
  23. _cpython_optimized_decoders = _cpython_optimized_encoders + (
  24. "utf-16", "utf-32"
  25. )
  26. _shared_docs = dict()
  27. def cat_core(list_of_columns, sep):
  28. """
  29. Auxiliary function for :meth:`str.cat`
  30. Parameters
  31. ----------
  32. list_of_columns : list of numpy arrays
  33. List of arrays to be concatenated with sep;
  34. these arrays may not contain NaNs!
  35. sep : string
  36. The separator string for concatenating the columns
  37. Returns
  38. -------
  39. nd.array
  40. The concatenation of list_of_columns with sep
  41. """
  42. list_with_sep = [sep] * (2 * len(list_of_columns) - 1)
  43. list_with_sep[::2] = list_of_columns
  44. return np.sum(list_with_sep, axis=0)
  45. def _na_map(f, arr, na_result=np.nan, dtype=object):
  46. # should really _check_ for NA
  47. return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
  48. def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
  49. if not len(arr):
  50. return np.ndarray(0, dtype=dtype)
  51. if isinstance(arr, ABCSeries):
  52. arr = arr.values
  53. if not isinstance(arr, np.ndarray):
  54. arr = np.asarray(arr, dtype=object)
  55. if na_mask:
  56. mask = isna(arr)
  57. try:
  58. convert = not all(mask)
  59. result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
  60. except (TypeError, AttributeError) as e:
  61. # Reraise the exception if callable `f` got wrong number of args.
  62. # The user may want to be warned by this, instead of getting NaN
  63. if compat.PY2:
  64. p_err = r'takes (no|(exactly|at (least|most)) ?\d+) arguments?'
  65. else:
  66. p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ '
  67. r'(?(3)required )positional arguments?')
  68. if len(e.args) >= 1 and re.search(p_err, e.args[0]):
  69. raise e
  70. def g(x):
  71. try:
  72. return f(x)
  73. except (TypeError, AttributeError):
  74. return na_value
  75. return _map(g, arr, dtype=dtype)
  76. if na_value is not np.nan:
  77. np.putmask(result, mask, na_value)
  78. if result.dtype == object:
  79. result = lib.maybe_convert_objects(result)
  80. return result
  81. else:
  82. return lib.map_infer(arr, f)
  83. def str_count(arr, pat, flags=0):
  84. """
  85. Count occurrences of pattern in each string of the Series/Index.
  86. This function is used to count the number of times a particular regex
  87. pattern is repeated in each of the string elements of the
  88. :class:`~pandas.Series`.
  89. Parameters
  90. ----------
  91. pat : str
  92. Valid regular expression.
  93. flags : int, default 0, meaning no flags
  94. Flags for the `re` module. For a complete list, `see here
  95. <https://docs.python.org/3/howto/regex.html#compilation-flags>`_.
  96. **kwargs
  97. For compatibility with other string methods. Not used.
  98. Returns
  99. -------
  100. counts : Series or Index
  101. Same type as the calling object containing the integer counts.
  102. See Also
  103. --------
  104. re : Standard library module for regular expressions.
  105. str.count : Standard library version, without regular expression support.
  106. Notes
  107. -----
  108. Some characters need to be escaped when passing in `pat`.
  109. eg. ``'$'`` has a special meaning in regex and must be escaped when
  110. finding this literal character.
  111. Examples
  112. --------
  113. >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])
  114. >>> s.str.count('a')
  115. 0 0.0
  116. 1 0.0
  117. 2 2.0
  118. 3 2.0
  119. 4 NaN
  120. 5 0.0
  121. 6 1.0
  122. dtype: float64
  123. Escape ``'$'`` to find the literal dollar sign.
  124. >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
  125. >>> s.str.count('\\$')
  126. 0 1
  127. 1 0
  128. 2 1
  129. 3 2
  130. 4 2
  131. 5 0
  132. dtype: int64
  133. This is also available on Index
  134. >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')
  135. Int64Index([0, 0, 2, 1], dtype='int64')
  136. """
  137. regex = re.compile(pat, flags=flags)
  138. f = lambda x: len(regex.findall(x))
  139. return _na_map(f, arr, dtype=int)
  140. def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
  141. """
  142. Test if pattern or regex is contained within a string of a Series or Index.
  143. Return boolean Series or Index based on whether a given pattern or regex is
  144. contained within a string of a Series or Index.
  145. Parameters
  146. ----------
  147. pat : str
  148. Character sequence or regular expression.
  149. case : bool, default True
  150. If True, case sensitive.
  151. flags : int, default 0 (no flags)
  152. Flags to pass through to the re module, e.g. re.IGNORECASE.
  153. na : default NaN
  154. Fill value for missing values.
  155. regex : bool, default True
  156. If True, assumes the pat is a regular expression.
  157. If False, treats the pat as a literal string.
  158. Returns
  159. -------
  160. Series or Index of boolean values
  161. A Series or Index of boolean values indicating whether the
  162. given pattern is contained within the string of each element
  163. of the Series or Index.
  164. See Also
  165. --------
  166. match : Analogous, but stricter, relying on re.match instead of re.search.
  167. Series.str.startswith : Test if the start of each string element matches a
  168. pattern.
  169. Series.str.endswith : Same as startswith, but tests the end of string.
  170. Examples
  171. --------
  172. Returning a Series of booleans using only a literal pattern.
  173. >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
  174. >>> s1.str.contains('og', regex=False)
  175. 0 False
  176. 1 True
  177. 2 False
  178. 3 False
  179. 4 NaN
  180. dtype: object
  181. Returning an Index of booleans using only a literal pattern.
  182. >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN])
  183. >>> ind.str.contains('23', regex=False)
  184. Index([False, False, False, True, nan], dtype='object')
  185. Specifying case sensitivity using `case`.
  186. >>> s1.str.contains('oG', case=True, regex=True)
  187. 0 False
  188. 1 False
  189. 2 False
  190. 3 False
  191. 4 NaN
  192. dtype: object
  193. Specifying `na` to be `False` instead of `NaN` replaces NaN values
  194. with `False`. If Series or Index does not contain NaN values
  195. the resultant dtype will be `bool`, otherwise, an `object` dtype.
  196. >>> s1.str.contains('og', na=False, regex=True)
  197. 0 False
  198. 1 True
  199. 2 False
  200. 3 False
  201. 4 False
  202. dtype: bool
  203. Returning 'house' or 'dog' when either expression occurs in a string.
  204. >>> s1.str.contains('house|dog', regex=True)
  205. 0 False
  206. 1 True
  207. 2 True
  208. 3 False
  209. 4 NaN
  210. dtype: object
  211. Ignoring case sensitivity using `flags` with regex.
  212. >>> import re
  213. >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)
  214. 0 False
  215. 1 False
  216. 2 True
  217. 3 False
  218. 4 NaN
  219. dtype: object
  220. Returning any digit using regular expression.
  221. >>> s1.str.contains('\\d', regex=True)
  222. 0 False
  223. 1 False
  224. 2 False
  225. 3 True
  226. 4 NaN
  227. dtype: object
  228. Ensure `pat` is a not a literal pattern when `regex` is set to True.
  229. Note in the following example one might expect only `s2[1]` and `s2[3]` to
  230. return `True`. However, '.0' as a regex matches any character
  231. followed by a 0.
  232. >>> s2 = pd.Series(['40','40.0','41','41.0','35'])
  233. >>> s2.str.contains('.0', regex=True)
  234. 0 True
  235. 1 True
  236. 2 False
  237. 3 True
  238. 4 False
  239. dtype: bool
  240. """
  241. if regex:
  242. if not case:
  243. flags |= re.IGNORECASE
  244. regex = re.compile(pat, flags=flags)
  245. if regex.groups > 0:
  246. warnings.warn("This pattern has match groups. To actually get the"
  247. " groups, use str.extract.", UserWarning,
  248. stacklevel=3)
  249. f = lambda x: bool(regex.search(x))
  250. else:
  251. if case:
  252. f = lambda x: pat in x
  253. else:
  254. upper_pat = pat.upper()
  255. f = lambda x: upper_pat in x
  256. uppered = _na_map(lambda x: x.upper(), arr)
  257. return _na_map(f, uppered, na, dtype=bool)
  258. return _na_map(f, arr, na, dtype=bool)
  259. def str_startswith(arr, pat, na=np.nan):
  260. """
  261. Test if the start of each string element matches a pattern.
  262. Equivalent to :meth:`str.startswith`.
  263. Parameters
  264. ----------
  265. pat : str
  266. Character sequence. Regular expressions are not accepted.
  267. na : object, default NaN
  268. Object shown if element tested is not a string.
  269. Returns
  270. -------
  271. Series or Index of bool
  272. A Series of booleans indicating whether the given pattern matches
  273. the start of each string element.
  274. See Also
  275. --------
  276. str.startswith : Python standard library string method.
  277. Series.str.endswith : Same as startswith, but tests the end of string.
  278. Series.str.contains : Tests if string element contains a pattern.
  279. Examples
  280. --------
  281. >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan])
  282. >>> s
  283. 0 bat
  284. 1 Bear
  285. 2 cat
  286. 3 NaN
  287. dtype: object
  288. >>> s.str.startswith('b')
  289. 0 True
  290. 1 False
  291. 2 False
  292. 3 NaN
  293. dtype: object
  294. Specifying `na` to be `False` instead of `NaN`.
  295. >>> s.str.startswith('b', na=False)
  296. 0 True
  297. 1 False
  298. 2 False
  299. 3 False
  300. dtype: bool
  301. """
  302. f = lambda x: x.startswith(pat)
  303. return _na_map(f, arr, na, dtype=bool)
  304. def str_endswith(arr, pat, na=np.nan):
  305. """
  306. Test if the end of each string element matches a pattern.
  307. Equivalent to :meth:`str.endswith`.
  308. Parameters
  309. ----------
  310. pat : str
  311. Character sequence. Regular expressions are not accepted.
  312. na : object, default NaN
  313. Object shown if element tested is not a string.
  314. Returns
  315. -------
  316. Series or Index of bool
  317. A Series of booleans indicating whether the given pattern matches
  318. the end of each string element.
  319. See Also
  320. --------
  321. str.endswith : Python standard library string method.
  322. Series.str.startswith : Same as endswith, but tests the start of string.
  323. Series.str.contains : Tests if string element contains a pattern.
  324. Examples
  325. --------
  326. >>> s = pd.Series(['bat', 'bear', 'caT', np.nan])
  327. >>> s
  328. 0 bat
  329. 1 bear
  330. 2 caT
  331. 3 NaN
  332. dtype: object
  333. >>> s.str.endswith('t')
  334. 0 True
  335. 1 False
  336. 2 False
  337. 3 NaN
  338. dtype: object
  339. Specifying `na` to be `False` instead of `NaN`.
  340. >>> s.str.endswith('t', na=False)
  341. 0 True
  342. 1 False
  343. 2 False
  344. 3 False
  345. dtype: bool
  346. """
  347. f = lambda x: x.endswith(pat)
  348. return _na_map(f, arr, na, dtype=bool)
  349. def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
  350. r"""
  351. Replace occurrences of pattern/regex in the Series/Index with
  352. some other string. Equivalent to :meth:`str.replace` or
  353. :func:`re.sub`.
  354. Parameters
  355. ----------
  356. pat : string or compiled regex
  357. String can be a character sequence or regular expression.
  358. .. versionadded:: 0.20.0
  359. `pat` also accepts a compiled regex.
  360. repl : string or callable
  361. Replacement string or a callable. The callable is passed the regex
  362. match object and must return a replacement string to be used.
  363. See :func:`re.sub`.
  364. .. versionadded:: 0.20.0
  365. `repl` also accepts a callable.
  366. n : int, default -1 (all)
  367. Number of replacements to make from start
  368. case : boolean, default None
  369. - If True, case sensitive (the default if `pat` is a string)
  370. - Set to False for case insensitive
  371. - Cannot be set if `pat` is a compiled regex
  372. flags : int, default 0 (no flags)
  373. - re module flags, e.g. re.IGNORECASE
  374. - Cannot be set if `pat` is a compiled regex
  375. regex : boolean, default True
  376. - If True, assumes the passed-in pattern is a regular expression.
  377. - If False, treats the pattern as a literal string
  378. - Cannot be set to False if `pat` is a compiled regex or `repl` is
  379. a callable.
  380. .. versionadded:: 0.23.0
  381. Returns
  382. -------
  383. Series or Index of object
  384. A copy of the object with all matching occurrences of `pat` replaced by
  385. `repl`.
  386. Raises
  387. ------
  388. ValueError
  389. * if `regex` is False and `repl` is a callable or `pat` is a compiled
  390. regex
  391. * if `pat` is a compiled regex and `case` or `flags` is set
  392. Notes
  393. -----
  394. When `pat` is a compiled regex, all flags should be included in the
  395. compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled
  396. regex will raise an error.
  397. Examples
  398. --------
  399. When `pat` is a string and `regex` is True (the default), the given `pat`
  400. is compiled as a regex. When `repl` is a string, it replaces matching
  401. regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are
  402. left as is:
  403. >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)
  404. 0 bao
  405. 1 baz
  406. 2 NaN
  407. dtype: object
  408. When `pat` is a string and `regex` is False, every `pat` is replaced with
  409. `repl` as with :meth:`str.replace`:
  410. >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)
  411. 0 bao
  412. 1 fuz
  413. 2 NaN
  414. dtype: object
  415. When `repl` is a callable, it is called on every `pat` using
  416. :func:`re.sub`. The callable should expect one positional argument
  417. (a regex object) and return a string.
  418. To get the idea:
  419. >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr)
  420. 0 <_sre.SRE_Match object; span=(0, 1), match='f'>oo
  421. 1 <_sre.SRE_Match object; span=(0, 1), match='f'>uz
  422. 2 NaN
  423. dtype: object
  424. Reverse every lowercase alphabetic word:
  425. >>> repl = lambda m: m.group(0)[::-1]
  426. >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl)
  427. 0 oof 123
  428. 1 rab zab
  429. 2 NaN
  430. dtype: object
  431. Using regex groups (extract second group and swap case):
  432. >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
  433. >>> repl = lambda m: m.group('two').swapcase()
  434. >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl)
  435. 0 tWO
  436. 1 bAR
  437. dtype: object
  438. Using a compiled regex with flags
  439. >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
  440. >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar')
  441. 0 foo
  442. 1 bar
  443. 2 NaN
  444. dtype: object
  445. """
  446. # Check whether repl is valid (GH 13438, GH 15055)
  447. if not (is_string_like(repl) or callable(repl)):
  448. raise TypeError("repl must be a string or callable")
  449. is_compiled_re = is_re(pat)
  450. if regex:
  451. if is_compiled_re:
  452. if (case is not None) or (flags != 0):
  453. raise ValueError("case and flags cannot be set"
  454. " when pat is a compiled regex")
  455. else:
  456. # not a compiled regex
  457. # set default case
  458. if case is None:
  459. case = True
  460. # add case flag, if provided
  461. if case is False:
  462. flags |= re.IGNORECASE
  463. if is_compiled_re or len(pat) > 1 or flags or callable(repl):
  464. n = n if n >= 0 else 0
  465. compiled = re.compile(pat, flags=flags)
  466. f = lambda x: compiled.sub(repl=repl, string=x, count=n)
  467. else:
  468. f = lambda x: x.replace(pat, repl, n)
  469. else:
  470. if is_compiled_re:
  471. raise ValueError("Cannot use a compiled regex as replacement "
  472. "pattern with regex=False")
  473. if callable(repl):
  474. raise ValueError("Cannot use a callable replacement when "
  475. "regex=False")
  476. f = lambda x: x.replace(pat, repl, n)
  477. return _na_map(f, arr)
  478. def str_repeat(arr, repeats):
  479. """
  480. Duplicate each string in the Series or Index.
  481. Parameters
  482. ----------
  483. repeats : int or sequence of int
  484. Same value for all (int) or different value per (sequence).
  485. Returns
  486. -------
  487. Series or Index of object
  488. Series or Index of repeated string objects specified by
  489. input parameter repeats.
  490. Examples
  491. --------
  492. >>> s = pd.Series(['a', 'b', 'c'])
  493. >>> s
  494. 0 a
  495. 1 b
  496. 2 c
  497. Single int repeats string in Series
  498. >>> s.str.repeat(repeats=2)
  499. 0 aa
  500. 1 bb
  501. 2 cc
  502. Sequence of int repeats corresponding string in Series
  503. >>> s.str.repeat(repeats=[1, 2, 3])
  504. 0 a
  505. 1 bb
  506. 2 ccc
  507. """
  508. if is_scalar(repeats):
  509. def rep(x):
  510. try:
  511. return compat.binary_type.__mul__(x, repeats)
  512. except TypeError:
  513. return compat.text_type.__mul__(x, repeats)
  514. return _na_map(rep, arr)
  515. else:
  516. def rep(x, r):
  517. try:
  518. return compat.binary_type.__mul__(x, r)
  519. except TypeError:
  520. return compat.text_type.__mul__(x, r)
  521. repeats = np.asarray(repeats, dtype=object)
  522. result = libops.vec_binop(com.values_from_object(arr), repeats, rep)
  523. return result
  524. def str_match(arr, pat, case=True, flags=0, na=np.nan):
  525. """
  526. Determine if each string matches a regular expression.
  527. Parameters
  528. ----------
  529. pat : string
  530. Character sequence or regular expression
  531. case : boolean, default True
  532. If True, case sensitive
  533. flags : int, default 0 (no flags)
  534. re module flags, e.g. re.IGNORECASE
  535. na : default NaN, fill value for missing values
  536. Returns
  537. -------
  538. Series/array of boolean values
  539. See Also
  540. --------
  541. contains : Analogous, but less strict, relying on re.search instead of
  542. re.match.
  543. extract : Extract matched groups.
  544. """
  545. if not case:
  546. flags |= re.IGNORECASE
  547. regex = re.compile(pat, flags=flags)
  548. dtype = bool
  549. f = lambda x: bool(regex.match(x))
  550. return _na_map(f, arr, na, dtype=dtype)
  551. def _get_single_group_name(rx):
  552. try:
  553. return list(rx.groupindex.keys()).pop()
  554. except IndexError:
  555. return None
  556. def _groups_or_na_fun(regex):
  557. """Used in both extract_noexpand and extract_frame"""
  558. if regex.groups == 0:
  559. raise ValueError("pattern contains no capture groups")
  560. empty_row = [np.nan] * regex.groups
  561. def f(x):
  562. if not isinstance(x, compat.string_types):
  563. return empty_row
  564. m = regex.search(x)
  565. if m:
  566. return [np.nan if item is None else item for item in m.groups()]
  567. else:
  568. return empty_row
  569. return f
  570. def _str_extract_noexpand(arr, pat, flags=0):
  571. """
  572. Find groups in each string in the Series using passed regular
  573. expression. This function is called from
  574. str_extract(expand=False), and can return Series, DataFrame, or
  575. Index.
  576. """
  577. from pandas import DataFrame, Index
  578. regex = re.compile(pat, flags=flags)
  579. groups_or_na = _groups_or_na_fun(regex)
  580. if regex.groups == 1:
  581. result = np.array([groups_or_na(val)[0] for val in arr], dtype=object)
  582. name = _get_single_group_name(regex)
  583. else:
  584. if isinstance(arr, Index):
  585. raise ValueError("only one regex group is supported with Index")
  586. name = None
  587. names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
  588. columns = [names.get(1 + i, i) for i in range(regex.groups)]
  589. if arr.empty:
  590. result = DataFrame(columns=columns, dtype=object)
  591. else:
  592. result = DataFrame(
  593. [groups_or_na(val) for val in arr],
  594. columns=columns,
  595. index=arr.index,
  596. dtype=object)
  597. return result, name
  598. def _str_extract_frame(arr, pat, flags=0):
  599. """
  600. For each subject string in the Series, extract groups from the
  601. first match of regular expression pat. This function is called from
  602. str_extract(expand=True), and always returns a DataFrame.
  603. """
  604. from pandas import DataFrame
  605. regex = re.compile(pat, flags=flags)
  606. groups_or_na = _groups_or_na_fun(regex)
  607. names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
  608. columns = [names.get(1 + i, i) for i in range(regex.groups)]
  609. if len(arr) == 0:
  610. return DataFrame(columns=columns, dtype=object)
  611. try:
  612. result_index = arr.index
  613. except AttributeError:
  614. result_index = None
  615. return DataFrame(
  616. [groups_or_na(val) for val in arr],
  617. columns=columns,
  618. index=result_index,
  619. dtype=object)
  620. def str_extract(arr, pat, flags=0, expand=True):
  621. r"""
  622. Extract capture groups in the regex `pat` as columns in a DataFrame.
  623. For each subject string in the Series, extract groups from the
  624. first match of regular expression `pat`.
  625. Parameters
  626. ----------
  627. pat : string
  628. Regular expression pattern with capturing groups.
  629. flags : int, default 0 (no flags)
  630. Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that
  631. modify regular expression matching for things like case,
  632. spaces, etc. For more details, see :mod:`re`.
  633. expand : bool, default True
  634. If True, return DataFrame with one column per capture group.
  635. If False, return a Series/Index if there is one capture group
  636. or DataFrame if there are multiple capture groups.
  637. .. versionadded:: 0.18.0
  638. Returns
  639. -------
  640. DataFrame or Series or Index
  641. A DataFrame with one row for each subject string, and one
  642. column for each group. Any capture group names in regular
  643. expression pat will be used for column names; otherwise
  644. capture group numbers will be used. The dtype of each result
  645. column is always object, even when no match is found. If
  646. ``expand=False`` and pat has only one capture group, then
  647. return a Series (if subject is a Series) or Index (if subject
  648. is an Index).
  649. See Also
  650. --------
  651. extractall : Returns all matches (not just the first match).
  652. Examples
  653. --------
  654. A pattern with two groups will return a DataFrame with two columns.
  655. Non-matches will be NaN.
  656. >>> s = pd.Series(['a1', 'b2', 'c3'])
  657. >>> s.str.extract(r'([ab])(\d)')
  658. 0 1
  659. 0 a 1
  660. 1 b 2
  661. 2 NaN NaN
  662. A pattern may contain optional groups.
  663. >>> s.str.extract(r'([ab])?(\d)')
  664. 0 1
  665. 0 a 1
  666. 1 b 2
  667. 2 NaN 3
  668. Named groups will become column names in the result.
  669. >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')
  670. letter digit
  671. 0 a 1
  672. 1 b 2
  673. 2 NaN NaN
  674. A pattern with one group will return a DataFrame with one column
  675. if expand=True.
  676. >>> s.str.extract(r'[ab](\d)', expand=True)
  677. 0
  678. 0 1
  679. 1 2
  680. 2 NaN
  681. A pattern with one group will return a Series if expand=False.
  682. >>> s.str.extract(r'[ab](\d)', expand=False)
  683. 0 1
  684. 1 2
  685. 2 NaN
  686. dtype: object
  687. """
  688. if not isinstance(expand, bool):
  689. raise ValueError("expand must be True or False")
  690. if expand:
  691. return _str_extract_frame(arr._orig, pat, flags=flags)
  692. else:
  693. result, name = _str_extract_noexpand(arr._parent, pat, flags=flags)
  694. return arr._wrap_result(result, name=name, expand=expand)
  695. def str_extractall(arr, pat, flags=0):
  696. r"""
  697. For each subject string in the Series, extract groups from all
  698. matches of regular expression pat. When each subject string in the
  699. Series has exactly one match, extractall(pat).xs(0, level='match')
  700. is the same as extract(pat).
  701. .. versionadded:: 0.18.0
  702. Parameters
  703. ----------
  704. pat : str
  705. Regular expression pattern with capturing groups.
  706. flags : int, default 0 (no flags)
  707. A ``re`` module flag, for example ``re.IGNORECASE``. These allow
  708. to modify regular expression matching for things like case, spaces,
  709. etc. Multiple flags can be combined with the bitwise OR operator,
  710. for example ``re.IGNORECASE | re.MULTILINE``.
  711. Returns
  712. -------
  713. DataFrame
  714. A ``DataFrame`` with one row for each match, and one column for each
  715. group. Its rows have a ``MultiIndex`` with first levels that come from
  716. the subject ``Series``. The last level is named 'match' and indexes the
  717. matches in each item of the ``Series``. Any capture group names in
  718. regular expression pat will be used for column names; otherwise capture
  719. group numbers will be used.
  720. See Also
  721. --------
  722. extract : Returns first match only (not all matches).
  723. Examples
  724. --------
  725. A pattern with one group will return a DataFrame with one column.
  726. Indices with no matches will not appear in the result.
  727. >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
  728. >>> s.str.extractall(r"[ab](\d)")
  729. 0
  730. match
  731. A 0 1
  732. 1 2
  733. B 0 1
  734. Capture group names are used for column names of the result.
  735. >>> s.str.extractall(r"[ab](?P<digit>\d)")
  736. digit
  737. match
  738. A 0 1
  739. 1 2
  740. B 0 1
  741. A pattern with two groups will return a DataFrame with two columns.
  742. >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
  743. letter digit
  744. match
  745. A 0 a 1
  746. 1 a 2
  747. B 0 b 1
  748. Optional groups that do not match are NaN in the result.
  749. >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)")
  750. letter digit
  751. match
  752. A 0 a 1
  753. 1 a 2
  754. B 0 b 1
  755. C 0 NaN 1
  756. """
  757. regex = re.compile(pat, flags=flags)
  758. # the regex must contain capture groups.
  759. if regex.groups == 0:
  760. raise ValueError("pattern contains no capture groups")
  761. if isinstance(arr, ABCIndexClass):
  762. arr = arr.to_series().reset_index(drop=True)
  763. names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
  764. columns = [names.get(1 + i, i) for i in range(regex.groups)]
  765. match_list = []
  766. index_list = []
  767. is_mi = arr.index.nlevels > 1
  768. for subject_key, subject in arr.iteritems():
  769. if isinstance(subject, compat.string_types):
  770. if not is_mi:
  771. subject_key = (subject_key, )
  772. for match_i, match_tuple in enumerate(regex.findall(subject)):
  773. if isinstance(match_tuple, compat.string_types):
  774. match_tuple = (match_tuple,)
  775. na_tuple = [np.NaN if group == "" else group
  776. for group in match_tuple]
  777. match_list.append(na_tuple)
  778. result_key = tuple(subject_key + (match_i, ))
  779. index_list.append(result_key)
  780. from pandas import MultiIndex
  781. index = MultiIndex.from_tuples(
  782. index_list, names=arr.index.names + ["match"])
  783. result = arr._constructor_expanddim(match_list, index=index,
  784. columns=columns)
  785. return result
  786. def str_get_dummies(arr, sep='|'):
  787. """
  788. Split each string in the Series by sep and return a frame of
  789. dummy/indicator variables.
  790. Parameters
  791. ----------
  792. sep : string, default "|"
  793. String to split on.
  794. Returns
  795. -------
  796. dummies : DataFrame
  797. See Also
  798. --------
  799. get_dummies
  800. Examples
  801. --------
  802. >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()
  803. a b c
  804. 0 1 1 0
  805. 1 1 0 0
  806. 2 1 0 1
  807. >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
  808. a b c
  809. 0 1 1 0
  810. 1 0 0 0
  811. 2 1 0 1
  812. """
  813. arr = arr.fillna('')
  814. try:
  815. arr = sep + arr + sep
  816. except TypeError:
  817. arr = sep + arr.astype(str) + sep
  818. tags = set()
  819. for ts in arr.str.split(sep):
  820. tags.update(ts)
  821. tags = sorted(tags - {""})
  822. dummies = np.empty((len(arr), len(tags)), dtype=np.int64)
  823. for i, t in enumerate(tags):
  824. pat = sep + t + sep
  825. dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x)
  826. return dummies, tags
  827. def str_join(arr, sep):
  828. """
  829. Join lists contained as elements in the Series/Index with passed delimiter.
  830. If the elements of a Series are lists themselves, join the content of these
  831. lists using the delimiter passed to the function.
  832. This function is an equivalent to :meth:`str.join`.
  833. Parameters
  834. ----------
  835. sep : str
  836. Delimiter to use between list entries.
  837. Returns
  838. -------
  839. Series/Index: object
  840. The list entries concatenated by intervening occurrences of the
  841. delimiter.
  842. Raises
  843. -------
  844. AttributeError
  845. If the supplied Series contains neither strings nor lists.
  846. See Also
  847. --------
  848. str.join : Standard library version of this method.
  849. Series.str.split : Split strings around given separator/delimiter.
  850. Notes
  851. -----
  852. If any of the list items is not a string object, the result of the join
  853. will be `NaN`.
  854. Examples
  855. --------
  856. Example with a list that contains non-string elements.
  857. >>> s = pd.Series([['lion', 'elephant', 'zebra'],
  858. ... [1.1, 2.2, 3.3],
  859. ... ['cat', np.nan, 'dog'],
  860. ... ['cow', 4.5, 'goat'],
  861. ... ['duck', ['swan', 'fish'], 'guppy']])
  862. >>> s
  863. 0 [lion, elephant, zebra]
  864. 1 [1.1, 2.2, 3.3]
  865. 2 [cat, nan, dog]
  866. 3 [cow, 4.5, goat]
  867. 4 [duck, [swan, fish], guppy]
  868. dtype: object
  869. Join all lists using a '-'. The lists containing object(s) of types other
  870. than str will produce a NaN.
  871. >>> s.str.join('-')
  872. 0 lion-elephant-zebra
  873. 1 NaN
  874. 2 NaN
  875. 3 NaN
  876. 4 NaN
  877. dtype: object
  878. """
  879. return _na_map(sep.join, arr)
  880. def str_findall(arr, pat, flags=0):
  881. """
  882. Find all occurrences of pattern or regular expression in the Series/Index.
  883. Equivalent to applying :func:`re.findall` to all the elements in the
  884. Series/Index.
  885. Parameters
  886. ----------
  887. pat : string
  888. Pattern or regular expression.
  889. flags : int, default 0
  890. ``re`` module flags, e.g. `re.IGNORECASE` (default is 0, which means
  891. no flags).
  892. Returns
  893. -------
  894. Series/Index of lists of strings
  895. All non-overlapping matches of pattern or regular expression in each
  896. string of this Series/Index.
  897. See Also
  898. --------
  899. count : Count occurrences of pattern or regular expression in each string
  900. of the Series/Index.
  901. extractall : For each string in the Series, extract groups from all matches
  902. of regular expression and return a DataFrame with one row for each
  903. match and one column for each group.
  904. re.findall : The equivalent ``re`` function to all non-overlapping matches
  905. of pattern or regular expression in string, as a list of strings.
  906. Examples
  907. --------
  908. >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit'])
  909. The search for the pattern 'Monkey' returns one match:
  910. >>> s.str.findall('Monkey')
  911. 0 []
  912. 1 [Monkey]
  913. 2 []
  914. dtype: object
  915. On the other hand, the search for the pattern 'MONKEY' doesn't return any
  916. match:
  917. >>> s.str.findall('MONKEY')
  918. 0 []
  919. 1 []
  920. 2 []
  921. dtype: object
  922. Flags can be added to the pattern or regular expression. For instance,
  923. to find the pattern 'MONKEY' ignoring the case:
  924. >>> import re
  925. >>> s.str.findall('MONKEY', flags=re.IGNORECASE)
  926. 0 []
  927. 1 [Monkey]
  928. 2 []
  929. dtype: object
  930. When the pattern matches more than one string in the Series, all matches
  931. are returned:
  932. >>> s.str.findall('on')
  933. 0 [on]
  934. 1 [on]
  935. 2 []
  936. dtype: object
  937. Regular expressions are supported too. For instance, the search for all the
  938. strings ending with the word 'on' is shown next:
  939. >>> s.str.findall('on$')
  940. 0 [on]
  941. 1 []
  942. 2 []
  943. dtype: object
  944. If the pattern is found more than once in the same string, then a list of
  945. multiple strings is returned:
  946. >>> s.str.findall('b')
  947. 0 []
  948. 1 []
  949. 2 [b, b]
  950. dtype: object
  951. """
  952. regex = re.compile(pat, flags=flags)
  953. return _na_map(regex.findall, arr)
  954. def str_find(arr, sub, start=0, end=None, side='left'):
  955. """
  956. Return indexes in each strings in the Series/Index where the
  957. substring is fully contained between [start:end]. Return -1 on failure.
  958. Parameters
  959. ----------
  960. sub : str
  961. Substring being searched
  962. start : int
  963. Left edge index
  964. end : int
  965. Right edge index
  966. side : {'left', 'right'}, default 'left'
  967. Specifies a starting side, equivalent to ``find`` or ``rfind``
  968. Returns
  969. -------
  970. found : Series/Index of integer values
  971. """
  972. if not isinstance(sub, compat.string_types):
  973. msg = 'expected a string object, not {0}'
  974. raise TypeError(msg.format(type(sub).__name__))
  975. if side == 'left':
  976. method = 'find'
  977. elif side == 'right':
  978. method = 'rfind'
  979. else: # pragma: no cover
  980. raise ValueError('Invalid side')
  981. if end is None:
  982. f = lambda x: getattr(x, method)(sub, start)
  983. else:
  984. f = lambda x: getattr(x, method)(sub, start, end)
  985. return _na_map(f, arr, dtype=int)
  986. def str_index(arr, sub, start=0, end=None, side='left'):
  987. if not isinstance(sub, compat.string_types):
  988. msg = 'expected a string object, not {0}'
  989. raise TypeError(msg.format(type(sub).__name__))
  990. if side == 'left':
  991. method = 'index'
  992. elif side == 'right':
  993. method = 'rindex'
  994. else: # pragma: no cover
  995. raise ValueError('Invalid side')
  996. if end is None:
  997. f = lambda x: getattr(x, method)(sub, start)
  998. else:
  999. f = lambda x: getattr(x, method)(sub, start, end)
  1000. return _na_map(f, arr, dtype=int)
  1001. def str_pad(arr, width, side='left', fillchar=' '):
  1002. """
  1003. Pad strings in the Series/Index up to width.
  1004. Parameters
  1005. ----------
  1006. width : int
  1007. Minimum width of resulting string; additional characters will be filled
  1008. with character defined in `fillchar`.
  1009. side : {'left', 'right', 'both'}, default 'left'
  1010. Side from which to fill resulting string.
  1011. fillchar : str, default ' '
  1012. Additional character for filling, default is whitespace.
  1013. Returns
  1014. -------
  1015. Series or Index of object
  1016. Returns Series or Index with minimum number of char in object.
  1017. See Also
  1018. --------
  1019. Series.str.rjust : Fills the left side of strings with an arbitrary
  1020. character. Equivalent to ``Series.str.pad(side='left')``.
  1021. Series.str.ljust : Fills the right side of strings with an arbitrary
  1022. character. Equivalent to ``Series.str.pad(side='right')``.
  1023. Series.str.center : Fills boths sides of strings with an arbitrary
  1024. character. Equivalent to ``Series.str.pad(side='both')``.
  1025. Series.str.zfill : Pad strings in the Series/Index by prepending '0'
  1026. character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``.
  1027. Examples
  1028. --------
  1029. >>> s = pd.Series(["caribou", "tiger"])
  1030. >>> s
  1031. 0 caribou
  1032. 1 tiger
  1033. dtype: object
  1034. >>> s.str.pad(width=10)
  1035. 0 caribou
  1036. 1 tiger
  1037. dtype: object
  1038. >>> s.str.pad(width=10, side='right', fillchar='-')
  1039. 0 caribou---
  1040. 1 tiger-----
  1041. dtype: object
  1042. >>> s.str.pad(width=10, side='both', fillchar='-')
  1043. 0 -caribou--
  1044. 1 --tiger---
  1045. dtype: object
  1046. """
  1047. if not isinstance(fillchar, compat.string_types):
  1048. msg = 'fillchar must be a character, not {0}'
  1049. raise TypeError(msg.format(type(fillchar).__name__))
  1050. if len(fillchar) != 1:
  1051. raise TypeError('fillchar must be a character, not str')
  1052. if not is_integer(width):
  1053. msg = 'width must be of integer type, not {0}'
  1054. raise TypeError(msg.format(type(width).__name__))
  1055. if side == 'left':
  1056. f = lambda x: x.rjust(width, fillchar)
  1057. elif side == 'right':
  1058. f = lambda x: x.ljust(width, fillchar)
  1059. elif side == 'both':
  1060. f = lambda x: x.center(width, fillchar)
  1061. else: # pragma: no cover
  1062. raise ValueError('Invalid side')
  1063. return _na_map(f, arr)
  1064. def str_split(arr, pat=None, n=None):
  1065. if pat is None:
  1066. if n is None or n == 0:
  1067. n = -1
  1068. f = lambda x: x.split(pat, n)
  1069. else:
  1070. if len(pat) == 1:
  1071. if n is None or n == 0:
  1072. n = -1
  1073. f = lambda x: x.split(pat, n)
  1074. else:
  1075. if n is None or n == -1:
  1076. n = 0
  1077. regex = re.compile(pat)
  1078. f = lambda x: regex.split(x, maxsplit=n)
  1079. res = _na_map(f, arr)
  1080. return res
  1081. def str_rsplit(arr, pat=None, n=None):
  1082. if n is None or n == 0:
  1083. n = -1
  1084. f = lambda x: x.rsplit(pat, n)
  1085. res = _na_map(f, arr)
  1086. return res
  1087. def str_slice(arr, start=None, stop=None, step=None):
  1088. """
  1089. Slice substrings from each element in the Series or Index.
  1090. Parameters
  1091. ----------
  1092. start : int, optional
  1093. Start position for slice operation.
  1094. stop : int, optional
  1095. Stop position for slice operation.
  1096. step : int, optional
  1097. Step size for slice operation.
  1098. Returns
  1099. -------
  1100. Series or Index of object
  1101. Series or Index from sliced substring from original string object.
  1102. See Also
  1103. --------
  1104. Series.str.slice_replace : Replace a slice with a string.
  1105. Series.str.get : Return element at position.
  1106. Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i`
  1107. being the position.
  1108. Examples
  1109. --------
  1110. >>> s = pd.Series(["koala", "fox", "chameleon"])
  1111. >>> s
  1112. 0 koala
  1113. 1 fox
  1114. 2 chameleon
  1115. dtype: object
  1116. >>> s.str.slice(start=1)
  1117. 0 oala
  1118. 1 ox
  1119. 2 hameleon
  1120. dtype: object
  1121. >>> s.str.slice(stop=2)
  1122. 0 ko
  1123. 1 fo
  1124. 2 ch
  1125. dtype: object
  1126. >>> s.str.slice(step=2)
  1127. 0 kaa
  1128. 1 fx
  1129. 2 caeen
  1130. dtype: object
  1131. >>> s.str.slice(start=0, stop=5, step=3)
  1132. 0 kl
  1133. 1 f
  1134. 2 cm
  1135. dtype: object
  1136. Equivalent behaviour to:
  1137. >>> s.str[0:5:3]
  1138. 0 kl
  1139. 1 f
  1140. 2 cm
  1141. dtype: object
  1142. """
  1143. obj = slice(start, stop, step)
  1144. f = lambda x: x[obj]
  1145. return _na_map(f, arr)
  1146. def str_slice_replace(arr, start=None, stop=None, repl=None):
  1147. """
  1148. Replace a positional slice of a string with another value.
  1149. Parameters
  1150. ----------
  1151. start : int, optional
  1152. Left index position to use for the slice. If not specified (None),
  1153. the slice is unbounded on the left, i.e. slice from the start
  1154. of the string.
  1155. stop : int, optional
  1156. Right index position to use for the slice. If not specified (None),
  1157. the slice is unbounded on the right, i.e. slice until the
  1158. end of the string.
  1159. repl : str, optional
  1160. String for replacement. If not specified (None), the sliced region
  1161. is replaced with an empty string.
  1162. Returns
  1163. -------
  1164. replaced : Series or Index
  1165. Same type as the original object.
  1166. See Also
  1167. --------
  1168. Series.str.slice : Just slicing without replacement.
  1169. Examples
  1170. --------
  1171. >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])
  1172. >>> s
  1173. 0 a
  1174. 1 ab
  1175. 2 abc
  1176. 3 abdc
  1177. 4 abcde
  1178. dtype: object
  1179. Specify just `start`, meaning replace `start` until the end of the
  1180. string with `repl`.
  1181. >>> s.str.slice_replace(1, repl='X')
  1182. 0 aX
  1183. 1 aX
  1184. 2 aX
  1185. 3 aX
  1186. 4 aX
  1187. dtype: object
  1188. Specify just `stop`, meaning the start of the string to `stop` is replaced
  1189. with `repl`, and the rest of the string is included.
  1190. >>> s.str.slice_replace(stop=2, repl='X')
  1191. 0 X
  1192. 1 X
  1193. 2 Xc
  1194. 3 Xdc
  1195. 4 Xcde
  1196. dtype: object
  1197. Specify `start` and `stop`, meaning the slice from `start` to `stop` is
  1198. replaced with `repl`. Everything before or after `start` and `stop` is
  1199. included as is.
  1200. >>> s.str.slice_replace(start=1, stop=3, repl='X')
  1201. 0 aX
  1202. 1 aX
  1203. 2 aX
  1204. 3 aXc
  1205. 4 aXde
  1206. dtype: object
  1207. """
  1208. if repl is None:
  1209. repl = ''
  1210. def f(x):
  1211. if x[start:stop] == '':
  1212. local_stop = start
  1213. else:
  1214. local_stop = stop
  1215. y = ''
  1216. if start is not None:
  1217. y += x[:start]
  1218. y += repl
  1219. if stop is not None:
  1220. y += x[local_stop:]
  1221. return y
  1222. return _na_map(f, arr)
  1223. def str_strip(arr, to_strip=None, side='both'):
  1224. """
  1225. Strip whitespace (including newlines) from each string in the
  1226. Series/Index.
  1227. Parameters
  1228. ----------
  1229. to_strip : str or unicode
  1230. side : {'left', 'right', 'both'}, default 'both'
  1231. Returns
  1232. -------
  1233. stripped : Series/Index of objects
  1234. """
  1235. if side == 'both':
  1236. f = lambda x: x.strip(to_strip)
  1237. elif side == 'left':
  1238. f = lambda x: x.lstrip(to_strip)
  1239. elif side == 'right':
  1240. f = lambda x: x.rstrip(to_strip)
  1241. else: # pragma: no cover
  1242. raise ValueError('Invalid side')
  1243. return _na_map(f, arr)
  1244. def str_wrap(arr, width, **kwargs):
  1245. r"""
  1246. Wrap long strings in the Series/Index to be formatted in
  1247. paragraphs with length less than a given width.
  1248. This method has the same keyword parameters and defaults as
  1249. :class:`textwrap.TextWrapper`.
  1250. Parameters
  1251. ----------
  1252. width : int
  1253. Maximum line-width
  1254. expand_tabs : bool, optional
  1255. If true, tab characters will be expanded to spaces (default: True)
  1256. replace_whitespace : bool, optional
  1257. If true, each whitespace character (as defined by string.whitespace)
  1258. remaining after tab expansion will be replaced by a single space
  1259. (default: True)
  1260. drop_whitespace : bool, optional
  1261. If true, whitespace that, after wrapping, happens to end up at the
  1262. beginning or end of a line is dropped (default: True)
  1263. break_long_words : bool, optional
  1264. If true, then words longer than width will be broken in order to ensure
  1265. that no lines are longer than width. If it is false, long words will
  1266. not be broken, and some lines may be longer than width. (default: True)
  1267. break_on_hyphens : bool, optional
  1268. If true, wrapping will occur preferably on whitespace and right after
  1269. hyphens in compound words, as it is customary in English. If false,
  1270. only whitespaces will be considered as potentially good places for line
  1271. breaks, but you need to set break_long_words to false if you want truly
  1272. insecable words. (default: True)
  1273. Returns
  1274. -------
  1275. wrapped : Series/Index of objects
  1276. Notes
  1277. -----
  1278. Internally, this method uses a :class:`textwrap.TextWrapper` instance with
  1279. default settings. To achieve behavior matching R's stringr library str_wrap
  1280. function, use the arguments:
  1281. - expand_tabs = False
  1282. - replace_whitespace = True
  1283. - drop_whitespace = True
  1284. - break_long_words = False
  1285. - break_on_hyphens = False
  1286. Examples
  1287. --------
  1288. >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped'])
  1289. >>> s.str.wrap(12)
  1290. 0 line to be\nwrapped
  1291. 1 another line\nto be\nwrapped
  1292. """
  1293. kwargs['width'] = width
  1294. tw = textwrap.TextWrapper(**kwargs)
  1295. return _na_map(lambda s: '\n'.join(tw.wrap(s)), arr)
  1296. def str_translate(arr, table, deletechars=None):
  1297. """
  1298. Map all characters in the string through the given mapping table.
  1299. Equivalent to standard :meth:`str.translate`. Note that the optional
  1300. argument deletechars is only valid if you are using python 2. For python 3,
  1301. character deletion should be specified via the table argument.
  1302. Parameters
  1303. ----------
  1304. table : dict (python 3), str or None (python 2)
  1305. In python 3, table is a mapping of Unicode ordinals to Unicode
  1306. ordinals, strings, or None. Unmapped characters are left untouched.
  1307. Characters mapped to None are deleted. :meth:`str.maketrans` is a
  1308. helper function for making translation tables.
  1309. In python 2, table is either a string of length 256 or None. If the
  1310. table argument is None, no translation is applied and the operation
  1311. simply removes the characters in deletechars. :func:`string.maketrans`
  1312. is a helper function for making translation tables.
  1313. deletechars : str, optional (python 2)
  1314. A string of characters to delete. This argument is only valid
  1315. in python 2.
  1316. Returns
  1317. -------
  1318. translated : Series/Index of objects
  1319. """
  1320. if deletechars is None:
  1321. f = lambda x: x.translate(table)
  1322. else:
  1323. if compat.PY3:
  1324. raise ValueError("deletechars is not a valid argument for "
  1325. "str.translate in python 3. You should simply "
  1326. "specify character deletions in the table "
  1327. "argument")
  1328. f = lambda x: x.translate(table, deletechars)
  1329. return _na_map(f, arr)
  1330. def str_get(arr, i):
  1331. """
  1332. Extract element from each component at specified position.
  1333. Extract element from lists, tuples, or strings in each element in the
  1334. Series/Index.
  1335. Parameters
  1336. ----------
  1337. i : int
  1338. Position of element to extract.
  1339. Returns
  1340. -------
  1341. items : Series/Index of objects
  1342. Examples
  1343. --------
  1344. >>> s = pd.Series(["String",
  1345. (1, 2, 3),
  1346. ["a", "b", "c"],
  1347. 123, -456,
  1348. {1:"Hello", "2":"World"}])
  1349. >>> s
  1350. 0 String
  1351. 1 (1, 2, 3)
  1352. 2 [a, b, c]
  1353. 3 123
  1354. 4 -456
  1355. 5 {1: 'Hello', '2': 'World'}
  1356. dtype: object
  1357. >>> s.str.get(1)
  1358. 0 t
  1359. 1 2
  1360. 2 b
  1361. 3 NaN
  1362. 4 NaN
  1363. 5 Hello
  1364. dtype: object
  1365. >>> s.str.get(-1)
  1366. 0 g
  1367. 1 3
  1368. 2 c
  1369. 3 NaN
  1370. 4 NaN
  1371. 5 NaN
  1372. dtype: object
  1373. """
  1374. def f(x):
  1375. if isinstance(x, dict):
  1376. return x.get(i)
  1377. elif len(x) > i >= -len(x):
  1378. return x[i]
  1379. return np.nan
  1380. return _na_map(f, arr)
  1381. def str_decode(arr, encoding, errors="strict"):
  1382. """
  1383. Decode character string in the Series/Index using indicated encoding.
  1384. Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in
  1385. python3.
  1386. Parameters
  1387. ----------
  1388. encoding : str
  1389. errors : str, optional
  1390. Returns
  1391. -------
  1392. decoded : Series/Index of objects
  1393. """
  1394. if encoding in _cpython_optimized_decoders:
  1395. # CPython optimized implementation
  1396. f = lambda x: x.decode(encoding, errors)
  1397. else:
  1398. decoder = codecs.getdecoder(encoding)
  1399. f = lambda x: decoder(x, errors)[0]
  1400. return _na_map(f, arr)
  1401. def str_encode(arr, encoding, errors="strict"):
  1402. """
  1403. Encode character string in the Series/Index using indicated encoding.
  1404. Equivalent to :meth:`str.encode`.
  1405. Parameters
  1406. ----------
  1407. encoding : str
  1408. errors : str, optional
  1409. Returns
  1410. -------
  1411. encoded : Series/Index of objects
  1412. """
  1413. if encoding in _cpython_optimized_encoders:
  1414. # CPython optimized implementation
  1415. f = lambda x: x.encode(encoding, errors)
  1416. else:
  1417. encoder = codecs.getencoder(encoding)
  1418. f = lambda x: encoder(x, errors)[0]
  1419. return _na_map(f, arr)
  1420. def _noarg_wrapper(f, docstring=None, **kargs):
  1421. def wrapper(self):
  1422. result = _na_map(f, self._parent, **kargs)
  1423. return self._wrap_result(result)
  1424. wrapper.__name__ = f.__name__
  1425. if docstring is not None:
  1426. wrapper.__doc__ = docstring
  1427. else:
  1428. raise ValueError('Provide docstring')
  1429. return wrapper
  1430. def _pat_wrapper(f, flags=False, na=False, **kwargs):
  1431. def wrapper1(self, pat):
  1432. result = f(self._parent, pat)
  1433. return self._wrap_result(result)
  1434. def wrapper2(self, pat, flags=0, **kwargs):
  1435. result = f(self._parent, pat, flags=flags, **kwargs)
  1436. return self._wrap_result(result)
  1437. def wrapper3(self, pat, na=np.nan):
  1438. result = f(self._parent, pat, na=na)
  1439. return self._wrap_result(result)
  1440. wrapper = wrapper3 if na else wrapper2 if flags else wrapper1
  1441. wrapper.__name__ = f.__name__
  1442. if f.__doc__:
  1443. wrapper.__doc__ = f.__doc__
  1444. return wrapper
  1445. def copy(source):
  1446. "Copy a docstring from another source function (if present)"
  1447. def do_copy(target):
  1448. if source.__doc__:
  1449. target.__doc__ = source.__doc__
  1450. return target
  1451. return do_copy
  1452. class StringMethods(NoNewAttributesMixin):
  1453. """
  1454. Vectorized string functions for Series and Index. NAs stay NA unless
  1455. handled otherwise by a particular method. Patterned after Python's string
  1456. methods, with some inspiration from R's stringr package.
  1457. Examples
  1458. --------
  1459. >>> s.str.split('_')
  1460. >>> s.str.replace('_', '')
  1461. """
  1462. def __init__(self, data):
  1463. self._validate(data)
  1464. self._is_categorical = is_categorical_dtype(data)
  1465. # .values.categories works for both Series/Index
  1466. self._parent = data.values.categories if self._is_categorical else data
  1467. # save orig to blow up categoricals to the right type
  1468. self._orig = data
  1469. self._freeze()
  1470. @staticmethod
  1471. def _validate(data):
  1472. from pandas.core.index import Index
  1473. if (isinstance(data, ABCSeries) and
  1474. not ((is_categorical_dtype(data.dtype) and
  1475. is_object_dtype(data.values.categories)) or
  1476. (is_object_dtype(data.dtype)))):
  1477. # it's neither a string series not a categorical series with
  1478. # strings inside the categories.
  1479. # this really should exclude all series with any non-string values
  1480. # (instead of test for object dtype), but that isn't practical for
  1481. # performance reasons until we have a str dtype (GH 9343)
  1482. raise AttributeError("Can only use .str accessor with string "
  1483. "values, which use np.object_ dtype in "
  1484. "pandas")
  1485. elif isinstance(data, Index):
  1486. # can't use ABCIndex to exclude non-str
  1487. # see src/inference.pyx which can contain string values
  1488. allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer')
  1489. if is_categorical_dtype(data.dtype):
  1490. inf_type = data.categories.inferred_type
  1491. else:
  1492. inf_type = data.inferred_type
  1493. if inf_type not in allowed_types:
  1494. message = ("Can only use .str accessor with string values "
  1495. "(i.e. inferred_type is 'string', 'unicode' or "
  1496. "'mixed')")
  1497. raise AttributeError(message)
  1498. if data.nlevels > 1:
  1499. message = ("Can only use .str accessor with Index, not "
  1500. "MultiIndex")
  1501. raise AttributeError(message)
  1502. def __getitem__(self, key):
  1503. if isinstance(key, slice):
  1504. return self.slice(start=key.start, stop=key.stop, step=key.step)
  1505. else:
  1506. return self.get(key)
  1507. def __iter__(self):
  1508. i = 0
  1509. g = self.get(i)
  1510. while g.notna().any():
  1511. yield g
  1512. i += 1
  1513. g = self.get(i)
  1514. def _wrap_result(self, result, use_codes=True,
  1515. name=None, expand=None, fill_value=np.nan):
  1516. from pandas import Index, Series, MultiIndex
  1517. # for category, we do the stuff on the categories, so blow it up
  1518. # to the full series again
  1519. # But for some operations, we have to do the stuff on the full values,
  1520. # so make it possible to skip this step as the method already did this
  1521. # before the transformation...
  1522. if use_codes and self._is_categorical:
  1523. # if self._orig is a CategoricalIndex, there is no .cat-accessor
  1524. result = take_1d(result, Series(self._orig, copy=False).cat.codes,
  1525. fill_value=fill_value)
  1526. if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'):
  1527. return result
  1528. assert result.ndim < 3
  1529. if expand is None:
  1530. # infer from ndim if expand is not specified
  1531. expand = False if result.ndim == 1 else True
  1532. elif expand is True and not isinstance(self._orig, Index):
  1533. # required when expand=True is explicitly specified
  1534. # not needed when inferred
  1535. def cons_row(x):
  1536. if is_list_like(x):
  1537. return x
  1538. else:
  1539. return [x]
  1540. result = [cons_row(x) for x in result]
  1541. if result:
  1542. # propagate nan values to match longest sequence (GH 18450)
  1543. max_len = max(len(x) for x in result)
  1544. result = [x * max_len if len(x) == 0 or x[0] is np.nan
  1545. else x for x in result]
  1546. if not isinstance(expand, bool):
  1547. raise ValueError("expand must be True or False")
  1548. if expand is False:
  1549. # if expand is False, result should have the same name
  1550. # as the original otherwise specified
  1551. if name is None:
  1552. name = getattr(result, 'name', None)
  1553. if name is None:
  1554. # do not use logical or, _orig may be a DataFrame
  1555. # which has "name" column
  1556. name = self._orig.name
  1557. # Wait until we are sure result is a Series or Index before
  1558. # checking attributes (GH 12180)
  1559. if isinstance(self._orig, Index):
  1560. # if result is a boolean np.array, return the np.array
  1561. # instead of wrapping it into a boolean Index (GH 8875)
  1562. if is_bool_dtype(result):
  1563. return result
  1564. if expand:
  1565. result = list(result)
  1566. out = MultiIndex.from_tuples(result, names=name)
  1567. if out.nlevels == 1:
  1568. # We had all tuples of length-one, which are
  1569. # better represented as a regular Index.
  1570. out = out.get_level_values(0)
  1571. return out
  1572. else:
  1573. return Index(result, name=name)
  1574. else:
  1575. index = self._orig.index
  1576. if expand:
  1577. cons = self._orig._constructor_expanddim
  1578. return cons(result, columns=name, index=index)
  1579. else:
  1580. # Must be a Series
  1581. cons = self._orig._constructor
  1582. return cons(result, name=name, index=index)
  1583. def _get_series_list(self, others, ignore_index=False):
  1584. """
  1585. Auxiliary function for :meth:`str.cat`. Turn potentially mixed input
  1586. into a list of Series (elements without an index must match the length
  1587. of the calling Series/Index).
  1588. Parameters
  1589. ----------
  1590. others : Series, Index, DataFrame, np.ndarray, list-like or list-like
  1591. of objects that are Series, Index or np.ndarray (1-dim)
  1592. ignore_index : boolean, default False
  1593. Determines whether to forcefully align others with index of caller
  1594. Returns
  1595. -------
  1596. tuple : (others transformed into list of Series,
  1597. boolean whether FutureWarning should be raised)
  1598. """
  1599. # Once str.cat defaults to alignment, this function can be simplified;
  1600. # will not need `ignore_index` and the second boolean output anymore
  1601. from pandas import Index, Series, DataFrame
  1602. # self._orig is either Series or Index
  1603. idx = self._orig if isinstance(self._orig, Index) else self._orig.index
  1604. err_msg = ('others must be Series, Index, DataFrame, np.ndarrary or '
  1605. 'list-like (either containing only strings or containing '
  1606. 'only objects of type Series/Index/list-like/np.ndarray)')
  1607. # Generally speaking, all objects without an index inherit the index
  1608. # `idx` of the calling Series/Index - i.e. must have matching length.
  1609. # Objects with an index (i.e. Series/Index/DataFrame) keep their own
  1610. # index, *unless* ignore_index is set to True.
  1611. if isinstance(others, Series):
  1612. warn = not others.index.equals(idx)
  1613. # only reconstruct Series when absolutely necessary
  1614. los = [Series(others.values, index=idx)
  1615. if ignore_index and warn else others]
  1616. return (los, warn)
  1617. elif isinstance(others, Index):
  1618. warn = not others.equals(idx)
  1619. los = [Series(others.values,
  1620. index=(idx if ignore_index else others))]
  1621. return (los, warn)
  1622. elif isinstance(others, DataFrame):
  1623. warn = not others.index.equals(idx)
  1624. if ignore_index and warn:
  1625. # without copy, this could change "others"
  1626. # that was passed to str.cat
  1627. others = others.copy()
  1628. others.index = idx
  1629. return ([others[x] for x in others], warn)
  1630. elif isinstance(others, np.ndarray) and others.ndim == 2:
  1631. others = DataFrame(others, index=idx)
  1632. return ([others[x] for x in others], False)
  1633. elif is_list_like(others, allow_sets=False):
  1634. others = list(others) # ensure iterators do not get read twice etc
  1635. # in case of list-like `others`, all elements must be
  1636. # either one-dimensional list-likes or scalars
  1637. if all(is_list_like(x, allow_sets=False) for x in others):
  1638. los = []
  1639. join_warn = False
  1640. depr_warn = False
  1641. # iterate through list and append list of series for each
  1642. # element (which we check to be one-dimensional and non-nested)
  1643. while others:
  1644. nxt = others.pop(0) # nxt is guaranteed list-like by above
  1645. # GH 21950 - DeprecationWarning
  1646. # only allowing Series/Index/np.ndarray[1-dim] will greatly
  1647. # simply this function post-deprecation.
  1648. if not (isinstance(nxt, (Series, Index)) or
  1649. (isinstance(nxt, np.ndarray) and nxt.ndim == 1)):
  1650. depr_warn = True
  1651. if not isinstance(nxt, (DataFrame, Series,
  1652. Index, np.ndarray)):
  1653. # safety for non-persistent list-likes (e.g. iterators)
  1654. # do not map indexed/typed objects; info needed below
  1655. nxt = list(nxt)
  1656. # known types for which we can avoid deep inspection
  1657. no_deep = ((isinstance(nxt, np.ndarray) and nxt.ndim == 1)
  1658. or isinstance(nxt, (Series, Index)))
  1659. # nested list-likes are forbidden:
  1660. # -> elements of nxt must not be list-like
  1661. is_legal = ((no_deep and nxt.dtype == object)
  1662. or all(not is_list_like(x) for x in nxt))
  1663. # DataFrame is false positive of is_legal
  1664. # because "x in df" returns column names
  1665. if not is_legal or isinstance(nxt, DataFrame):
  1666. raise TypeError(err_msg)
  1667. nxt, wnx = self._get_series_list(nxt,
  1668. ignore_index=ignore_index)
  1669. los = los + nxt
  1670. join_warn = join_warn or wnx
  1671. if depr_warn:
  1672. warnings.warn('list-likes other than Series, Index, or '
  1673. 'np.ndarray WITHIN another list-like are '
  1674. 'deprecated and will be removed in a future '
  1675. 'version.', FutureWarning, stacklevel=3)
  1676. return (los, join_warn)
  1677. elif all(not is_list_like(x) for x in others):
  1678. return ([Series(others, index=idx)], False)
  1679. raise TypeError(err_msg)
  1680. def cat(self, others=None, sep=None, na_rep=None, join=None):
  1681. """
  1682. Concatenate strings in the Series/Index with given separator.
  1683. If `others` is specified, this function concatenates the Series/Index
  1684. and elements of `others` element-wise.
  1685. If `others` is not passed, then all values in the Series/Index are
  1686. concatenated into a single string with a given `sep`.
  1687. Parameters
  1688. ----------
  1689. others : Series, Index, DataFrame, np.ndarrary or list-like
  1690. Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and
  1691. other list-likes of strings must have the same length as the
  1692. calling Series/Index, with the exception of indexed objects (i.e.
  1693. Series/Index/DataFrame) if `join` is not None.
  1694. If others is a list-like that contains a combination of Series,
  1695. Index or np.ndarray (1-dim), then all elements will be unpacked and
  1696. must satisfy the above criteria individually.
  1697. If others is None, the method returns the concatenation of all
  1698. strings in the calling Series/Index.
  1699. sep : str, default ''
  1700. The separator between the different elements/columns. By default
  1701. the empty string `''` is used.
  1702. na_rep : str or None, default None
  1703. Representation that is inserted for all missing values:
  1704. - If `na_rep` is None, and `others` is None, missing values in the
  1705. Series/Index are omitted from the result.
  1706. - If `na_rep` is None, and `others` is not None, a row containing a
  1707. missing value in any of the columns (before concatenation) will
  1708. have a missing value in the result.
  1709. join : {'left', 'right', 'outer', 'inner'}, default None
  1710. Determines the join-style between the calling Series/Index and any
  1711. Series/Index/DataFrame in `others` (objects without an index need
  1712. to match the length of the calling Series/Index). If None,
  1713. alignment is disabled, but this option will be removed in a future
  1714. version of pandas and replaced with a default of `'left'`. To
  1715. disable alignment, use `.values` on any Series/Index/DataFrame in
  1716. `others`.
  1717. .. versionadded:: 0.23.0
  1718. Returns
  1719. -------
  1720. concat : str or Series/Index of objects
  1721. If `others` is None, `str` is returned, otherwise a `Series/Index`
  1722. (same type as caller) of objects is returned.
  1723. See Also
  1724. --------
  1725. split : Split each string in the Series/Index.
  1726. join : Join lists contained as elements in the Series/Index.
  1727. Examples
  1728. --------
  1729. When not passing `others`, all values are concatenated into a single
  1730. string:
  1731. >>> s = pd.Series(['a', 'b', np.nan, 'd'])
  1732. >>> s.str.cat(sep=' ')
  1733. 'a b d'
  1734. By default, NA values in the Series are ignored. Using `na_rep`, they
  1735. can be given a representation:
  1736. >>> s.str.cat(sep=' ', na_rep='?')
  1737. 'a b ? d'
  1738. If `others` is specified, corresponding values are concatenated with
  1739. the separator. Result will be a Series of strings.
  1740. >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',')
  1741. 0 a,A
  1742. 1 b,B
  1743. 2 NaN
  1744. 3 d,D
  1745. dtype: object
  1746. Missing values will remain missing in the result, but can again be
  1747. represented using `na_rep`
  1748. >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')
  1749. 0 a,A
  1750. 1 b,B
  1751. 2 -,C
  1752. 3 d,D
  1753. dtype: object
  1754. If `sep` is not specified, the values are concatenated without
  1755. separation.
  1756. >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')
  1757. 0 aA
  1758. 1 bB
  1759. 2 -C
  1760. 3 dD
  1761. dtype: object
  1762. Series with different indexes can be aligned before concatenation. The
  1763. `join`-keyword works as in other methods.
  1764. >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2])
  1765. >>> s.str.cat(t, join='left', na_rep='-')
  1766. 0 aa
  1767. 1 b-
  1768. 2 -c
  1769. 3 dd
  1770. dtype: object
  1771. >>>
  1772. >>> s.str.cat(t, join='outer', na_rep='-')
  1773. 0 aa
  1774. 1 b-
  1775. 2 -c
  1776. 3 dd
  1777. 4 -e
  1778. dtype: object
  1779. >>>
  1780. >>> s.str.cat(t, join='inner', na_rep='-')
  1781. 0 aa
  1782. 2 -c
  1783. 3 dd
  1784. dtype: object
  1785. >>>
  1786. >>> s.str.cat(t, join='right', na_rep='-')
  1787. 3 dd
  1788. 0 aa
  1789. 4 -e
  1790. 2 -c
  1791. dtype: object
  1792. For more examples, see :ref:`here <text.concatenate>`.
  1793. """
  1794. from pandas import Index, Series, concat
  1795. if isinstance(others, compat.string_types):
  1796. raise ValueError("Did you mean to supply a `sep` keyword?")
  1797. if sep is None:
  1798. sep = ''
  1799. if isinstance(self._orig, Index):
  1800. data = Series(self._orig, index=self._orig)
  1801. else: # Series
  1802. data = self._orig
  1803. # concatenate Series/Index with itself if no "others"
  1804. if others is None:
  1805. data = ensure_object(data)
  1806. na_mask = isna(data)
  1807. if na_rep is None and na_mask.any():
  1808. data = data[~na_mask]
  1809. elif na_rep is not None and na_mask.any():
  1810. data = np.where(na_mask, na_rep, data)
  1811. return sep.join(data)
  1812. try:
  1813. # turn anything in "others" into lists of Series
  1814. others, warn = self._get_series_list(others,
  1815. ignore_index=(join is None))
  1816. except ValueError: # do not catch TypeError raised by _get_series_list
  1817. if join is None:
  1818. raise ValueError('All arrays must be same length, except '
  1819. 'those having an index if `join` is not None')
  1820. else:
  1821. raise ValueError('If `others` contains arrays or lists (or '
  1822. 'other list-likes without an index), these '
  1823. 'must all be of the same length as the '
  1824. 'calling Series/Index.')
  1825. if join is None and warn:
  1826. warnings.warn("A future version of pandas will perform index "
  1827. "alignment when `others` is a Series/Index/"
  1828. "DataFrame (or a list-like containing one). To "
  1829. "disable alignment (the behavior before v.0.23) and "
  1830. "silence this warning, use `.values` on any Series/"
  1831. "Index/DataFrame in `others`. To enable alignment "
  1832. "and silence this warning, pass `join='left'|"
  1833. "'outer'|'inner'|'right'`. The future default will "
  1834. "be `join='left'`.", FutureWarning, stacklevel=2)
  1835. # if join is None, _get_series_list already force-aligned indexes
  1836. join = 'left' if join is None else join
  1837. # align if required
  1838. if any(not data.index.equals(x.index) for x in others):
  1839. # Need to add keys for uniqueness in case of duplicate columns
  1840. others = concat(others, axis=1,
  1841. join=(join if join == 'inner' else 'outer'),
  1842. keys=range(len(others)), sort=False, copy=False)
  1843. data, others = data.align(others, join=join)
  1844. others = [others[x] for x in others] # again list of Series
  1845. all_cols = [ensure_object(x) for x in [data] + others]
  1846. na_masks = np.array([isna(x) for x in all_cols])
  1847. union_mask = np.logical_or.reduce(na_masks, axis=0)
  1848. if na_rep is None and union_mask.any():
  1849. # no na_rep means NaNs for all rows where any column has a NaN
  1850. # only necessary if there are actually any NaNs
  1851. result = np.empty(len(data), dtype=object)
  1852. np.putmask(result, union_mask, np.nan)
  1853. not_masked = ~union_mask
  1854. result[not_masked] = cat_core([x[not_masked] for x in all_cols],
  1855. sep)
  1856. elif na_rep is not None and union_mask.any():
  1857. # fill NaNs with na_rep in case there are actually any NaNs
  1858. all_cols = [np.where(nm, na_rep, col)
  1859. for nm, col in zip(na_masks, all_cols)]
  1860. result = cat_core(all_cols, sep)
  1861. else:
  1862. # no NaNs - can just concatenate
  1863. result = cat_core(all_cols, sep)
  1864. if isinstance(self._orig, Index):
  1865. # add dtype for case that result is all-NA
  1866. result = Index(result, dtype=object, name=self._orig.name)
  1867. else: # Series
  1868. result = Series(result, dtype=object, index=data.index,
  1869. name=self._orig.name)
  1870. return result
  1871. _shared_docs['str_split'] = ("""
  1872. Split strings around given separator/delimiter.
  1873. Splits the string in the Series/Index from the %(side)s,
  1874. at the specified delimiter string. Equivalent to :meth:`str.%(method)s`.
  1875. Parameters
  1876. ----------
  1877. pat : str, optional
  1878. String or regular expression to split on.
  1879. If not specified, split on whitespace.
  1880. n : int, default -1 (all)
  1881. Limit number of splits in output.
  1882. ``None``, 0 and -1 will be interpreted as return all splits.
  1883. expand : bool, default False
  1884. Expand the splitted strings into separate columns.
  1885. * If ``True``, return DataFrame/MultiIndex expanding dimensionality.
  1886. * If ``False``, return Series/Index, containing lists of strings.
  1887. Returns
  1888. -------
  1889. Series, Index, DataFrame or MultiIndex
  1890. Type matches caller unless ``expand=True`` (see Notes).
  1891. See Also
  1892. --------
  1893. Series.str.split : Split strings around given separator/delimiter.
  1894. Series.str.rsplit : Splits string around given separator/delimiter,
  1895. starting from the right.
  1896. Series.str.join : Join lists contained as elements in the Series/Index
  1897. with passed delimiter.
  1898. str.split : Standard library version for split.
  1899. str.rsplit : Standard library version for rsplit.
  1900. Notes
  1901. -----
  1902. The handling of the `n` keyword depends on the number of found splits:
  1903. - If found splits > `n`, make first `n` splits only
  1904. - If found splits <= `n`, make all splits
  1905. - If for a certain row the number of found splits < `n`,
  1906. append `None` for padding up to `n` if ``expand=True``
  1907. If using ``expand=True``, Series and Index callers return DataFrame and
  1908. MultiIndex objects, respectively.
  1909. Examples
  1910. --------
  1911. >>> s = pd.Series(["this is a regular sentence",
  1912. "https://docs.python.org/3/tutorial/index.html", np.nan])
  1913. In the default setting, the string is split by whitespace.
  1914. >>> s.str.split()
  1915. 0 [this, is, a, regular, sentence]
  1916. 1 [https://docs.python.org/3/tutorial/index.html]
  1917. 2 NaN
  1918. dtype: object
  1919. Without the `n` parameter, the outputs of `rsplit` and `split`
  1920. are identical.
  1921. >>> s.str.rsplit()
  1922. 0 [this, is, a, regular, sentence]
  1923. 1 [https://docs.python.org/3/tutorial/index.html]
  1924. 2 NaN
  1925. dtype: object
  1926. The `n` parameter can be used to limit the number of splits on the
  1927. delimiter. The outputs of `split` and `rsplit` are different.
  1928. >>> s.str.split(n=2)
  1929. 0 [this, is, a regular sentence]
  1930. 1 [https://docs.python.org/3/tutorial/index.html]
  1931. 2 NaN
  1932. dtype: object
  1933. >>> s.str.rsplit(n=2)
  1934. 0 [this is a, regular, sentence]
  1935. 1 [https://docs.python.org/3/tutorial/index.html]
  1936. 2 NaN
  1937. dtype: object
  1938. The `pat` parameter can be used to split by other characters.
  1939. >>> s.str.split(pat = "/")
  1940. 0 [this is a regular sentence]
  1941. 1 [https:, , docs.python.org, 3, tutorial, index...
  1942. 2 NaN
  1943. dtype: object
  1944. When using ``expand=True``, the split elements will expand out into
  1945. separate columns. If NaN is present, it is propagated throughout
  1946. the columns during the split.
  1947. >>> s.str.split(expand=True)
  1948. 0 1 2 3
  1949. 0 this is a regular
  1950. 1 https://docs.python.org/3/tutorial/index.html None None None
  1951. 2 NaN NaN NaN NaN \
  1952. 4
  1953. 0 sentence
  1954. 1 None
  1955. 2 NaN
  1956. For slightly more complex use cases like splitting the html document name
  1957. from a url, a combination of parameter settings can be used.
  1958. >>> s.str.rsplit("/", n=1, expand=True)
  1959. 0 1
  1960. 0 this is a regular sentence None
  1961. 1 https://docs.python.org/3/tutorial index.html
  1962. 2 NaN NaN
  1963. """)
  1964. @Appender(_shared_docs['str_split'] % {
  1965. 'side': 'beginning',
  1966. 'method': 'split'})
  1967. def split(self, pat=None, n=-1, expand=False):
  1968. result = str_split(self._parent, pat, n=n)
  1969. return self._wrap_result(result, expand=expand)
  1970. @Appender(_shared_docs['str_split'] % {
  1971. 'side': 'end',
  1972. 'method': 'rsplit'})
  1973. def rsplit(self, pat=None, n=-1, expand=False):
  1974. result = str_rsplit(self._parent, pat, n=n)
  1975. return self._wrap_result(result, expand=expand)
  1976. _shared_docs['str_partition'] = ("""
  1977. Split the string at the %(side)s occurrence of `sep`.
  1978. This method splits the string at the %(side)s occurrence of `sep`,
  1979. and returns 3 elements containing the part before the separator,
  1980. the separator itself, and the part after the separator.
  1981. If the separator is not found, return %(return)s.
  1982. Parameters
  1983. ----------
  1984. sep : str, default whitespace
  1985. String to split on.
  1986. pat : str, default whitespace
  1987. .. deprecated:: 0.24.0
  1988. Use ``sep`` instead
  1989. expand : bool, default True
  1990. If True, return DataFrame/MultiIndex expanding dimensionality.
  1991. If False, return Series/Index.
  1992. Returns
  1993. -------
  1994. DataFrame/MultiIndex or Series/Index of objects
  1995. See Also
  1996. --------
  1997. %(also)s
  1998. Series.str.split : Split strings around given separators.
  1999. str.partition : Standard library version.
  2000. Examples
  2001. --------
  2002. >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers'])
  2003. >>> s
  2004. 0 Linda van der Berg
  2005. 1 George Pitt-Rivers
  2006. dtype: object
  2007. >>> s.str.partition()
  2008. 0 1 2
  2009. 0 Linda van der Berg
  2010. 1 George Pitt-Rivers
  2011. To partition by the last space instead of the first one:
  2012. >>> s.str.rpartition()
  2013. 0 1 2
  2014. 0 Linda van der Berg
  2015. 1 George Pitt-Rivers
  2016. To partition by something different than a space:
  2017. >>> s.str.partition('-')
  2018. 0 1 2
  2019. 0 Linda van der Berg
  2020. 1 George Pitt - Rivers
  2021. To return a Series containining tuples instead of a DataFrame:
  2022. >>> s.str.partition('-', expand=False)
  2023. 0 (Linda van der Berg, , )
  2024. 1 (George Pitt, -, Rivers)
  2025. dtype: object
  2026. Also available on indices:
  2027. >>> idx = pd.Index(['X 123', 'Y 999'])
  2028. >>> idx
  2029. Index(['X 123', 'Y 999'], dtype='object')
  2030. Which will create a MultiIndex:
  2031. >>> idx.str.partition()
  2032. MultiIndex(levels=[['X', 'Y'], [' '], ['123', '999']],
  2033. codes=[[0, 1], [0, 0], [0, 1]])
  2034. Or an index with tuples with ``expand=False``:
  2035. >>> idx.str.partition(expand=False)
  2036. Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object')
  2037. """)
  2038. @Appender(_shared_docs['str_partition'] % {
  2039. 'side': 'first',
  2040. 'return': '3 elements containing the string itself, followed by two '
  2041. 'empty strings',
  2042. 'also': 'rpartition : Split the string at the last occurrence of '
  2043. '`sep`.'
  2044. })
  2045. @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep')
  2046. def partition(self, sep=' ', expand=True):
  2047. f = lambda x: x.partition(sep)
  2048. result = _na_map(f, self._parent)
  2049. return self._wrap_result(result, expand=expand)
  2050. @Appender(_shared_docs['str_partition'] % {
  2051. 'side': 'last',
  2052. 'return': '3 elements containing two empty strings, followed by the '
  2053. 'string itself',
  2054. 'also': 'partition : Split the string at the first occurrence of '
  2055. '`sep`.'
  2056. })
  2057. @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep')
  2058. def rpartition(self, sep=' ', expand=True):
  2059. f = lambda x: x.rpartition(sep)
  2060. result = _na_map(f, self._parent)
  2061. return self._wrap_result(result, expand=expand)
  2062. @copy(str_get)
  2063. def get(self, i):
  2064. result = str_get(self._parent, i)
  2065. return self._wrap_result(result)
  2066. @copy(str_join)
  2067. def join(self, sep):
  2068. result = str_join(self._parent, sep)
  2069. return self._wrap_result(result)
  2070. @copy(str_contains)
  2071. def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
  2072. result = str_contains(self._parent, pat, case=case, flags=flags, na=na,
  2073. regex=regex)
  2074. return self._wrap_result(result, fill_value=na)
  2075. @copy(str_match)
  2076. def match(self, pat, case=True, flags=0, na=np.nan):
  2077. result = str_match(self._parent, pat, case=case, flags=flags, na=na)
  2078. return self._wrap_result(result, fill_value=na)
  2079. @copy(str_replace)
  2080. def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
  2081. result = str_replace(self._parent, pat, repl, n=n, case=case,
  2082. flags=flags, regex=regex)
  2083. return self._wrap_result(result)
  2084. @copy(str_repeat)
  2085. def repeat(self, repeats):
  2086. result = str_repeat(self._parent, repeats)
  2087. return self._wrap_result(result)
  2088. @copy(str_pad)
  2089. def pad(self, width, side='left', fillchar=' '):
  2090. result = str_pad(self._parent, width, side=side, fillchar=fillchar)
  2091. return self._wrap_result(result)
  2092. _shared_docs['str_pad'] = ("""
  2093. Filling %(side)s side of strings in the Series/Index with an
  2094. additional character. Equivalent to :meth:`str.%(method)s`.
  2095. Parameters
  2096. ----------
  2097. width : int
  2098. Minimum width of resulting string; additional characters will be filled
  2099. with ``fillchar``
  2100. fillchar : str
  2101. Additional character for filling, default is whitespace
  2102. Returns
  2103. -------
  2104. filled : Series/Index of objects
  2105. """)
  2106. @Appender(_shared_docs['str_pad'] % dict(side='left and right',
  2107. method='center'))
  2108. def center(self, width, fillchar=' '):
  2109. return self.pad(width, side='both', fillchar=fillchar)
  2110. @Appender(_shared_docs['str_pad'] % dict(side='right', method='ljust'))
  2111. def ljust(self, width, fillchar=' '):
  2112. return self.pad(width, side='right', fillchar=fillchar)
  2113. @Appender(_shared_docs['str_pad'] % dict(side='left', method='rjust'))
  2114. def rjust(self, width, fillchar=' '):
  2115. return self.pad(width, side='left', fillchar=fillchar)
  2116. def zfill(self, width):
  2117. """
  2118. Pad strings in the Series/Index by prepending '0' characters.
  2119. Strings in the Series/Index are padded with '0' characters on the
  2120. left of the string to reach a total string length `width`. Strings
  2121. in the Series/Index with length greater or equal to `width` are
  2122. unchanged.
  2123. Parameters
  2124. ----------
  2125. width : int
  2126. Minimum length of resulting string; strings with length less
  2127. than `width` be prepended with '0' characters.
  2128. Returns
  2129. -------
  2130. Series/Index of objects
  2131. See Also
  2132. --------
  2133. Series.str.rjust : Fills the left side of strings with an arbitrary
  2134. character.
  2135. Series.str.ljust : Fills the right side of strings with an arbitrary
  2136. character.
  2137. Series.str.pad : Fills the specified sides of strings with an arbitrary
  2138. character.
  2139. Series.str.center : Fills boths sides of strings with an arbitrary
  2140. character.
  2141. Notes
  2142. -----
  2143. Differs from :meth:`str.zfill` which has special handling
  2144. for '+'/'-' in the string.
  2145. Examples
  2146. --------
  2147. >>> s = pd.Series(['-1', '1', '1000', 10, np.nan])
  2148. >>> s
  2149. 0 -1
  2150. 1 1
  2151. 2 1000
  2152. 3 10
  2153. 4 NaN
  2154. dtype: object
  2155. Note that ``10`` and ``NaN`` are not strings, therefore they are
  2156. converted to ``NaN``. The minus sign in ``'-1'`` is treated as a
  2157. regular character and the zero is added to the left of it
  2158. (:meth:`str.zfill` would have moved it to the left). ``1000``
  2159. remains unchanged as it is longer than `width`.
  2160. >>> s.str.zfill(3)
  2161. 0 0-1
  2162. 1 001
  2163. 2 1000
  2164. 3 NaN
  2165. 4 NaN
  2166. dtype: object
  2167. """
  2168. result = str_pad(self._parent, width, side='left', fillchar='0')
  2169. return self._wrap_result(result)
  2170. @copy(str_slice)
  2171. def slice(self, start=None, stop=None, step=None):
  2172. result = str_slice(self._parent, start, stop, step)
  2173. return self._wrap_result(result)
  2174. @copy(str_slice_replace)
  2175. def slice_replace(self, start=None, stop=None, repl=None):
  2176. result = str_slice_replace(self._parent, start, stop, repl)
  2177. return self._wrap_result(result)
  2178. @copy(str_decode)
  2179. def decode(self, encoding, errors="strict"):
  2180. result = str_decode(self._parent, encoding, errors)
  2181. return self._wrap_result(result)
  2182. @copy(str_encode)
  2183. def encode(self, encoding, errors="strict"):
  2184. result = str_encode(self._parent, encoding, errors)
  2185. return self._wrap_result(result)
  2186. _shared_docs['str_strip'] = (r"""
  2187. Remove leading and trailing characters.
  2188. Strip whitespaces (including newlines) or a set of specified characters
  2189. from each string in the Series/Index from %(side)s.
  2190. Equivalent to :meth:`str.%(method)s`.
  2191. Parameters
  2192. ----------
  2193. to_strip : str or None, default None
  2194. Specifying the set of characters to be removed.
  2195. All combinations of this set of characters will be stripped.
  2196. If None then whitespaces are removed.
  2197. Returns
  2198. -------
  2199. Series/Index of objects
  2200. See Also
  2201. --------
  2202. Series.str.strip : Remove leading and trailing characters in Series/Index.
  2203. Series.str.lstrip : Remove leading characters in Series/Index.
  2204. Series.str.rstrip : Remove trailing characters in Series/Index.
  2205. Examples
  2206. --------
  2207. >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan])
  2208. >>> s
  2209. 0 1. Ant.
  2210. 1 2. Bee!\n
  2211. 2 3. Cat?\t
  2212. 3 NaN
  2213. dtype: object
  2214. >>> s.str.strip()
  2215. 0 1. Ant.
  2216. 1 2. Bee!
  2217. 2 3. Cat?
  2218. 3 NaN
  2219. dtype: object
  2220. >>> s.str.lstrip('123.')
  2221. 0 Ant.
  2222. 1 Bee!\n
  2223. 2 Cat?\t
  2224. 3 NaN
  2225. dtype: object
  2226. >>> s.str.rstrip('.!? \n\t')
  2227. 0 1. Ant
  2228. 1 2. Bee
  2229. 2 3. Cat
  2230. 3 NaN
  2231. dtype: object
  2232. >>> s.str.strip('123.!? \n\t')
  2233. 0 Ant
  2234. 1 Bee
  2235. 2 Cat
  2236. 3 NaN
  2237. dtype: object
  2238. """)
  2239. @Appender(_shared_docs['str_strip'] % dict(side='left and right sides',
  2240. method='strip'))
  2241. def strip(self, to_strip=None):
  2242. result = str_strip(self._parent, to_strip, side='both')
  2243. return self._wrap_result(result)
  2244. @Appender(_shared_docs['str_strip'] % dict(side='left side',
  2245. method='lstrip'))
  2246. def lstrip(self, to_strip=None):
  2247. result = str_strip(self._parent, to_strip, side='left')
  2248. return self._wrap_result(result)
  2249. @Appender(_shared_docs['str_strip'] % dict(side='right side',
  2250. method='rstrip'))
  2251. def rstrip(self, to_strip=None):
  2252. result = str_strip(self._parent, to_strip, side='right')
  2253. return self._wrap_result(result)
  2254. @copy(str_wrap)
  2255. def wrap(self, width, **kwargs):
  2256. result = str_wrap(self._parent, width, **kwargs)
  2257. return self._wrap_result(result)
  2258. @copy(str_get_dummies)
  2259. def get_dummies(self, sep='|'):
  2260. # we need to cast to Series of strings as only that has all
  2261. # methods available for making the dummies...
  2262. data = self._orig.astype(str) if self._is_categorical else self._parent
  2263. result, name = str_get_dummies(data, sep)
  2264. return self._wrap_result(result, use_codes=(not self._is_categorical),
  2265. name=name, expand=True)
  2266. @copy(str_translate)
  2267. def translate(self, table, deletechars=None):
  2268. result = str_translate(self._parent, table, deletechars)
  2269. return self._wrap_result(result)
  2270. count = _pat_wrapper(str_count, flags=True)
  2271. startswith = _pat_wrapper(str_startswith, na=True)
  2272. endswith = _pat_wrapper(str_endswith, na=True)
  2273. findall = _pat_wrapper(str_findall, flags=True)
  2274. @copy(str_extract)
  2275. def extract(self, pat, flags=0, expand=True):
  2276. return str_extract(self, pat, flags=flags, expand=expand)
  2277. @copy(str_extractall)
  2278. def extractall(self, pat, flags=0):
  2279. return str_extractall(self._orig, pat, flags=flags)
  2280. _shared_docs['find'] = ("""
  2281. Return %(side)s indexes in each strings in the Series/Index
  2282. where the substring is fully contained between [start:end].
  2283. Return -1 on failure. Equivalent to standard :meth:`str.%(method)s`.
  2284. Parameters
  2285. ----------
  2286. sub : str
  2287. Substring being searched
  2288. start : int
  2289. Left edge index
  2290. end : int
  2291. Right edge index
  2292. Returns
  2293. -------
  2294. found : Series/Index of integer values
  2295. See Also
  2296. --------
  2297. %(also)s
  2298. """)
  2299. @Appender(_shared_docs['find'] %
  2300. dict(side='lowest', method='find',
  2301. also='rfind : Return highest indexes in each strings.'))
  2302. def find(self, sub, start=0, end=None):
  2303. result = str_find(self._parent, sub, start=start, end=end, side='left')
  2304. return self._wrap_result(result)
  2305. @Appender(_shared_docs['find'] %
  2306. dict(side='highest', method='rfind',
  2307. also='find : Return lowest indexes in each strings.'))
  2308. def rfind(self, sub, start=0, end=None):
  2309. result = str_find(self._parent, sub,
  2310. start=start, end=end, side='right')
  2311. return self._wrap_result(result)
  2312. def normalize(self, form):
  2313. """
  2314. Return the Unicode normal form for the strings in the Series/Index.
  2315. For more information on the forms, see the
  2316. :func:`unicodedata.normalize`.
  2317. Parameters
  2318. ----------
  2319. form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
  2320. Unicode form
  2321. Returns
  2322. -------
  2323. normalized : Series/Index of objects
  2324. """
  2325. import unicodedata
  2326. f = lambda x: unicodedata.normalize(form, compat.u_safe(x))
  2327. result = _na_map(f, self._parent)
  2328. return self._wrap_result(result)
  2329. _shared_docs['index'] = ("""
  2330. Return %(side)s indexes in each strings where the substring is
  2331. fully contained between [start:end]. This is the same as
  2332. ``str.%(similar)s`` except instead of returning -1, it raises a ValueError
  2333. when the substring is not found. Equivalent to standard ``str.%(method)s``.
  2334. Parameters
  2335. ----------
  2336. sub : str
  2337. Substring being searched
  2338. start : int
  2339. Left edge index
  2340. end : int
  2341. Right edge index
  2342. Returns
  2343. -------
  2344. found : Series/Index of objects
  2345. See Also
  2346. --------
  2347. %(also)s
  2348. """)
  2349. @Appender(_shared_docs['index'] %
  2350. dict(side='lowest', similar='find', method='index',
  2351. also='rindex : Return highest indexes in each strings.'))
  2352. def index(self, sub, start=0, end=None):
  2353. result = str_index(self._parent, sub,
  2354. start=start, end=end, side='left')
  2355. return self._wrap_result(result)
  2356. @Appender(_shared_docs['index'] %
  2357. dict(side='highest', similar='rfind', method='rindex',
  2358. also='index : Return lowest indexes in each strings.'))
  2359. def rindex(self, sub, start=0, end=None):
  2360. result = str_index(self._parent, sub,
  2361. start=start, end=end, side='right')
  2362. return self._wrap_result(result)
  2363. _shared_docs['len'] = ("""
  2364. Computes the length of each element in the Series/Index. The element may be
  2365. a sequence (such as a string, tuple or list) or a collection
  2366. (such as a dictionary).
  2367. Returns
  2368. -------
  2369. Series or Index of int
  2370. A Series or Index of integer values indicating the length of each
  2371. element in the Series or Index.
  2372. See Also
  2373. --------
  2374. str.len : Python built-in function returning the length of an object.
  2375. Series.size : Returns the length of the Series.
  2376. Examples
  2377. --------
  2378. Returns the length (number of characters) in a string. Returns the
  2379. number of entries for dictionaries, lists or tuples.
  2380. >>> s = pd.Series(['dog',
  2381. ... '',
  2382. ... 5,
  2383. ... {'foo' : 'bar'},
  2384. ... [2, 3, 5, 7],
  2385. ... ('one', 'two', 'three')])
  2386. >>> s
  2387. 0 dog
  2388. 1
  2389. 2 5
  2390. 3 {'foo': 'bar'}
  2391. 4 [2, 3, 5, 7]
  2392. 5 (one, two, three)
  2393. dtype: object
  2394. >>> s.str.len()
  2395. 0 3.0
  2396. 1 0.0
  2397. 2 NaN
  2398. 3 1.0
  2399. 4 4.0
  2400. 5 3.0
  2401. dtype: float64
  2402. """)
  2403. len = _noarg_wrapper(len, docstring=_shared_docs['len'], dtype=int)
  2404. _shared_docs['casemethods'] = ("""
  2405. Convert strings in the Series/Index to %(type)s.
  2406. Equivalent to :meth:`str.%(method)s`.
  2407. Returns
  2408. -------
  2409. Series/Index of objects
  2410. See Also
  2411. --------
  2412. Series.str.lower : Converts all characters to lowercase.
  2413. Series.str.upper : Converts all characters to uppercase.
  2414. Series.str.title : Converts first character of each word to uppercase and
  2415. remaining to lowercase.
  2416. Series.str.capitalize : Converts first character to uppercase and
  2417. remaining to lowercase.
  2418. Series.str.swapcase : Converts uppercase to lowercase and lowercase to
  2419. uppercase.
  2420. Examples
  2421. --------
  2422. >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
  2423. >>> s
  2424. 0 lower
  2425. 1 CAPITALS
  2426. 2 this is a sentence
  2427. 3 SwApCaSe
  2428. dtype: object
  2429. >>> s.str.lower()
  2430. 0 lower
  2431. 1 capitals
  2432. 2 this is a sentence
  2433. 3 swapcase
  2434. dtype: object
  2435. >>> s.str.upper()
  2436. 0 LOWER
  2437. 1 CAPITALS
  2438. 2 THIS IS A SENTENCE
  2439. 3 SWAPCASE
  2440. dtype: object
  2441. >>> s.str.title()
  2442. 0 Lower
  2443. 1 Capitals
  2444. 2 This Is A Sentence
  2445. 3 Swapcase
  2446. dtype: object
  2447. >>> s.str.capitalize()
  2448. 0 Lower
  2449. 1 Capitals
  2450. 2 This is a sentence
  2451. 3 Swapcase
  2452. dtype: object
  2453. >>> s.str.swapcase()
  2454. 0 LOWER
  2455. 1 capitals
  2456. 2 THIS IS A SENTENCE
  2457. 3 sWaPcAsE
  2458. dtype: object
  2459. """)
  2460. _shared_docs['lower'] = dict(type='lowercase', method='lower')
  2461. _shared_docs['upper'] = dict(type='uppercase', method='upper')
  2462. _shared_docs['title'] = dict(type='titlecase', method='title')
  2463. _shared_docs['capitalize'] = dict(type='be capitalized',
  2464. method='capitalize')
  2465. _shared_docs['swapcase'] = dict(type='be swapcased', method='swapcase')
  2466. lower = _noarg_wrapper(lambda x: x.lower(),
  2467. docstring=_shared_docs['casemethods'] %
  2468. _shared_docs['lower'])
  2469. upper = _noarg_wrapper(lambda x: x.upper(),
  2470. docstring=_shared_docs['casemethods'] %
  2471. _shared_docs['upper'])
  2472. title = _noarg_wrapper(lambda x: x.title(),
  2473. docstring=_shared_docs['casemethods'] %
  2474. _shared_docs['title'])
  2475. capitalize = _noarg_wrapper(lambda x: x.capitalize(),
  2476. docstring=_shared_docs['casemethods'] %
  2477. _shared_docs['capitalize'])
  2478. swapcase = _noarg_wrapper(lambda x: x.swapcase(),
  2479. docstring=_shared_docs['casemethods'] %
  2480. _shared_docs['swapcase'])
  2481. _shared_docs['ismethods'] = ("""
  2482. Check whether all characters in each string are %(type)s.
  2483. This is equivalent to running the Python string method
  2484. :meth:`str.%(method)s` for each element of the Series/Index. If a string
  2485. has zero characters, ``False`` is returned for that check.
  2486. Returns
  2487. -------
  2488. Series or Index of bool
  2489. Series or Index of boolean values with the same length as the original
  2490. Series/Index.
  2491. See Also
  2492. --------
  2493. Series.str.isalpha : Check whether all characters are alphabetic.
  2494. Series.str.isnumeric : Check whether all characters are numeric.
  2495. Series.str.isalnum : Check whether all characters are alphanumeric.
  2496. Series.str.isdigit : Check whether all characters are digits.
  2497. Series.str.isdecimal : Check whether all characters are decimal.
  2498. Series.str.isspace : Check whether all characters are whitespace.
  2499. Series.str.islower : Check whether all characters are lowercase.
  2500. Series.str.isupper : Check whether all characters are uppercase.
  2501. Series.str.istitle : Check whether all characters are titlecase.
  2502. Examples
  2503. --------
  2504. **Checks for Alphabetic and Numeric Characters**
  2505. >>> s1 = pd.Series(['one', 'one1', '1', ''])
  2506. >>> s1.str.isalpha()
  2507. 0 True
  2508. 1 False
  2509. 2 False
  2510. 3 False
  2511. dtype: bool
  2512. >>> s1.str.isnumeric()
  2513. 0 False
  2514. 1 False
  2515. 2 True
  2516. 3 False
  2517. dtype: bool
  2518. >>> s1.str.isalnum()
  2519. 0 True
  2520. 1 True
  2521. 2 True
  2522. 3 False
  2523. dtype: bool
  2524. Note that checks against characters mixed with any additional punctuation
  2525. or whitespace will evaluate to false for an alphanumeric check.
  2526. >>> s2 = pd.Series(['A B', '1.5', '3,000'])
  2527. >>> s2.str.isalnum()
  2528. 0 False
  2529. 1 False
  2530. 2 False
  2531. dtype: bool
  2532. **More Detailed Checks for Numeric Characters**
  2533. There are several different but overlapping sets of numeric characters that
  2534. can be checked for.
  2535. >>> s3 = pd.Series(['23', '³', '⅕', ''])
  2536. The ``s3.str.isdecimal`` method checks for characters used to form numbers
  2537. in base 10.
  2538. >>> s3.str.isdecimal()
  2539. 0 True
  2540. 1 False
  2541. 2 False
  2542. 3 False
  2543. dtype: bool
  2544. The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also
  2545. includes special digits, like superscripted and subscripted digits in
  2546. unicode.
  2547. >>> s3.str.isdigit()
  2548. 0 True
  2549. 1 True
  2550. 2 False
  2551. 3 False
  2552. dtype: bool
  2553. The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also
  2554. includes other characters that can represent quantities such as unicode
  2555. fractions.
  2556. >>> s3.str.isnumeric()
  2557. 0 True
  2558. 1 True
  2559. 2 True
  2560. 3 False
  2561. dtype: bool
  2562. **Checks for Whitespace**
  2563. >>> s4 = pd.Series([' ', '\\t\\r\\n ', ''])
  2564. >>> s4.str.isspace()
  2565. 0 True
  2566. 1 True
  2567. 2 False
  2568. dtype: bool
  2569. **Checks for Character Case**
  2570. >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
  2571. >>> s5.str.islower()
  2572. 0 True
  2573. 1 False
  2574. 2 False
  2575. 3 False
  2576. dtype: bool
  2577. >>> s5.str.isupper()
  2578. 0 False
  2579. 1 False
  2580. 2 True
  2581. 3 False
  2582. dtype: bool
  2583. The ``s5.str.istitle`` method checks for whether all words are in title
  2584. case (whether only the first letter of each word is capitalized). Words are
  2585. assumed to be as any sequence of non-numeric characters seperated by
  2586. whitespace characters.
  2587. >>> s5.str.istitle()
  2588. 0 False
  2589. 1 True
  2590. 2 False
  2591. 3 False
  2592. dtype: bool
  2593. """)
  2594. _shared_docs['isalnum'] = dict(type='alphanumeric', method='isalnum')
  2595. _shared_docs['isalpha'] = dict(type='alphabetic', method='isalpha')
  2596. _shared_docs['isdigit'] = dict(type='digits', method='isdigit')
  2597. _shared_docs['isspace'] = dict(type='whitespace', method='isspace')
  2598. _shared_docs['islower'] = dict(type='lowercase', method='islower')
  2599. _shared_docs['isupper'] = dict(type='uppercase', method='isupper')
  2600. _shared_docs['istitle'] = dict(type='titlecase', method='istitle')
  2601. _shared_docs['isnumeric'] = dict(type='numeric', method='isnumeric')
  2602. _shared_docs['isdecimal'] = dict(type='decimal', method='isdecimal')
  2603. isalnum = _noarg_wrapper(lambda x: x.isalnum(),
  2604. docstring=_shared_docs['ismethods'] %
  2605. _shared_docs['isalnum'])
  2606. isalpha = _noarg_wrapper(lambda x: x.isalpha(),
  2607. docstring=_shared_docs['ismethods'] %
  2608. _shared_docs['isalpha'])
  2609. isdigit = _noarg_wrapper(lambda x: x.isdigit(),
  2610. docstring=_shared_docs['ismethods'] %
  2611. _shared_docs['isdigit'])
  2612. isspace = _noarg_wrapper(lambda x: x.isspace(),
  2613. docstring=_shared_docs['ismethods'] %
  2614. _shared_docs['isspace'])
  2615. islower = _noarg_wrapper(lambda x: x.islower(),
  2616. docstring=_shared_docs['ismethods'] %
  2617. _shared_docs['islower'])
  2618. isupper = _noarg_wrapper(lambda x: x.isupper(),
  2619. docstring=_shared_docs['ismethods'] %
  2620. _shared_docs['isupper'])
  2621. istitle = _noarg_wrapper(lambda x: x.istitle(),
  2622. docstring=_shared_docs['ismethods'] %
  2623. _shared_docs['istitle'])
  2624. isnumeric = _noarg_wrapper(lambda x: compat.u_safe(x).isnumeric(),
  2625. docstring=_shared_docs['ismethods'] %
  2626. _shared_docs['isnumeric'])
  2627. isdecimal = _noarg_wrapper(lambda x: compat.u_safe(x).isdecimal(),
  2628. docstring=_shared_docs['ismethods'] %
  2629. _shared_docs['isdecimal'])
  2630. @classmethod
  2631. def _make_accessor(cls, data):
  2632. cls._validate(data)
  2633. return cls(data)