from datetime import ( datetime, timedelta, ) import numpy as np import pytest from pandas.compat import ( pa_version_under2p0, pa_version_under4p0, ) from pandas.errors import PerformanceWarning from pandas import ( DataFrame, Index, MultiIndex, Series, isna, ) import pandas._testing as tm @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) def test_startswith_endswith_non_str_patterns(pattern): # GH3485 ser = Series(["foo", "bar"]) msg = f"expected a string or tuple, not {type(pattern).__name__}" with pytest.raises(TypeError, match=msg): ser.str.startswith(pattern) with pytest.raises(TypeError, match=msg): ser.str.endswith(pattern) def assert_series_or_index_equal(left, right): if isinstance(left, Series): tm.assert_series_equal(left, right) else: # Index tm.assert_index_equal(left, right) def test_iter(): # GH3638 strs = "google", "wikimedia", "wikipedia", "wikitravel" ser = Series(strs) with tm.assert_produces_warning(FutureWarning): for s in ser.str: # iter must yield a Series assert isinstance(s, Series) # indices of each yielded Series should be equal to the index of # the original Series tm.assert_index_equal(s.index, ser.index) for el in s: # each element of the series is either a basestring/str or nan assert isinstance(el, str) or isna(el) # desired behavior is to iterate until everything would be nan on the # next iter so make sure the last element of the iterator was 'l' in # this case since 'wikitravel' is the longest string assert s.dropna().values.item() == "l" def test_iter_empty(any_string_dtype): ser = Series([], dtype=any_string_dtype) i, s = 100, 1 with tm.assert_produces_warning(FutureWarning): for i, s in enumerate(ser.str): pass # nothing to iterate over so nothing defined values should remain # unchanged assert i == 100 assert s == 1 def test_iter_single_element(any_string_dtype): ser = Series(["a"], dtype=any_string_dtype) with tm.assert_produces_warning(FutureWarning): for i, s in enumerate(ser.str): pass assert not i tm.assert_series_equal(ser, s) def test_iter_object_try_string(): ser = Series( [ slice(None, np.random.randint(10), np.random.randint(10, 20)) for _ in range(4) ] ) i, s = 100, "h" with tm.assert_produces_warning(FutureWarning): for i, s in enumerate(ser.str): pass assert i == 100 assert s == "h" # test integer/float dtypes (inferred by constructor) and mixed def test_count(any_string_dtype): ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype) result = ser.str.count("f[o]+") expected_dtype = np.float64 if any_string_dtype == "object" else "Int64" expected = Series([1, 2, np.nan, 4], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_count_mixed_object(): ser = Series( ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=object, ) result = ser.str.count("a") expected = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) tm.assert_series_equal(result, expected) def test_repeat(any_string_dtype): ser = Series(["a", "b", np.nan, "c", np.nan, "d"], dtype=any_string_dtype) result = ser.str.repeat(3) expected = Series( ["aaa", "bbb", np.nan, "ccc", np.nan, "ddd"], dtype=any_string_dtype ) tm.assert_series_equal(result, expected) result = ser.str.repeat([1, 2, 3, 4, 5, 6]) expected = Series( ["a", "bb", np.nan, "cccc", np.nan, "dddddd"], dtype=any_string_dtype ) tm.assert_series_equal(result, expected) def test_repeat_mixed_object(): ser = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) result = ser.str.repeat(3) expected = Series( ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", np.nan, np.nan, np.nan] ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("arg, repeat", [[None, 4], ["b", None]]) def test_repeat_with_null(any_string_dtype, arg, repeat): # GH: 31632 ser = Series(["a", arg], dtype=any_string_dtype) result = ser.str.repeat([3, repeat]) expected = Series(["aaa", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) if any_string_dtype == "object": empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) else: empty_int = Series(dtype="Int64") empty_bool = Series(dtype="boolean") empty_object = Series(dtype=object) empty_bytes = Series(dtype=object) empty_df = DataFrame() # GH7241 # (extract) on empty series tm.assert_series_equal(empty_str, empty.str.cat(empty)) assert "" == empty.str.cat() tm.assert_series_equal(empty_str, empty.str.title()) tm.assert_series_equal(empty_int, empty.str.count("a")) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): tm.assert_series_equal(empty_bool, empty.str.contains("a")) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): tm.assert_series_equal(empty_bool, empty.str.startswith("a")) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): tm.assert_series_equal(empty_bool, empty.str.endswith("a")) tm.assert_series_equal(empty_str, empty.str.lower()) tm.assert_series_equal(empty_str, empty.str.upper()) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): tm.assert_series_equal(empty_str, empty.str.replace("a", "b")) tm.assert_series_equal(empty_str, empty.str.repeat(3)) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): tm.assert_series_equal(empty_bool, empty.str.match("^a")) tm.assert_frame_equal( DataFrame(columns=[0], dtype=any_string_dtype), empty.str.extract("()", expand=True), ) tm.assert_frame_equal( DataFrame(columns=[0, 1], dtype=any_string_dtype), empty.str.extract("()()", expand=True), ) tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False)) tm.assert_frame_equal( DataFrame(columns=[0, 1], dtype=any_string_dtype), empty.str.extract("()()", expand=False), ) tm.assert_frame_equal(empty_df, empty.str.get_dummies()) tm.assert_series_equal(empty_str, empty_str.str.join("")) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): tm.assert_series_equal(empty_int, empty.str.len()) tm.assert_series_equal(empty_object, empty_str.str.findall("a")) tm.assert_series_equal(empty_int, empty.str.find("a")) tm.assert_series_equal(empty_int, empty.str.rfind("a")) tm.assert_series_equal(empty_str, empty.str.pad(42)) tm.assert_series_equal(empty_str, empty.str.center(42)) tm.assert_series_equal(empty_object, empty.str.split("a")) tm.assert_series_equal(empty_object, empty.str.rsplit("a")) tm.assert_series_equal(empty_object, empty.str.partition("a", expand=False)) tm.assert_frame_equal(empty_df, empty.str.partition("a")) tm.assert_series_equal(empty_object, empty.str.rpartition("a", expand=False)) tm.assert_frame_equal(empty_df, empty.str.rpartition("a")) tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) tm.assert_series_equal(empty_str, empty.str.slice(step=1)) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): tm.assert_series_equal(empty_str, empty.str.strip()) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): tm.assert_series_equal(empty_str, empty.str.lstrip()) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) tm.assert_series_equal(empty_object, empty_bytes.str.decode("ascii")) tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) # ismethods should always return boolean (GH 29624) tm.assert_series_equal(empty_bool, empty.str.isalnum()) tm.assert_series_equal(empty_bool, empty.str.isalpha()) tm.assert_series_equal(empty_bool, empty.str.isdigit()) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under2p0, ): tm.assert_series_equal(empty_bool, empty.str.isspace()) tm.assert_series_equal(empty_bool, empty.str.islower()) tm.assert_series_equal(empty_bool, empty.str.isupper()) tm.assert_series_equal(empty_bool, empty.str.istitle()) tm.assert_series_equal(empty_bool, empty.str.isnumeric()) tm.assert_series_equal(empty_bool, empty.str.isdecimal()) tm.assert_series_equal(empty_str, empty.str.capitalize()) tm.assert_series_equal(empty_str, empty.str.swapcase()) tm.assert_series_equal(empty_str, empty.str.normalize("NFC")) table = str.maketrans("a", "b") tm.assert_series_equal(empty_str, empty.str.translate(table)) @pytest.mark.parametrize( "method, expected", [ ("isalnum", [True, True, True, True, True, False, True, True, False, False]), ("isalpha", [True, True, True, False, False, False, True, False, False, False]), ( "isdigit", [False, False, False, True, False, False, False, True, False, False], ), ( "isnumeric", [False, False, False, True, False, False, False, True, False, False], ), ( "isspace", [False, False, False, False, False, False, False, False, False, True], ), ( "islower", [False, True, False, False, False, False, False, False, False, False], ), ( "isupper", [True, False, False, False, True, False, True, False, False, False], ), ( "istitle", [True, False, True, False, True, False, False, False, False, False], ), ], ) def test_ismethods(method, expected, any_string_dtype): ser = Series( ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "], dtype=any_string_dtype ) expected_dtype = "bool" if any_string_dtype == "object" else "boolean" expected = Series(expected, dtype=expected_dtype) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under2p0 and method == "isspace", ): result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) # compare with standard library expected = [getattr(item, method)() for item in ser] assert list(result) == expected @pytest.mark.parametrize( "method, expected", [ ("isnumeric", [False, True, True, False, True, True, False]), ("isdecimal", [False, True, False, False, False, True, False]), ], ) def test_isnumeric_unicode(method, expected, any_string_dtype): # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER # 0x2605: ★ not number # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY # 0xFF13: 3 Em 3 ser = Series(["A", "3", "¼", "★", "፸", "3", "four"], dtype=any_string_dtype) expected_dtype = "bool" if any_string_dtype == "object" else "boolean" expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) # compare with standard library expected = [getattr(item, method)() for item in ser] assert list(result) == expected @pytest.mark.parametrize( "method, expected", [ ("isnumeric", [False, np.nan, True, False, np.nan, True, False]), ("isdecimal", [False, np.nan, False, False, np.nan, True, False]), ], ) def test_isnumeric_unicode_missing(method, expected, any_string_dtype): values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] ser = Series(values, dtype=any_string_dtype) expected_dtype = "object" if any_string_dtype == "object" else "boolean" expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) def test_spilt_join_roundtrip(any_string_dtype): ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) result = ser.str.split("_").str.join("_") expected = ser.astype(object) tm.assert_series_equal(result, expected) def test_spilt_join_roundtrip_mixed_object(): ser = Series( ["a_b", np.nan, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0] ) result = ser.str.split("_").str.join("_") expected = Series( ["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", np.nan, np.nan, np.nan] ) tm.assert_series_equal(result, expected) def test_len(any_string_dtype): ser = Series( ["foo", "fooo", "fooooo", np.nan, "fooooooo", "foo\n", "あ"], dtype=any_string_dtype, ) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = ser.str.len() expected_dtype = "float64" if any_string_dtype == "object" else "Int64" expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_len_mixed(): ser = Series( ["a_b", np.nan, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0] ) result = ser.str.len() expected = Series([3, np.nan, 13, np.nan, np.nan, 3, np.nan, np.nan, np.nan]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "method,sub,start,end,expected", [ ("index", "EF", None, None, [4, 3, 1, 0]), ("rindex", "EF", None, None, [4, 5, 7, 4]), ("index", "EF", 3, None, [4, 3, 7, 4]), ("rindex", "EF", 3, None, [4, 5, 7, 4]), ("index", "E", 4, 8, [4, 5, 7, 4]), ("rindex", "E", 0, 5, [4, 3, 1, 4]), ], ) def test_index(method, sub, start, end, index_or_series, any_string_dtype, expected): obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype ) expected_dtype = np.int64 if any_string_dtype == "object" else "Int64" expected = index_or_series(expected, dtype=expected_dtype) result = getattr(obj.str, method)(sub, start, end) if index_or_series is Series: tm.assert_series_equal(result, expected) else: tm.assert_index_equal(result, expected) # compare with standard library expected = [getattr(item, method)(sub, start, end) for item in obj] assert list(result) == expected def test_index_not_found_raises(index_or_series, any_string_dtype): obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype ) with pytest.raises(ValueError, match="substring not found"): obj.str.index("DE") @pytest.mark.parametrize("method", ["index", "rindex"]) def test_index_wrong_type_raises(index_or_series, any_string_dtype, method): obj = index_or_series([], dtype=any_string_dtype) msg = "expected a string object, not int" with pytest.raises(TypeError, match=msg): getattr(obj.str, method)(0) @pytest.mark.parametrize( "method, exp", [ ["index", [1, 1, 0]], ["rindex", [3, 1, 2]], ], ) def test_index_missing(any_string_dtype, method, exp): ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype) expected_dtype = np.float64 if any_string_dtype == "object" else "Int64" result = getattr(ser.str, method)("b") expected = Series(exp + [np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_pipe_failures(any_string_dtype): # #2119 ser = Series(["A|B|C"], dtype=any_string_dtype) result = ser.str.split("|") expected = Series([["A", "B", "C"]], dtype=object) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = ser.str.replace("|", " ", regex=False) expected = Series(["A B C"], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "start, stop, step, expected", [ (2, 5, None, ["foo", "bar", np.nan, "baz"]), (0, 3, -1, ["", "", np.nan, ""]), (None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]), (3, 10, 2, ["oto", "ato", np.nan, "aqx"]), (3, 0, -1, ["ofa", "aba", np.nan, "aba"]), ], ) def test_slice(start, stop, step, expected, any_string_dtype): ser = Series(["aafootwo", "aabartwo", np.nan, "aabazqux"], dtype=any_string_dtype) result = ser.str.slice(start, stop, step) expected = Series(expected, dtype=any_string_dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "start, stop, step, expected", [ (2, 5, None, ["foo", np.nan, "bar", np.nan, np.nan, np.nan, np.nan, np.nan]), (4, 1, -1, ["oof", np.nan, "rab", np.nan, np.nan, np.nan, np.nan, np.nan]), ], ) def test_slice_mixed_object(start, stop, step, expected): ser = Series(["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0]) result = ser.str.slice(start, stop, step) expected = Series(expected) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "start,stop,repl,expected", [ (2, 3, None, ["shrt", "a it longer", "evnlongerthanthat", "", np.nan]), (2, 3, "z", ["shzrt", "a zit longer", "evznlongerthanthat", "z", np.nan]), (2, 2, "z", ["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]), (2, 1, "z", ["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]), (-1, None, "z", ["shorz", "a bit longez", "evenlongerthanthaz", "z", np.nan]), (None, -2, "z", ["zrt", "zer", "zat", "z", np.nan]), (6, 8, "z", ["shortz", "a bit znger", "evenlozerthanthat", "z", np.nan]), (-10, 3, "z", ["zrt", "a zit longer", "evenlongzerthanthat", "z", np.nan]), ], ) def test_slice_replace(start, stop, repl, expected, any_string_dtype): ser = Series( ["short", "a bit longer", "evenlongerthanthat", "", np.nan], dtype=any_string_dtype, ) expected = Series(expected, dtype=any_string_dtype) result = ser.str.slice_replace(start, stop, repl) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "method, exp", [ ["strip", ["aa", "bb", np.nan, "cc"]], ["lstrip", ["aa ", "bb \n", np.nan, "cc "]], ["rstrip", [" aa", " bb", np.nan, "cc"]], ], ) def test_strip_lstrip_rstrip(any_string_dtype, method, exp): ser = Series([" aa ", " bb \n", np.nan, "cc "], dtype=any_string_dtype) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = getattr(ser.str, method)() expected = Series(exp, dtype=any_string_dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "method, exp", [ ["strip", ["aa", np.nan, "bb"]], ["lstrip", ["aa ", np.nan, "bb \t\n"]], ["rstrip", [" aa", np.nan, " bb"]], ], ) def test_strip_lstrip_rstrip_mixed_object(method, exp): ser = Series([" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0]) result = getattr(ser.str, method)() expected = Series(exp + [np.nan, np.nan, np.nan, np.nan, np.nan]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "method, exp", [ ["strip", ["ABC", " BNSD", "LDFJH "]], ["lstrip", ["ABCxx", " BNSD", "LDFJH xx"]], ["rstrip", ["xxABC", "xx BNSD", "LDFJH "]], ], ) def test_strip_lstrip_rstrip_args(any_string_dtype, method, exp): ser = Series(["xxABCxx", "xx BNSD", "LDFJH xx"], dtype=any_string_dtype) with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" and pa_version_under4p0, ): result = getattr(ser.str, method)("x") expected = Series(exp, dtype=any_string_dtype) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "prefix, expected", [("a", ["b", " b c", "bc"]), ("ab", ["", "a b c", "bc"])] ) def test_removeprefix(any_string_dtype, prefix, expected): ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype) result = ser.str.removeprefix(prefix) ser_expected = Series(expected, dtype=any_string_dtype) tm.assert_series_equal(result, ser_expected) @pytest.mark.parametrize( "suffix, expected", [("c", ["ab", "a b ", "b"]), ("bc", ["ab", "a b c", ""])] ) def test_removesuffix(any_string_dtype, suffix, expected): ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype) result = ser.str.removesuffix(suffix) ser_expected = Series(expected, dtype=any_string_dtype) tm.assert_series_equal(result, ser_expected) def test_string_slice_get_syntax(any_string_dtype): ser = Series( ["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", np.nan, "CYYYBYYY", "dog", "cYYYt"], dtype=any_string_dtype, ) result = ser.str[0] expected = ser.str.get(0) tm.assert_series_equal(result, expected) result = ser.str[:3] expected = ser.str.slice(stop=3) tm.assert_series_equal(result, expected) result = ser.str[2::-1] expected = ser.str.slice(start=2, step=-1) tm.assert_series_equal(result, expected) def test_string_slice_out_of_bounds_nested(): ser = Series([(1, 2), (1,), (3, 4, 5)]) result = ser.str[1] expected = Series([2, np.nan, 4]) tm.assert_series_equal(result, expected) def test_string_slice_out_of_bounds(any_string_dtype): ser = Series(["foo", "b", "ba"], dtype=any_string_dtype) result = ser.str[1] expected = Series(["o", np.nan, "a"], dtype=any_string_dtype) tm.assert_series_equal(result, expected) def test_encode_decode(any_string_dtype): ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8") result = ser.str.decode("utf-8") expected = ser.map(lambda x: x.decode("utf-8")) tm.assert_series_equal(result, expected) def test_encode_errors_kwarg(any_string_dtype): ser = Series(["a", "b", "a\x9d"], dtype=any_string_dtype) msg = ( r"'charmap' codec can't encode character '\\x9d' in position 1: " "character maps to " ) with pytest.raises(UnicodeEncodeError, match=msg): ser.str.encode("cp1252") result = ser.str.encode("cp1252", "ignore") expected = ser.map(lambda x: x.encode("cp1252", "ignore")) tm.assert_series_equal(result, expected) def test_decode_errors_kwarg(): ser = Series([b"a", b"b", b"a\x9d"]) msg = ( "'charmap' codec can't decode byte 0x9d in position 1: " "character maps to " ) with pytest.raises(UnicodeDecodeError, match=msg): ser.str.decode("cp1252") result = ser.str.decode("cp1252", "ignore") expected = ser.map(lambda x: x.decode("cp1252", "ignore")) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "form, expected", [ ("NFKC", ["ABC", "ABC", "123", np.nan, "アイエ"]), ("NFC", ["ABC", "ABC", "123", np.nan, "アイエ"]), ], ) def test_normalize(form, expected, any_string_dtype): ser = Series( ["ABC", "ABC", "123", np.nan, "アイエ"], index=["a", "b", "c", "d", "e"], dtype=any_string_dtype, ) expected = Series(expected, index=["a", "b", "c", "d", "e"], dtype=any_string_dtype) result = ser.str.normalize(form) tm.assert_series_equal(result, expected) def test_normalize_bad_arg_raises(any_string_dtype): ser = Series( ["ABC", "ABC", "123", np.nan, "アイエ"], index=["a", "b", "c", "d", "e"], dtype=any_string_dtype, ) with pytest.raises(ValueError, match="invalid normalization form"): ser.str.normalize("xxx") def test_normalize_index(): idx = Index(["ABC", "123", "アイエ"]) expected = Index(["ABC", "123", "アイエ"]) result = idx.str.normalize("NFKC") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( "values,inferred_type", [ (["a", "b"], "string"), (["a", "b", 1], "mixed-integer"), (["a", "b", 1.3], "mixed"), (["a", "b", 1.3, 1], "mixed-integer"), (["aa", datetime(2011, 1, 1)], "mixed"), ], ) def test_index_str_accessor_visibility(values, inferred_type, index_or_series): from pandas.core.strings import StringMethods obj = index_or_series(values) if index_or_series is Index: assert obj.inferred_type == inferred_type assert isinstance(obj.str, StringMethods) @pytest.mark.parametrize( "values,inferred_type", [ ([1, np.nan], "floating"), ([datetime(2011, 1, 1)], "datetime64"), ([timedelta(1)], "timedelta64"), ], ) def test_index_str_accessor_non_string_values_raises( values, inferred_type, index_or_series ): obj = index_or_series(values) if index_or_series is Index: assert obj.inferred_type == inferred_type msg = "Can only use .str accessor with string values" with pytest.raises(AttributeError, match=msg): obj.str def test_index_str_accessor_multiindex_raises(): # MultiIndex has mixed dtype, but not allow to use accessor idx = MultiIndex.from_tuples([("a", "b"), ("a", "b")]) assert idx.inferred_type == "mixed" msg = "Can only use .str accessor with Index, not MultiIndex" with pytest.raises(AttributeError, match=msg): idx.str def test_str_accessor_no_new_attributes(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/10673 ser = Series(list("aabbcde"), dtype=any_string_dtype) with pytest.raises(AttributeError, match="You cannot add any new attribute"): ser.str.xlabel = "a" def test_cat_on_bytes_raises(): lhs = Series(np.array(list("abc"), "S1").astype(object)) rhs = Series(np.array(list("def"), "S1").astype(object)) msg = "Cannot use .str.cat with values of inferred dtype 'bytes'" with pytest.raises(TypeError, match=msg): lhs.str.cat(rhs) def test_str_accessor_in_apply_func(): # https://github.com/pandas-dev/pandas/issues/38979 df = DataFrame(zip("abc", "def")) expected = Series(["A/D", "B/E", "C/F"]) result = df.apply(lambda f: "/".join(f.str.upper()), axis=1) tm.assert_series_equal(result, expected) def test_zfill(): # https://github.com/pandas-dev/pandas/issues/20868 value = Series(["-1", "1", "1000", 10, np.nan]) expected = Series(["-01", "001", "1000", np.nan, np.nan]) tm.assert_series_equal(value.str.zfill(3), expected) value = Series(["-2", "+5"]) expected = Series(["-0002", "+0005"]) tm.assert_series_equal(value.str.zfill(5), expected) def test_zfill_with_non_integer_argument(): value = Series(["-2", "+5"]) wid = "a" msg = f"width must be of integer type, not {type(wid).__name__}" with pytest.raises(TypeError, match=msg): value.str.zfill(wid) def test_zfill_with_leading_sign(): value = Series(["-cat", "-1", "+dog"]) expected = Series(["-0cat", "-0001", "+0dog"]) tm.assert_series_equal(value.str.zfill(5), expected) def test_get_with_dict_label(): # GH47911 s = Series( [ {"name": "Hello", "value": "World"}, {"name": "Goodbye", "value": "Planet"}, {"value": "Sea"}, ] ) result = s.str.get("name") expected = Series(["Hello", "Goodbye", None]) tm.assert_series_equal(result, expected) result = s.str.get("value") expected = Series(["World", "Planet", "Sea"]) tm.assert_series_equal(result, expected)