from itertools import product import numpy as np import pytest from pandas._libs import hashtable from pandas import ( DatetimeIndex, MultiIndex, Series, ) import pandas._testing as tm @pytest.mark.parametrize("names", [None, ["first", "second"]]) def test_unique(names): mi = MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]], names=names) res = mi.unique() exp = MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]], names=mi.names) tm.assert_index_equal(res, exp) mi = MultiIndex.from_arrays([list("aaaa"), list("abab")], names=names) res = mi.unique() exp = MultiIndex.from_arrays([list("aa"), list("ab")], names=mi.names) tm.assert_index_equal(res, exp) mi = MultiIndex.from_arrays([list("aaaa"), list("aaaa")], names=names) res = mi.unique() exp = MultiIndex.from_arrays([["a"], ["a"]], names=mi.names) tm.assert_index_equal(res, exp) # GH #20568 - empty MI mi = MultiIndex.from_arrays([[], []], names=names) res = mi.unique() tm.assert_index_equal(mi, res) def test_unique_datetimelike(): idx1 = DatetimeIndex( ["2015-01-01", "2015-01-01", "2015-01-01", "2015-01-01", "NaT", "NaT"] ) idx2 = DatetimeIndex( ["2015-01-01", "2015-01-01", "2015-01-02", "2015-01-02", "NaT", "2015-01-01"], tz="Asia/Tokyo", ) result = MultiIndex.from_arrays([idx1, idx2]).unique() eidx1 = DatetimeIndex(["2015-01-01", "2015-01-01", "NaT", "NaT"]) eidx2 = DatetimeIndex( ["2015-01-01", "2015-01-02", "NaT", "2015-01-01"], tz="Asia/Tokyo" ) exp = MultiIndex.from_arrays([eidx1, eidx2]) tm.assert_index_equal(result, exp) @pytest.mark.parametrize("level", [0, "first", 1, "second"]) def test_unique_level(idx, level): # GH #17896 - with level= argument result = idx.unique(level=level) expected = idx.get_level_values(level).unique() tm.assert_index_equal(result, expected) # With already unique level mi = MultiIndex.from_arrays([[1, 3, 2, 4], [1, 3, 2, 5]], names=["first", "second"]) result = mi.unique(level=level) expected = mi.get_level_values(level) tm.assert_index_equal(result, expected) # With empty MI mi = MultiIndex.from_arrays([[], []], names=["first", "second"]) result = mi.unique(level=level) expected = mi.get_level_values(level) tm.assert_index_equal(result, expected) def test_duplicate_multiindex_codes(): # GH 17464 # Make sure that a MultiIndex with duplicate levels throws a ValueError msg = r"Level values must be unique: \[[A', ]+\] on level 0" with pytest.raises(ValueError, match=msg): mi = MultiIndex([["A"] * 10, range(10)], [[0] * 10, range(10)]) # And that using set_levels with duplicate levels fails mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) msg = r"Level values must be unique: \[[AB', ]+\] on level 0" with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning): mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]], inplace=True) @pytest.mark.parametrize("names", [["a", "b", "a"], [1, 1, 2], [1, "a", 1]]) def test_duplicate_level_names(names): # GH18872, GH19029 mi = MultiIndex.from_product([[0, 1]] * 3, names=names) assert mi.names == names # With .rename() mi = MultiIndex.from_product([[0, 1]] * 3) mi = mi.rename(names) assert mi.names == names # With .rename(., level=) mi.rename(names[1], level=1, inplace=True) mi = mi.rename([names[0], names[2]], level=[0, 2]) assert mi.names == names def test_duplicate_meta_data(): # GH 10115 mi = MultiIndex( levels=[[0, 1], [0, 1, 2]], codes=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]] ) for idx in [ mi, mi.set_names([None, None]), mi.set_names([None, "Num"]), mi.set_names(["Upper", "Num"]), ]: assert idx.has_duplicates assert idx.drop_duplicates().names == idx.names def test_has_duplicates(idx, idx_dup): # see fixtures assert idx.is_unique is True assert idx.has_duplicates is False assert idx_dup.is_unique is False assert idx_dup.has_duplicates is True mi = MultiIndex( levels=[[0, 1], [0, 1, 2]], codes=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]] ) assert mi.is_unique is False assert mi.has_duplicates is True # single instance of NaN mi_nan = MultiIndex( levels=[["a", "b"], [0, 1]], codes=[[-1, 0, 0, 1, 1], [-1, 0, 1, 0, 1]] ) assert mi_nan.is_unique is True assert mi_nan.has_duplicates is False # multiple instances of NaN mi_nan_dup = MultiIndex( levels=[["a", "b"], [0, 1]], codes=[[-1, -1, 0, 0, 1, 1], [-1, -1, 0, 1, 0, 1]] ) assert mi_nan_dup.is_unique is False assert mi_nan_dup.has_duplicates is True def test_has_duplicates_from_tuples(): # GH 9075 t = [ ("x", "out", "z", 5, "y", "in", "z", 169), ("x", "out", "z", 7, "y", "in", "z", 119), ("x", "out", "z", 9, "y", "in", "z", 135), ("x", "out", "z", 13, "y", "in", "z", 145), ("x", "out", "z", 14, "y", "in", "z", 158), ("x", "out", "z", 16, "y", "in", "z", 122), ("x", "out", "z", 17, "y", "in", "z", 160), ("x", "out", "z", 18, "y", "in", "z", 180), ("x", "out", "z", 20, "y", "in", "z", 143), ("x", "out", "z", 21, "y", "in", "z", 128), ("x", "out", "z", 22, "y", "in", "z", 129), ("x", "out", "z", 25, "y", "in", "z", 111), ("x", "out", "z", 28, "y", "in", "z", 114), ("x", "out", "z", 29, "y", "in", "z", 121), ("x", "out", "z", 31, "y", "in", "z", 126), ("x", "out", "z", 32, "y", "in", "z", 155), ("x", "out", "z", 33, "y", "in", "z", 123), ("x", "out", "z", 12, "y", "in", "z", 144), ] mi = MultiIndex.from_tuples(t) assert not mi.has_duplicates @pytest.mark.parametrize("nlevels", [4, 8]) @pytest.mark.parametrize("with_nulls", [True, False]) def test_has_duplicates_overflow(nlevels, with_nulls): # handle int64 overflow if possible # no overflow with 4 # overflow possible with 8 codes = np.tile(np.arange(500), 2) level = np.arange(500) if with_nulls: # inject some null values codes[500] = -1 # common nan value codes = [codes.copy() for i in range(nlevels)] for i in range(nlevels): codes[i][500 + i - nlevels // 2] = -1 codes += [np.array([-1, 1]).repeat(500)] else: codes = [codes] * nlevels + [np.arange(2).repeat(500)] levels = [level] * nlevels + [[0, 1]] # no dups mi = MultiIndex(levels=levels, codes=codes) assert not mi.has_duplicates # with a dup if with_nulls: def f(a): return np.insert(a, 1000, a[0]) codes = list(map(f, codes)) mi = MultiIndex(levels=levels, codes=codes) else: values = mi.values.tolist() mi = MultiIndex.from_tuples(values + [values[0]]) assert mi.has_duplicates @pytest.mark.parametrize( "keep, expected", [ ("first", np.array([False, False, False, True, True, False])), ("last", np.array([False, True, True, False, False, False])), (False, np.array([False, True, True, True, True, False])), ], ) def test_duplicated(idx_dup, keep, expected): result = idx_dup.duplicated(keep=keep) tm.assert_numpy_array_equal(result, expected) @pytest.mark.arm_slow def test_duplicated_large(keep): # GH 9125 n, k = 200, 5000 levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] codes = [np.random.choice(n, k * n) for lev in levels] mi = MultiIndex(levels=levels, codes=codes) result = mi.duplicated(keep=keep) expected = hashtable.duplicated(mi.values, keep=keep) tm.assert_numpy_array_equal(result, expected) def test_duplicated2(): # TODO: more informative test name # GH5873 for a in [101, 102]: mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]]) assert not mi.has_duplicates tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype="bool")) for n in range(1, 6): # 1st level shape for m in range(1, 5): # 2nd level shape # all possible unique combinations, including nan codes = product(range(-1, n), range(-1, m)) mi = MultiIndex( levels=[list("abcde")[:n], list("WXYZ")[:m]], codes=np.random.permutation(list(codes)).T, ) assert len(mi) == (n + 1) * (m + 1) assert not mi.has_duplicates tm.assert_numpy_array_equal( mi.duplicated(), np.zeros(len(mi), dtype="bool") ) def test_duplicated_drop_duplicates(): # GH#4060 idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2, 3], [1, 1, 1, 1, 2, 2])) expected = np.array([False, False, False, True, False, False], dtype=bool) duplicated = idx.duplicated() tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool expected = MultiIndex.from_arrays(([1, 2, 3, 2, 3], [1, 1, 1, 2, 2])) tm.assert_index_equal(idx.drop_duplicates(), expected) expected = np.array([True, False, False, False, False, False]) duplicated = idx.duplicated(keep="last") tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool expected = MultiIndex.from_arrays(([2, 3, 1, 2, 3], [1, 1, 1, 2, 2])) tm.assert_index_equal(idx.drop_duplicates(keep="last"), expected) expected = np.array([True, False, False, True, False, False]) duplicated = idx.duplicated(keep=False) tm.assert_numpy_array_equal(duplicated, expected) assert duplicated.dtype == bool expected = MultiIndex.from_arrays(([2, 3, 2, 3], [1, 1, 2, 2])) tm.assert_index_equal(idx.drop_duplicates(keep=False), expected) @pytest.mark.parametrize( "dtype", [ np.complex64, np.complex128, ], ) def test_duplicated_series_complex_numbers(dtype): # GH 17927 expected = Series( [False, False, False, True, False, False, False, True, False, True], dtype=bool, ) result = Series( [ np.nan + np.nan * 1j, 0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j, np.nan, np.nan + np.nan * 1j, ], dtype=dtype, ).duplicated() tm.assert_series_equal(result, expected) def test_multi_drop_duplicates_pos_args_deprecation(): # GH#41485 idx = MultiIndex.from_arrays([[1, 2, 3, 1], [1, 2, 3, 1]]) msg = ( "In a future version of pandas all arguments of " "MultiIndex.drop_duplicates will be keyword-only" ) with tm.assert_produces_warning(FutureWarning, match=msg): result = idx.drop_duplicates("last") expected = MultiIndex.from_arrays([[2, 3, 1], [2, 3, 1]]) tm.assert_index_equal(expected, result)