import numpy as np import pytest from pandas.compat import pa_version_under2p0 from pandas.errors import PerformanceWarning from pandas.core.dtypes.common import is_datetime64tz_dtype import pandas as pd import pandas._testing as tm from pandas.core.api import NumericIndex from pandas.tests.base.common import allow_na_ops def test_unique(index_or_series_obj): obj = index_or_series_obj obj = np.repeat(obj, range(1, len(obj) + 1)) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): result = obj.unique() # dict.fromkeys preserves the order unique_values = list(dict.fromkeys(obj.values)) if isinstance(obj, pd.MultiIndex): expected = pd.MultiIndex.from_tuples(unique_values) expected.names = obj.names tm.assert_index_equal(result, expected, exact=True) elif isinstance(obj, pd.Index) and obj._is_backward_compat_public_numeric_index: expected = NumericIndex(unique_values, dtype=obj.dtype) tm.assert_index_equal(result, expected, exact=True) elif isinstance(obj, pd.Index): expected = pd.Index(unique_values, dtype=obj.dtype) if is_datetime64tz_dtype(obj.dtype): expected = expected.normalize() tm.assert_index_equal(result, expected, exact=True) else: expected = np.array(unique_values) tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("null_obj", [np.nan, None]) def test_unique_null(null_obj, index_or_series_obj): obj = index_or_series_obj if not allow_na_ops(obj): pytest.skip("type doesn't allow for NA operations") elif len(obj) < 1: pytest.skip("Test doesn't make sense on empty data") elif isinstance(obj, pd.MultiIndex): pytest.skip(f"MultiIndex can't hold '{null_obj}'") values = obj._values values[0:2] = null_obj klass = type(obj) repeated_values = np.repeat(values, range(1, len(values) + 1)) obj = klass(repeated_values, dtype=obj.dtype) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): result = obj.unique() unique_values_raw = dict.fromkeys(obj.values) # because np.nan == np.nan is False, but None == None is True # np.nan would be duplicated, whereas None wouldn't unique_values_not_null = [val for val in unique_values_raw if not pd.isnull(val)] unique_values = [null_obj] + unique_values_not_null if isinstance(obj, pd.Index) and obj._is_backward_compat_public_numeric_index: expected = NumericIndex(unique_values, dtype=obj.dtype) tm.assert_index_equal(result, expected, exact=True) elif isinstance(obj, pd.Index): expected = pd.Index(unique_values, dtype=obj.dtype) if is_datetime64tz_dtype(obj.dtype): result = result.normalize() expected = expected.normalize() tm.assert_index_equal(result, expected, exact=True) else: expected = np.array(unique_values, dtype=obj.dtype) tm.assert_numpy_array_equal(result, expected) def test_nunique(index_or_series_obj): obj = index_or_series_obj obj = np.repeat(obj, range(1, len(obj) + 1)) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): expected = len(obj.unique()) assert obj.nunique(dropna=False) == expected @pytest.mark.parametrize("null_obj", [np.nan, None]) def test_nunique_null(null_obj, index_or_series_obj): obj = index_or_series_obj if not allow_na_ops(obj): pytest.skip("type doesn't allow for NA operations") elif isinstance(obj, pd.MultiIndex): pytest.skip(f"MultiIndex can't hold '{null_obj}'") values = obj._values values[0:2] = null_obj klass = type(obj) repeated_values = np.repeat(values, range(1, len(values) + 1)) obj = klass(repeated_values, dtype=obj.dtype) if isinstance(obj, pd.CategoricalIndex): assert obj.nunique() == len(obj.categories) assert obj.nunique(dropna=False) == len(obj.categories) + 1 else: with tm.maybe_produces_warning( PerformanceWarning, pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): num_unique_values = len(obj.unique()) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): assert obj.nunique() == max(0, num_unique_values - 1) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]", ): assert obj.nunique(dropna=False) == max(0, num_unique_values) @pytest.mark.single_cpu @pytest.mark.xfail( reason="Flaky in the CI. Remove once CI has a single build: GH 44584", strict=False ) def test_unique_bad_unicode(index_or_series): # regression test for #34550 uval = "\ud83d" # smiley emoji obj = index_or_series([uval] * 2) result = obj.unique() if isinstance(obj, pd.Index): expected = pd.Index(["\ud83d"], dtype=object) tm.assert_index_equal(result, expected, exact=True) else: expected = np.array(["\ud83d"], dtype=object) tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("dropna", [True, False]) def test_nunique_dropna(dropna): # GH37566 ser = pd.Series(["yes", "yes", pd.NA, np.nan, None, pd.NaT]) res = ser.nunique(dropna) assert res == 1 if dropna else 5