from datetime import timedelta from decimal import Decimal import inspect import re from dateutil.tz import tzlocal import numpy as np import pytest from pandas._libs import lib from pandas.compat import is_platform_windows import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_categorical_dtype import pandas as pd from pandas import ( Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna, notna, to_datetime, to_timedelta, ) import pandas._testing as tm import pandas.core.algorithms as algorithms import pandas.core.nanops as nanops def assert_stat_op_calc( opname, alternative, frame, has_skipna=True, check_dtype=True, check_dates=False, rtol=1e-5, atol=1e-8, skipna_alternative=None, ): """ Check that operator opname works as advertised on frame Parameters ---------- opname : str Name of the operator to test on frame alternative : function Function that opname is tested against; i.e. "frame.opname()" should equal "alternative(frame)". frame : DataFrame The object that the tests are executed on has_skipna : bool, default True Whether the method "opname" has the kwarg "skip_na" check_dtype : bool, default True Whether the dtypes of the result of "frame.opname()" and "alternative(frame)" should be checked. check_dates : bool, default false Whether opname should be tested on a Datetime Series rtol : float, default 1e-5 Relative tolerance. atol : float, default 1e-8 Absolute tolerance. skipna_alternative : function, default None NaN-safe version of alternative """ warn = FutureWarning if opname == "mad" else None f = getattr(frame, opname) if check_dates: expected_warning = FutureWarning if opname in ["mean", "median"] else None df = DataFrame({"b": date_range("1/1/2001", periods=2)}) with tm.assert_produces_warning(expected_warning): result = getattr(df, opname)() assert isinstance(result, Series) df["a"] = range(len(df)) with tm.assert_produces_warning(expected_warning): result = getattr(df, opname)() assert isinstance(result, Series) assert len(result) if has_skipna: def wrapper(x): return alternative(x.values) skipna_wrapper = tm._make_skipna_wrapper(alternative, skipna_alternative) with tm.assert_produces_warning(warn, match="The 'mad' method is deprecated"): result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) tm.assert_series_equal( result0, frame.apply(wrapper), check_dtype=check_dtype, rtol=rtol, atol=atol ) tm.assert_series_equal( result1, frame.apply(wrapper, axis=1), rtol=rtol, atol=atol, ) else: skipna_wrapper = alternative with tm.assert_produces_warning(warn, match="The 'mad' method is deprecated"): result0 = f(axis=0) result1 = f(axis=1) tm.assert_series_equal( result0, frame.apply(skipna_wrapper), check_dtype=check_dtype, rtol=rtol, atol=atol, ) if opname in ["sum", "prod"]: expected = frame.apply(skipna_wrapper, axis=1) tm.assert_series_equal( result1, expected, check_dtype=False, rtol=rtol, atol=atol ) # check dtypes if check_dtype: lcd_dtype = frame.values.dtype assert lcd_dtype == result0.dtype assert lcd_dtype == result1.dtype # bad axis with tm.assert_produces_warning(warn, match="The 'mad' method is deprecated"): with pytest.raises(ValueError, match="No axis named 2"): f(axis=2) # all NA case if has_skipna: all_na = frame * np.NaN with tm.assert_produces_warning( warn, match="The 'mad' method is deprecated", raise_on_extra_warnings=False ): r0 = getattr(all_na, opname)(axis=0) r1 = getattr(all_na, opname)(axis=1) if opname in ["sum", "prod"]: unit = 1 if opname == "prod" else 0 # result for empty sum/prod expected = Series(unit, index=r0.index, dtype=r0.dtype) tm.assert_series_equal(r0, expected) expected = Series(unit, index=r1.index, dtype=r1.dtype) tm.assert_series_equal(r1, expected) class TestDataFrameAnalytics: # --------------------------------------------------------------------- # Reductions @pytest.mark.filterwarnings("ignore:Dropping of nuisance:FutureWarning") @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "opname", [ "count", "sum", "mean", "product", "median", "min", "max", "nunique", "mad", "var", "std", "sem", pytest.param("skew", marks=td.skip_if_no_scipy), pytest.param("kurt", marks=td.skip_if_no_scipy), ], ) def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): warn = FutureWarning if opname == "mad" else None with tm.assert_produces_warning( warn, match="The 'mad' method is deprecated", raise_on_extra_warnings=False ): getattr(float_string_frame, opname)(axis=axis) if opname not in ("nunique", "mad"): getattr(float_string_frame, opname)(axis=axis, numeric_only=True) @pytest.mark.filterwarnings("ignore:Dropping of nuisance:FutureWarning") @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize( "opname", [ "count", "sum", "mean", "product", "median", "min", "max", "var", "std", "sem", pytest.param("skew", marks=td.skip_if_no_scipy), pytest.param("kurt", marks=td.skip_if_no_scipy), ], ) def test_stat_op_api_float_frame(self, float_frame, axis, opname): getattr(float_frame, opname)(axis=axis, numeric_only=False) def test_stat_op_calc(self, float_frame_with_na, mixed_float_frame): def count(s): return notna(s).sum() def nunique(s): return len(algorithms.unique1d(s.dropna())) def mad(x): return np.abs(x - x.mean()).mean() def var(x): return np.var(x, ddof=1) def std(x): return np.std(x, ddof=1) def sem(x): return np.std(x, ddof=1) / np.sqrt(len(x)) assert_stat_op_calc( "nunique", nunique, float_frame_with_na, has_skipna=False, check_dtype=False, check_dates=True, ) # GH#32571 check_less_precise is needed on apparently-random # py37-npdev builds and OSX-PY36-min_version builds # mixed types (with upcasting happening) assert_stat_op_calc( "sum", np.sum, mixed_float_frame.astype("float32"), check_dtype=False, rtol=1e-3, ) assert_stat_op_calc( "sum", np.sum, float_frame_with_na, skipna_alternative=np.nansum ) assert_stat_op_calc("mean", np.mean, float_frame_with_na, check_dates=True) assert_stat_op_calc( "product", np.prod, float_frame_with_na, skipna_alternative=np.nanprod ) assert_stat_op_calc("mad", mad, float_frame_with_na) assert_stat_op_calc("var", var, float_frame_with_na) assert_stat_op_calc("std", std, float_frame_with_na) assert_stat_op_calc("sem", sem, float_frame_with_na) assert_stat_op_calc( "count", count, float_frame_with_na, has_skipna=False, check_dtype=False, check_dates=True, ) @td.skip_if_no_scipy def test_stat_op_calc_skew_kurtosis(self, float_frame_with_na): def skewness(x): from scipy.stats import skew if len(x) < 3: return np.nan return skew(x, bias=False) def kurt(x): from scipy.stats import kurtosis if len(x) < 4: return np.nan return kurtosis(x, bias=False) assert_stat_op_calc("skew", skewness, float_frame_with_na) assert_stat_op_calc("kurt", kurt, float_frame_with_na) # TODO: Ensure warning isn't emitted in the first place # ignore mean of empty slice and all-NaN @pytest.mark.filterwarnings("ignore::RuntimeWarning") def test_median(self, float_frame_with_na, int_frame): def wrapper(x): if isna(x).any(): return np.nan return np.median(x) assert_stat_op_calc("median", wrapper, float_frame_with_na, check_dates=True) assert_stat_op_calc( "median", wrapper, int_frame, check_dtype=False, check_dates=True ) @pytest.mark.parametrize( "method", ["sum", "mean", "prod", "var", "std", "skew", "min", "max"] ) @pytest.mark.parametrize( "df", [ DataFrame( { "a": [ -0.00049987540199591344, -0.0016467257772919831, 0.00067695870775883013, ], "b": [-0, -0, 0.0], "c": [ 0.00031111847529610595, 0.0014902627951905339, -0.00094099200035979691, ], }, index=["foo", "bar", "baz"], dtype="O", ), DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object), ], ) def test_stat_operators_attempt_obj_array(self, method, df): # GH#676 assert df.values.dtype == np.object_ result = getattr(df, method)(1) expected = getattr(df.astype("f8"), method)(1) if method in ["sum", "prod"]: tm.assert_series_equal(result, expected) @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) def test_mixed_ops(self, op): # GH#16116 df = DataFrame( { "int": [1, 2, 3, 4], "float": [1.0, 2.0, 3.0, 4.0], "str": ["a", "b", "c", "d"], } ) with tm.assert_produces_warning( FutureWarning, match="Select only valid columns" ): result = getattr(df, op)() assert len(result) == 2 with pd.option_context("use_bottleneck", False): with tm.assert_produces_warning( FutureWarning, match="Select only valid columns" ): result = getattr(df, op)() assert len(result) == 2 def test_reduce_mixed_frame(self): # GH 6806 df = DataFrame( { "bool_data": [True, True, False, False, False], "int_data": [10, 20, 30, 40, 50], "string_data": ["a", "b", "c", "d", "e"], } ) df.reindex(columns=["bool_data", "int_data", "string_data"]) test = df.sum(axis=0) tm.assert_numpy_array_equal( test.values, np.array([2, 150, "abcde"], dtype=object) ) alt = df.T.sum(axis=1) tm.assert_series_equal(test, alt) def test_nunique(self): df = DataFrame({"A": [1, 1, 1], "B": [1, 2, 3], "C": [1, np.nan, 3]}) tm.assert_series_equal(df.nunique(), Series({"A": 1, "B": 3, "C": 2})) tm.assert_series_equal( df.nunique(dropna=False), Series({"A": 1, "B": 3, "C": 3}) ) tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2})) tm.assert_series_equal( df.nunique(axis=1, dropna=False), Series({0: 1, 1: 3, 2: 2}) ) @pytest.mark.parametrize("tz", [None, "UTC"]) def test_mean_mixed_datetime_numeric(self, tz): # https://github.com/pandas-dev/pandas/issues/24752 df = DataFrame({"A": [1, 1], "B": [Timestamp("2000", tz=tz)] * 2}) with tm.assert_produces_warning(FutureWarning): result = df.mean() expected = Series([1.0], index=["A"]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("tz", [None, "UTC"]) def test_mean_excludes_datetimes(self, tz): # https://github.com/pandas-dev/pandas/issues/24752 # Our long-term desired behavior is unclear, but the behavior in # 0.24.0rc1 was buggy. df = DataFrame({"A": [Timestamp("2000", tz=tz)] * 2}) with tm.assert_produces_warning(FutureWarning): result = df.mean() expected = Series(dtype=np.float64) tm.assert_series_equal(result, expected) def test_mean_mixed_string_decimal(self): # GH 11670 # possible bug when calculating mean of DataFrame? d = [ {"A": 2, "B": None, "C": Decimal("628.00")}, {"A": 1, "B": None, "C": Decimal("383.00")}, {"A": 3, "B": None, "C": Decimal("651.00")}, {"A": 2, "B": None, "C": Decimal("575.00")}, {"A": 4, "B": None, "C": Decimal("1114.00")}, {"A": 1, "B": "TEST", "C": Decimal("241.00")}, {"A": 2, "B": None, "C": Decimal("572.00")}, {"A": 4, "B": None, "C": Decimal("609.00")}, {"A": 3, "B": None, "C": Decimal("820.00")}, {"A": 5, "B": None, "C": Decimal("1223.00")}, ] df = DataFrame(d) with tm.assert_produces_warning( FutureWarning, match="Select only valid columns" ): result = df.mean() expected = Series([2.7, 681.6], index=["A", "C"]) tm.assert_series_equal(result, expected) def test_var_std(self, datetime_frame): result = datetime_frame.std(ddof=4) expected = datetime_frame.apply(lambda x: x.std(ddof=4)) tm.assert_almost_equal(result, expected) result = datetime_frame.var(ddof=4) expected = datetime_frame.apply(lambda x: x.var(ddof=4)) tm.assert_almost_equal(result, expected) arr = np.repeat(np.random.random((1, 1000)), 1000, 0) result = nanops.nanvar(arr, axis=0) assert not (result < 0).any() with pd.option_context("use_bottleneck", False): result = nanops.nanvar(arr, axis=0) assert not (result < 0).any() @pytest.mark.parametrize("meth", ["sem", "var", "std"]) def test_numeric_only_flag(self, meth): # GH 9201 df1 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"]) # set one entry to a number in str format df1.loc[0, "foo"] = "100" df2 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"]) # set one entry to a non-number str df2.loc[0, "foo"] = "a" result = getattr(df1, meth)(axis=1, numeric_only=True) expected = getattr(df1[["bar", "baz"]], meth)(axis=1) tm.assert_series_equal(expected, result) result = getattr(df2, meth)(axis=1, numeric_only=True) expected = getattr(df2[["bar", "baz"]], meth)(axis=1) tm.assert_series_equal(expected, result) # df1 has all numbers, df2 has a letter inside msg = r"unsupported operand type\(s\) for -: 'float' and 'str'" with pytest.raises(TypeError, match=msg): getattr(df1, meth)(axis=1, numeric_only=False) msg = "could not convert string to float: 'a'" with pytest.raises(TypeError, match=msg): getattr(df2, meth)(axis=1, numeric_only=False) def test_sem(self, datetime_frame): result = datetime_frame.sem(ddof=4) expected = datetime_frame.apply(lambda x: x.std(ddof=4) / np.sqrt(len(x))) tm.assert_almost_equal(result, expected) arr = np.repeat(np.random.random((1, 1000)), 1000, 0) result = nanops.nansem(arr, axis=0) assert not (result < 0).any() with pd.option_context("use_bottleneck", False): result = nanops.nansem(arr, axis=0) assert not (result < 0).any() @td.skip_if_no_scipy def test_kurt(self): index = MultiIndex( levels=[["bar"], ["one", "two", "three"], [0, 1]], codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], ) df = DataFrame(np.random.randn(6, 3), index=index) kurt = df.kurt() with tm.assert_produces_warning(FutureWarning): kurt2 = df.kurt(level=0).xs("bar") tm.assert_series_equal(kurt, kurt2, check_names=False) assert kurt.name is None assert kurt2.name == "bar" @pytest.mark.parametrize( "dropna, expected", [ ( True, { "A": [12], "B": [10.0], "C": [1.0], "D": ["a"], "E": Categorical(["a"], categories=["a"]), "F": to_datetime(["2000-1-2"]), "G": to_timedelta(["1 days"]), }, ), ( False, { "A": [12], "B": [10.0], "C": [np.nan], "D": np.array([np.nan], dtype=object), "E": Categorical([np.nan], categories=["a"]), "F": [pd.NaT], "G": to_timedelta([pd.NaT]), }, ), ( True, { "H": [8, 9, np.nan, np.nan], "I": [8, 9, np.nan, np.nan], "J": [1, np.nan, np.nan, np.nan], "K": Categorical(["a", np.nan, np.nan, np.nan], categories=["a"]), "L": to_datetime(["2000-1-2", "NaT", "NaT", "NaT"]), "M": to_timedelta(["1 days", "nan", "nan", "nan"]), "N": [0, 1, 2, 3], }, ), ( False, { "H": [8, 9, np.nan, np.nan], "I": [8, 9, np.nan, np.nan], "J": [1, np.nan, np.nan, np.nan], "K": Categorical([np.nan, "a", np.nan, np.nan], categories=["a"]), "L": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), "M": to_timedelta(["nan", "1 days", "nan", "nan"]), "N": [0, 1, 2, 3], }, ), ], ) def test_mode_dropna(self, dropna, expected): df = DataFrame( { "A": [12, 12, 19, 11], "B": [10, 10, np.nan, 3], "C": [1, np.nan, np.nan, np.nan], "D": [np.nan, np.nan, "a", np.nan], "E": Categorical([np.nan, np.nan, "a", np.nan]), "F": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), "G": to_timedelta(["1 days", "nan", "nan", "nan"]), "H": [8, 8, 9, 9], "I": [9, 9, 8, 8], "J": [1, 1, np.nan, np.nan], "K": Categorical(["a", np.nan, "a", np.nan]), "L": to_datetime(["2000-1-2", "2000-1-2", "NaT", "NaT"]), "M": to_timedelta(["1 days", "nan", "1 days", "nan"]), "N": np.arange(4, dtype="int64"), } ) result = df[sorted(expected.keys())].mode(dropna=dropna) expected = DataFrame(expected) tm.assert_frame_equal(result, expected) def test_mode_sortwarning(self): # Check for the warning that is raised when the mode # results cannot be sorted df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) with tm.assert_produces_warning(UserWarning): result = df.mode(dropna=False) result = result.sort_values(by="A").reset_index(drop=True) tm.assert_frame_equal(result, expected) def test_mode_empty_df(self): df = DataFrame([], columns=["a", "b"]) result = df.mode() expected = DataFrame([], columns=["a", "b"], index=Index([], dtype=int)) tm.assert_frame_equal(result, expected) def test_operators_timedelta64(self): df = DataFrame( { "A": date_range("2012-1-1", periods=3, freq="D"), "B": date_range("2012-1-2", periods=3, freq="D"), "C": Timestamp("20120101") - timedelta(minutes=5, seconds=5), } ) diffs = DataFrame({"A": df["A"] - df["C"], "B": df["A"] - df["B"]}) # min result = diffs.min() assert result[0] == diffs.loc[0, "A"] assert result[1] == diffs.loc[0, "B"] result = diffs.min(axis=1) assert (result == diffs.loc[0, "B"]).all() # max result = diffs.max() assert result[0] == diffs.loc[2, "A"] assert result[1] == diffs.loc[2, "B"] result = diffs.max(axis=1) assert (result == diffs["A"]).all() # abs result = diffs.abs() result2 = abs(diffs) expected = DataFrame({"A": df["A"] - df["C"], "B": df["B"] - df["A"]}) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) # mixed frame mixed = diffs.copy() mixed["C"] = "foo" mixed["D"] = 1 mixed["E"] = 1.0 mixed["F"] = Timestamp("20130101") # results in an object array result = mixed.min() expected = Series( [ pd.Timedelta(timedelta(seconds=5 * 60 + 5)), pd.Timedelta(timedelta(days=-1)), "foo", 1, 1.0, Timestamp("20130101"), ], index=mixed.columns, ) tm.assert_series_equal(result, expected) # excludes numeric with tm.assert_produces_warning(FutureWarning, match="Select only valid"): result = mixed.min(axis=1) expected = Series([1, 1, 1.0], index=[0, 1, 2]) tm.assert_series_equal(result, expected) # works when only those columns are selected result = mixed[["A", "B"]].min(1) expected = Series([timedelta(days=-1)] * 3) tm.assert_series_equal(result, expected) result = mixed[["A", "B"]].min() expected = Series( [timedelta(seconds=5 * 60 + 5), timedelta(days=-1)], index=["A", "B"] ) tm.assert_series_equal(result, expected) # GH 3106 df = DataFrame( { "time": date_range("20130102", periods=5), "time2": date_range("20130105", periods=5), } ) df["off1"] = df["time2"] - df["time"] assert df["off1"].dtype == "timedelta64[ns]" df["off2"] = df["time"] - df["time2"] df._consolidate_inplace() assert df["off1"].dtype == "timedelta64[ns]" assert df["off2"].dtype == "timedelta64[ns]" def test_std_timedelta64_skipna_false(self): # GH#37392 tdi = pd.timedelta_range("1 Day", periods=10) df = DataFrame({"A": tdi, "B": tdi}, copy=True) df.iloc[-2, -1] = pd.NaT result = df.std(skipna=False) expected = Series( [df["A"].std(), pd.NaT], index=["A", "B"], dtype="timedelta64[ns]" ) tm.assert_series_equal(result, expected) result = df.std(axis=1, skipna=False) expected = Series([pd.Timedelta(0)] * 8 + [pd.NaT, pd.Timedelta(0)]) tm.assert_series_equal(result, expected) def test_sum_corner(self): empty_frame = DataFrame() axis0 = empty_frame.sum(0) axis1 = empty_frame.sum(1) assert isinstance(axis0, Series) assert isinstance(axis1, Series) assert len(axis0) == 0 assert len(axis1) == 0 @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)]) @pytest.mark.parametrize("numeric_only", [None, True, False]) def test_sum_prod_nanops(self, method, unit, numeric_only): idx = ["a", "b", "c"] df = DataFrame({"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]}) # The default result = getattr(df, method)(numeric_only=numeric_only) expected = Series([unit, unit, unit], index=idx, dtype="float64") tm.assert_series_equal(result, expected) # min_count=1 result = getattr(df, method)(numeric_only=numeric_only, min_count=1) expected = Series([unit, unit, np.nan], index=idx) tm.assert_series_equal(result, expected) # min_count=0 result = getattr(df, method)(numeric_only=numeric_only, min_count=0) expected = Series([unit, unit, unit], index=idx, dtype="float64") tm.assert_series_equal(result, expected) result = getattr(df.iloc[1:], method)(numeric_only=numeric_only, min_count=1) expected = Series([unit, np.nan, np.nan], index=idx) tm.assert_series_equal(result, expected) # min_count > 1 df = DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5}) result = getattr(df, method)(numeric_only=numeric_only, min_count=5) expected = Series(result, index=["A", "B"]) tm.assert_series_equal(result, expected) result = getattr(df, method)(numeric_only=numeric_only, min_count=6) expected = Series(result, index=["A", "B"]) tm.assert_series_equal(result, expected) def test_sum_nanops_timedelta(self): # prod isn't defined on timedeltas idx = ["a", "b", "c"] df = DataFrame({"a": [0, 0], "b": [0, np.nan], "c": [np.nan, np.nan]}) df2 = df.apply(to_timedelta) # 0 by default result = df2.sum() expected = Series([0, 0, 0], dtype="m8[ns]", index=idx) tm.assert_series_equal(result, expected) # min_count=0 result = df2.sum(min_count=0) tm.assert_series_equal(result, expected) # min_count=1 result = df2.sum(min_count=1) expected = Series([0, 0, np.nan], dtype="m8[ns]", index=idx) tm.assert_series_equal(result, expected) def test_sum_nanops_min_count(self): # https://github.com/pandas-dev/pandas/issues/39738 df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) result = df.sum(min_count=10) expected = Series([np.nan, np.nan], index=["x", "y"]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("float_type", ["float16", "float32", "float64"]) @pytest.mark.parametrize( "kwargs, expected_result", [ ({"axis": 1, "min_count": 2}, [3.2, 5.3, np.NaN]), ({"axis": 1, "min_count": 3}, [np.NaN, np.NaN, np.NaN]), ({"axis": 1, "skipna": False}, [3.2, 5.3, np.NaN]), ], ) def test_sum_nanops_dtype_min_count(self, float_type, kwargs, expected_result): # GH#46947 df = DataFrame({"a": [1.0, 2.3, 4.4], "b": [2.2, 3, np.nan]}, dtype=float_type) result = df.sum(**kwargs) expected = Series(expected_result).astype(float_type) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("float_type", ["float16", "float32", "float64"]) @pytest.mark.parametrize( "kwargs, expected_result", [ ({"axis": 1, "min_count": 2}, [2.0, 4.0, np.NaN]), ({"axis": 1, "min_count": 3}, [np.NaN, np.NaN, np.NaN]), ({"axis": 1, "skipna": False}, [2.0, 4.0, np.NaN]), ], ) def test_prod_nanops_dtype_min_count(self, float_type, kwargs, expected_result): # GH#46947 df = DataFrame( {"a": [1.0, 2.0, 4.4], "b": [2.0, 2.0, np.nan]}, dtype=float_type ) result = df.prod(**kwargs) expected = Series(expected_result).astype(float_type) tm.assert_series_equal(result, expected) def test_sum_object(self, float_frame): values = float_frame.values.astype(int) frame = DataFrame(values, index=float_frame.index, columns=float_frame.columns) deltas = frame * timedelta(1) deltas.sum() def test_sum_bool(self, float_frame): # ensure this works, bug report bools = np.isnan(float_frame) bools.sum(1) bools.sum(0) def test_sum_mixed_datetime(self): # GH#30886 df = DataFrame({"A": date_range("2000", periods=4), "B": [1, 2, 3, 4]}).reindex( [2, 3, 4] ) with tm.assert_produces_warning(FutureWarning, match="Select only valid"): result = df.sum() expected = Series({"B": 7.0}) tm.assert_series_equal(result, expected) def test_mean_corner(self, float_frame, float_string_frame): # unit test when have object data with tm.assert_produces_warning(FutureWarning, match="Select only valid"): the_mean = float_string_frame.mean(axis=0) the_sum = float_string_frame.sum(axis=0, numeric_only=True) tm.assert_index_equal(the_sum.index, the_mean.index) assert len(the_mean.index) < len(float_string_frame.columns) # xs sum mixed type, just want to know it works... with tm.assert_produces_warning(FutureWarning, match="Select only valid"): the_mean = float_string_frame.mean(axis=1) the_sum = float_string_frame.sum(axis=1, numeric_only=True) tm.assert_index_equal(the_sum.index, the_mean.index) # take mean of boolean column float_frame["bool"] = float_frame["A"] > 0 means = float_frame.mean(0) assert means["bool"] == float_frame["bool"].values.mean() def test_mean_datetimelike(self): # GH#24757 check that datetimelike are excluded by default, handled # correctly with numeric_only=True df = DataFrame( { "A": np.arange(3), "B": date_range("2016-01-01", periods=3), "C": pd.timedelta_range("1D", periods=3), "D": pd.period_range("2016", periods=3, freq="A"), } ) result = df.mean(numeric_only=True) expected = Series({"A": 1.0}) tm.assert_series_equal(result, expected) with tm.assert_produces_warning(FutureWarning): # in the future datetime columns will be included result = df.mean() expected = Series({"A": 1.0, "C": df.loc[1, "C"]}) tm.assert_series_equal(result, expected) def test_mean_datetimelike_numeric_only_false(self): df = DataFrame( { "A": np.arange(3), "B": date_range("2016-01-01", periods=3), "C": pd.timedelta_range("1D", periods=3), } ) # datetime(tz) and timedelta work result = df.mean(numeric_only=False) expected = Series({"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"]}) tm.assert_series_equal(result, expected) # mean of period is not allowed df["D"] = pd.period_range("2016", periods=3, freq="A") with pytest.raises(TypeError, match="mean is not implemented for Period"): df.mean(numeric_only=False) def test_mean_extensionarray_numeric_only_true(self): # https://github.com/pandas-dev/pandas/issues/33256 arr = np.random.randint(1000, size=(10, 5)) df = DataFrame(arr, dtype="Int64") result = df.mean(numeric_only=True) expected = DataFrame(arr).mean() tm.assert_series_equal(result, expected) def test_stats_mixed_type(self, float_string_frame): # don't blow up with tm.assert_produces_warning( FutureWarning, match="Select only valid columns" ): float_string_frame.std(1) float_string_frame.var(1) float_string_frame.mean(1) float_string_frame.skew(1) def test_sum_bools(self): df = DataFrame(index=range(1), columns=range(10)) bools = isna(df) assert bools.sum(axis=1)[0] == 10 # ---------------------------------------------------------------------- # Index of max / min @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmin(self, float_frame, int_frame, skipna, axis): frame = float_frame frame.iloc[5:10] = np.nan frame.iloc[15:20, -2:] = np.nan for df in [frame, int_frame]: result = df.idxmin(axis=axis, skipna=skipna) expected = df.apply(Series.idxmin, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("numeric_only", [True, False]) def test_idxmin_numeric_only(self, numeric_only): df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) if numeric_only: result = df.idxmin(numeric_only=numeric_only) expected = Series([2, 1], index=["a", "b"]) tm.assert_series_equal(result, expected) else: with pytest.raises(TypeError, match="not allowed for this dtype"): df.idxmin(numeric_only=numeric_only) def test_idxmin_axis_2(self, float_frame): frame = float_frame msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmax(self, float_frame, int_frame, skipna, axis): frame = float_frame frame.iloc[5:10] = np.nan frame.iloc[15:20, -2:] = np.nan for df in [frame, int_frame]: result = df.idxmax(axis=axis, skipna=skipna) expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("numeric_only", [True, False]) def test_idxmax_numeric_only(self, numeric_only): df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) if numeric_only: result = df.idxmax(numeric_only=numeric_only) expected = Series([1, 0], index=["a", "b"]) tm.assert_series_equal(result, expected) else: with pytest.raises(TypeError, match="not allowed for this dtype"): df.idxmin(numeric_only=numeric_only) def test_idxmax_axis_2(self, float_frame): frame = float_frame msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): frame.idxmax(axis=2) def test_idxmax_mixed_dtype(self): # don't cast to object, which would raise in nanops dti = date_range("2016-01-01", periods=3) # Copying dti is needed for ArrayManager otherwise when we set # df.loc[0, 3] = pd.NaT below it edits dti df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti.copy(deep=True)}) result = df.idxmax() expected = Series([1, 0, 2], index=[1, 2, 3]) tm.assert_series_equal(result, expected) result = df.idxmin() expected = Series([0, 2, 0], index=[1, 2, 3]) tm.assert_series_equal(result, expected) # with NaTs df.loc[0, 3] = pd.NaT result = df.idxmax() expected = Series([1, 0, 2], index=[1, 2, 3]) tm.assert_series_equal(result, expected) result = df.idxmin() expected = Series([0, 2, 1], index=[1, 2, 3]) tm.assert_series_equal(result, expected) # with multi-column dt64 block df[4] = dti[::-1] df._consolidate_inplace() result = df.idxmax() expected = Series([1, 0, 2, 0], index=[1, 2, 3, 4]) tm.assert_series_equal(result, expected) result = df.idxmin() expected = Series([0, 2, 1, 2], index=[1, 2, 3, 4]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "op, expected_value", [("idxmax", [0, 4]), ("idxmin", [0, 5])], ) def test_idxmax_idxmin_convert_dtypes(self, op, expected_value): # GH 40346 df = DataFrame( { "ID": [100, 100, 100, 200, 200, 200], "value": [0, 0, 0, 1, 2, 0], }, dtype="Int64", ) df = df.groupby("ID") result = getattr(df, op)() expected = DataFrame( {"value": expected_value}, index=Index([100, 200], name="ID", dtype="Int64"), ) tm.assert_frame_equal(result, expected) def test_idxmax_dt64_multicolumn_axis1(self): dti = date_range("2016-01-01", periods=3) df = DataFrame({3: dti, 4: dti[::-1]}, copy=True) df.iloc[0, 0] = pd.NaT df._consolidate_inplace() result = df.idxmax(axis=1) expected = Series([4, 3, 3]) tm.assert_series_equal(result, expected) result = df.idxmin(axis=1) expected = Series([4, 3, 4]) tm.assert_series_equal(result, expected) # ---------------------------------------------------------------------- # Logical reductions @pytest.mark.parametrize("opname", ["any", "all"]) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("bool_only", [False, True]) def test_any_all_mixed_float(self, opname, axis, bool_only, float_string_frame): # make sure op works on mixed-type frame mixed = float_string_frame mixed["_bool_"] = np.random.randn(len(mixed)) > 0.5 getattr(mixed, opname)(axis=axis, bool_only=bool_only) @pytest.mark.parametrize("opname", ["any", "all"]) @pytest.mark.parametrize("axis", [0, 1]) def test_any_all_bool_with_na(self, opname, axis, bool_frame_with_na): getattr(bool_frame_with_na, opname)(axis=axis, bool_only=False) @pytest.mark.parametrize("opname", ["any", "all"]) def test_any_all_bool_frame(self, opname, bool_frame_with_na): # GH#12863: numpy gives back non-boolean data for object type # so fill NaNs to compare with pandas behavior frame = bool_frame_with_na.fillna(True) alternative = getattr(np, opname) f = getattr(frame, opname) def skipna_wrapper(x): nona = x.dropna().values return alternative(nona) def wrapper(x): return alternative(x.values) result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) tm.assert_series_equal(result0, frame.apply(wrapper)) tm.assert_series_equal(result1, frame.apply(wrapper, axis=1)) result0 = f(axis=0) result1 = f(axis=1) tm.assert_series_equal(result0, frame.apply(skipna_wrapper)) tm.assert_series_equal( result1, frame.apply(skipna_wrapper, axis=1), check_dtype=False ) # bad axis with pytest.raises(ValueError, match="No axis named 2"): f(axis=2) # all NA case all_na = frame * np.NaN r0 = getattr(all_na, opname)(axis=0) r1 = getattr(all_na, opname)(axis=1) if opname == "any": assert not r0.any() assert not r1.any() else: assert r0.all() assert r1.all() def test_any_all_extra(self): df = DataFrame( { "A": [True, False, False], "B": [True, True, False], "C": [True, True, True], }, index=["a", "b", "c"], ) result = df[["A", "B"]].any(axis=1) expected = Series([True, True, False], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) result = df[["A", "B"]].any(axis=1, bool_only=True) tm.assert_series_equal(result, expected) result = df.all(1) expected = Series([True, False, False], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) result = df.all(1, bool_only=True) tm.assert_series_equal(result, expected) # Axis is None result = df.all(axis=None).item() assert result is False result = df.any(axis=None).item() assert result is True result = df[["C"]].all(axis=None).item() assert result is True @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize("skipna", [True, False]) def test_any_all_object_dtype(self, axis, bool_agg_func, skipna): # GH#35450 df = DataFrame( data=[ [1, np.nan, np.nan, True], [np.nan, 2, np.nan, True], [np.nan, np.nan, np.nan, True], [np.nan, np.nan, "5", np.nan], ] ) result = getattr(df, bool_agg_func)(axis=axis, skipna=skipna) expected = Series([True, True, True, True]) tm.assert_series_equal(result, expected) def test_any_datetime(self): # GH 23070 float_data = [1, np.nan, 3, np.nan] datetime_data = [ Timestamp("1960-02-15"), Timestamp("1960-02-16"), pd.NaT, pd.NaT, ] df = DataFrame({"A": float_data, "B": datetime_data}) result = df.any(axis=1) expected = Series([True, True, True, False]) tm.assert_series_equal(result, expected) def test_any_all_bool_only(self): # GH 25101 df = DataFrame( {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]} ) result = df.all(bool_only=True) expected = Series(dtype=np.bool_) tm.assert_series_equal(result, expected) df = DataFrame( { "col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None], "col4": [False, False, True], } ) result = df.all(bool_only=True) expected = Series({"col4": False}) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "func, data, expected", [ (np.any, {}, False), (np.all, {}, True), (np.any, {"A": []}, False), (np.all, {"A": []}, True), (np.any, {"A": [False, False]}, False), (np.all, {"A": [False, False]}, False), (np.any, {"A": [True, False]}, True), (np.all, {"A": [True, False]}, False), (np.any, {"A": [True, True]}, True), (np.all, {"A": [True, True]}, True), (np.any, {"A": [False], "B": [False]}, False), (np.all, {"A": [False], "B": [False]}, False), (np.any, {"A": [False, False], "B": [False, True]}, True), (np.all, {"A": [False, False], "B": [False, True]}, False), # other types (np.all, {"A": Series([0.0, 1.0], dtype="float")}, False), (np.any, {"A": Series([0.0, 1.0], dtype="float")}, True), (np.all, {"A": Series([0, 1], dtype=int)}, False), (np.any, {"A": Series([0, 1], dtype=int)}, True), pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns]")}, False), pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, False), pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns]")}, True), pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, True), pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns]")}, True), pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True), pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns]")}, True), pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True), pytest.param(np.all, {"A": Series([0, 1], dtype="m8[ns]")}, False), pytest.param(np.any, {"A": Series([0, 1], dtype="m8[ns]")}, True), pytest.param(np.all, {"A": Series([1, 2], dtype="m8[ns]")}, True), pytest.param(np.any, {"A": Series([1, 2], dtype="m8[ns]")}, True), # np.all on Categorical raises, so the reduction drops the # column, so all is being done on an empty Series, so is True (np.all, {"A": Series([0, 1], dtype="category")}, True), (np.any, {"A": Series([0, 1], dtype="category")}, False), (np.all, {"A": Series([1, 2], dtype="category")}, True), (np.any, {"A": Series([1, 2], dtype="category")}, False), # Mix GH#21484 pytest.param( np.all, { "A": Series([10, 20], dtype="M8[ns]"), "B": Series([10, 20], dtype="m8[ns]"), }, True, ), ], ) def test_any_all_np_func(self, func, data, expected): # GH 19976 data = DataFrame(data) warn = None if any(is_categorical_dtype(x) for x in data.dtypes): warn = FutureWarning with tm.assert_produces_warning( warn, match="Select only valid columns", check_stacklevel=False ): result = func(data) assert isinstance(result, np.bool_) assert result.item() is expected # method version with tm.assert_produces_warning( warn, match="Select only valid columns", check_stacklevel=False ): result = getattr(DataFrame(data), func.__name__)(axis=None) assert isinstance(result, np.bool_) assert result.item() is expected def test_any_all_object(self): # GH 19976 result = np.all(DataFrame(columns=["a", "b"])).item() assert result is True result = np.any(DataFrame(columns=["a", "b"])).item() assert result is False def test_any_all_object_bool_only(self): msg = "object-dtype columns with all-bool values" df = DataFrame({"A": ["foo", 2], "B": [True, False]}).astype(object) df._consolidate_inplace() df["C"] = Series([True, True]) # Categorical of bools is _not_ considered booly df["D"] = df["C"].astype("category") # The underlying bug is in DataFrame._get_bool_data, so we check # that while we're here with tm.assert_produces_warning(FutureWarning, match=msg): res = df._get_bool_data() expected = df[["B", "C"]] tm.assert_frame_equal(res, expected) with tm.assert_produces_warning(FutureWarning, match=msg): res = df.all(bool_only=True, axis=0) expected = Series([False, True], index=["B", "C"]) tm.assert_series_equal(res, expected) # operating on a subset of columns should not produce a _larger_ Series with tm.assert_produces_warning(FutureWarning, match=msg): res = df[["B", "C"]].all(bool_only=True, axis=0) tm.assert_series_equal(res, expected) with tm.assert_produces_warning(FutureWarning, match=msg): assert not df.all(bool_only=True, axis=None) with tm.assert_produces_warning(FutureWarning, match=msg): res = df.any(bool_only=True, axis=0) expected = Series([True, True], index=["B", "C"]) tm.assert_series_equal(res, expected) # operating on a subset of columns should not produce a _larger_ Series with tm.assert_produces_warning(FutureWarning, match=msg): res = df[["B", "C"]].any(bool_only=True, axis=0) tm.assert_series_equal(res, expected) with tm.assert_produces_warning(FutureWarning, match=msg): assert df.any(bool_only=True, axis=None) @pytest.mark.parametrize("method", ["any", "all"]) def test_any_all_level_axis_none_raises(self, method): df = DataFrame( {"A": 1}, index=MultiIndex.from_product( [["A", "B"], ["a", "b"]], names=["out", "in"] ), ) xpr = "Must specify 'axis' when aggregating by level." with pytest.raises(ValueError, match=xpr): with tm.assert_produces_warning(FutureWarning): getattr(df, method)(axis=None, level="out") # --------------------------------------------------------------------- # Unsorted def test_series_broadcasting(self): # smoke test for numpy warnings # GH 16378, GH 16306 df = DataFrame([1.0, 1.0, 1.0]) df_nan = DataFrame({"A": [np.nan, 2.0, np.nan]}) s = Series([1, 1, 1]) s_nan = Series([np.nan, np.nan, 1]) with tm.assert_produces_warning(None): df_nan.clip(lower=s, axis=0) for op in ["lt", "le", "gt", "ge", "eq", "ne"]: getattr(df, op)(s_nan, axis=0) class TestDataFrameReductions: def test_min_max_dt64_with_NaT(self): # Both NaT and Timestamp are in DataFrame. df = DataFrame({"foo": [pd.NaT, pd.NaT, Timestamp("2012-05-01")]}) res = df.min() exp = Series([Timestamp("2012-05-01")], index=["foo"]) tm.assert_series_equal(res, exp) res = df.max() exp = Series([Timestamp("2012-05-01")], index=["foo"]) tm.assert_series_equal(res, exp) # GH12941, only NaTs are in DataFrame. df = DataFrame({"foo": [pd.NaT, pd.NaT]}) res = df.min() exp = Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) res = df.max() exp = Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) def test_min_max_dt64_with_NaT_skipna_false(self, request, tz_naive_fixture): # GH#36907 tz = tz_naive_fixture if isinstance(tz, tzlocal) and is_platform_windows(): pytest.skip( "GH#37659 OSError raised within tzlocal bc Windows " "chokes in times before 1970-01-01" ) df = DataFrame( { "a": [ Timestamp("2020-01-01 08:00:00", tz=tz), Timestamp("1920-02-01 09:00:00", tz=tz), ], "b": [Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT], } ) res = df.min(axis=1, skipna=False) expected = Series([df.loc[0, "a"], pd.NaT]) assert expected.dtype == df["a"].dtype tm.assert_series_equal(res, expected) res = df.max(axis=1, skipna=False) expected = Series([df.loc[0, "b"], pd.NaT]) assert expected.dtype == df["a"].dtype tm.assert_series_equal(res, expected) def test_min_max_dt64_api_consistency_with_NaT(self): # Calling the following sum functions returned an error for dataframes but # returned NaT for series. These tests check that the API is consistent in # min/max calls on empty Series/DataFrames. See GH:33704 for more # information df = DataFrame({"x": to_datetime([])}) expected_dt_series = Series(to_datetime([])) # check axis 0 assert (df.min(axis=0).x is pd.NaT) == (expected_dt_series.min() is pd.NaT) assert (df.max(axis=0).x is pd.NaT) == (expected_dt_series.max() is pd.NaT) # check axis 1 tm.assert_series_equal(df.min(axis=1), expected_dt_series) tm.assert_series_equal(df.max(axis=1), expected_dt_series) def test_min_max_dt64_api_consistency_empty_df(self): # check DataFrame/Series api consistency when calling min/max on an empty # DataFrame/Series. df = DataFrame({"x": []}) expected_float_series = Series([], dtype=float) # check axis 0 assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min()) assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max()) # check axis 1 tm.assert_series_equal(df.min(axis=1), expected_float_series) tm.assert_series_equal(df.min(axis=1), expected_float_series) @pytest.mark.parametrize( "initial", ["2018-10-08 13:36:45+00:00", "2018-10-08 13:36:45+03:00"], # Non-UTC timezone ) @pytest.mark.parametrize("method", ["min", "max"]) def test_preserve_timezone(self, initial: str, method): # GH 28552 initial_dt = to_datetime(initial) expected = Series([initial_dt]) df = DataFrame([expected]) result = getattr(df, method)(axis=1) tm.assert_series_equal(result, expected) def test_frame_any_all_with_level(self): df = DataFrame( {"data": [False, False, True, False, True, False, True]}, index=[ ["one", "one", "two", "one", "two", "two", "two"], [0, 1, 0, 2, 1, 2, 3], ], ) with tm.assert_produces_warning(FutureWarning, match="Using the level"): result = df.any(level=0) ex = DataFrame({"data": [False, True]}, index=["one", "two"]) tm.assert_frame_equal(result, ex) with tm.assert_produces_warning(FutureWarning, match="Using the level"): result = df.all(level=0) ex = DataFrame({"data": [False, False]}, index=["one", "two"]) tm.assert_frame_equal(result, ex) def test_frame_any_with_timedelta(self): # GH#17667 df = DataFrame( { "a": Series([0, 0]), "t": Series([to_timedelta(0, "s"), to_timedelta(1, "ms")]), } ) result = df.any(axis=0) expected = Series(data=[False, True], index=["a", "t"]) tm.assert_series_equal(result, expected) result = df.any(axis=1) expected = Series(data=[False, True]) tm.assert_series_equal(result, expected) def test_reductions_deprecation_skipna_none(self, frame_or_series): # GH#44580 obj = frame_or_series([1, 2, 3]) with tm.assert_produces_warning( FutureWarning, match="skipna", raise_on_extra_warnings=False ): obj.mad(skipna=None) def test_reductions_deprecation_level_argument( self, frame_or_series, reduction_functions ): # GH#39983 obj = frame_or_series( [1, 2, 3], index=MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]]) ) with tm.assert_produces_warning(FutureWarning, match="level"): getattr(obj, reduction_functions)(level=0) def test_reductions_skipna_none_raises( self, request, frame_or_series, reduction_functions ): if reduction_functions == "count": request.node.add_marker( pytest.mark.xfail(reason="Count does not accept skipna") ) elif reduction_functions == "mad": pytest.skip("Mad is deprecated: GH#11787") obj = frame_or_series([1, 2, 3]) msg = 'For argument "skipna" expected type bool, received type NoneType.' with pytest.raises(ValueError, match=msg): getattr(obj, reduction_functions)(skipna=None) class TestNuisanceColumns: @pytest.mark.parametrize("method", ["any", "all"]) def test_any_all_categorical_dtype_nuisance_column(self, method): # GH#36076 DataFrame should match Series behavior ser = Series([0, 1], dtype="category", name="A") df = ser.to_frame() # Double-check the Series behavior is to raise with pytest.raises(TypeError, match="does not support reduction"): getattr(ser, method)() with pytest.raises(TypeError, match="does not support reduction"): getattr(np, method)(ser) with pytest.raises(TypeError, match="does not support reduction"): getattr(df, method)(bool_only=False) # With bool_only=None, operating on this column raises and is ignored, # so we expect an empty result. with tm.assert_produces_warning( FutureWarning, match="Select only valid columns" ): result = getattr(df, method)(bool_only=None) expected = Series([], index=Index([]), dtype=bool) tm.assert_series_equal(result, expected) with tm.assert_produces_warning( FutureWarning, match="Select only valid columns", check_stacklevel=False ): result = getattr(np, method)(df, axis=0) tm.assert_series_equal(result, expected) def test_median_categorical_dtype_nuisance_column(self): # GH#21020 DataFrame.median should match Series.median df = DataFrame({"A": Categorical([1, 2, 2, 2, 3])}) ser = df["A"] # Double-check the Series behavior is to raise with pytest.raises(TypeError, match="does not support reduction"): ser.median() with pytest.raises(TypeError, match="does not support reduction"): df.median(numeric_only=False) with tm.assert_produces_warning( FutureWarning, match="Select only valid columns" ): result = df.median() expected = Series([], index=Index([]), dtype=np.float64) tm.assert_series_equal(result, expected) # same thing, but with an additional non-categorical column df["B"] = df["A"].astype(int) with pytest.raises(TypeError, match="does not support reduction"): df.median(numeric_only=False) with tm.assert_produces_warning( FutureWarning, match="Select only valid columns" ): result = df.median() expected = Series([2.0], index=["B"]) tm.assert_series_equal(result, expected) # TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead # of expected.values @pytest.mark.parametrize("method", ["min", "max"]) def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method): # GH#28949 DataFrame.min should behave like Series.min cat = Categorical(["a", "b", "c", "b"], ordered=False) ser = Series(cat) df = ser.to_frame("A") # Double-check the Series behavior with pytest.raises(TypeError, match="is not ordered for operation"): getattr(ser, method)() with pytest.raises(TypeError, match="is not ordered for operation"): getattr(np, method)(ser) with pytest.raises(TypeError, match="is not ordered for operation"): getattr(df, method)(numeric_only=False) with tm.assert_produces_warning( FutureWarning, match="Select only valid columns" ): result = getattr(df, method)() expected = Series([], index=Index([]), dtype=np.float64) tm.assert_series_equal(result, expected) with tm.assert_produces_warning( FutureWarning, match="Select only valid columns", check_stacklevel=False ): result = getattr(np, method)(df) tm.assert_series_equal(result, expected) # same thing, but with an additional non-categorical column df["B"] = df["A"].astype(object) with tm.assert_produces_warning( FutureWarning, match="Select only valid columns" ): result = getattr(df, method)() if method == "min": expected = Series(["a"], index=["B"]) else: expected = Series(["c"], index=["B"]) tm.assert_series_equal(result, expected) with tm.assert_produces_warning( FutureWarning, match="Select only valid columns", check_stacklevel=False ): result = getattr(np, method)(df) tm.assert_series_equal(result, expected) def test_reduction_object_block_splits_nuisance_columns(self): # GH#37827 df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object) # We should only exclude "B", not "A" with tm.assert_produces_warning( FutureWarning, match="Select only valid columns" ): result = df.mean() expected = Series([1.0], index=["A"]) tm.assert_series_equal(result, expected) # Same behavior but heterogeneous dtype df["C"] = df["A"].astype(int) + 4 with tm.assert_produces_warning( FutureWarning, match="Select only valid columns" ): result = df.mean() expected = Series([1.0, 5.0], index=["A", "C"]) tm.assert_series_equal(result, expected) def test_sum_timedelta64_skipna_false(): # GH#17235 arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2) arr[-1, -1] = "Nat" df = DataFrame(arr) result = df.sum(skipna=False) expected = Series([pd.Timedelta(seconds=12), pd.NaT]) tm.assert_series_equal(result, expected) result = df.sum(axis=0, skipna=False) tm.assert_series_equal(result, expected) result = df.sum(axis=1, skipna=False) expected = Series( [ pd.Timedelta(seconds=1), pd.Timedelta(seconds=5), pd.Timedelta(seconds=9), pd.NaT, ] ) tm.assert_series_equal(result, expected) def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 df = DataFrame([["a", 1]], columns=list("ab")) df = df.astype({"b": "Int64"}) result = df.sum() expected = Series(["a", 1], index=["a", "b"]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("numeric_only", [True, False, None]) @pytest.mark.parametrize("method", ["min", "max"]) def test_minmax_extensionarray(method, numeric_only): # https://github.com/pandas-dev/pandas/issues/32651 int64_info = np.iinfo("int64") ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype()) df = DataFrame({"Int64": ser}) result = getattr(df, method)(numeric_only=numeric_only) expected = Series( [getattr(int64_info, method)], index=Index(["Int64"], dtype="object") ) tm.assert_series_equal(result, expected) def test_mad_nullable_integer(any_signed_int_ea_dtype): # GH#33036 df = DataFrame(np.random.randn(100, 4).astype(np.int64)) df2 = df.astype(any_signed_int_ea_dtype) with tm.assert_produces_warning( FutureWarning, match="The 'mad' method is deprecated" ): result = df2.mad() expected = df.mad() tm.assert_series_equal(result, expected) with tm.assert_produces_warning( FutureWarning, match="The 'mad' method is deprecated" ): result = df2.mad(axis=1) expected = df.mad(axis=1) tm.assert_series_equal(result, expected) # case with NAs present df2.iloc[::2, 1] = pd.NA with tm.assert_produces_warning( FutureWarning, match="The 'mad' method is deprecated" ): result = df2.mad() expected = df.mad() expected[1] = df.iloc[1::2, 1].mad() tm.assert_series_equal(result, expected) with tm.assert_produces_warning( FutureWarning, match="The 'mad' method is deprecated" ): result = df2.mad(axis=1) expected = df.mad(axis=1) expected[::2] = df.T.loc[[0, 2, 3], ::2].mad() tm.assert_series_equal(result, expected) @pytest.mark.xfail(reason="GH#42895 caused by lack of 2D EA") def test_mad_nullable_integer_all_na(any_signed_int_ea_dtype): # GH#33036 df = DataFrame(np.random.randn(100, 4).astype(np.int64)) df2 = df.astype(any_signed_int_ea_dtype) # case with all-NA row/column msg = "will attempt to set the values inplace instead" with tm.assert_produces_warning(FutureWarning, match=msg): df2.iloc[:, 1] = pd.NA # FIXME(GH#44199): this doesn't operate in-place df2.iloc[:, 1] = pd.array([pd.NA] * len(df2), dtype=any_signed_int_ea_dtype) with tm.assert_produces_warning( FutureWarning, match="The 'mad' method is deprecated" ): result = df2.mad() expected = df.mad() expected[1] = pd.NA expected = expected.astype("Float64") tm.assert_series_equal(result, expected) @pytest.mark.parametrize("meth", ["max", "min", "sum", "mean", "median"]) def test_groupby_regular_arithmetic_equivalent(meth): # GH#40660 df = DataFrame( {"a": [pd.Timedelta(hours=6), pd.Timedelta(hours=7)], "b": [12.1, 13.3]} ) expected = df.copy() with tm.assert_produces_warning(FutureWarning): result = getattr(df, meth)(level=0) tm.assert_frame_equal(result, expected) result = getattr(df.groupby(level=0), meth)(numeric_only=False) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("ts_value", [Timestamp("2000-01-01"), pd.NaT]) def test_frame_mixed_numeric_object_with_timestamp(ts_value): # GH 13912 df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]}) with tm.assert_produces_warning( FutureWarning, match="The default value of numeric_only" ): result = df.sum() expected = Series([1, 1.1, "foo"], index=list("abc")) tm.assert_series_equal(result, expected) def test_prod_sum_min_count_mixed_object(): # https://github.com/pandas-dev/pandas/issues/41074 df = DataFrame([1, "a", True]) result = df.prod(axis=0, min_count=1, numeric_only=False) expected = Series(["a"]) tm.assert_series_equal(result, expected) msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'") with pytest.raises(TypeError, match=msg): df.sum(axis=0, min_count=1, numeric_only=False) @pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"]) def test_reduction_axis_none_deprecation(method): # GH#21597 deprecate axis=None defaulting to axis=0 so that we can change it # to reducing over all axes. df = DataFrame(np.random.randn(4, 4)) meth = getattr(df, method) msg = f"scalar {method} over the entire DataFrame" with tm.assert_produces_warning(FutureWarning, match=msg): res = meth(axis=None) with tm.assert_produces_warning(None): expected = meth() tm.assert_series_equal(res, expected) tm.assert_series_equal(res, meth(axis=0)) @pytest.mark.parametrize( "kernel", [ "corr", "corrwith", "count", "cov", "idxmax", "idxmin", "kurt", "kurt", "max", "mean", "median", "min", "mode", "prod", "prod", "quantile", "sem", "skew", "std", "sum", "var", ], ) def test_numeric_only_deprecation(kernel): # GH#46852 df = DataFrame({"a": [1, 2, 3], "b": object}) args = (df,) if kernel == "corrwith" else () signature = inspect.signature(getattr(DataFrame, kernel)) default = signature.parameters["numeric_only"].default assert default is not True if kernel in ("idxmax", "idxmin"): # kernels that default to numeric_only=False and fail on nuisance columns assert default is False with pytest.raises(TypeError, match="not allowed for this dtype"): getattr(df, kernel)(*args) else: if default is None or default is lib.no_default: expected = getattr(df[["a"]], kernel)(*args) warn = FutureWarning else: # default must be False and works on any nuisance columns expected = getattr(df, kernel)(*args) if kernel == "mode": assert "b" in expected.columns else: assert "b" in expected.index warn = None msg = f"The default value of numeric_only in DataFrame.{kernel}" with tm.assert_produces_warning(warn, match=msg): result = getattr(df, kernel)(*args) tm.assert_equal(result, expected)