1027 lines
36 KiB
Python
1027 lines
36 KiB
Python
""" test where we are determining what we are grouping, or getting groups """
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
import pandas as pd
|
|
from pandas import (
|
|
CategoricalIndex,
|
|
DataFrame,
|
|
Index,
|
|
MultiIndex,
|
|
Series,
|
|
Timestamp,
|
|
date_range,
|
|
)
|
|
import pandas._testing as tm
|
|
from pandas.core.api import (
|
|
Float64Index,
|
|
Int64Index,
|
|
)
|
|
from pandas.core.groupby.grouper import Grouping
|
|
|
|
# selection
|
|
# --------------------------------
|
|
|
|
|
|
class TestSelection:
|
|
def test_select_bad_cols(self):
|
|
df = DataFrame([[1, 2]], columns=["A", "B"])
|
|
g = df.groupby("A")
|
|
with pytest.raises(KeyError, match="\"Columns not found: 'C'\""):
|
|
g[["C"]]
|
|
|
|
with pytest.raises(KeyError, match="^[^A]+$"):
|
|
# A should not be referenced as a bad column...
|
|
# will have to rethink regex if you change message!
|
|
g[["A", "C"]]
|
|
|
|
def test_groupby_duplicated_column_errormsg(self):
|
|
# GH7511
|
|
df = DataFrame(
|
|
columns=["A", "B", "A", "C"], data=[range(4), range(2, 6), range(0, 8, 2)]
|
|
)
|
|
|
|
msg = "Grouper for 'A' not 1-dimensional"
|
|
with pytest.raises(ValueError, match=msg):
|
|
df.groupby("A")
|
|
with pytest.raises(ValueError, match=msg):
|
|
df.groupby(["A", "B"])
|
|
|
|
grouped = df.groupby("B")
|
|
c = grouped.count()
|
|
assert c.columns.nlevels == 1
|
|
assert c.columns.size == 3
|
|
|
|
def test_column_select_via_attr(self, df):
|
|
result = df.groupby("A").C.sum()
|
|
expected = df.groupby("A")["C"].sum()
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
df["mean"] = 1.5
|
|
msg = "The default value of numeric_only"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
result = df.groupby("A").mean()
|
|
expected = df.groupby("A").agg(np.mean)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_getitem_list_of_columns(self):
|
|
df = DataFrame(
|
|
{
|
|
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
|
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
|
"C": np.random.randn(8),
|
|
"D": np.random.randn(8),
|
|
"E": np.random.randn(8),
|
|
}
|
|
)
|
|
|
|
result = df.groupby("A")[["C", "D"]].mean()
|
|
result2 = df.groupby("A")[df.columns[2:4]].mean()
|
|
|
|
expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean()
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
tm.assert_frame_equal(result2, expected)
|
|
|
|
def test_getitem_numeric_column_names(self):
|
|
# GH #13731
|
|
df = DataFrame(
|
|
{
|
|
0: list("abcd") * 2,
|
|
2: np.random.randn(8),
|
|
4: np.random.randn(8),
|
|
6: np.random.randn(8),
|
|
}
|
|
)
|
|
result = df.groupby(0)[df.columns[1:3]].mean()
|
|
result2 = df.groupby(0)[[2, 4]].mean()
|
|
|
|
expected = df.loc[:, [0, 2, 4]].groupby(0).mean()
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
tm.assert_frame_equal(result2, expected)
|
|
|
|
# per GH 23566 this should raise a FutureWarning
|
|
with tm.assert_produces_warning(FutureWarning):
|
|
df.groupby(0)[2, 4].mean()
|
|
|
|
def test_getitem_single_list_of_columns(self, df):
|
|
# per GH 23566 this should raise a FutureWarning
|
|
with tm.assert_produces_warning(FutureWarning):
|
|
df.groupby("A")["C", "D"].mean()
|
|
|
|
def test_getitem_single_column(self):
|
|
df = DataFrame(
|
|
{
|
|
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
|
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
|
"C": np.random.randn(8),
|
|
"D": np.random.randn(8),
|
|
"E": np.random.randn(8),
|
|
}
|
|
)
|
|
|
|
result = df.groupby("A")["C"].mean()
|
|
|
|
as_frame = df.loc[:, ["A", "C"]].groupby("A").mean()
|
|
as_series = as_frame.iloc[:, 0]
|
|
expected = as_series
|
|
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_indices_grouped_by_tuple_with_lambda(self):
|
|
# GH 36158
|
|
df = DataFrame(
|
|
{"Tuples": ((x, y) for x in [0, 1] for y in np.random.randint(3, 5, 5))}
|
|
)
|
|
|
|
gb = df.groupby("Tuples")
|
|
gb_lambda = df.groupby(lambda x: df.iloc[x, 0])
|
|
|
|
expected = gb.indices
|
|
result = gb_lambda.indices
|
|
|
|
tm.assert_dict_equal(result, expected)
|
|
|
|
|
|
# grouping
|
|
# --------------------------------
|
|
|
|
|
|
class TestGrouping:
|
|
@pytest.mark.parametrize(
|
|
"index",
|
|
[
|
|
tm.makeFloatIndex,
|
|
tm.makeStringIndex,
|
|
tm.makeIntIndex,
|
|
tm.makeDateIndex,
|
|
tm.makePeriodIndex,
|
|
],
|
|
)
|
|
def test_grouper_index_types(self, index):
|
|
# related GH5375
|
|
# groupby misbehaving when using a Floatlike index
|
|
df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB"))
|
|
|
|
df.index = index(len(df))
|
|
df.groupby(list("abcde"), group_keys=False).apply(lambda x: x)
|
|
|
|
df.index = list(reversed(df.index.tolist()))
|
|
df.groupby(list("abcde"), group_keys=False).apply(lambda x: x)
|
|
|
|
def test_grouper_multilevel_freq(self):
|
|
|
|
# GH 7885
|
|
# with level and freq specified in a pd.Grouper
|
|
from datetime import (
|
|
date,
|
|
timedelta,
|
|
)
|
|
|
|
d0 = date.today() - timedelta(days=14)
|
|
dates = date_range(d0, date.today())
|
|
date_index = MultiIndex.from_product([dates, dates], names=["foo", "bar"])
|
|
df = DataFrame(np.random.randint(0, 100, 225), index=date_index)
|
|
|
|
# Check string level
|
|
expected = (
|
|
df.reset_index()
|
|
.groupby([pd.Grouper(key="foo", freq="W"), pd.Grouper(key="bar", freq="W")])
|
|
.sum()
|
|
)
|
|
# reset index changes columns dtype to object
|
|
expected.columns = Index([0], dtype="int64")
|
|
|
|
result = df.groupby(
|
|
[pd.Grouper(level="foo", freq="W"), pd.Grouper(level="bar", freq="W")]
|
|
).sum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Check integer level
|
|
result = df.groupby(
|
|
[pd.Grouper(level=0, freq="W"), pd.Grouper(level=1, freq="W")]
|
|
).sum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_grouper_creation_bug(self):
|
|
|
|
# GH 8795
|
|
df = DataFrame({"A": [0, 0, 1, 1, 2, 2], "B": [1, 2, 3, 4, 5, 6]})
|
|
g = df.groupby("A")
|
|
expected = g.sum()
|
|
|
|
g = df.groupby(pd.Grouper(key="A"))
|
|
result = g.sum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
g = df.groupby(pd.Grouper(key="A", axis=0))
|
|
result = g.sum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = g.apply(lambda x: x.sum())
|
|
expected["A"] = [0, 2, 4]
|
|
expected = expected.loc[:, ["A", "B"]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# GH14334
|
|
# pd.Grouper(key=...) may be passed in a list
|
|
df = DataFrame(
|
|
{"A": [0, 0, 0, 1, 1, 1], "B": [1, 1, 2, 2, 3, 3], "C": [1, 2, 3, 4, 5, 6]}
|
|
)
|
|
# Group by single column
|
|
expected = df.groupby("A").sum()
|
|
g = df.groupby([pd.Grouper(key="A")])
|
|
result = g.sum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Group by two columns
|
|
# using a combination of strings and Grouper objects
|
|
expected = df.groupby(["A", "B"]).sum()
|
|
|
|
# Group with two Grouper objects
|
|
g = df.groupby([pd.Grouper(key="A"), pd.Grouper(key="B")])
|
|
result = g.sum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Group with a string and a Grouper object
|
|
g = df.groupby(["A", pd.Grouper(key="B")])
|
|
result = g.sum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Group with a Grouper object and a string
|
|
g = df.groupby([pd.Grouper(key="A"), "B"])
|
|
result = g.sum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# GH8866
|
|
s = Series(
|
|
np.arange(8, dtype="int64"),
|
|
index=MultiIndex.from_product(
|
|
[list("ab"), range(2), date_range("20130101", periods=2)],
|
|
names=["one", "two", "three"],
|
|
),
|
|
)
|
|
result = s.groupby(pd.Grouper(level="three", freq="M")).sum()
|
|
expected = Series(
|
|
[28],
|
|
index=pd.DatetimeIndex([Timestamp("2013-01-31")], freq="M", name="three"),
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# just specifying a level breaks
|
|
result = s.groupby(pd.Grouper(level="one")).sum()
|
|
expected = s.groupby(level="one").sum()
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_grouper_column_and_index(self):
|
|
# GH 14327
|
|
|
|
# Grouping a multi-index frame by a column and an index level should
|
|
# be equivalent to resetting the index and grouping by two columns
|
|
idx = MultiIndex.from_tuples(
|
|
[("a", 1), ("a", 2), ("a", 3), ("b", 1), ("b", 2), ("b", 3)]
|
|
)
|
|
idx.names = ["outer", "inner"]
|
|
df_multi = DataFrame(
|
|
{"A": np.arange(6), "B": ["one", "one", "two", "two", "one", "one"]},
|
|
index=idx,
|
|
)
|
|
msg = "The default value of numeric_only"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
result = df_multi.groupby(["B", pd.Grouper(level="inner")]).mean()
|
|
expected = df_multi.reset_index().groupby(["B", "inner"]).mean()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Test the reverse grouping order
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
result = df_multi.groupby([pd.Grouper(level="inner"), "B"]).mean()
|
|
expected = df_multi.reset_index().groupby(["inner", "B"]).mean()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Grouping a single-index frame by a column and the index should
|
|
# be equivalent to resetting the index and grouping by two columns
|
|
df_single = df_multi.reset_index("outer")
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
result = df_single.groupby(["B", pd.Grouper(level="inner")]).mean()
|
|
expected = df_single.reset_index().groupby(["B", "inner"]).mean()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Test the reverse grouping order
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
result = df_single.groupby([pd.Grouper(level="inner"), "B"]).mean()
|
|
expected = df_single.reset_index().groupby(["inner", "B"]).mean()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_groupby_levels_and_columns(self):
|
|
# GH9344, GH9049
|
|
idx_names = ["x", "y"]
|
|
idx = MultiIndex.from_tuples([(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names)
|
|
df = DataFrame(np.arange(12).reshape(-1, 3), index=idx)
|
|
|
|
by_levels = df.groupby(level=idx_names).mean()
|
|
# reset_index changes columns dtype to object
|
|
by_columns = df.reset_index().groupby(idx_names).mean()
|
|
|
|
# without casting, by_columns.columns is object-dtype
|
|
by_columns.columns = by_columns.columns.astype(np.int64)
|
|
tm.assert_frame_equal(by_levels, by_columns)
|
|
|
|
def test_groupby_categorical_index_and_columns(self, observed):
|
|
# GH18432, adapted for GH25871
|
|
columns = ["A", "B", "A", "B"]
|
|
categories = ["B", "A"]
|
|
data = np.array(
|
|
[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2]], int
|
|
)
|
|
cat_columns = CategoricalIndex(columns, categories=categories, ordered=True)
|
|
df = DataFrame(data=data, columns=cat_columns)
|
|
result = df.groupby(axis=1, level=0, observed=observed).sum()
|
|
expected_data = np.array([[4, 2], [4, 2], [4, 2], [4, 2], [4, 2]], int)
|
|
expected_columns = CategoricalIndex(
|
|
categories, categories=categories, ordered=True
|
|
)
|
|
expected = DataFrame(data=expected_data, columns=expected_columns)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# test transposed version
|
|
df = DataFrame(data.T, index=cat_columns)
|
|
result = df.groupby(axis=0, level=0, observed=observed).sum()
|
|
expected = DataFrame(data=expected_data.T, index=expected_columns)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_grouper_getting_correct_binner(self):
|
|
|
|
# GH 10063
|
|
# using a non-time-based grouper and a time-based grouper
|
|
# and specifying levels
|
|
df = DataFrame(
|
|
{"A": 1},
|
|
index=MultiIndex.from_product(
|
|
[list("ab"), date_range("20130101", periods=80)], names=["one", "two"]
|
|
),
|
|
)
|
|
result = df.groupby(
|
|
[pd.Grouper(level="one"), pd.Grouper(level="two", freq="M")]
|
|
).sum()
|
|
expected = DataFrame(
|
|
{"A": [31, 28, 21, 31, 28, 21]},
|
|
index=MultiIndex.from_product(
|
|
[list("ab"), date_range("20130101", freq="M", periods=3)],
|
|
names=["one", "two"],
|
|
),
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_grouper_iter(self, df):
|
|
assert sorted(df.groupby("A").grouper) == ["bar", "foo"]
|
|
|
|
def test_empty_groups(self, df):
|
|
# see gh-1048
|
|
with pytest.raises(ValueError, match="No group keys passed!"):
|
|
df.groupby([])
|
|
|
|
def test_groupby_grouper(self, df):
|
|
grouped = df.groupby("A")
|
|
|
|
msg = "The default value of numeric_only"
|
|
with tm.assert_produces_warning(FutureWarning, match=msg):
|
|
result = df.groupby(grouped.grouper).mean()
|
|
expected = grouped.mean()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_groupby_dict_mapping(self):
|
|
# GH #679
|
|
from pandas import Series
|
|
|
|
s = Series({"T1": 5})
|
|
result = s.groupby({"T1": "T2"}).agg(sum)
|
|
expected = s.groupby(["T2"]).agg(sum)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd"))
|
|
mapping = {"a": 0, "b": 0, "c": 1, "d": 1}
|
|
|
|
result = s.groupby(mapping).mean()
|
|
result2 = s.groupby(mapping).agg(np.mean)
|
|
expected = s.groupby([0, 0, 1, 1]).mean()
|
|
expected2 = s.groupby([0, 0, 1, 1]).mean()
|
|
tm.assert_series_equal(result, expected)
|
|
tm.assert_series_equal(result, result2)
|
|
tm.assert_series_equal(result, expected2)
|
|
|
|
@pytest.mark.parametrize(
|
|
"index",
|
|
[
|
|
[0, 1, 2, 3],
|
|
["a", "b", "c", "d"],
|
|
[Timestamp(2021, 7, 28 + i) for i in range(4)],
|
|
],
|
|
)
|
|
def test_groupby_series_named_with_tuple(self, frame_or_series, index):
|
|
# GH 42731
|
|
obj = frame_or_series([1, 2, 3, 4], index=index)
|
|
groups = Series([1, 0, 1, 0], index=index, name=("a", "a"))
|
|
result = obj.groupby(groups).last()
|
|
expected = frame_or_series([4, 3])
|
|
expected.index.name = ("a", "a")
|
|
tm.assert_equal(result, expected)
|
|
|
|
def test_groupby_grouper_f_sanity_checked(self):
|
|
dates = date_range("01-Jan-2013", periods=12, freq="MS")
|
|
ts = Series(np.random.randn(12), index=dates)
|
|
|
|
# GH3035
|
|
# index.map is used to apply grouper to the index
|
|
# if it fails on the elements, map tries it on the entire index as
|
|
# a sequence. That can yield invalid results that cause trouble
|
|
# down the line.
|
|
# the surprise comes from using key[0:6] rather than str(key)[0:6]
|
|
# when the elements are Timestamp.
|
|
# the result is Index[0:6], very confusing.
|
|
|
|
msg = r"Grouper result violates len\(labels\) == len\(data\)"
|
|
with pytest.raises(AssertionError, match=msg):
|
|
ts.groupby(lambda key: key[0:6])
|
|
|
|
def test_grouping_error_on_multidim_input(self, df):
|
|
msg = "Grouper for '<class 'pandas.core.frame.DataFrame'>' not 1-dimensional"
|
|
with pytest.raises(ValueError, match=msg):
|
|
Grouping(df.index, df[["A", "A"]])
|
|
|
|
def test_multiindex_passthru(self):
|
|
|
|
# GH 7997
|
|
# regression from 0.14.1
|
|
df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
|
df.columns = MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)])
|
|
|
|
result = df.groupby(axis=1, level=[0, 1]).first()
|
|
tm.assert_frame_equal(result, df)
|
|
|
|
def test_multiindex_negative_level(self, mframe):
|
|
# GH 13901
|
|
result = mframe.groupby(level=-1).sum()
|
|
expected = mframe.groupby(level="second").sum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = mframe.groupby(level=-2).sum()
|
|
expected = mframe.groupby(level="first").sum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = mframe.groupby(level=[-2, -1]).sum()
|
|
expected = mframe.sort_index()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = mframe.groupby(level=[-1, "first"]).sum()
|
|
expected = mframe.groupby(level=["second", "first"]).sum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_multifunc_select_col_integer_cols(self, df):
|
|
df.columns = np.arange(len(df.columns))
|
|
|
|
# it works!
|
|
df.groupby(1, as_index=False)[2].agg({"Q": np.mean})
|
|
|
|
def test_multiindex_columns_empty_level(self):
|
|
lst = [["count", "values"], ["to filter", ""]]
|
|
midx = MultiIndex.from_tuples(lst)
|
|
|
|
df = DataFrame([[1, "A"]], columns=midx)
|
|
|
|
grouped = df.groupby("to filter").groups
|
|
assert grouped["A"] == [0]
|
|
|
|
grouped = df.groupby([("to filter", "")]).groups
|
|
assert grouped["A"] == [0]
|
|
|
|
df = DataFrame([[1, "A"], [2, "B"]], columns=midx)
|
|
|
|
expected = df.groupby("to filter").groups
|
|
result = df.groupby([("to filter", "")]).groups
|
|
assert result == expected
|
|
|
|
df = DataFrame([[1, "A"], [2, "A"]], columns=midx)
|
|
|
|
expected = df.groupby("to filter").groups
|
|
result = df.groupby([("to filter", "")]).groups
|
|
tm.assert_dict_equal(result, expected)
|
|
|
|
def test_groupby_multiindex_tuple(self):
|
|
# GH 17979
|
|
df = DataFrame(
|
|
[[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
|
|
columns=MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]),
|
|
)
|
|
expected = df.groupby([("b", 1)]).groups
|
|
result = df.groupby(("b", 1)).groups
|
|
tm.assert_dict_equal(expected, result)
|
|
|
|
df2 = DataFrame(
|
|
df.values,
|
|
columns=MultiIndex.from_arrays(
|
|
[["a", "b", "b", "c"], ["d", "d", "e", "e"]]
|
|
),
|
|
)
|
|
expected = df2.groupby([("b", "d")]).groups
|
|
result = df.groupby(("b", 1)).groups
|
|
tm.assert_dict_equal(expected, result)
|
|
|
|
df3 = DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"])
|
|
expected = df3.groupby([("b", "d")]).groups
|
|
result = df.groupby(("b", 1)).groups
|
|
tm.assert_dict_equal(expected, result)
|
|
|
|
@pytest.mark.parametrize("sort", [True, False])
|
|
def test_groupby_level(self, sort, mframe, df):
|
|
# GH 17537
|
|
frame = mframe
|
|
deleveled = frame.reset_index()
|
|
|
|
result0 = frame.groupby(level=0, sort=sort).sum()
|
|
result1 = frame.groupby(level=1, sort=sort).sum()
|
|
|
|
expected0 = frame.groupby(deleveled["first"].values, sort=sort).sum()
|
|
expected1 = frame.groupby(deleveled["second"].values, sort=sort).sum()
|
|
|
|
expected0.index.name = "first"
|
|
expected1.index.name = "second"
|
|
|
|
assert result0.index.name == "first"
|
|
assert result1.index.name == "second"
|
|
|
|
tm.assert_frame_equal(result0, expected0)
|
|
tm.assert_frame_equal(result1, expected1)
|
|
assert result0.index.name == frame.index.names[0]
|
|
assert result1.index.name == frame.index.names[1]
|
|
|
|
# groupby level name
|
|
result0 = frame.groupby(level="first", sort=sort).sum()
|
|
result1 = frame.groupby(level="second", sort=sort).sum()
|
|
tm.assert_frame_equal(result0, expected0)
|
|
tm.assert_frame_equal(result1, expected1)
|
|
|
|
# axis=1
|
|
|
|
result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum()
|
|
result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum()
|
|
tm.assert_frame_equal(result0, expected0.T)
|
|
tm.assert_frame_equal(result1, expected1.T)
|
|
|
|
# raise exception for non-MultiIndex
|
|
msg = "level > 0 or level < -1 only valid with MultiIndex"
|
|
with pytest.raises(ValueError, match=msg):
|
|
df.groupby(level=1)
|
|
|
|
def test_groupby_level_index_names(self, axis):
|
|
# GH4014 this used to raise ValueError since 'exp'>1 (in py2)
|
|
df = DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}).set_index(
|
|
"exp"
|
|
)
|
|
if axis in (1, "columns"):
|
|
df = df.T
|
|
df.groupby(level="exp", axis=axis)
|
|
msg = f"level name foo is not the name of the {df._get_axis_name(axis)}"
|
|
with pytest.raises(ValueError, match=msg):
|
|
df.groupby(level="foo", axis=axis)
|
|
|
|
@pytest.mark.parametrize("sort", [True, False])
|
|
def test_groupby_level_with_nas(self, sort):
|
|
# GH 17537
|
|
index = MultiIndex(
|
|
levels=[[1, 0], [0, 1, 2, 3]],
|
|
codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
|
|
)
|
|
|
|
# factorizing doesn't confuse things
|
|
s = Series(np.arange(8.0), index=index)
|
|
result = s.groupby(level=0, sort=sort).sum()
|
|
expected = Series([6.0, 22.0], index=[0, 1])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
index = MultiIndex(
|
|
levels=[[1, 0], [0, 1, 2, 3]],
|
|
codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
|
|
)
|
|
|
|
# factorizing doesn't confuse things
|
|
s = Series(np.arange(8.0), index=index)
|
|
result = s.groupby(level=0, sort=sort).sum()
|
|
expected = Series([6.0, 18.0], index=[0.0, 1.0])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_groupby_args(self, mframe):
|
|
# PR8618 and issue 8015
|
|
frame = mframe
|
|
|
|
msg = "You have to supply one of 'by' and 'level'"
|
|
with pytest.raises(TypeError, match=msg):
|
|
frame.groupby()
|
|
|
|
msg = "You have to supply one of 'by' and 'level'"
|
|
with pytest.raises(TypeError, match=msg):
|
|
frame.groupby(by=None, level=None)
|
|
|
|
@pytest.mark.parametrize(
|
|
"sort,labels",
|
|
[
|
|
[True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
|
|
[False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]],
|
|
],
|
|
)
|
|
def test_level_preserve_order(self, sort, labels, mframe):
|
|
# GH 17537
|
|
grouped = mframe.groupby(level=0, sort=sort)
|
|
exp_labels = np.array(labels, np.intp)
|
|
tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels)
|
|
|
|
def test_grouping_labels(self, mframe):
|
|
grouped = mframe.groupby(mframe.index.get_level_values(0))
|
|
exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp)
|
|
tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels)
|
|
|
|
def test_list_grouper_with_nat(self):
|
|
# GH 14715
|
|
df = DataFrame({"date": date_range("1/1/2011", periods=365, freq="D")})
|
|
df.iloc[-1] = pd.NaT
|
|
grouper = pd.Grouper(key="date", freq="AS")
|
|
|
|
# Grouper in a list grouping
|
|
result = df.groupby([grouper])
|
|
expected = {Timestamp("2011-01-01"): Index(list(range(364)))}
|
|
tm.assert_dict_equal(result.groups, expected)
|
|
|
|
# Test case without a list
|
|
result = df.groupby(grouper)
|
|
expected = {Timestamp("2011-01-01"): 365}
|
|
tm.assert_dict_equal(result.groups, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"func,expected",
|
|
[
|
|
(
|
|
"transform",
|
|
Series(name=2, dtype=np.float64, index=Index([])),
|
|
),
|
|
(
|
|
"agg",
|
|
Series(name=2, dtype=np.float64, index=Float64Index([], name=1)),
|
|
),
|
|
(
|
|
"apply",
|
|
Series(name=2, dtype=np.float64, index=Float64Index([], name=1)),
|
|
),
|
|
],
|
|
)
|
|
def test_evaluate_with_empty_groups(self, func, expected):
|
|
# 26208
|
|
# test transform'ing empty groups
|
|
# (not testing other agg fns, because they return
|
|
# different index objects.
|
|
df = DataFrame({1: [], 2: []})
|
|
g = df.groupby(1, group_keys=False)
|
|
result = getattr(g[2], func)(lambda x: x)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_groupby_empty(self):
|
|
# https://github.com/pandas-dev/pandas/issues/27190
|
|
s = Series([], name="name", dtype="float64")
|
|
gr = s.groupby([])
|
|
|
|
result = gr.mean()
|
|
tm.assert_series_equal(result, s)
|
|
|
|
# check group properties
|
|
assert len(gr.grouper.groupings) == 1
|
|
tm.assert_numpy_array_equal(
|
|
gr.grouper.group_info[0], np.array([], dtype=np.dtype(np.intp))
|
|
)
|
|
|
|
tm.assert_numpy_array_equal(
|
|
gr.grouper.group_info[1], np.array([], dtype=np.dtype(np.intp))
|
|
)
|
|
|
|
assert gr.grouper.group_info[2] == 0
|
|
|
|
# check name
|
|
assert s.groupby(s).grouper.names == ["name"]
|
|
|
|
def test_groupby_level_index_value_all_na(self):
|
|
# issue 20519
|
|
df = DataFrame(
|
|
[["x", np.nan, 10], [None, np.nan, 20]], columns=["A", "B", "C"]
|
|
).set_index(["A", "B"])
|
|
result = df.groupby(level=["A", "B"]).sum()
|
|
expected = DataFrame(
|
|
data=[],
|
|
index=MultiIndex(
|
|
levels=[Index(["x"], dtype="object"), Index([], dtype="float64")],
|
|
codes=[[], []],
|
|
names=["A", "B"],
|
|
),
|
|
columns=["C"],
|
|
dtype="int64",
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_groupby_multiindex_level_empty(self):
|
|
# https://github.com/pandas-dev/pandas/issues/31670
|
|
df = DataFrame(
|
|
[[123, "a", 1.0], [123, "b", 2.0]], columns=["id", "category", "value"]
|
|
)
|
|
df = df.set_index(["id", "category"])
|
|
empty = df[df.value < 0]
|
|
result = empty.groupby("id").sum()
|
|
expected = DataFrame(
|
|
dtype="float64", columns=["value"], index=Int64Index([], name="id")
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
# get_group
|
|
# --------------------------------
|
|
|
|
|
|
class TestGetGroup:
|
|
def test_get_group(self):
|
|
# GH 5267
|
|
# be datelike friendly
|
|
df = DataFrame(
|
|
{
|
|
"DATE": pd.to_datetime(
|
|
[
|
|
"10-Oct-2013",
|
|
"10-Oct-2013",
|
|
"10-Oct-2013",
|
|
"11-Oct-2013",
|
|
"11-Oct-2013",
|
|
"11-Oct-2013",
|
|
]
|
|
),
|
|
"label": ["foo", "foo", "bar", "foo", "foo", "bar"],
|
|
"VAL": [1, 2, 3, 4, 5, 6],
|
|
}
|
|
)
|
|
|
|
g = df.groupby("DATE")
|
|
key = list(g.groups)[0]
|
|
result1 = g.get_group(key)
|
|
result2 = g.get_group(Timestamp(key).to_pydatetime())
|
|
result3 = g.get_group(str(Timestamp(key)))
|
|
tm.assert_frame_equal(result1, result2)
|
|
tm.assert_frame_equal(result1, result3)
|
|
|
|
g = df.groupby(["DATE", "label"])
|
|
|
|
key = list(g.groups)[0]
|
|
result1 = g.get_group(key)
|
|
result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1]))
|
|
result3 = g.get_group((str(Timestamp(key[0])), key[1]))
|
|
tm.assert_frame_equal(result1, result2)
|
|
tm.assert_frame_equal(result1, result3)
|
|
|
|
# must pass a same-length tuple with multiple keys
|
|
msg = "must supply a tuple to get_group with multiple grouping keys"
|
|
with pytest.raises(ValueError, match=msg):
|
|
g.get_group("foo")
|
|
with pytest.raises(ValueError, match=msg):
|
|
g.get_group("foo")
|
|
msg = "must supply a same-length tuple to get_group with multiple grouping keys"
|
|
with pytest.raises(ValueError, match=msg):
|
|
g.get_group(("foo", "bar", "baz"))
|
|
|
|
def test_get_group_empty_bins(self, observed):
|
|
|
|
d = DataFrame([3, 1, 7, 6])
|
|
bins = [0, 5, 10, 15]
|
|
g = d.groupby(pd.cut(d[0], bins), observed=observed)
|
|
|
|
# TODO: should prob allow a str of Interval work as well
|
|
# IOW '(0, 5]'
|
|
result = g.get_group(pd.Interval(0, 5))
|
|
expected = DataFrame([3, 1], index=[0, 1])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
msg = r"Interval\(10, 15, closed='right'\)"
|
|
with pytest.raises(KeyError, match=msg):
|
|
g.get_group(pd.Interval(10, 15))
|
|
|
|
def test_get_group_grouped_by_tuple(self):
|
|
# GH 8121
|
|
df = DataFrame([[(1,), (1, 2), (1,), (1, 2)]], index=["ids"]).T
|
|
gr = df.groupby("ids")
|
|
expected = DataFrame({"ids": [(1,), (1,)]}, index=[0, 2])
|
|
result = gr.get_group((1,))
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
dt = pd.to_datetime(["2010-01-01", "2010-01-02", "2010-01-01", "2010-01-02"])
|
|
df = DataFrame({"ids": [(x,) for x in dt]})
|
|
gr = df.groupby("ids")
|
|
result = gr.get_group(("2010-01-01",))
|
|
expected = DataFrame({"ids": [(dt[0],), (dt[0],)]}, index=[0, 2])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_get_group_grouped_by_tuple_with_lambda(self):
|
|
# GH 36158
|
|
df = DataFrame(
|
|
{"Tuples": ((x, y) for x in [0, 1] for y in np.random.randint(3, 5, 5))}
|
|
)
|
|
|
|
gb = df.groupby("Tuples")
|
|
gb_lambda = df.groupby(lambda x: df.iloc[x, 0])
|
|
|
|
expected = gb.get_group(list(gb.groups.keys())[0])
|
|
result = gb_lambda.get_group(list(gb_lambda.groups.keys())[0])
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_groupby_with_empty(self):
|
|
index = pd.DatetimeIndex(())
|
|
data = ()
|
|
series = Series(data, index, dtype=object)
|
|
grouper = pd.Grouper(freq="D")
|
|
grouped = series.groupby(grouper)
|
|
assert next(iter(grouped), None) is None
|
|
|
|
def test_groupby_with_single_column(self):
|
|
df = DataFrame({"a": list("abssbab")})
|
|
tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]])
|
|
# GH 13530
|
|
exp = DataFrame(index=Index(["a", "b", "s"], name="a"))
|
|
tm.assert_frame_equal(df.groupby("a").count(), exp)
|
|
tm.assert_frame_equal(df.groupby("a").sum(), exp)
|
|
tm.assert_frame_equal(df.groupby("a").nth(1), exp)
|
|
|
|
def test_gb_key_len_equal_axis_len(self):
|
|
# GH16843
|
|
# test ensures that index and column keys are recognized correctly
|
|
# when number of keys equals axis length of groupby
|
|
df = DataFrame(
|
|
[["foo", "bar", "B", 1], ["foo", "bar", "B", 2], ["foo", "baz", "C", 3]],
|
|
columns=["first", "second", "third", "one"],
|
|
)
|
|
df = df.set_index(["first", "second"])
|
|
df = df.groupby(["first", "second", "third"]).size()
|
|
assert df.loc[("foo", "bar", "B")] == 2
|
|
assert df.loc[("foo", "baz", "C")] == 1
|
|
|
|
|
|
# groups & iteration
|
|
# --------------------------------
|
|
|
|
|
|
class TestIteration:
|
|
def test_groups(self, df):
|
|
grouped = df.groupby(["A"])
|
|
groups = grouped.groups
|
|
assert groups is grouped.groups # caching works
|
|
|
|
for k, v in grouped.groups.items():
|
|
assert (df.loc[v]["A"] == k).all()
|
|
|
|
grouped = df.groupby(["A", "B"])
|
|
groups = grouped.groups
|
|
assert groups is grouped.groups # caching works
|
|
|
|
for k, v in grouped.groups.items():
|
|
assert (df.loc[v]["A"] == k[0]).all()
|
|
assert (df.loc[v]["B"] == k[1]).all()
|
|
|
|
def test_grouping_is_iterable(self, tsframe):
|
|
# this code path isn't used anywhere else
|
|
# not sure it's useful
|
|
grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year])
|
|
|
|
# test it works
|
|
for g in grouped.grouper.groupings[0]:
|
|
pass
|
|
|
|
def test_multi_iter(self):
|
|
s = Series(np.arange(6))
|
|
k1 = np.array(["a", "a", "a", "b", "b", "b"])
|
|
k2 = np.array(["1", "2", "1", "2", "1", "2"])
|
|
|
|
grouped = s.groupby([k1, k2])
|
|
|
|
iterated = list(grouped)
|
|
expected = [
|
|
("a", "1", s[[0, 2]]),
|
|
("a", "2", s[[1]]),
|
|
("b", "1", s[[4]]),
|
|
("b", "2", s[[3, 5]]),
|
|
]
|
|
for i, ((one, two), three) in enumerate(iterated):
|
|
e1, e2, e3 = expected[i]
|
|
assert e1 == one
|
|
assert e2 == two
|
|
tm.assert_series_equal(three, e3)
|
|
|
|
def test_multi_iter_frame(self, three_group):
|
|
k1 = np.array(["b", "b", "b", "a", "a", "a"])
|
|
k2 = np.array(["1", "2", "1", "2", "1", "2"])
|
|
df = DataFrame(
|
|
{"v1": np.random.randn(6), "v2": np.random.randn(6), "k1": k1, "k2": k2},
|
|
index=["one", "two", "three", "four", "five", "six"],
|
|
)
|
|
|
|
grouped = df.groupby(["k1", "k2"])
|
|
|
|
# things get sorted!
|
|
iterated = list(grouped)
|
|
idx = df.index
|
|
expected = [
|
|
("a", "1", df.loc[idx[[4]]]),
|
|
("a", "2", df.loc[idx[[3, 5]]]),
|
|
("b", "1", df.loc[idx[[0, 2]]]),
|
|
("b", "2", df.loc[idx[[1]]]),
|
|
]
|
|
for i, ((one, two), three) in enumerate(iterated):
|
|
e1, e2, e3 = expected[i]
|
|
assert e1 == one
|
|
assert e2 == two
|
|
tm.assert_frame_equal(three, e3)
|
|
|
|
# don't iterate through groups with no data
|
|
df["k1"] = np.array(["b", "b", "b", "a", "a", "a"])
|
|
df["k2"] = np.array(["1", "1", "1", "2", "2", "2"])
|
|
grouped = df.groupby(["k1", "k2"])
|
|
groups = {key: gp for key, gp in grouped}
|
|
assert len(groups) == 2
|
|
|
|
# axis = 1
|
|
three_levels = three_group.groupby(["A", "B", "C"]).mean()
|
|
grouped = three_levels.T.groupby(axis=1, level=(1, 2))
|
|
for key, group in grouped:
|
|
pass
|
|
|
|
def test_dictify(self, df):
|
|
dict(iter(df.groupby("A")))
|
|
dict(iter(df.groupby(["A", "B"])))
|
|
dict(iter(df["C"].groupby(df["A"])))
|
|
dict(iter(df["C"].groupby([df["A"], df["B"]])))
|
|
dict(iter(df.groupby("A")["C"]))
|
|
dict(iter(df.groupby(["A", "B"])["C"]))
|
|
|
|
def test_groupby_with_small_elem(self):
|
|
# GH 8542
|
|
# length=2
|
|
df = DataFrame(
|
|
{"event": ["start", "start"], "change": [1234, 5678]},
|
|
index=pd.DatetimeIndex(["2014-09-10", "2013-10-10"]),
|
|
)
|
|
grouped = df.groupby([pd.Grouper(freq="M"), "event"])
|
|
assert len(grouped.groups) == 2
|
|
assert grouped.ngroups == 2
|
|
assert (Timestamp("2014-09-30"), "start") in grouped.groups
|
|
assert (Timestamp("2013-10-31"), "start") in grouped.groups
|
|
|
|
res = grouped.get_group((Timestamp("2014-09-30"), "start"))
|
|
tm.assert_frame_equal(res, df.iloc[[0], :])
|
|
res = grouped.get_group((Timestamp("2013-10-31"), "start"))
|
|
tm.assert_frame_equal(res, df.iloc[[1], :])
|
|
|
|
df = DataFrame(
|
|
{"event": ["start", "start", "start"], "change": [1234, 5678, 9123]},
|
|
index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-09-15"]),
|
|
)
|
|
grouped = df.groupby([pd.Grouper(freq="M"), "event"])
|
|
assert len(grouped.groups) == 2
|
|
assert grouped.ngroups == 2
|
|
assert (Timestamp("2014-09-30"), "start") in grouped.groups
|
|
assert (Timestamp("2013-10-31"), "start") in grouped.groups
|
|
|
|
res = grouped.get_group((Timestamp("2014-09-30"), "start"))
|
|
tm.assert_frame_equal(res, df.iloc[[0, 2], :])
|
|
res = grouped.get_group((Timestamp("2013-10-31"), "start"))
|
|
tm.assert_frame_equal(res, df.iloc[[1], :])
|
|
|
|
# length=3
|
|
df = DataFrame(
|
|
{"event": ["start", "start", "start"], "change": [1234, 5678, 9123]},
|
|
index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-08-05"]),
|
|
)
|
|
grouped = df.groupby([pd.Grouper(freq="M"), "event"])
|
|
assert len(grouped.groups) == 3
|
|
assert grouped.ngroups == 3
|
|
assert (Timestamp("2014-09-30"), "start") in grouped.groups
|
|
assert (Timestamp("2013-10-31"), "start") in grouped.groups
|
|
assert (Timestamp("2014-08-31"), "start") in grouped.groups
|
|
|
|
res = grouped.get_group((Timestamp("2014-09-30"), "start"))
|
|
tm.assert_frame_equal(res, df.iloc[[0], :])
|
|
res = grouped.get_group((Timestamp("2013-10-31"), "start"))
|
|
tm.assert_frame_equal(res, df.iloc[[1], :])
|
|
res = grouped.get_group((Timestamp("2014-08-31"), "start"))
|
|
tm.assert_frame_equal(res, df.iloc[[2], :])
|
|
|
|
def test_grouping_string_repr(self):
|
|
# GH 13394
|
|
mi = MultiIndex.from_arrays([list("AAB"), list("aba")])
|
|
df = DataFrame([[1, 2, 3]], columns=mi)
|
|
gr = df.groupby(df[("A", "a")])
|
|
|
|
result = gr.grouper.groupings[0].__repr__()
|
|
expected = "Grouping(('A', 'a'))"
|
|
assert result == expected
|