194 lines
5.6 KiB
Python
194 lines
5.6 KiB
Python
|
"""
|
||
|
these are systematically testing all of the args to value_counts
|
||
|
with different size combinations. This is to ensure stability of the sorting
|
||
|
and proper parameter handling
|
||
|
"""
|
||
|
|
||
|
from itertools import product
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from pandas import (
|
||
|
Categorical,
|
||
|
CategoricalIndex,
|
||
|
DataFrame,
|
||
|
Grouper,
|
||
|
MultiIndex,
|
||
|
Series,
|
||
|
date_range,
|
||
|
to_datetime,
|
||
|
)
|
||
|
import pandas._testing as tm
|
||
|
|
||
|
|
||
|
def tests_value_counts_index_names_category_column():
|
||
|
# GH44324 Missing name of index category column
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"gender": ["female"],
|
||
|
"country": ["US"],
|
||
|
}
|
||
|
)
|
||
|
df["gender"] = df["gender"].astype("category")
|
||
|
result = df.groupby("country")["gender"].value_counts()
|
||
|
|
||
|
# Construct expected, very specific multiindex
|
||
|
df_mi_expected = DataFrame([["US", "female"]], columns=["country", "gender"])
|
||
|
df_mi_expected["gender"] = df_mi_expected["gender"].astype("category")
|
||
|
mi_expected = MultiIndex.from_frame(df_mi_expected)
|
||
|
expected = Series([1], index=mi_expected, name="gender")
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
# our starting frame
|
||
|
def seed_df(seed_nans, n, m):
|
||
|
np.random.seed(1234)
|
||
|
days = date_range("2015-08-24", periods=10)
|
||
|
|
||
|
frame = DataFrame(
|
||
|
{
|
||
|
"1st": np.random.choice(list("abcd"), n),
|
||
|
"2nd": np.random.choice(days, n),
|
||
|
"3rd": np.random.randint(1, m + 1, n),
|
||
|
}
|
||
|
)
|
||
|
|
||
|
if seed_nans:
|
||
|
frame.loc[1::11, "1st"] = np.nan
|
||
|
frame.loc[3::17, "2nd"] = np.nan
|
||
|
frame.loc[7::19, "3rd"] = np.nan
|
||
|
frame.loc[8::19, "3rd"] = np.nan
|
||
|
frame.loc[9::19, "3rd"] = np.nan
|
||
|
|
||
|
return frame
|
||
|
|
||
|
|
||
|
# create input df, keys, and the bins
|
||
|
binned = []
|
||
|
ids = []
|
||
|
for seed_nans in [True, False]:
|
||
|
for n, m in product((100, 1000), (5, 20)):
|
||
|
|
||
|
df = seed_df(seed_nans, n, m)
|
||
|
bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2)
|
||
|
keys = "1st", "2nd", ["1st", "2nd"]
|
||
|
for k, b in product(keys, bins):
|
||
|
binned.append((df, k, b, n, m))
|
||
|
ids.append(f"{k}-{n}-{m}")
|
||
|
|
||
|
|
||
|
@pytest.mark.slow
|
||
|
@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids)
|
||
|
@pytest.mark.parametrize("isort", [True, False])
|
||
|
@pytest.mark.parametrize("normalize", [True, False])
|
||
|
@pytest.mark.parametrize("sort", [True, False])
|
||
|
@pytest.mark.parametrize("ascending", [True, False])
|
||
|
@pytest.mark.parametrize("dropna", [True, False])
|
||
|
def test_series_groupby_value_counts(
|
||
|
df, keys, bins, n, m, isort, normalize, sort, ascending, dropna
|
||
|
):
|
||
|
def rebuild_index(df):
|
||
|
arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
|
||
|
df.index = MultiIndex.from_arrays(arr, names=df.index.names)
|
||
|
return df
|
||
|
|
||
|
kwargs = {
|
||
|
"normalize": normalize,
|
||
|
"sort": sort,
|
||
|
"ascending": ascending,
|
||
|
"dropna": dropna,
|
||
|
"bins": bins,
|
||
|
}
|
||
|
|
||
|
gr = df.groupby(keys, sort=isort)
|
||
|
left = gr["3rd"].value_counts(**kwargs)
|
||
|
|
||
|
gr = df.groupby(keys, sort=isort)
|
||
|
right = gr["3rd"].apply(Series.value_counts, **kwargs)
|
||
|
right.index.names = right.index.names[:-1] + ["3rd"]
|
||
|
|
||
|
# have to sort on index because of unstable sort on values
|
||
|
left, right = map(rebuild_index, (left, right)) # xref GH9212
|
||
|
tm.assert_series_equal(left.sort_index(), right.sort_index())
|
||
|
|
||
|
|
||
|
def test_series_groupby_value_counts_with_grouper():
|
||
|
# GH28479
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"Timestamp": [
|
||
|
1565083561,
|
||
|
1565083561 + 86400,
|
||
|
1565083561 + 86500,
|
||
|
1565083561 + 86400 * 2,
|
||
|
1565083561 + 86400 * 3,
|
||
|
1565083561 + 86500 * 3,
|
||
|
1565083561 + 86400 * 4,
|
||
|
],
|
||
|
"Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
|
||
|
}
|
||
|
).drop([3])
|
||
|
|
||
|
df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s")
|
||
|
dfg = df.groupby(Grouper(freq="1D", key="Datetime"))
|
||
|
|
||
|
# have to sort on index because of unstable sort on values xref GH9212
|
||
|
result = dfg["Food"].value_counts().sort_index()
|
||
|
expected = dfg["Food"].apply(Series.value_counts).sort_index()
|
||
|
expected.index.names = result.index.names
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
|
||
|
def test_series_groupby_value_counts_empty(columns):
|
||
|
# GH39172
|
||
|
df = DataFrame(columns=columns)
|
||
|
dfg = df.groupby(columns[:-1])
|
||
|
|
||
|
result = dfg[columns[-1]].value_counts()
|
||
|
expected = Series([], name=columns[-1], dtype=result.dtype)
|
||
|
expected.index = MultiIndex.from_arrays([[]] * len(columns), names=columns)
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
|
||
|
def test_series_groupby_value_counts_one_row(columns):
|
||
|
# GH42618
|
||
|
df = DataFrame(data=[range(len(columns))], columns=columns)
|
||
|
dfg = df.groupby(columns[:-1])
|
||
|
|
||
|
result = dfg[columns[-1]].value_counts()
|
||
|
expected = df.value_counts().rename(columns[-1])
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_series_groupby_value_counts_on_categorical():
|
||
|
# GH38672
|
||
|
|
||
|
s = Series(Categorical(["a"], categories=["a", "b"]))
|
||
|
result = s.groupby([0]).value_counts()
|
||
|
|
||
|
expected = Series(
|
||
|
data=[1, 0],
|
||
|
index=MultiIndex.from_arrays(
|
||
|
[
|
||
|
[0, 0],
|
||
|
CategoricalIndex(
|
||
|
["a", "b"], categories=["a", "b"], ordered=False, dtype="category"
|
||
|
),
|
||
|
]
|
||
|
),
|
||
|
)
|
||
|
|
||
|
# Expected:
|
||
|
# 0 a 1
|
||
|
# b 0
|
||
|
# dtype: int64
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|