145 lines
4.8 KiB
Python
145 lines
4.8 KiB
Python
|
import pytest
|
||
|
|
||
|
from pandas import (
|
||
|
DataFrame,
|
||
|
Index,
|
||
|
Series,
|
||
|
)
|
||
|
import pandas._testing as tm
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)])
|
||
|
def test_groupby_sample_balanced_groups_shape(n, frac):
|
||
|
values = [1] * 10 + [2] * 10
|
||
|
df = DataFrame({"a": values, "b": values})
|
||
|
|
||
|
result = df.groupby("a").sample(n=n, frac=frac)
|
||
|
values = [1] * 2 + [2] * 2
|
||
|
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.groupby("a")["b"].sample(n=n, frac=frac)
|
||
|
expected = Series(values, name="b", index=result.index)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_sample_unbalanced_groups_shape():
|
||
|
values = [1] * 10 + [2] * 20
|
||
|
df = DataFrame({"a": values, "b": values})
|
||
|
|
||
|
result = df.groupby("a").sample(n=5)
|
||
|
values = [1] * 5 + [2] * 5
|
||
|
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.groupby("a")["b"].sample(n=5)
|
||
|
expected = Series(values, name="b", index=result.index)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_sample_index_value_spans_groups():
|
||
|
values = [1] * 3 + [2] * 3
|
||
|
df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2])
|
||
|
|
||
|
result = df.groupby("a").sample(n=2)
|
||
|
values = [1] * 2 + [2] * 2
|
||
|
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.groupby("a")["b"].sample(n=2)
|
||
|
expected = Series(values, name="b", index=result.index)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_sample_n_and_frac_raises():
|
||
|
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||
|
msg = "Please enter a value for `frac` OR `n`, not both"
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.groupby("a").sample(n=1, frac=1.0)
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.groupby("a")["b"].sample(n=1, frac=1.0)
|
||
|
|
||
|
|
||
|
def test_groupby_sample_frac_gt_one_without_replacement_raises():
|
||
|
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||
|
msg = "Replace has to be set to `True` when upsampling the population `frac` > 1."
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.groupby("a").sample(frac=1.5, replace=False)
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.groupby("a")["b"].sample(frac=1.5, replace=False)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("n", [-1, 1.5])
|
||
|
def test_groupby_sample_invalid_n_raises(n):
|
||
|
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||
|
|
||
|
if n < 0:
|
||
|
msg = "A negative number of rows requested. Please provide `n` >= 0."
|
||
|
else:
|
||
|
msg = "Only integers accepted as `n` values"
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.groupby("a").sample(n=n)
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
df.groupby("a")["b"].sample(n=n)
|
||
|
|
||
|
|
||
|
def test_groupby_sample_oversample():
|
||
|
values = [1] * 10 + [2] * 10
|
||
|
df = DataFrame({"a": values, "b": values})
|
||
|
|
||
|
result = df.groupby("a").sample(frac=2.0, replace=True)
|
||
|
values = [1] * 20 + [2] * 20
|
||
|
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.groupby("a")["b"].sample(frac=2.0, replace=True)
|
||
|
expected = Series(values, name="b", index=result.index)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_sample_without_n_or_frac():
|
||
|
values = [1] * 10 + [2] * 10
|
||
|
df = DataFrame({"a": values, "b": values})
|
||
|
|
||
|
result = df.groupby("a").sample(n=None, frac=None)
|
||
|
expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.groupby("a")["b"].sample(n=None, frac=None)
|
||
|
expected = Series([1, 2], name="b", index=result.index)
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize(
|
||
|
"index, expected_index",
|
||
|
[(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])],
|
||
|
)
|
||
|
def test_groupby_sample_with_weights(index, expected_index):
|
||
|
# GH 39927 - tests for integer index needed
|
||
|
values = [1] * 2 + [2] * 2
|
||
|
df = DataFrame({"a": values, "b": values}, index=Index(index))
|
||
|
|
||
|
result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0])
|
||
|
expected = DataFrame({"a": values, "b": values}, index=Index(expected_index))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0])
|
||
|
expected = Series(values, name="b", index=Index(expected_index))
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_groupby_sample_with_selections():
|
||
|
# GH 39928
|
||
|
values = [1] * 10 + [2] * 10
|
||
|
df = DataFrame({"a": values, "b": values, "c": values})
|
||
|
|
||
|
result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None)
|
||
|
expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index)
|
||
|
tm.assert_frame_equal(result, expected)
|