aoc-2022/venv/Lib/site-packages/pandas/tests/frame/methods/test_reindex.py

1226 lines
43 KiB
Python

from datetime import (
datetime,
timedelta,
)
import inspect
import numpy as np
import pytest
from pandas._libs.tslibs.timezones import dateutil_gettz as gettz
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
Categorical,
CategoricalIndex,
DataFrame,
Index,
MultiIndex,
Series,
date_range,
isna,
)
import pandas._testing as tm
from pandas.api.types import CategoricalDtype as CDT
import pandas.core.common as com
class TestReindexSetIndex:
# Tests that check both reindex and set_index
def test_dti_set_index_reindex_datetimeindex(self):
# GH#6631
df = DataFrame(np.random.random(6))
idx1 = date_range("2011/01/01", periods=6, freq="M", tz="US/Eastern")
idx2 = date_range("2013", periods=6, freq="A", tz="Asia/Tokyo")
df = df.set_index(idx1)
tm.assert_index_equal(df.index, idx1)
df = df.reindex(idx2)
tm.assert_index_equal(df.index, idx2)
def test_dti_set_index_reindex_freq_with_tz(self):
# GH#11314 with tz
index = date_range(
datetime(2015, 10, 1), datetime(2015, 10, 1, 23), freq="H", tz="US/Eastern"
)
df = DataFrame(np.random.randn(24, 1), columns=["a"], index=index)
new_index = date_range(
datetime(2015, 10, 2), datetime(2015, 10, 2, 23), freq="H", tz="US/Eastern"
)
result = df.set_index(new_index)
assert result.index.freq == index.freq
def test_set_reset_index_intervalindex(self):
df = DataFrame({"A": range(10)})
ser = pd.cut(df.A, 5)
df["B"] = ser
df = df.set_index("B")
df = df.reset_index()
def test_setitem_reset_index_dtypes(self):
# GH 22060
df = DataFrame(columns=["a", "b", "c"]).astype(
{"a": "datetime64[ns]", "b": np.int64, "c": np.float64}
)
df1 = df.set_index(["a"])
df1["d"] = []
result = df1.reset_index()
expected = DataFrame(columns=["a", "b", "c", "d"], index=range(0)).astype(
{"a": "datetime64[ns]", "b": np.int64, "c": np.float64, "d": np.float64}
)
tm.assert_frame_equal(result, expected)
df2 = df.set_index(["a", "b"])
df2["d"] = []
result = df2.reset_index()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"timezone, year, month, day, hour",
[["America/Chicago", 2013, 11, 3, 1], ["America/Santiago", 2021, 4, 3, 23]],
)
def test_reindex_timestamp_with_fold(self, timezone, year, month, day, hour):
# see gh-40817
test_timezone = gettz(timezone)
transition_1 = pd.Timestamp(
year=year,
month=month,
day=day,
hour=hour,
minute=0,
fold=0,
tzinfo=test_timezone,
)
transition_2 = pd.Timestamp(
year=year,
month=month,
day=day,
hour=hour,
minute=0,
fold=1,
tzinfo=test_timezone,
)
df = (
DataFrame({"index": [transition_1, transition_2], "vals": ["a", "b"]})
.set_index("index")
.reindex(["1", "2"])
)
tm.assert_frame_equal(
df,
DataFrame({"index": ["1", "2"], "vals": [None, None]}).set_index("index"),
)
class TestDataFrameSelectReindex:
# These are specific reindex-based tests; other indexing tests should go in
# test_indexing
def test_reindex_copies(self):
# based on asv time_reindex_axis1
N = 10
df = DataFrame(np.random.randn(N * 10, N))
cols = np.arange(N)
np.random.shuffle(cols)
result = df.reindex(columns=cols, copy=True)
assert not np.shares_memory(result[0]._values, df[0]._values)
# pass both columns and index
result2 = df.reindex(columns=cols, index=df.index, copy=True)
assert not np.shares_memory(result2[0]._values, df[0]._values)
@td.skip_array_manager_not_yet_implemented
def test_reindex_date_fill_value(self):
# passing date to dt64 is deprecated
arr = date_range("2016-01-01", periods=6).values.reshape(3, 2)
df = DataFrame(arr, columns=["A", "B"], index=range(3))
ts = df.iloc[0, 0]
fv = ts.date()
with tm.assert_produces_warning(FutureWarning):
res = df.reindex(index=range(4), columns=["A", "B", "C"], fill_value=fv)
expected = DataFrame(
{"A": df["A"].tolist() + [ts], "B": df["B"].tolist() + [ts], "C": [ts] * 4}
)
tm.assert_frame_equal(res, expected)
# only reindexing rows
with tm.assert_produces_warning(FutureWarning):
res = df.reindex(index=range(4), fill_value=fv)
tm.assert_frame_equal(res, expected[["A", "B"]])
# same with a datetime-castable str
res = df.reindex(
index=range(4), columns=["A", "B", "C"], fill_value="2016-01-01"
)
tm.assert_frame_equal(res, expected)
def test_reindex_with_multi_index(self):
# https://github.com/pandas-dev/pandas/issues/29896
# tests for reindexing a multi-indexed DataFrame with a new MultiIndex
#
# confirms that we can reindex a multi-indexed DataFrame with a new
# MultiIndex object correctly when using no filling, backfilling, and
# padding
#
# The DataFrame, `df`, used in this test is:
# c
# a b
# -1 0 A
# 1 B
# 2 C
# 3 D
# 4 E
# 5 F
# 6 G
# 0 0 A
# 1 B
# 2 C
# 3 D
# 4 E
# 5 F
# 6 G
# 1 0 A
# 1 B
# 2 C
# 3 D
# 4 E
# 5 F
# 6 G
#
# and the other MultiIndex, `new_multi_index`, is:
# 0: 0 0.5
# 1: 2.0
# 2: 5.0
# 3: 5.8
df = DataFrame(
{
"a": [-1] * 7 + [0] * 7 + [1] * 7,
"b": list(range(7)) * 3,
"c": ["A", "B", "C", "D", "E", "F", "G"] * 3,
}
).set_index(["a", "b"])
new_index = [0.5, 2.0, 5.0, 5.8]
new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"])
# reindexing w/o a `method` value
reindexed = df.reindex(new_multi_index)
expected = DataFrame(
{"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]}
).set_index(["a", "b"])
tm.assert_frame_equal(expected, reindexed)
# reindexing with backfilling
expected = DataFrame(
{"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]}
).set_index(["a", "b"])
reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill")
tm.assert_frame_equal(expected, reindexed_with_backfilling)
reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill")
tm.assert_frame_equal(expected, reindexed_with_backfilling)
# reindexing with padding
expected = DataFrame(
{"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]}
).set_index(["a", "b"])
reindexed_with_padding = df.reindex(new_multi_index, method="pad")
tm.assert_frame_equal(expected, reindexed_with_padding)
reindexed_with_padding = df.reindex(new_multi_index, method="ffill")
tm.assert_frame_equal(expected, reindexed_with_padding)
@pytest.mark.parametrize(
"method,expected_values",
[
("nearest", [0, 1, 1, 2]),
("pad", [np.nan, 0, 1, 1]),
("backfill", [0, 1, 2, 2]),
],
)
def test_reindex_methods(self, method, expected_values):
df = DataFrame({"x": list(range(5))})
target = np.array([-0.1, 0.9, 1.1, 1.5])
expected = DataFrame({"x": expected_values}, index=target)
actual = df.reindex(target, method=method)
tm.assert_frame_equal(expected, actual)
actual = df.reindex(target, method=method, tolerance=1)
tm.assert_frame_equal(expected, actual)
actual = df.reindex(target, method=method, tolerance=[1, 1, 1, 1])
tm.assert_frame_equal(expected, actual)
e2 = expected[::-1]
actual = df.reindex(target[::-1], method=method)
tm.assert_frame_equal(e2, actual)
new_order = [3, 0, 2, 1]
e2 = expected.iloc[new_order]
actual = df.reindex(target[new_order], method=method)
tm.assert_frame_equal(e2, actual)
switched_method = (
"pad" if method == "backfill" else "backfill" if method == "pad" else method
)
actual = df[::-1].reindex(target, method=switched_method)
tm.assert_frame_equal(expected, actual)
def test_reindex_methods_nearest_special(self):
df = DataFrame({"x": list(range(5))})
target = np.array([-0.1, 0.9, 1.1, 1.5])
expected = DataFrame({"x": [0, 1, 1, np.nan]}, index=target)
actual = df.reindex(target, method="nearest", tolerance=0.2)
tm.assert_frame_equal(expected, actual)
expected = DataFrame({"x": [0, np.nan, 1, np.nan]}, index=target)
actual = df.reindex(target, method="nearest", tolerance=[0.5, 0.01, 0.4, 0.1])
tm.assert_frame_equal(expected, actual)
def test_reindex_nearest_tz(self, tz_aware_fixture):
# GH26683
tz = tz_aware_fixture
idx = date_range("2019-01-01", periods=5, tz=tz)
df = DataFrame({"x": list(range(5))}, index=idx)
expected = df.head(3)
actual = df.reindex(idx[:3], method="nearest")
tm.assert_frame_equal(expected, actual)
def test_reindex_nearest_tz_empty_frame(self):
# https://github.com/pandas-dev/pandas/issues/31964
dti = pd.DatetimeIndex(["2016-06-26 14:27:26+00:00"])
df = DataFrame(index=pd.DatetimeIndex(["2016-07-04 14:00:59+00:00"]))
expected = DataFrame(index=dti)
result = df.reindex(dti, method="nearest")
tm.assert_frame_equal(result, expected)
def test_reindex_frame_add_nat(self):
rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s")
df = DataFrame({"A": np.random.randn(len(rng)), "B": rng})
result = df.reindex(range(15))
assert np.issubdtype(result["B"].dtype, np.dtype("M8[ns]"))
mask = com.isna(result)["B"]
assert mask[-5:].all()
assert not mask[:-5].any()
@pytest.mark.parametrize(
"method, exp_values",
[("ffill", [0, 1, 2, 3]), ("bfill", [1.0, 2.0, 3.0, np.nan])],
)
def test_reindex_frame_tz_ffill_bfill(self, frame_or_series, method, exp_values):
# GH#38566
obj = frame_or_series(
[0, 1, 2, 3],
index=date_range("2020-01-01 00:00:00", periods=4, freq="H", tz="UTC"),
)
new_index = date_range("2020-01-01 00:01:00", periods=4, freq="H", tz="UTC")
result = obj.reindex(new_index, method=method, tolerance=pd.Timedelta("1 hour"))
expected = frame_or_series(exp_values, index=new_index)
tm.assert_equal(result, expected)
def test_reindex_limit(self):
# GH 28631
data = [["A", "A", "A"], ["B", "B", "B"], ["C", "C", "C"], ["D", "D", "D"]]
exp_data = [
["A", "A", "A"],
["B", "B", "B"],
["C", "C", "C"],
["D", "D", "D"],
["D", "D", "D"],
[np.nan, np.nan, np.nan],
]
df = DataFrame(data)
result = df.reindex([0, 1, 2, 3, 4, 5], method="ffill", limit=1)
expected = DataFrame(exp_data)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"idx, check_index_type",
[
[["C", "B", "A"], True],
[["F", "C", "A", "D"], True],
[["A"], True],
[["A", "B", "C"], True],
[["C", "A", "B"], True],
[["C", "B"], True],
[["C", "A"], True],
[["A", "B"], True],
[["B", "A", "C"], True],
# reindex by these causes different MultiIndex levels
[["D", "F"], False],
[["A", "C", "B"], False],
],
)
def test_reindex_level_verify_first_level(self, idx, check_index_type):
df = DataFrame(
{
"jim": list("B" * 4 + "A" * 2 + "C" * 3),
"joe": list("abcdeabcd")[::-1],
"jolie": [10, 20, 30] * 3,
"joline": np.random.randint(0, 1000, 9),
}
)
icol = ["jim", "joe", "jolie"]
def f(val):
return np.nonzero((df["jim"] == val).to_numpy())[0]
i = np.concatenate(list(map(f, idx)))
left = df.set_index(icol).reindex(idx, level="jim")
right = df.iloc[i].set_index(icol)
tm.assert_frame_equal(left, right, check_index_type=check_index_type)
@pytest.mark.parametrize(
"idx",
[
("mid",),
("mid", "btm"),
("mid", "btm", "top"),
("mid",),
("mid", "top"),
("mid", "top", "btm"),
("btm",),
("btm", "mid"),
("btm", "mid", "top"),
("btm",),
("btm", "top"),
("btm", "top", "mid"),
("top",),
("top", "mid"),
("top", "mid", "btm"),
("top",),
("top", "btm"),
("top", "btm", "mid"),
],
)
def test_reindex_level_verify_first_level_repeats(self, idx):
df = DataFrame(
{
"jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7,
"joe": ["3rd"] * 2
+ ["1st"] * 3
+ ["2nd"] * 3
+ ["1st"] * 2
+ ["3rd"] * 3
+ ["1st"] * 2
+ ["3rd"] * 3
+ ["2nd"] * 2,
# this needs to be jointly unique with jim and joe or
# reindexing will fail ~1.5% of the time, this works
# out to needing unique groups of same size as joe
"jolie": np.concatenate(
[
np.random.choice(1000, x, replace=False)
for x in [2, 3, 3, 2, 3, 2, 3, 2]
]
),
"joline": np.random.randn(20).round(3) * 10,
}
)
icol = ["jim", "joe", "jolie"]
def f(val):
return np.nonzero((df["jim"] == val).to_numpy())[0]
i = np.concatenate(list(map(f, idx)))
left = df.set_index(icol).reindex(idx, level="jim")
right = df.iloc[i].set_index(icol)
tm.assert_frame_equal(left, right)
@pytest.mark.parametrize(
"idx, indexer",
[
[
["1st", "2nd", "3rd"],
[2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 15, 16, 17],
],
[
["3rd", "2nd", "1st"],
[0, 1, 2, 3, 4, 10, 11, 12, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 13, 14],
],
[["2nd", "3rd"], [0, 1, 5, 6, 7, 10, 11, 12, 18, 19, 15, 16, 17]],
[["3rd", "1st"], [0, 1, 2, 3, 4, 10, 11, 12, 8, 9, 15, 16, 17, 13, 14]],
],
)
def test_reindex_level_verify_repeats(self, idx, indexer):
df = DataFrame(
{
"jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7,
"joe": ["3rd"] * 2
+ ["1st"] * 3
+ ["2nd"] * 3
+ ["1st"] * 2
+ ["3rd"] * 3
+ ["1st"] * 2
+ ["3rd"] * 3
+ ["2nd"] * 2,
# this needs to be jointly unique with jim and joe or
# reindexing will fail ~1.5% of the time, this works
# out to needing unique groups of same size as joe
"jolie": np.concatenate(
[
np.random.choice(1000, x, replace=False)
for x in [2, 3, 3, 2, 3, 2, 3, 2]
]
),
"joline": np.random.randn(20).round(3) * 10,
}
)
icol = ["jim", "joe", "jolie"]
left = df.set_index(icol).reindex(idx, level="joe")
right = df.iloc[indexer].set_index(icol)
tm.assert_frame_equal(left, right)
@pytest.mark.parametrize(
"idx, indexer, check_index_type",
[
[list("abcde"), [3, 2, 1, 0, 5, 4, 8, 7, 6], True],
[list("abcd"), [3, 2, 1, 0, 5, 8, 7, 6], True],
[list("abc"), [3, 2, 1, 8, 7, 6], True],
[list("eca"), [1, 3, 4, 6, 8], True],
[list("edc"), [0, 1, 4, 5, 6], True],
[list("eadbc"), [3, 0, 2, 1, 4, 5, 8, 7, 6], True],
[list("edwq"), [0, 4, 5], True],
[list("wq"), [], False],
],
)
def test_reindex_level_verify(self, idx, indexer, check_index_type):
df = DataFrame(
{
"jim": list("B" * 4 + "A" * 2 + "C" * 3),
"joe": list("abcdeabcd")[::-1],
"jolie": [10, 20, 30] * 3,
"joline": np.random.randint(0, 1000, 9),
}
)
icol = ["jim", "joe", "jolie"]
left = df.set_index(icol).reindex(idx, level="joe")
right = df.iloc[indexer].set_index(icol)
tm.assert_frame_equal(left, right, check_index_type=check_index_type)
def test_non_monotonic_reindex_methods(self):
dr = date_range("2013-08-01", periods=6, freq="B")
data = np.random.randn(6, 1)
df = DataFrame(data, index=dr, columns=list("A"))
df_rev = DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]], columns=list("A"))
# index is not monotonic increasing or decreasing
msg = "index must be monotonic increasing or decreasing"
with pytest.raises(ValueError, match=msg):
df_rev.reindex(df.index, method="pad")
with pytest.raises(ValueError, match=msg):
df_rev.reindex(df.index, method="ffill")
with pytest.raises(ValueError, match=msg):
df_rev.reindex(df.index, method="bfill")
with pytest.raises(ValueError, match=msg):
df_rev.reindex(df.index, method="nearest")
def test_reindex_sparse(self):
# https://github.com/pandas-dev/pandas/issues/35286
df = DataFrame(
{"A": [0, 1], "B": pd.array([0, 1], dtype=pd.SparseDtype("int64", 0))}
)
result = df.reindex([0, 2])
expected = DataFrame(
{
"A": [0.0, np.nan],
"B": pd.array([0.0, np.nan], dtype=pd.SparseDtype("float64", 0.0)),
},
index=[0, 2],
)
tm.assert_frame_equal(result, expected)
def test_reindex(self, float_frame):
datetime_series = tm.makeTimeSeries(nper=30)
newFrame = float_frame.reindex(datetime_series.index)
for col in newFrame.columns:
for idx, val in newFrame[col].items():
if idx in float_frame.index:
if np.isnan(val):
assert np.isnan(float_frame[col][idx])
else:
assert val == float_frame[col][idx]
else:
assert np.isnan(val)
for col, series in newFrame.items():
assert tm.equalContents(series.index, newFrame.index)
emptyFrame = float_frame.reindex(Index([]))
assert len(emptyFrame.index) == 0
# Cython code should be unit-tested directly
nonContigFrame = float_frame.reindex(datetime_series.index[::2])
for col in nonContigFrame.columns:
for idx, val in nonContigFrame[col].items():
if idx in float_frame.index:
if np.isnan(val):
assert np.isnan(float_frame[col][idx])
else:
assert val == float_frame[col][idx]
else:
assert np.isnan(val)
for col, series in nonContigFrame.items():
assert tm.equalContents(series.index, nonContigFrame.index)
# corner cases
# Same index, copies values but not index if copy=False
newFrame = float_frame.reindex(float_frame.index, copy=False)
assert newFrame.index is float_frame.index
# length zero
newFrame = float_frame.reindex([])
assert newFrame.empty
assert len(newFrame.columns) == len(float_frame.columns)
# length zero with columns reindexed with non-empty index
newFrame = float_frame.reindex([])
newFrame = newFrame.reindex(float_frame.index)
assert len(newFrame.index) == len(float_frame.index)
assert len(newFrame.columns) == len(float_frame.columns)
# pass non-Index
newFrame = float_frame.reindex(list(datetime_series.index))
expected = datetime_series.index._with_freq(None)
tm.assert_index_equal(newFrame.index, expected)
# copy with no axes
result = float_frame.reindex()
tm.assert_frame_equal(result, float_frame)
assert result is not float_frame
def test_reindex_nan(self):
df = DataFrame(
[[1, 2], [3, 5], [7, 11], [9, 23]],
index=[2, np.nan, 1, 5],
columns=["joe", "jim"],
)
i, j = [np.nan, 5, 5, np.nan, 1, 2, np.nan], [1, 3, 3, 1, 2, 0, 1]
tm.assert_frame_equal(df.reindex(i), df.iloc[j])
df.index = df.index.astype("object")
tm.assert_frame_equal(df.reindex(i), df.iloc[j], check_index_type=False)
# GH10388
df = DataFrame(
{
"other": ["a", "b", np.nan, "c"],
"date": ["2015-03-22", np.nan, "2012-01-08", np.nan],
"amount": [2, 3, 4, 5],
}
)
df["date"] = pd.to_datetime(df.date)
df["delta"] = (pd.to_datetime("2015-06-18") - df["date"]).shift(1)
left = df.set_index(["delta", "other", "date"]).reset_index()
right = df.reindex(columns=["delta", "other", "date", "amount"])
tm.assert_frame_equal(left, right)
def test_reindex_name_remains(self):
s = Series(np.random.rand(10))
df = DataFrame(s, index=np.arange(len(s)))
i = Series(np.arange(10), name="iname")
df = df.reindex(i)
assert df.index.name == "iname"
df = df.reindex(Index(np.arange(10), name="tmpname"))
assert df.index.name == "tmpname"
s = Series(np.random.rand(10))
df = DataFrame(s.T, index=np.arange(len(s)))
i = Series(np.arange(10), name="iname")
df = df.reindex(columns=i)
assert df.columns.name == "iname"
def test_reindex_int(self, int_frame):
smaller = int_frame.reindex(int_frame.index[::2])
assert smaller["A"].dtype == np.int64
bigger = smaller.reindex(int_frame.index)
assert bigger["A"].dtype == np.float64
smaller = int_frame.reindex(columns=["A", "B"])
assert smaller["A"].dtype == np.int64
def test_reindex_columns(self, float_frame):
new_frame = float_frame.reindex(columns=["A", "B", "E"])
tm.assert_series_equal(new_frame["B"], float_frame["B"])
assert np.isnan(new_frame["E"]).all()
assert "C" not in new_frame
# Length zero
new_frame = float_frame.reindex(columns=[])
assert new_frame.empty
def test_reindex_columns_method(self):
# GH 14992, reindexing over columns ignored method
df = DataFrame(
data=[[11, 12, 13], [21, 22, 23], [31, 32, 33]],
index=[1, 2, 4],
columns=[1, 2, 4],
dtype=float,
)
# default method
result = df.reindex(columns=range(6))
expected = DataFrame(
data=[
[np.nan, 11, 12, np.nan, 13, np.nan],
[np.nan, 21, 22, np.nan, 23, np.nan],
[np.nan, 31, 32, np.nan, 33, np.nan],
],
index=[1, 2, 4],
columns=range(6),
dtype=float,
)
tm.assert_frame_equal(result, expected)
# method='ffill'
result = df.reindex(columns=range(6), method="ffill")
expected = DataFrame(
data=[
[np.nan, 11, 12, 12, 13, 13],
[np.nan, 21, 22, 22, 23, 23],
[np.nan, 31, 32, 32, 33, 33],
],
index=[1, 2, 4],
columns=range(6),
dtype=float,
)
tm.assert_frame_equal(result, expected)
# method='bfill'
result = df.reindex(columns=range(6), method="bfill")
expected = DataFrame(
data=[
[11, 11, 12, 13, 13, np.nan],
[21, 21, 22, 23, 23, np.nan],
[31, 31, 32, 33, 33, np.nan],
],
index=[1, 2, 4],
columns=range(6),
dtype=float,
)
tm.assert_frame_equal(result, expected)
def test_reindex_axes(self):
# GH 3317, reindexing by both axes loses freq of the index
df = DataFrame(
np.ones((3, 3)),
index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)],
columns=["a", "b", "c"],
)
time_freq = date_range("2012-01-01", "2012-01-03", freq="d")
some_cols = ["a", "b"]
index_freq = df.reindex(index=time_freq).index.freq
both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq
seq_freq = df.reindex(index=time_freq).reindex(columns=some_cols).index.freq
assert index_freq == both_freq
assert index_freq == seq_freq
def test_reindex_fill_value(self):
df = DataFrame(np.random.randn(10, 4))
# axis=0
result = df.reindex(list(range(15)))
assert np.isnan(result.values[-5:]).all()
result = df.reindex(range(15), fill_value=0)
expected = df.reindex(range(15)).fillna(0)
tm.assert_frame_equal(result, expected)
# axis=1
result = df.reindex(columns=range(5), fill_value=0.0)
expected = df.copy()
expected[4] = 0.0
tm.assert_frame_equal(result, expected)
result = df.reindex(columns=range(5), fill_value=0)
expected = df.copy()
expected[4] = 0
tm.assert_frame_equal(result, expected)
result = df.reindex(columns=range(5), fill_value="foo")
expected = df.copy()
expected[4] = "foo"
tm.assert_frame_equal(result, expected)
# other dtypes
df["foo"] = "foo"
result = df.reindex(range(15), fill_value=0)
expected = df.reindex(range(15)).fillna(0)
tm.assert_frame_equal(result, expected)
def test_reindex_dups(self):
# GH4746, reindex on duplicate index error messages
arr = np.random.randn(10)
df = DataFrame(arr, index=[1, 2, 3, 4, 5, 1, 2, 3, 4, 5])
# set index is ok
result = df.copy()
result.index = list(range(len(df)))
expected = DataFrame(arr, index=list(range(len(df))))
tm.assert_frame_equal(result, expected)
# reindex fails
msg = "cannot reindex on an axis with duplicate labels"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
df.reindex(index=list(range(len(df))))
def test_reindex_with_duplicate_columns(self):
# reindex is invalid!
df = DataFrame(
[[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"]
)
msg = "cannot reindex on an axis with duplicate labels"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
df.reindex(columns=["bar"])
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
df.reindex(columns=["bar", "foo"])
def test_reindex_axis_style(self):
# https://github.com/pandas-dev/pandas/issues/12392
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
expected = DataFrame(
{"A": [1, 2, np.nan], "B": [4, 5, np.nan]}, index=[0, 1, 3]
)
result = df.reindex([0, 1, 3])
tm.assert_frame_equal(result, expected)
result = df.reindex([0, 1, 3], axis=0)
tm.assert_frame_equal(result, expected)
result = df.reindex([0, 1, 3], axis="index")
tm.assert_frame_equal(result, expected)
def test_reindex_positional_warns(self):
# https://github.com/pandas-dev/pandas/issues/12392
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
expected = DataFrame({"A": [1.0, 2], "B": [4.0, 5], "C": [np.nan, np.nan]})
with tm.assert_produces_warning(FutureWarning):
result = df.reindex([0, 1], ["A", "B", "C"])
tm.assert_frame_equal(result, expected)
def test_reindex_axis_style_raises(self):
# https://github.com/pandas-dev/pandas/issues/12392
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
df.reindex([0, 1], ["A"], axis=1)
with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
df.reindex([0, 1], ["A"], axis="index")
with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
df.reindex(index=[0, 1], axis="index")
with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
df.reindex(index=[0, 1], axis="columns")
with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
df.reindex(columns=[0, 1], axis="columns")
with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
df.reindex(index=[0, 1], columns=[0, 1], axis="columns")
with pytest.raises(TypeError, match="Cannot specify all"):
df.reindex([0, 1], [0], ["A"])
# Mixing styles
with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
df.reindex(index=[0, 1], axis="index")
with pytest.raises(TypeError, match="Cannot specify both 'axis'"):
df.reindex(index=[0, 1], axis="columns")
# Duplicates
with pytest.raises(TypeError, match="multiple values"):
df.reindex([0, 1], labels=[0, 1])
def test_reindex_single_named_indexer(self):
# https://github.com/pandas-dev/pandas/issues/12392
df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]})
result = df.reindex([0, 1], columns=["A"])
expected = DataFrame({"A": [1, 2]})
tm.assert_frame_equal(result, expected)
def test_reindex_api_equivalence(self):
# https://github.com/pandas-dev/pandas/issues/12392
# equivalence of the labels/axis and index/columns API's
df = DataFrame(
[[1, 2, 3], [3, 4, 5], [5, 6, 7]],
index=["a", "b", "c"],
columns=["d", "e", "f"],
)
res1 = df.reindex(["b", "a"])
res2 = df.reindex(index=["b", "a"])
res3 = df.reindex(labels=["b", "a"])
res4 = df.reindex(labels=["b", "a"], axis=0)
res5 = df.reindex(["b", "a"], axis=0)
for res in [res2, res3, res4, res5]:
tm.assert_frame_equal(res1, res)
res1 = df.reindex(columns=["e", "d"])
res2 = df.reindex(["e", "d"], axis=1)
res3 = df.reindex(labels=["e", "d"], axis=1)
for res in [res2, res3]:
tm.assert_frame_equal(res1, res)
with tm.assert_produces_warning(FutureWarning) as m:
res1 = df.reindex(["b", "a"], ["e", "d"])
assert "reindex" in str(m[0].message)
res2 = df.reindex(columns=["e", "d"], index=["b", "a"])
res3 = df.reindex(labels=["b", "a"], axis=0).reindex(labels=["e", "d"], axis=1)
for res in [res2, res3]:
tm.assert_frame_equal(res1, res)
def test_reindex_boolean(self):
frame = DataFrame(
np.ones((10, 2), dtype=bool), index=np.arange(0, 20, 2), columns=[0, 2]
)
reindexed = frame.reindex(np.arange(10))
assert reindexed.values.dtype == np.object_
assert isna(reindexed[0][1])
reindexed = frame.reindex(columns=range(3))
assert reindexed.values.dtype == np.object_
assert isna(reindexed[1]).all()
def test_reindex_objects(self, float_string_frame):
reindexed = float_string_frame.reindex(columns=["foo", "A", "B"])
assert "foo" in reindexed
reindexed = float_string_frame.reindex(columns=["A", "B"])
assert "foo" not in reindexed
def test_reindex_corner(self, int_frame):
index = Index(["a", "b", "c"])
dm = DataFrame({}).reindex(index=[1, 2, 3])
reindexed = dm.reindex(columns=index)
tm.assert_index_equal(reindexed.columns, index)
# ints are weird
smaller = int_frame.reindex(columns=["A", "B", "E"])
assert smaller["E"].dtype == np.float64
def test_reindex_with_nans(self):
df = DataFrame(
[[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]],
columns=["a", "b"],
index=[100.0, 101.0, np.nan, 102.0, 103.0],
)
result = df.reindex(index=[101.0, 102.0, 103.0])
expected = df.iloc[[1, 3, 4]]
tm.assert_frame_equal(result, expected)
result = df.reindex(index=[103.0])
expected = df.iloc[[4]]
tm.assert_frame_equal(result, expected)
result = df.reindex(index=[101.0])
expected = df.iloc[[1]]
tm.assert_frame_equal(result, expected)
def test_reindex_multi(self):
df = DataFrame(np.random.randn(3, 3))
result = df.reindex(index=range(4), columns=range(4))
expected = df.reindex(list(range(4))).reindex(columns=range(4))
tm.assert_frame_equal(result, expected)
df = DataFrame(np.random.randint(0, 10, (3, 3)))
result = df.reindex(index=range(4), columns=range(4))
expected = df.reindex(list(range(4))).reindex(columns=range(4))
tm.assert_frame_equal(result, expected)
df = DataFrame(np.random.randint(0, 10, (3, 3)))
result = df.reindex(index=range(2), columns=range(2))
expected = df.reindex(range(2)).reindex(columns=range(2))
tm.assert_frame_equal(result, expected)
df = DataFrame(np.random.randn(5, 3) + 1j, columns=["a", "b", "c"])
result = df.reindex(index=[0, 1], columns=["a", "b"])
expected = df.reindex([0, 1]).reindex(columns=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_reindex_multi_categorical_time(self):
# https://github.com/pandas-dev/pandas/issues/21390
midx = MultiIndex.from_product(
[
Categorical(["a", "b", "c"]),
Categorical(date_range("2012-01-01", periods=3, freq="H")),
]
)
df = DataFrame({"a": range(len(midx))}, index=midx)
df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]]
result = df2.reindex(midx)
expected = DataFrame({"a": [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx)
tm.assert_frame_equal(result, expected)
def test_reindex_with_categoricalindex(self):
df = DataFrame(
{
"A": np.arange(3, dtype="int64"),
},
index=CategoricalIndex(list("abc"), dtype=CDT(list("cabe")), name="B"),
)
# reindexing
# convert to a regular index
result = df.reindex(["a", "b", "e"])
expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index(
"B"
)
tm.assert_frame_equal(result, expected, check_index_type=True)
result = df.reindex(["a", "b"])
expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
result = df.reindex(["e"])
expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
result = df.reindex(["d"])
expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
# since we are actually reindexing with a Categorical
# then return a Categorical
cats = list("cabe")
result = df.reindex(Categorical(["a", "e"], categories=cats))
expected = DataFrame(
{"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))}
).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
result = df.reindex(Categorical(["a"], categories=cats))
expected = DataFrame(
{"A": [0], "B": Series(list("a")).astype(CDT(cats))}
).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
result = df.reindex(["a", "b", "e"])
expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index(
"B"
)
tm.assert_frame_equal(result, expected, check_index_type=True)
result = df.reindex(["a", "b"])
expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
result = df.reindex(["e"])
expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
# give back the type of categorical that we received
result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True))
expected = DataFrame(
{"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))}
).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
result = df.reindex(Categorical(["a", "d"], categories=["a", "d"]))
expected = DataFrame(
{"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))}
).set_index("B")
tm.assert_frame_equal(result, expected, check_index_type=True)
df2 = DataFrame(
{
"A": np.arange(6, dtype="int64"),
},
index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"),
)
# passed duplicate indexers are not allowed
msg = "cannot reindex on an axis with duplicate labels"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
df2.reindex(["a", "b"])
# args NotImplemented ATM
msg = r"argument {} is not implemented for CategoricalIndex\.reindex"
with pytest.raises(NotImplementedError, match=msg.format("method")):
df.reindex(["a"], method="ffill")
with pytest.raises(NotImplementedError, match=msg.format("level")):
df.reindex(["a"], level=1)
with pytest.raises(NotImplementedError, match=msg.format("limit")):
df.reindex(["a"], limit=2)
def test_reindex_signature(self):
sig = inspect.signature(DataFrame.reindex)
parameters = set(sig.parameters)
assert parameters == {
"self",
"labels",
"index",
"columns",
"axis",
"limit",
"copy",
"level",
"method",
"fill_value",
"tolerance",
}
def test_reindex_multiindex_ffill_added_rows(self):
# GH#23693
# reindex added rows with nan values even when fill method was specified
mi = MultiIndex.from_tuples([("a", "b"), ("d", "e")])
df = DataFrame([[0, 7], [3, 4]], index=mi, columns=["x", "y"])
mi2 = MultiIndex.from_tuples([("a", "b"), ("d", "e"), ("h", "i")])
result = df.reindex(mi2, axis=0, method="ffill")
expected = DataFrame([[0, 7], [3, 4], [3, 4]], index=mi2, columns=["x", "y"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs",
[
{"method": "pad", "tolerance": timedelta(seconds=9)},
{"method": "backfill", "tolerance": timedelta(seconds=9)},
{"method": "nearest"},
{"method": None},
],
)
def test_reindex_empty_frame(self, kwargs):
# GH#27315
idx = date_range(start="2020", freq="30s", periods=3)
df = DataFrame([], index=Index([], name="time"), columns=["a"])
result = df.reindex(idx, **kwargs)
expected = DataFrame({"a": [pd.NA] * 3}, index=idx)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"src_idx",
[
Index([]),
CategoricalIndex([]),
],
)
@pytest.mark.parametrize(
"cat_idx",
[
# No duplicates
Index([]),
CategoricalIndex([]),
Index(["A", "B"]),
CategoricalIndex(["A", "B"]),
# Duplicates: GH#38906
Index(["A", "A"]),
CategoricalIndex(["A", "A"]),
],
)
def test_reindex_empty(self, src_idx, cat_idx):
df = DataFrame(columns=src_idx, index=["K"], dtype="f8")
result = df.reindex(columns=cat_idx)
expected = DataFrame(index=["K"], columns=cat_idx, dtype="f8")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]"])
def test_reindex_datetimelike_to_object(self, dtype):
# GH#39755 dont cast dt64/td64 to ints
mi = MultiIndex.from_product([list("ABCDE"), range(2)])
dti = date_range("2016-01-01", periods=10)
fv = np.timedelta64("NaT", "ns")
if dtype == "m8[ns]":
dti = dti - dti[0]
fv = np.datetime64("NaT", "ns")
ser = Series(dti, index=mi)
ser[::3] = pd.NaT
df = ser.unstack()
index = df.index.append(Index([1]))
columns = df.columns.append(Index(["foo"]))
res = df.reindex(index=index, columns=columns, fill_value=fv)
expected = DataFrame(
{
0: df[0].tolist() + [fv],
1: df[1].tolist() + [fv],
"foo": np.array(["NaT"] * 6, dtype=fv.dtype),
},
index=index,
)
assert (res.dtypes[[0, 1]] == object).all()
assert res.iloc[0, 0] is pd.NaT
assert res.iloc[-1, 0] is fv
assert res.iloc[-1, 1] is fv
tm.assert_frame_equal(res, expected)
@pytest.mark.parametrize(
"index_df,index_res,index_exp",
[
(
CategoricalIndex([], categories=["A"]),
Index(["A"]),
Index(["A"]),
),
(
CategoricalIndex([], categories=["A"]),
Index(["B"]),
Index(["B"]),
),
(
CategoricalIndex([], categories=["A"]),
CategoricalIndex(["A"]),
CategoricalIndex(["A"]),
),
(
CategoricalIndex([], categories=["A"]),
CategoricalIndex(["B"]),
CategoricalIndex(["B"]),
),
],
)
def test_reindex_not_category(self, index_df, index_res, index_exp):
# GH#28690
df = DataFrame(index=index_df)
result = df.reindex(index=index_res)
expected = DataFrame(index=index_exp)
tm.assert_frame_equal(result, expected)