aoc-2022/venv/Lib/site-packages/pandas/tests/indexes/multi/test_integrity.py

import re

import numpy as np
import pytest

from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike

import pandas as pd
from pandas import (
    IntervalIndex,
    MultiIndex,
    RangeIndex,
)
import pandas._testing as tm
from pandas.core.api import Int64Index


def test_labels_dtypes():

    # GH 8456
    i = MultiIndex.from_tuples([("A", 1), ("A", 2)])
    assert i.codes[0].dtype == "int8"
    assert i.codes[1].dtype == "int8"

    i = MultiIndex.from_product([["a"], range(40)])
    assert i.codes[1].dtype == "int8"
    i = MultiIndex.from_product([["a"], range(400)])
    assert i.codes[1].dtype == "int16"
    i = MultiIndex.from_product([["a"], range(40000)])
    assert i.codes[1].dtype == "int32"

    i = MultiIndex.from_product([["a"], range(1000)])
    assert (i.codes[0] >= 0).all()
    assert (i.codes[1] >= 0).all()


def test_values_boxed():
    tuples = [
        (1, pd.Timestamp("2000-01-01")),
        (2, pd.NaT),
        (3, pd.Timestamp("2000-01-03")),
        (1, pd.Timestamp("2000-01-04")),
        (2, pd.Timestamp("2000-01-02")),
        (3, pd.Timestamp("2000-01-03")),
    ]
    result = MultiIndex.from_tuples(tuples)
    expected = construct_1d_object_array_from_listlike(tuples)
    tm.assert_numpy_array_equal(result.values, expected)
    # Check that code branches for boxed values produce identical results
    tm.assert_numpy_array_equal(result.values[:4], result[:4].values)


def test_values_multiindex_datetimeindex():
    # Test to ensure we hit the boxing / nobox part of MI.values
    ints = np.arange(10**18, 10**18 + 5)
    naive = pd.DatetimeIndex(ints)

    aware = pd.DatetimeIndex(ints, tz="US/Central")

    idx = MultiIndex.from_arrays([naive, aware])
    result = idx.values

    outer = pd.DatetimeIndex([x[0] for x in result])
    tm.assert_index_equal(outer, naive)

    inner = pd.DatetimeIndex([x[1] for x in result])
    tm.assert_index_equal(inner, aware)

    # n_lev > n_lab
    result = idx[:2].values

    outer = pd.DatetimeIndex([x[0] for x in result])
    tm.assert_index_equal(outer, naive[:2])

    inner = pd.DatetimeIndex([x[1] for x in result])
    tm.assert_index_equal(inner, aware[:2])


def test_values_multiindex_periodindex():
    # Test to ensure we hit the boxing / nobox part of MI.values
    ints = np.arange(2007, 2012)
    pidx = pd.PeriodIndex(ints, freq="D")

    idx = MultiIndex.from_arrays([ints, pidx])
    result = idx.values

    outer = Int64Index([x[0] for x in result])
    tm.assert_index_equal(outer, Int64Index(ints))

    inner = pd.PeriodIndex([x[1] for x in result])
    tm.assert_index_equal(inner, pidx)

    # n_lev > n_lab
    result = idx[:2].values

    outer = Int64Index([x[0] for x in result])
    tm.assert_index_equal(outer, Int64Index(ints[:2]))

    inner = pd.PeriodIndex([x[1] for x in result])
    tm.assert_index_equal(inner, pidx[:2])


def test_consistency():
    # need to construct an overflow
    major_axis = list(range(70000))
    minor_axis = list(range(10))

    major_codes = np.arange(70000)
    minor_codes = np.repeat(range(10), 7000)

    # the fact that is works means it's consistent
    index = MultiIndex(
        levels=[major_axis, minor_axis], codes=[major_codes, minor_codes]
    )

    # inconsistent
    major_codes = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3])
    minor_codes = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1])
    index = MultiIndex(
        levels=[major_axis, minor_axis], codes=[major_codes, minor_codes]
    )

    assert index.is_unique is False


@pytest.mark.slow
def test_hash_collisions():
    # non-smoke test that we don't get hash collisions

    index = MultiIndex.from_product(
        [np.arange(1000), np.arange(1000)], names=["one", "two"]
    )
    result = index.get_indexer(index.values)
    tm.assert_numpy_array_equal(result, np.arange(len(index), dtype="intp"))

    for i in [0, 1, len(index) - 2, len(index) - 1]:
        result = index.get_loc(index[i])
        assert result == i


def test_dims():
    pass


def test_take_invalid_kwargs():
    vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]]
    idx = MultiIndex.from_product(vals, names=["str", "dt"])
    indices = [1, 2]

    msg = r"take\(\) got an unexpected keyword argument 'foo'"
    with pytest.raises(TypeError, match=msg):
        idx.take(indices, foo=2)

    msg = "the 'out' parameter is not supported"
    with pytest.raises(ValueError, match=msg):
        idx.take(indices, out=indices)

    msg = "the 'mode' parameter is not supported"
    with pytest.raises(ValueError, match=msg):
        idx.take(indices, mode="clip")


def test_isna_behavior(idx):
    # should not segfault GH5123
    # NOTE: if MI representation changes, may make sense to allow
    # isna(MI)
    msg = "isna is not defined for MultiIndex"
    with pytest.raises(NotImplementedError, match=msg):
        pd.isna(idx)


def test_large_multiindex_error():
    # GH12527
    df_below_1000000 = pd.DataFrame(
        1, index=MultiIndex.from_product([[1, 2], range(499999)]), columns=["dest"]
    )
    with pytest.raises(KeyError, match=r"^\(-1, 0\)$"):
        df_below_1000000.loc[(-1, 0), "dest"]
    with pytest.raises(KeyError, match=r"^\(3, 0\)$"):
        df_below_1000000.loc[(3, 0), "dest"]
    df_above_1000000 = pd.DataFrame(
        1, index=MultiIndex.from_product([[1, 2], range(500001)]), columns=["dest"]
    )
    with pytest.raises(KeyError, match=r"^\(-1, 0\)$"):
        df_above_1000000.loc[(-1, 0), "dest"]
    with pytest.raises(KeyError, match=r"^\(3, 0\)$"):
        df_above_1000000.loc[(3, 0), "dest"]


def test_million_record_attribute_error():
    # GH 18165
    r = list(range(1000000))
    df = pd.DataFrame(
        {"a": r, "b": r}, index=MultiIndex.from_tuples([(x, x) for x in r])
    )

    msg = "'Series' object has no attribute 'foo'"
    with pytest.raises(AttributeError, match=msg):
        df["a"].foo()


def test_can_hold_identifiers(idx):
    key = idx[0]
    assert idx._can_hold_identifiers_and_holds_name(key) is True


def test_metadata_immutable(idx):
    levels, codes = idx.levels, idx.codes
    # shouldn't be able to set at either the top level or base level
    mutable_regex = re.compile("does not support mutable operations")
    with pytest.raises(TypeError, match=mutable_regex):
        levels[0] = levels[0]
    with pytest.raises(TypeError, match=mutable_regex):
        levels[0][0] = levels[0][0]
    # ditto for labels
    with pytest.raises(TypeError, match=mutable_regex):
        codes[0] = codes[0]
    with pytest.raises(ValueError, match="assignment destination is read-only"):
        codes[0][0] = codes[0][0]
    # and for names
    names = idx.names
    with pytest.raises(TypeError, match=mutable_regex):
        names[0] = names[0]


def test_level_setting_resets_attributes():
    ind = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]])
    assert ind.is_monotonic_increasing
    with tm.assert_produces_warning(FutureWarning):
        ind.set_levels([["A", "B"], [1, 3, 2]], inplace=True)
    # if this fails, probably didn't reset the cache correctly.
    assert not ind.is_monotonic_increasing


def test_rangeindex_fallback_coercion_bug():
    # GH 12893
    foo = pd.DataFrame(np.arange(100).reshape((10, 10)))
    bar = pd.DataFrame(np.arange(100).reshape((10, 10)))
    df = pd.concat({"foo": foo.stack(), "bar": bar.stack()}, axis=1)
    df.index.names = ["fizz", "buzz"]

    str(df)
    expected = pd.DataFrame(
        {"bar": np.arange(100), "foo": np.arange(100)},
        index=MultiIndex.from_product([range(10), range(10)], names=["fizz", "buzz"]),
    )
    tm.assert_frame_equal(df, expected, check_like=True)

    result = df.index.get_level_values("fizz")
    expected = Int64Index(np.arange(10), name="fizz").repeat(10)
    tm.assert_index_equal(result, expected)

    result = df.index.get_level_values("buzz")
    expected = Int64Index(np.tile(np.arange(10), 10), name="buzz")
    tm.assert_index_equal(result, expected)


def test_memory_usage(idx):
    result = idx.memory_usage()
    if len(idx):
        idx.get_loc(idx[0])
        result2 = idx.memory_usage()
        result3 = idx.memory_usage(deep=True)

        # RangeIndex, IntervalIndex
        # don't have engines
        if not isinstance(idx, (RangeIndex, IntervalIndex)):
            assert result2 > result

        if idx.inferred_type == "object":
            assert result3 > result2

    else:

        # we report 0 for no-length
        assert result == 0


def test_nlevels(idx):
    assert idx.nlevels == 2
Give it up for day 1 of smooth brain shenanigans! https://pbs.twimg.com/media/E1bxikWWEAEGYHU.png 2022-12-01 16:50:29 +00:00			`import re`

			`import numpy as np`
			`import pytest`

			`from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike`

			`import pandas as pd`
			`from pandas import (`
			`IntervalIndex,`
			`MultiIndex,`
			`RangeIndex,`
			`)`
			`import pandas._testing as tm`
			`from pandas.core.api import Int64Index`


			`def test_labels_dtypes():`

			`# GH 8456`
			`i = MultiIndex.from_tuples([("A", 1), ("A", 2)])`
			`assert i.codes[0].dtype == "int8"`
			`assert i.codes[1].dtype == "int8"`

			`i = MultiIndex.from_product([["a"], range(40)])`
			`assert i.codes[1].dtype == "int8"`
			`i = MultiIndex.from_product([["a"], range(400)])`
			`assert i.codes[1].dtype == "int16"`
			`i = MultiIndex.from_product([["a"], range(40000)])`
			`assert i.codes[1].dtype == "int32"`

			`i = MultiIndex.from_product([["a"], range(1000)])`
			`assert (i.codes[0] >= 0).all()`
			`assert (i.codes[1] >= 0).all()`


			`def test_values_boxed():`
			`tuples = [`
			`(1, pd.Timestamp("2000-01-01")),`
			`(2, pd.NaT),`
			`(3, pd.Timestamp("2000-01-03")),`
			`(1, pd.Timestamp("2000-01-04")),`
			`(2, pd.Timestamp("2000-01-02")),`
			`(3, pd.Timestamp("2000-01-03")),`
			`]`
			`result = MultiIndex.from_tuples(tuples)`
			`expected = construct_1d_object_array_from_listlike(tuples)`
			`tm.assert_numpy_array_equal(result.values, expected)`
			`# Check that code branches for boxed values produce identical results`
			`tm.assert_numpy_array_equal(result.values[:4], result[:4].values)`


			`def test_values_multiindex_datetimeindex():`
			`# Test to ensure we hit the boxing / nobox part of MI.values`
			`ints = np.arange(1018, 1018 + 5)`
			`naive = pd.DatetimeIndex(ints)`

			`aware = pd.DatetimeIndex(ints, tz="US/Central")`

			`idx = MultiIndex.from_arrays([naive, aware])`
			`result = idx.values`

			`outer = pd.DatetimeIndex([x[0] for x in result])`
			`tm.assert_index_equal(outer, naive)`

			`inner = pd.DatetimeIndex([x[1] for x in result])`
			`tm.assert_index_equal(inner, aware)`

			`# n_lev > n_lab`
			`result = idx[:2].values`

			`outer = pd.DatetimeIndex([x[0] for x in result])`
			`tm.assert_index_equal(outer, naive[:2])`

			`inner = pd.DatetimeIndex([x[1] for x in result])`
			`tm.assert_index_equal(inner, aware[:2])`


			`def test_values_multiindex_periodindex():`
			`# Test to ensure we hit the boxing / nobox part of MI.values`
			`ints = np.arange(2007, 2012)`
			`pidx = pd.PeriodIndex(ints, freq="D")`

			`idx = MultiIndex.from_arrays([ints, pidx])`
			`result = idx.values`

			`outer = Int64Index([x[0] for x in result])`
			`tm.assert_index_equal(outer, Int64Index(ints))`

			`inner = pd.PeriodIndex([x[1] for x in result])`
			`tm.assert_index_equal(inner, pidx)`

			`# n_lev > n_lab`
			`result = idx[:2].values`

			`outer = Int64Index([x[0] for x in result])`
			`tm.assert_index_equal(outer, Int64Index(ints[:2]))`

			`inner = pd.PeriodIndex([x[1] for x in result])`
			`tm.assert_index_equal(inner, pidx[:2])`


			`def test_consistency():`
			`# need to construct an overflow`
			`major_axis = list(range(70000))`
			`minor_axis = list(range(10))`

			`major_codes = np.arange(70000)`
			`minor_codes = np.repeat(range(10), 7000)`

			`# the fact that is works means it's consistent`
			`index = MultiIndex(`
			`levels=[major_axis, minor_axis], codes=[major_codes, minor_codes]`
			`)`

			`# inconsistent`
			`major_codes = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3])`
			`minor_codes = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1])`
			`index = MultiIndex(`
			`levels=[major_axis, minor_axis], codes=[major_codes, minor_codes]`
			`)`

			`assert index.is_unique is False`


			`@pytest.mark.slow`
			`def test_hash_collisions():`
			`# non-smoke test that we don't get hash collisions`

			`index = MultiIndex.from_product(`
			`[np.arange(1000), np.arange(1000)], names=["one", "two"]`
			`)`
			`result = index.get_indexer(index.values)`
			`tm.assert_numpy_array_equal(result, np.arange(len(index), dtype="intp"))`

			`for i in [0, 1, len(index) - 2, len(index) - 1]:`
			`result = index.get_loc(index[i])`
			`assert result == i`


			`def test_dims():`
			`pass`


			`def test_take_invalid_kwargs():`
			`vals = [["A", "B"], [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]]`
			`idx = MultiIndex.from_product(vals, names=["str", "dt"])`
			`indices = [1, 2]`

			`msg = r"take\(\) got an unexpected keyword argument 'foo'"`
			`with pytest.raises(TypeError, match=msg):`
			`idx.take(indices, foo=2)`

			`msg = "the 'out' parameter is not supported"`
			`with pytest.raises(ValueError, match=msg):`
			`idx.take(indices, out=indices)`

			`msg = "the 'mode' parameter is not supported"`
			`with pytest.raises(ValueError, match=msg):`
			`idx.take(indices, mode="clip")`


			`def test_isna_behavior(idx):`
			`# should not segfault GH5123`
			`# NOTE: if MI representation changes, may make sense to allow`
			`# isna(MI)`
			`msg = "isna is not defined for MultiIndex"`
			`with pytest.raises(NotImplementedError, match=msg):`
			`pd.isna(idx)`


			`def test_large_multiindex_error():`
			`# GH12527`
			`df_below_1000000 = pd.DataFrame(`
			`1, index=MultiIndex.from_product([[1, 2], range(499999)]), columns=["dest"]`
			`)`
			`with pytest.raises(KeyError, match=r"^\(-1, 0\)$"):`
			`df_below_1000000.loc[(-1, 0), "dest"]`
			`with pytest.raises(KeyError, match=r"^\(3, 0\)$"):`
			`df_below_1000000.loc[(3, 0), "dest"]`
			`df_above_1000000 = pd.DataFrame(`
			`1, index=MultiIndex.from_product([[1, 2], range(500001)]), columns=["dest"]`
			`)`
			`with pytest.raises(KeyError, match=r"^\(-1, 0\)$"):`
			`df_above_1000000.loc[(-1, 0), "dest"]`
			`with pytest.raises(KeyError, match=r"^\(3, 0\)$"):`
			`df_above_1000000.loc[(3, 0), "dest"]`


			`def test_million_record_attribute_error():`
			`# GH 18165`
			`r = list(range(1000000))`
			`df = pd.DataFrame(`
			`{"a": r, "b": r}, index=MultiIndex.from_tuples([(x, x) for x in r])`
			`)`

			`msg = "'Series' object has no attribute 'foo'"`
			`with pytest.raises(AttributeError, match=msg):`
			`df["a"].foo()`


			`def test_can_hold_identifiers(idx):`
			`key = idx[0]`
			`assert idx._can_hold_identifiers_and_holds_name(key) is True`


			`def test_metadata_immutable(idx):`
			`levels, codes = idx.levels, idx.codes`
			`# shouldn't be able to set at either the top level or base level`
			`mutable_regex = re.compile("does not support mutable operations")`
			`with pytest.raises(TypeError, match=mutable_regex):`
			`levels[0] = levels[0]`
			`with pytest.raises(TypeError, match=mutable_regex):`
			`levels[0][0] = levels[0][0]`
			`# ditto for labels`
			`with pytest.raises(TypeError, match=mutable_regex):`
			`codes[0] = codes[0]`
			`with pytest.raises(ValueError, match="assignment destination is read-only"):`
			`codes[0][0] = codes[0][0]`
			`# and for names`
			`names = idx.names`
			`with pytest.raises(TypeError, match=mutable_regex):`
			`names[0] = names[0]`


			`def test_level_setting_resets_attributes():`
			`ind = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]])`
			`assert ind.is_monotonic_increasing`
			`with tm.assert_produces_warning(FutureWarning):`
			`ind.set_levels([["A", "B"], [1, 3, 2]], inplace=True)`
			`# if this fails, probably didn't reset the cache correctly.`
			`assert not ind.is_monotonic_increasing`


			`def test_rangeindex_fallback_coercion_bug():`
			`# GH 12893`
			`foo = pd.DataFrame(np.arange(100).reshape((10, 10)))`
			`bar = pd.DataFrame(np.arange(100).reshape((10, 10)))`
			`df = pd.concat({"foo": foo.stack(), "bar": bar.stack()}, axis=1)`
			`df.index.names = ["fizz", "buzz"]`

			`str(df)`
			`expected = pd.DataFrame(`
			`{"bar": np.arange(100), "foo": np.arange(100)},`
			`index=MultiIndex.from_product([range(10), range(10)], names=["fizz", "buzz"]),`
			`)`
			`tm.assert_frame_equal(df, expected, check_like=True)`

			`result = df.index.get_level_values("fizz")`
			`expected = Int64Index(np.arange(10), name="fizz").repeat(10)`
			`tm.assert_index_equal(result, expected)`

			`result = df.index.get_level_values("buzz")`
			`expected = Int64Index(np.tile(np.arange(10), 10), name="buzz")`
			`tm.assert_index_equal(result, expected)`


			`def test_memory_usage(idx):`
			`result = idx.memory_usage()`
			`if len(idx):`
			`idx.get_loc(idx[0])`
			`result2 = idx.memory_usage()`
			`result3 = idx.memory_usage(deep=True)`

			`# RangeIndex, IntervalIndex`
			`# don't have engines`
			`if not isinstance(idx, (RangeIndex, IntervalIndex)):`
			`assert result2 > result`

			`if idx.inferred_type == "object":`
			`assert result3 > result2`

			`else:`

			`# we report 0 for no-length`
			`assert result == 0`


			`def test_nlevels(idx):`
			`assert idx.nlevels == 2`