from datetime import datetime import random import numpy as np import pytest from pandas._libs.tslibs import iNaT import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm from pandas.core.interchange.column import PandasColumn from pandas.core.interchange.dataframe_protocol import ( ColumnNullType, DtypeKind, ) from pandas.core.interchange.from_dataframe import from_dataframe test_data_categorical = { "ordered": pd.Categorical(list("testdata") * 30, ordered=True), "unordered": pd.Categorical(list("testdata") * 30, ordered=False), } NCOLS, NROWS = 100, 200 def _make_data(make_one): return { f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [make_one() for _ in range(NROWS)] for i in range(NCOLS) } int_data = _make_data(lambda: random.randint(-100, 100)) uint_data = _make_data(lambda: random.randint(1, 100)) bool_data = _make_data(lambda: random.choice([True, False])) float_data = _make_data(lambda: random.random()) datetime_data = _make_data( lambda: datetime( year=random.randint(1900, 2100), month=random.randint(1, 12), day=random.randint(1, 20), ) ) string_data = { "separator data": [ "abC|DeF,Hik", "234,3245.67", "gSaf,qWer|Gre", "asd3,4sad|", np.NaN, ] } @pytest.mark.parametrize("data", [("ordered", True), ("unordered", False)]) def test_categorical_dtype(data): df = pd.DataFrame({"A": (test_data_categorical[data[0]])}) col = df.__dataframe__().get_column_by_name("A") assert col.dtype[0] == DtypeKind.CATEGORICAL assert col.null_count == 0 assert col.describe_null == (ColumnNullType.USE_SENTINEL, -1) assert col.num_chunks() == 1 desc_cat = col.describe_categorical assert desc_cat["is_ordered"] == data[1] assert desc_cat["is_dictionary"] is True assert isinstance(desc_cat["categories"], PandasColumn) tm.assert_series_equal( desc_cat["categories"]._col, pd.Series(["a", "d", "e", "s", "t"]) ) tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) @pytest.mark.parametrize( "data", [int_data, uint_data, float_data, bool_data, datetime_data] ) def test_dataframe(data): df = pd.DataFrame(data) df2 = df.__dataframe__() assert df2.num_columns() == NCOLS assert df2.num_rows() == NROWS assert list(df2.column_names()) == list(data.keys()) indices = (0, 2) names = tuple(list(data.keys())[idx] for idx in indices) result = from_dataframe(df2.select_columns(indices)) expected = from_dataframe(df2.select_columns_by_name(names)) tm.assert_frame_equal(result, expected) assert isinstance(result.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"], list) assert isinstance(expected.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"], list) def test_missing_from_masked(): df = pd.DataFrame( { "x": np.array([1, 2, 3, 4, 0]), "y": np.array([1.5, 2.5, 3.5, 4.5, 0]), "z": np.array([True, False, True, True, True]), } ) df2 = df.__dataframe__() rng = np.random.RandomState(42) dict_null = {col: rng.randint(low=0, high=len(df)) for col in df.columns} for col, num_nulls in dict_null.items(): null_idx = df.index[ rng.choice(np.arange(len(df)), size=num_nulls, replace=False) ] df.loc[null_idx, col] = None df2 = df.__dataframe__() assert df2.get_column_by_name("x").null_count == dict_null["x"] assert df2.get_column_by_name("y").null_count == dict_null["y"] assert df2.get_column_by_name("z").null_count == dict_null["z"] @pytest.mark.parametrize( "data", [ {"x": [1.5, 2.5, 3.5], "y": [9.2, 10.5, 11.8]}, {"x": [1, 2, 0], "y": [9.2, 10.5, 11.8]}, { "x": np.array([True, True, False]), "y": np.array([1, 2, 0]), "z": np.array([9.2, 10.5, 11.8]), }, ], ) def test_mixed_data(data): df = pd.DataFrame(data) df2 = df.__dataframe__() for col_name in df.columns: assert df2.get_column_by_name(col_name).null_count == 0 def test_mixed_missing(): df = pd.DataFrame( { "x": np.array([True, None, False, None, True]), "y": np.array([None, 2, None, 1, 2]), "z": np.array([9.2, 10.5, None, 11.8, None]), } ) df2 = df.__dataframe__() for col_name in df.columns: assert df2.get_column_by_name(col_name).null_count == 2 def test_string(): test_str_data = string_data["separator data"] + [""] df = pd.DataFrame({"A": test_str_data}) col = df.__dataframe__().get_column_by_name("A") assert col.size() == 6 assert col.null_count == 1 assert col.dtype[0] == DtypeKind.STRING assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0) df_sliced = df[1:] col = df_sliced.__dataframe__().get_column_by_name("A") assert col.size() == 5 assert col.null_count == 1 assert col.dtype[0] == DtypeKind.STRING assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0) def test_nonstring_object(): df = pd.DataFrame({"A": ["a", 10, 1.0, ()]}) col = df.__dataframe__().get_column_by_name("A") with pytest.raises(NotImplementedError, match="not supported yet"): col.dtype def test_datetime(): df = pd.DataFrame({"A": [pd.Timestamp("2022-01-01"), pd.NaT]}) col = df.__dataframe__().get_column_by_name("A") assert col.size() == 2 assert col.null_count == 1 assert col.dtype[0] == DtypeKind.DATETIME assert col.describe_null == (ColumnNullType.USE_SENTINEL, iNaT) tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) @td.skip_if_np_lt("1.23") def test_categorical_to_numpy_dlpack(): # https://github.com/pandas-dev/pandas/issues/48393 df = pd.DataFrame({"A": pd.Categorical(["a", "b", "a"])}) col = df.__dataframe__().get_column_by_name("A") result = np.from_dlpack(col.get_buffers()["data"][0]) expected = np.array([0, 1, 0], dtype="int8") tm.assert_numpy_array_equal(result, expected)