942 lines
33 KiB
Python
942 lines
33 KiB
Python
|
import datetime
|
||
|
from datetime import timedelta
|
||
|
import re
|
||
|
from warnings import catch_warnings
|
||
|
|
||
|
import numpy as np
|
||
|
import pytest
|
||
|
|
||
|
from pandas._libs.tslibs import Timestamp
|
||
|
import pandas.util._test_decorators as td
|
||
|
|
||
|
import pandas as pd
|
||
|
from pandas import (
|
||
|
DataFrame,
|
||
|
Series,
|
||
|
_testing as tm,
|
||
|
concat,
|
||
|
date_range,
|
||
|
read_hdf,
|
||
|
)
|
||
|
from pandas.tests.io.pytables.common import (
|
||
|
_maybe_remove,
|
||
|
ensure_clean_path,
|
||
|
ensure_clean_store,
|
||
|
)
|
||
|
|
||
|
pytestmark = pytest.mark.single_cpu
|
||
|
|
||
|
|
||
|
@pytest.mark.filterwarnings("ignore:object name:tables.exceptions.NaturalNameWarning")
|
||
|
def test_append(setup_path):
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
|
||
|
# this is allowed by almost always don't want to do it
|
||
|
# tables.NaturalNameWarning):
|
||
|
with catch_warnings(record=True):
|
||
|
|
||
|
df = tm.makeTimeDataFrame()
|
||
|
_maybe_remove(store, "df1")
|
||
|
store.append("df1", df[:10])
|
||
|
store.append("df1", df[10:])
|
||
|
tm.assert_frame_equal(store["df1"], df)
|
||
|
|
||
|
_maybe_remove(store, "df2")
|
||
|
store.put("df2", df[:10], format="table")
|
||
|
store.append("df2", df[10:])
|
||
|
tm.assert_frame_equal(store["df2"], df)
|
||
|
|
||
|
_maybe_remove(store, "df3")
|
||
|
store.append("/df3", df[:10])
|
||
|
store.append("/df3", df[10:])
|
||
|
tm.assert_frame_equal(store["df3"], df)
|
||
|
|
||
|
# this is allowed by almost always don't want to do it
|
||
|
# tables.NaturalNameWarning
|
||
|
_maybe_remove(store, "/df3 foo")
|
||
|
store.append("/df3 foo", df[:10])
|
||
|
store.append("/df3 foo", df[10:])
|
||
|
tm.assert_frame_equal(store["df3 foo"], df)
|
||
|
|
||
|
# dtype issues - mizxed type in a single object column
|
||
|
df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]])
|
||
|
df["mixed_column"] = "testing"
|
||
|
df.loc[2, "mixed_column"] = np.nan
|
||
|
_maybe_remove(store, "df")
|
||
|
store.append("df", df)
|
||
|
tm.assert_frame_equal(store["df"], df)
|
||
|
|
||
|
# uints - test storage of uints
|
||
|
uint_data = DataFrame(
|
||
|
{
|
||
|
"u08": Series(
|
||
|
np.random.randint(0, high=255, size=5), dtype=np.uint8
|
||
|
),
|
||
|
"u16": Series(
|
||
|
np.random.randint(0, high=65535, size=5), dtype=np.uint16
|
||
|
),
|
||
|
"u32": Series(
|
||
|
np.random.randint(0, high=2**30, size=5), dtype=np.uint32
|
||
|
),
|
||
|
"u64": Series(
|
||
|
[2**58, 2**59, 2**60, 2**61, 2**62],
|
||
|
dtype=np.uint64,
|
||
|
),
|
||
|
},
|
||
|
index=np.arange(5),
|
||
|
)
|
||
|
_maybe_remove(store, "uints")
|
||
|
store.append("uints", uint_data)
|
||
|
tm.assert_frame_equal(store["uints"], uint_data)
|
||
|
|
||
|
# uints - test storage of uints in indexable columns
|
||
|
_maybe_remove(store, "uints")
|
||
|
# 64-bit indices not yet supported
|
||
|
store.append("uints", uint_data, data_columns=["u08", "u16", "u32"])
|
||
|
tm.assert_frame_equal(store["uints"], uint_data)
|
||
|
|
||
|
|
||
|
def test_append_series(setup_path):
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
|
||
|
# basic
|
||
|
ss = tm.makeStringSeries()
|
||
|
ts = tm.makeTimeSeries()
|
||
|
ns = Series(np.arange(100))
|
||
|
|
||
|
store.append("ss", ss)
|
||
|
result = store["ss"]
|
||
|
tm.assert_series_equal(result, ss)
|
||
|
assert result.name is None
|
||
|
|
||
|
store.append("ts", ts)
|
||
|
result = store["ts"]
|
||
|
tm.assert_series_equal(result, ts)
|
||
|
assert result.name is None
|
||
|
|
||
|
ns.name = "foo"
|
||
|
store.append("ns", ns)
|
||
|
result = store["ns"]
|
||
|
tm.assert_series_equal(result, ns)
|
||
|
assert result.name == ns.name
|
||
|
|
||
|
# select on the values
|
||
|
expected = ns[ns > 60]
|
||
|
result = store.select("ns", "foo>60")
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# select on the index and values
|
||
|
expected = ns[(ns > 70) & (ns.index < 90)]
|
||
|
result = store.select("ns", "foo>70 and index<90")
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
# multi-index
|
||
|
mi = DataFrame(np.random.randn(5, 1), columns=["A"])
|
||
|
mi["B"] = np.arange(len(mi))
|
||
|
mi["C"] = "foo"
|
||
|
mi.loc[3:5, "C"] = "bar"
|
||
|
mi.set_index(["C", "B"], inplace=True)
|
||
|
s = mi.stack()
|
||
|
s.index = s.index.droplevel(2)
|
||
|
store.append("mi", s)
|
||
|
tm.assert_series_equal(store["mi"], s)
|
||
|
|
||
|
|
||
|
def test_append_some_nans(setup_path):
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": Series(np.random.randn(20)).astype("int32"),
|
||
|
"A1": np.random.randn(20),
|
||
|
"A2": np.random.randn(20),
|
||
|
"B": "foo",
|
||
|
"C": "bar",
|
||
|
"D": Timestamp("20010101"),
|
||
|
"E": datetime.datetime(2001, 1, 2, 0, 0),
|
||
|
},
|
||
|
index=np.arange(20),
|
||
|
)
|
||
|
# some nans
|
||
|
_maybe_remove(store, "df1")
|
||
|
df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan
|
||
|
store.append("df1", df[:10])
|
||
|
store.append("df1", df[10:])
|
||
|
tm.assert_frame_equal(store["df1"], df)
|
||
|
|
||
|
# first column
|
||
|
df1 = df.copy()
|
||
|
df1["A1"] = np.nan
|
||
|
_maybe_remove(store, "df1")
|
||
|
store.append("df1", df1[:10])
|
||
|
store.append("df1", df1[10:])
|
||
|
tm.assert_frame_equal(store["df1"], df1)
|
||
|
|
||
|
# 2nd column
|
||
|
df2 = df.copy()
|
||
|
df2["A2"] = np.nan
|
||
|
_maybe_remove(store, "df2")
|
||
|
store.append("df2", df2[:10])
|
||
|
store.append("df2", df2[10:])
|
||
|
tm.assert_frame_equal(store["df2"], df2)
|
||
|
|
||
|
# datetimes
|
||
|
df3 = df.copy()
|
||
|
df3["E"] = np.nan
|
||
|
_maybe_remove(store, "df3")
|
||
|
store.append("df3", df3[:10])
|
||
|
store.append("df3", df3[10:])
|
||
|
tm.assert_frame_equal(store["df3"], df3)
|
||
|
|
||
|
|
||
|
def test_append_all_nans(setup_path):
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
|
||
|
df = DataFrame(
|
||
|
{"A1": np.random.randn(20), "A2": np.random.randn(20)},
|
||
|
index=np.arange(20),
|
||
|
)
|
||
|
df.loc[0:15, :] = np.nan
|
||
|
|
||
|
# nan some entire rows (dropna=True)
|
||
|
_maybe_remove(store, "df")
|
||
|
store.append("df", df[:10], dropna=True)
|
||
|
store.append("df", df[10:], dropna=True)
|
||
|
tm.assert_frame_equal(store["df"], df[-4:])
|
||
|
|
||
|
# nan some entire rows (dropna=False)
|
||
|
_maybe_remove(store, "df2")
|
||
|
store.append("df2", df[:10], dropna=False)
|
||
|
store.append("df2", df[10:], dropna=False)
|
||
|
tm.assert_frame_equal(store["df2"], df)
|
||
|
|
||
|
# tests the option io.hdf.dropna_table
|
||
|
with pd.option_context("io.hdf.dropna_table", False):
|
||
|
_maybe_remove(store, "df3")
|
||
|
store.append("df3", df[:10])
|
||
|
store.append("df3", df[10:])
|
||
|
tm.assert_frame_equal(store["df3"], df)
|
||
|
|
||
|
with pd.option_context("io.hdf.dropna_table", True):
|
||
|
_maybe_remove(store, "df4")
|
||
|
store.append("df4", df[:10])
|
||
|
store.append("df4", df[10:])
|
||
|
tm.assert_frame_equal(store["df4"], df[-4:])
|
||
|
|
||
|
# nan some entire rows (string are still written!)
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A1": np.random.randn(20),
|
||
|
"A2": np.random.randn(20),
|
||
|
"B": "foo",
|
||
|
"C": "bar",
|
||
|
},
|
||
|
index=np.arange(20),
|
||
|
)
|
||
|
|
||
|
df.loc[0:15, :] = np.nan
|
||
|
|
||
|
_maybe_remove(store, "df")
|
||
|
store.append("df", df[:10], dropna=True)
|
||
|
store.append("df", df[10:], dropna=True)
|
||
|
tm.assert_frame_equal(store["df"], df)
|
||
|
|
||
|
_maybe_remove(store, "df2")
|
||
|
store.append("df2", df[:10], dropna=False)
|
||
|
store.append("df2", df[10:], dropna=False)
|
||
|
tm.assert_frame_equal(store["df2"], df)
|
||
|
|
||
|
# nan some entire rows (but since we have dates they are still
|
||
|
# written!)
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A1": np.random.randn(20),
|
||
|
"A2": np.random.randn(20),
|
||
|
"B": "foo",
|
||
|
"C": "bar",
|
||
|
"D": Timestamp("20010101"),
|
||
|
"E": datetime.datetime(2001, 1, 2, 0, 0),
|
||
|
},
|
||
|
index=np.arange(20),
|
||
|
)
|
||
|
|
||
|
df.loc[0:15, :] = np.nan
|
||
|
|
||
|
_maybe_remove(store, "df")
|
||
|
store.append("df", df[:10], dropna=True)
|
||
|
store.append("df", df[10:], dropna=True)
|
||
|
tm.assert_frame_equal(store["df"], df)
|
||
|
|
||
|
_maybe_remove(store, "df2")
|
||
|
store.append("df2", df[:10], dropna=False)
|
||
|
store.append("df2", df[10:], dropna=False)
|
||
|
tm.assert_frame_equal(store["df2"], df)
|
||
|
|
||
|
|
||
|
def test_append_frame_column_oriented(setup_path):
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
|
||
|
# column oriented
|
||
|
df = tm.makeTimeDataFrame()
|
||
|
df.index = df.index._with_freq(None) # freq doesn't round-trip
|
||
|
|
||
|
_maybe_remove(store, "df1")
|
||
|
store.append("df1", df.iloc[:, :2], axes=["columns"])
|
||
|
store.append("df1", df.iloc[:, 2:])
|
||
|
tm.assert_frame_equal(store["df1"], df)
|
||
|
|
||
|
result = store.select("df1", "columns=A")
|
||
|
expected = df.reindex(columns=["A"])
|
||
|
tm.assert_frame_equal(expected, result)
|
||
|
|
||
|
# selection on the non-indexable
|
||
|
result = store.select("df1", ("columns=A", "index=df.index[0:4]"))
|
||
|
expected = df.reindex(columns=["A"], index=df.index[0:4])
|
||
|
tm.assert_frame_equal(expected, result)
|
||
|
|
||
|
# this isn't supported
|
||
|
msg = re.escape(
|
||
|
"passing a filterable condition to a non-table indexer "
|
||
|
"[Filter: Not Initialized]"
|
||
|
)
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
store.select("df1", "columns=A and index>df.index[4]")
|
||
|
|
||
|
|
||
|
def test_append_with_different_block_ordering(setup_path):
|
||
|
|
||
|
# GH 4096; using same frames, but different block orderings
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
|
||
|
for i in range(10):
|
||
|
|
||
|
df = DataFrame(np.random.randn(10, 2), columns=list("AB"))
|
||
|
df["index"] = range(10)
|
||
|
df["index"] += i * 10
|
||
|
df["int64"] = Series([1] * len(df), dtype="int64")
|
||
|
df["int16"] = Series([1] * len(df), dtype="int16")
|
||
|
|
||
|
if i % 2 == 0:
|
||
|
del df["int64"]
|
||
|
df["int64"] = Series([1] * len(df), dtype="int64")
|
||
|
if i % 3 == 0:
|
||
|
a = df.pop("A")
|
||
|
df["A"] = a
|
||
|
|
||
|
df.set_index("index", inplace=True)
|
||
|
|
||
|
store.append("df", df)
|
||
|
|
||
|
# test a different ordering but with more fields (like invalid
|
||
|
# combinations)
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
|
||
|
df = DataFrame(np.random.randn(10, 2), columns=list("AB"), dtype="float64")
|
||
|
df["int64"] = Series([1] * len(df), dtype="int64")
|
||
|
df["int16"] = Series([1] * len(df), dtype="int16")
|
||
|
store.append("df", df)
|
||
|
|
||
|
# store additional fields in different blocks
|
||
|
df["int16_2"] = Series([1] * len(df), dtype="int16")
|
||
|
msg = re.escape(
|
||
|
"cannot match existing table structure for [int16] on appending data"
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
store.append("df", df)
|
||
|
|
||
|
# store multiple additional fields in different blocks
|
||
|
df["float_3"] = Series([1.0] * len(df), dtype="float64")
|
||
|
msg = re.escape(
|
||
|
"cannot match existing table structure for [A,B] on appending data"
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
store.append("df", df)
|
||
|
|
||
|
|
||
|
def test_append_with_strings(setup_path):
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
with catch_warnings(record=True):
|
||
|
|
||
|
def check_col(key, name, size):
|
||
|
assert (
|
||
|
getattr(store.get_storer(key).table.description, name).itemsize
|
||
|
== size
|
||
|
)
|
||
|
|
||
|
# avoid truncation on elements
|
||
|
df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
|
||
|
store.append("df_big", df)
|
||
|
tm.assert_frame_equal(store.select("df_big"), df)
|
||
|
check_col("df_big", "values_block_1", 15)
|
||
|
|
||
|
# appending smaller string ok
|
||
|
df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]])
|
||
|
store.append("df_big", df2)
|
||
|
expected = concat([df, df2])
|
||
|
tm.assert_frame_equal(store.select("df_big"), expected)
|
||
|
check_col("df_big", "values_block_1", 15)
|
||
|
|
||
|
# avoid truncation on elements
|
||
|
df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
|
||
|
store.append("df_big2", df, min_itemsize={"values": 50})
|
||
|
tm.assert_frame_equal(store.select("df_big2"), df)
|
||
|
check_col("df_big2", "values_block_1", 50)
|
||
|
|
||
|
# bigger string on next append
|
||
|
store.append("df_new", df)
|
||
|
df_new = DataFrame(
|
||
|
[[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]]
|
||
|
)
|
||
|
msg = (
|
||
|
r"Trying to store a string with len \[26\] in "
|
||
|
r"\[values_block_1\] column but\n"
|
||
|
r"this column has a limit of \[15\]!\n"
|
||
|
"Consider using min_itemsize to preset the sizes on these "
|
||
|
"columns"
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
store.append("df_new", df_new)
|
||
|
|
||
|
# min_itemsize on Series index (GH 11412)
|
||
|
df = tm.makeMixedDataFrame().set_index("C")
|
||
|
store.append("ss", df["B"], min_itemsize={"index": 4})
|
||
|
tm.assert_series_equal(store.select("ss"), df["B"])
|
||
|
|
||
|
# same as above, with data_columns=True
|
||
|
store.append("ss2", df["B"], data_columns=True, min_itemsize={"index": 4})
|
||
|
tm.assert_series_equal(store.select("ss2"), df["B"])
|
||
|
|
||
|
# min_itemsize in index without appending (GH 10381)
|
||
|
store.put("ss3", df, format="table", min_itemsize={"index": 6})
|
||
|
# just make sure there is a longer string:
|
||
|
df2 = df.copy().reset_index().assign(C="longer").set_index("C")
|
||
|
store.append("ss3", df2)
|
||
|
tm.assert_frame_equal(store.select("ss3"), concat([df, df2]))
|
||
|
|
||
|
# same as above, with a Series
|
||
|
store.put("ss4", df["B"], format="table", min_itemsize={"index": 6})
|
||
|
store.append("ss4", df2["B"])
|
||
|
tm.assert_series_equal(store.select("ss4"), concat([df["B"], df2["B"]]))
|
||
|
|
||
|
# with nans
|
||
|
_maybe_remove(store, "df")
|
||
|
df = tm.makeTimeDataFrame()
|
||
|
df["string"] = "foo"
|
||
|
df.loc[df.index[1:4], "string"] = np.nan
|
||
|
df["string2"] = "bar"
|
||
|
df.loc[df.index[4:8], "string2"] = np.nan
|
||
|
df["string3"] = "bah"
|
||
|
df.loc[df.index[1:], "string3"] = np.nan
|
||
|
store.append("df", df)
|
||
|
result = store.select("df")
|
||
|
tm.assert_frame_equal(result, df)
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
|
||
|
def check_col(key, name, size):
|
||
|
assert getattr(store.get_storer(key).table.description, name).itemsize, size
|
||
|
|
||
|
df = DataFrame({"A": "foo", "B": "bar"}, index=range(10))
|
||
|
|
||
|
# a min_itemsize that creates a data_column
|
||
|
_maybe_remove(store, "df")
|
||
|
store.append("df", df, min_itemsize={"A": 200})
|
||
|
check_col("df", "A", 200)
|
||
|
assert store.get_storer("df").data_columns == ["A"]
|
||
|
|
||
|
# a min_itemsize that creates a data_column2
|
||
|
_maybe_remove(store, "df")
|
||
|
store.append("df", df, data_columns=["B"], min_itemsize={"A": 200})
|
||
|
check_col("df", "A", 200)
|
||
|
assert store.get_storer("df").data_columns == ["B", "A"]
|
||
|
|
||
|
# a min_itemsize that creates a data_column2
|
||
|
_maybe_remove(store, "df")
|
||
|
store.append("df", df, data_columns=["B"], min_itemsize={"values": 200})
|
||
|
check_col("df", "B", 200)
|
||
|
check_col("df", "values_block_0", 200)
|
||
|
assert store.get_storer("df").data_columns == ["B"]
|
||
|
|
||
|
# infer the .typ on subsequent appends
|
||
|
_maybe_remove(store, "df")
|
||
|
store.append("df", df[:5], min_itemsize=200)
|
||
|
store.append("df", df[5:], min_itemsize=200)
|
||
|
tm.assert_frame_equal(store["df"], df)
|
||
|
|
||
|
# invalid min_itemsize keys
|
||
|
df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"])
|
||
|
_maybe_remove(store, "df")
|
||
|
msg = re.escape(
|
||
|
"min_itemsize has the key [foo] which is not an axis or data_column"
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
store.append("df", df, min_itemsize={"foo": 20, "foobar": 20})
|
||
|
|
||
|
|
||
|
def test_append_with_empty_string(setup_path):
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
|
||
|
# with all empty strings (GH 12242)
|
||
|
df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]})
|
||
|
store.append("df", df[:-1], min_itemsize={"x": 1})
|
||
|
store.append("df", df[-1:], min_itemsize={"x": 1})
|
||
|
tm.assert_frame_equal(store.select("df"), df)
|
||
|
|
||
|
|
||
|
def test_append_with_data_columns(setup_path):
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
df = tm.makeTimeDataFrame()
|
||
|
df.iloc[0, df.columns.get_loc("B")] = 1.0
|
||
|
_maybe_remove(store, "df")
|
||
|
store.append("df", df[:2], data_columns=["B"])
|
||
|
store.append("df", df[2:])
|
||
|
tm.assert_frame_equal(store["df"], df)
|
||
|
|
||
|
# check that we have indices created
|
||
|
assert store._handle.root.df.table.cols.index.is_indexed is True
|
||
|
assert store._handle.root.df.table.cols.B.is_indexed is True
|
||
|
|
||
|
# data column searching
|
||
|
result = store.select("df", "B>0")
|
||
|
expected = df[df.B > 0]
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# data column searching (with an indexable and a data_columns)
|
||
|
result = store.select("df", "B>0 and index>df.index[3]")
|
||
|
df_new = df.reindex(index=df.index[4:])
|
||
|
expected = df_new[df_new.B > 0]
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# data column selection with a string data_column
|
||
|
df_new = df.copy()
|
||
|
df_new["string"] = "foo"
|
||
|
df_new.loc[df_new.index[1:4], "string"] = np.nan
|
||
|
df_new.loc[df_new.index[5:6], "string"] = "bar"
|
||
|
_maybe_remove(store, "df")
|
||
|
store.append("df", df_new, data_columns=["string"])
|
||
|
result = store.select("df", "string='foo'")
|
||
|
expected = df_new[df_new.string == "foo"]
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# using min_itemsize and a data column
|
||
|
def check_col(key, name, size):
|
||
|
assert (
|
||
|
getattr(store.get_storer(key).table.description, name).itemsize == size
|
||
|
)
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
_maybe_remove(store, "df")
|
||
|
store.append("df", df_new, data_columns=["string"], min_itemsize={"string": 30})
|
||
|
check_col("df", "string", 30)
|
||
|
_maybe_remove(store, "df")
|
||
|
store.append("df", df_new, data_columns=["string"], min_itemsize=30)
|
||
|
check_col("df", "string", 30)
|
||
|
_maybe_remove(store, "df")
|
||
|
store.append("df", df_new, data_columns=["string"], min_itemsize={"values": 30})
|
||
|
check_col("df", "string", 30)
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
df_new["string2"] = "foobarbah"
|
||
|
df_new["string_block1"] = "foobarbah1"
|
||
|
df_new["string_block2"] = "foobarbah2"
|
||
|
_maybe_remove(store, "df")
|
||
|
store.append(
|
||
|
"df",
|
||
|
df_new,
|
||
|
data_columns=["string", "string2"],
|
||
|
min_itemsize={"string": 30, "string2": 40, "values": 50},
|
||
|
)
|
||
|
check_col("df", "string", 30)
|
||
|
check_col("df", "string2", 40)
|
||
|
check_col("df", "values_block_1", 50)
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
# multiple data columns
|
||
|
df_new = df.copy()
|
||
|
df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0
|
||
|
df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0
|
||
|
df_new["string"] = "foo"
|
||
|
|
||
|
sl = df_new.columns.get_loc("string")
|
||
|
df_new.iloc[1:4, sl] = np.nan
|
||
|
df_new.iloc[5:6, sl] = "bar"
|
||
|
|
||
|
df_new["string2"] = "foo"
|
||
|
sl = df_new.columns.get_loc("string2")
|
||
|
df_new.iloc[2:5, sl] = np.nan
|
||
|
df_new.iloc[7:8, sl] = "bar"
|
||
|
_maybe_remove(store, "df")
|
||
|
store.append("df", df_new, data_columns=["A", "B", "string", "string2"])
|
||
|
result = store.select("df", "string='foo' and string2='foo' and A>0 and B<0")
|
||
|
expected = df_new[
|
||
|
(df_new.string == "foo")
|
||
|
& (df_new.string2 == "foo")
|
||
|
& (df_new.A > 0)
|
||
|
& (df_new.B < 0)
|
||
|
]
|
||
|
tm.assert_frame_equal(result, expected, check_freq=False)
|
||
|
# FIXME: 2020-05-07 freq check randomly fails in the CI
|
||
|
|
||
|
# yield an empty frame
|
||
|
result = store.select("df", "string='foo' and string2='cool'")
|
||
|
expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")]
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
# doc example
|
||
|
df_dc = df.copy()
|
||
|
df_dc["string"] = "foo"
|
||
|
df_dc.loc[df_dc.index[4:6], "string"] = np.nan
|
||
|
df_dc.loc[df_dc.index[7:9], "string"] = "bar"
|
||
|
df_dc["string2"] = "cool"
|
||
|
df_dc["datetime"] = Timestamp("20010102")
|
||
|
df_dc = df_dc._convert(datetime=True)
|
||
|
df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan
|
||
|
|
||
|
_maybe_remove(store, "df_dc")
|
||
|
store.append(
|
||
|
"df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"]
|
||
|
)
|
||
|
result = store.select("df_dc", "B>0")
|
||
|
|
||
|
expected = df_dc[df_dc.B > 0]
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"])
|
||
|
expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
|
||
|
tm.assert_frame_equal(result, expected, check_freq=False)
|
||
|
# FIXME: 2020-12-07 intermittent build failures here with freq of
|
||
|
# None instead of BDay(4)
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
# doc example part 2
|
||
|
np.random.seed(1234)
|
||
|
index = date_range("1/1/2000", periods=8)
|
||
|
df_dc = DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])
|
||
|
df_dc["string"] = "foo"
|
||
|
df_dc.loc[df_dc.index[4:6], "string"] = np.nan
|
||
|
df_dc.loc[df_dc.index[7:9], "string"] = "bar"
|
||
|
df_dc[["B", "C"]] = df_dc[["B", "C"]].abs()
|
||
|
df_dc["string2"] = "cool"
|
||
|
|
||
|
# on-disk operations
|
||
|
store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"])
|
||
|
|
||
|
result = store.select("df_dc", "B>0")
|
||
|
expected = df_dc[df_dc.B > 0]
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"'])
|
||
|
expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_append_hierarchical(setup_path, multiindex_dataframe_random_data):
|
||
|
df = multiindex_dataframe_random_data
|
||
|
df.columns.name = None
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
store.append("mi", df)
|
||
|
result = store.select("mi")
|
||
|
tm.assert_frame_equal(result, df)
|
||
|
|
||
|
# GH 3748
|
||
|
result = store.select("mi", columns=["A", "B"])
|
||
|
expected = df.reindex(columns=["A", "B"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
with ensure_clean_path("test.hdf") as path:
|
||
|
df.to_hdf(path, "df", format="table")
|
||
|
result = read_hdf(path, "df", columns=["A", "B"])
|
||
|
expected = df.reindex(columns=["A", "B"])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_append_misc(setup_path):
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
df = tm.makeDataFrame()
|
||
|
store.append("df", df, chunksize=1)
|
||
|
result = store.select("df")
|
||
|
tm.assert_frame_equal(result, df)
|
||
|
|
||
|
store.append("df1", df, expectedrows=10)
|
||
|
result = store.select("df1")
|
||
|
tm.assert_frame_equal(result, df)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("chunksize", [10, 200, 1000])
|
||
|
def test_append_misc_chunksize(setup_path, chunksize):
|
||
|
# more chunksize in append tests
|
||
|
df = tm.makeDataFrame()
|
||
|
df["string"] = "foo"
|
||
|
df["float322"] = 1.0
|
||
|
df["float322"] = df["float322"].astype("float32")
|
||
|
df["bool"] = df["float322"] > 0
|
||
|
df["time1"] = Timestamp("20130101")
|
||
|
df["time2"] = Timestamp("20130102")
|
||
|
with ensure_clean_store(setup_path, mode="w") as store:
|
||
|
store.append("obj", df, chunksize=chunksize)
|
||
|
result = store.select("obj")
|
||
|
tm.assert_frame_equal(result, df)
|
||
|
|
||
|
|
||
|
def test_append_misc_empty_frame(setup_path):
|
||
|
# empty frame, GH4273
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
|
||
|
# 0 len
|
||
|
df_empty = DataFrame(columns=list("ABC"))
|
||
|
store.append("df", df_empty)
|
||
|
with pytest.raises(KeyError, match="'No object named df in the file'"):
|
||
|
store.select("df")
|
||
|
|
||
|
# repeated append of 0/non-zero frames
|
||
|
df = DataFrame(np.random.rand(10, 3), columns=list("ABC"))
|
||
|
store.append("df", df)
|
||
|
tm.assert_frame_equal(store.select("df"), df)
|
||
|
store.append("df", df_empty)
|
||
|
tm.assert_frame_equal(store.select("df"), df)
|
||
|
|
||
|
# store
|
||
|
df = DataFrame(columns=list("ABC"))
|
||
|
store.put("df2", df)
|
||
|
tm.assert_frame_equal(store.select("df2"), df)
|
||
|
|
||
|
|
||
|
# TODO(ArrayManager) currently we rely on falling back to BlockManager, but
|
||
|
# the conversion from AM->BM converts the invalid object dtype column into
|
||
|
# a datetime64 column no longer raising an error
|
||
|
@td.skip_array_manager_not_yet_implemented
|
||
|
def test_append_raise(setup_path):
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
|
||
|
# test append with invalid input to get good error messages
|
||
|
|
||
|
# list in column
|
||
|
df = tm.makeDataFrame()
|
||
|
df["invalid"] = [["a"]] * len(df)
|
||
|
assert df.dtypes["invalid"] == np.object_
|
||
|
msg = re.escape(
|
||
|
"""Cannot serialize the column [invalid]
|
||
|
because its data contents are not [string] but [mixed] object dtype"""
|
||
|
)
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
store.append("df", df)
|
||
|
|
||
|
# multiple invalid columns
|
||
|
df["invalid2"] = [["a"]] * len(df)
|
||
|
df["invalid3"] = [["a"]] * len(df)
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
store.append("df", df)
|
||
|
|
||
|
# datetime with embedded nans as object
|
||
|
df = tm.makeDataFrame()
|
||
|
s = Series(datetime.datetime(2001, 1, 2), index=df.index)
|
||
|
s = s.astype(object)
|
||
|
s[0:5] = np.nan
|
||
|
df["invalid"] = s
|
||
|
assert df.dtypes["invalid"] == np.object_
|
||
|
msg = "too many timezones in this block, create separate data columns"
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
store.append("df", df)
|
||
|
|
||
|
# directly ndarray
|
||
|
msg = "value must be None, Series, or DataFrame"
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
store.append("df", np.arange(10))
|
||
|
|
||
|
# series directly
|
||
|
msg = re.escape(
|
||
|
"cannot properly create the storer for: "
|
||
|
"[group->df,value-><class 'pandas.core.series.Series'>]"
|
||
|
)
|
||
|
with pytest.raises(TypeError, match=msg):
|
||
|
store.append("df", Series(np.arange(10)))
|
||
|
|
||
|
# appending an incompatible table
|
||
|
df = tm.makeDataFrame()
|
||
|
store.append("df", df)
|
||
|
|
||
|
df["foo"] = "foo"
|
||
|
msg = re.escape(
|
||
|
"invalid combination of [non_index_axes] on appending data "
|
||
|
"[(1, ['A', 'B', 'C', 'D', 'foo'])] vs current table "
|
||
|
"[(1, ['A', 'B', 'C', 'D'])]"
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
store.append("df", df)
|
||
|
|
||
|
# incompatible type (GH 41897)
|
||
|
_maybe_remove(store, "df")
|
||
|
df["foo"] = Timestamp("20130101")
|
||
|
store.append("df", df)
|
||
|
df["foo"] = "bar"
|
||
|
msg = re.escape(
|
||
|
"invalid combination of [values_axes] on appending data "
|
||
|
"[name->values_block_1,cname->values_block_1,"
|
||
|
"dtype->bytes24,kind->string,shape->(1, 30)] "
|
||
|
"vs current table "
|
||
|
"[name->values_block_1,cname->values_block_1,"
|
||
|
"dtype->datetime64,kind->datetime64,shape->None]"
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
store.append("df", df)
|
||
|
|
||
|
|
||
|
def test_append_with_timedelta(setup_path):
|
||
|
# GH 3577
|
||
|
# append timedelta
|
||
|
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"A": Timestamp("20130101"),
|
||
|
"B": [
|
||
|
Timestamp("20130101") + timedelta(days=i, seconds=10) for i in range(10)
|
||
|
],
|
||
|
}
|
||
|
)
|
||
|
df["C"] = df["A"] - df["B"]
|
||
|
df.loc[3:5, "C"] = np.nan
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
|
||
|
# table
|
||
|
_maybe_remove(store, "df")
|
||
|
store.append("df", df, data_columns=True)
|
||
|
result = store.select("df")
|
||
|
tm.assert_frame_equal(result, df)
|
||
|
|
||
|
result = store.select("df", where="C<100000")
|
||
|
tm.assert_frame_equal(result, df)
|
||
|
|
||
|
result = store.select("df", where="C<pd.Timedelta('-3D')")
|
||
|
tm.assert_frame_equal(result, df.iloc[3:])
|
||
|
|
||
|
result = store.select("df", "C<'-3D'")
|
||
|
tm.assert_frame_equal(result, df.iloc[3:])
|
||
|
|
||
|
# a bit hacky here as we don't really deal with the NaT properly
|
||
|
|
||
|
result = store.select("df", "C<'-500000s'")
|
||
|
result = result.dropna(subset=["C"])
|
||
|
tm.assert_frame_equal(result, df.iloc[6:])
|
||
|
|
||
|
result = store.select("df", "C<'-3.5D'")
|
||
|
result = result.iloc[1:]
|
||
|
tm.assert_frame_equal(result, df.iloc[4:])
|
||
|
|
||
|
# fixed
|
||
|
_maybe_remove(store, "df2")
|
||
|
store.put("df2", df)
|
||
|
result = store.select("df2")
|
||
|
tm.assert_frame_equal(result, df)
|
||
|
|
||
|
|
||
|
def test_append_to_multiple(setup_path):
|
||
|
df1 = tm.makeTimeDataFrame()
|
||
|
df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
|
||
|
df2["foo"] = "bar"
|
||
|
df = concat([df1, df2], axis=1)
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
|
||
|
# exceptions
|
||
|
msg = "append_to_multiple requires a selector that is in passed dict"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
store.append_to_multiple(
|
||
|
{"df1": ["A", "B"], "df2": None}, df, selector="df3"
|
||
|
)
|
||
|
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
store.append_to_multiple({"df1": None, "df2": None}, df, selector="df3")
|
||
|
|
||
|
msg = (
|
||
|
"append_to_multiple must have a dictionary specified as the way to "
|
||
|
"split the value"
|
||
|
)
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
store.append_to_multiple("df1", df, "df1")
|
||
|
|
||
|
# regular operation
|
||
|
store.append_to_multiple({"df1": ["A", "B"], "df2": None}, df, selector="df1")
|
||
|
result = store.select_as_multiple(
|
||
|
["df1", "df2"], where=["A>0", "B>0"], selector="df1"
|
||
|
)
|
||
|
expected = df[(df.A > 0) & (df.B > 0)]
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_append_to_multiple_dropna(setup_path):
|
||
|
df1 = tm.makeTimeDataFrame()
|
||
|
df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
|
||
|
df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
|
||
|
df = concat([df1, df2], axis=1)
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
|
||
|
# dropna=True should guarantee rows are synchronized
|
||
|
store.append_to_multiple(
|
||
|
{"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True
|
||
|
)
|
||
|
result = store.select_as_multiple(["df1", "df2"])
|
||
|
expected = df.dropna()
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
tm.assert_index_equal(store.select("df1").index, store.select("df2").index)
|
||
|
|
||
|
|
||
|
def test_append_to_multiple_dropna_false(setup_path):
|
||
|
df1 = tm.makeTimeDataFrame()
|
||
|
df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
|
||
|
df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
|
||
|
df = concat([df1, df2], axis=1)
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store, pd.option_context(
|
||
|
"io.hdf.dropna_table", True
|
||
|
):
|
||
|
# dropna=False shouldn't synchronize row indexes
|
||
|
store.append_to_multiple(
|
||
|
{"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False
|
||
|
)
|
||
|
|
||
|
msg = "all tables must have exactly the same nrows!"
|
||
|
with pytest.raises(ValueError, match=msg):
|
||
|
store.select_as_multiple(["df1a", "df2a"])
|
||
|
|
||
|
assert not store.select("df1a").index.equals(store.select("df2a").index)
|
||
|
|
||
|
|
||
|
def test_append_to_multiple_min_itemsize(setup_path):
|
||
|
# GH 11238
|
||
|
df = DataFrame(
|
||
|
{
|
||
|
"IX": np.arange(1, 21),
|
||
|
"Num": np.arange(1, 21),
|
||
|
"BigNum": np.arange(1, 21) * 88,
|
||
|
"Str": ["a" for _ in range(20)],
|
||
|
"LongStr": ["abcde" for _ in range(20)],
|
||
|
}
|
||
|
)
|
||
|
expected = df.iloc[[0]]
|
||
|
|
||
|
with ensure_clean_store(setup_path) as store:
|
||
|
store.append_to_multiple(
|
||
|
{
|
||
|
"index": ["IX"],
|
||
|
"nums": ["Num", "BigNum"],
|
||
|
"strs": ["Str", "LongStr"],
|
||
|
},
|
||
|
df.iloc[[0]],
|
||
|
"index",
|
||
|
min_itemsize={"Str": 10, "LongStr": 100, "Num": 2},
|
||
|
)
|
||
|
result = store.select_as_multiple(["index", "nums", "strs"])
|
||
|
tm.assert_frame_equal(result, expected)
|