import gzip import io import os from pathlib import Path import subprocess import sys import tarfile import textwrap import time import zipfile import pytest from pandas.compat import is_platform_windows import pandas as pd import pandas._testing as tm import pandas.io.common as icom _compression_to_extension = { value: key for key, value in icom._extension_to_compression.items() } @pytest.mark.parametrize( "obj", [ pd.DataFrame( 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], columns=["X", "Y", "Z"], ), pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), ], ) @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) def test_compression_size(obj, method, compression_only): if compression_only == "tar": compression_only = {"method": "tar", "mode": "w:gz"} with tm.ensure_clean() as path: getattr(obj, method)(path, compression=compression_only) compressed_size = os.path.getsize(path) getattr(obj, method)(path, compression=None) uncompressed_size = os.path.getsize(path) assert uncompressed_size > compressed_size @pytest.mark.parametrize( "obj", [ pd.DataFrame( 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], columns=["X", "Y", "Z"], ), pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), ], ) @pytest.mark.parametrize("method", ["to_csv", "to_json"]) def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as path: with icom.get_handle( path, "w:gz" if compression_only == "tar" else "w", compression=compression_only, ) as handles: getattr(obj, method)(handles.handle) assert not handles.handle.closed compressed_size = os.path.getsize(path) with tm.ensure_clean() as path: with icom.get_handle(path, "w", compression=None) as handles: getattr(obj, method)(handles.handle) assert not handles.handle.closed uncompressed_size = os.path.getsize(path) assert uncompressed_size > compressed_size @pytest.mark.parametrize( "write_method, write_kwargs, read_method", [ ("to_csv", {"index": False}, pd.read_csv), ("to_json", {}, pd.read_json), ("to_pickle", {}, pd.read_pickle), ], ) def test_dataframe_compression_defaults_to_infer( write_method, write_kwargs, read_method, compression_only ): # GH22004 input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"]) extension = _compression_to_extension[compression_only] with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) output = read_method(path, compression=compression_only) tm.assert_frame_equal(output, input) @pytest.mark.parametrize( "write_method,write_kwargs,read_method,read_kwargs", [ ("to_csv", {"index": False, "header": True}, pd.read_csv, {"squeeze": True}), ("to_json", {}, pd.read_json, {"typ": "series"}), ("to_pickle", {}, pd.read_pickle, {}), ], ) def test_series_compression_defaults_to_infer( write_method, write_kwargs, read_method, read_kwargs, compression_only ): # GH22004 input = pd.Series([0, 5, -2, 10], name="X") extension = _compression_to_extension[compression_only] with tm.ensure_clean("compressed" + extension) as path: getattr(input, write_method)(path, **write_kwargs) if "squeeze" in read_kwargs: kwargs = read_kwargs.copy() del kwargs["squeeze"] output = read_method(path, compression=compression_only, **kwargs).squeeze( "columns" ) else: output = read_method(path, compression=compression_only, **read_kwargs) tm.assert_series_equal(output, input, check_names=False) def test_compression_warning(compression_only): # Assert that passing a file object to to_csv while explicitly specifying a # compression protocol triggers a RuntimeWarning, as per GH21227. df = pd.DataFrame( 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], columns=["X", "Y", "Z"], ) with tm.ensure_clean() as path: with icom.get_handle(path, "w", compression=compression_only) as handles: with tm.assert_produces_warning(RuntimeWarning): df.to_csv(handles.handle, compression=compression_only) def test_compression_binary(compression_only): """ Binary file handles support compression. GH22555 """ df = tm.makeDataFrame() # with a file with tm.ensure_clean() as path: with open(path, mode="wb") as file: df.to_csv(file, mode="wb", compression=compression_only) file.seek(0) # file shouldn't be closed tm.assert_frame_equal( df, pd.read_csv(path, index_col=0, compression=compression_only) ) # with BytesIO file = io.BytesIO() df.to_csv(file, mode="wb", compression=compression_only) file.seek(0) # file shouldn't be closed tm.assert_frame_equal( df, pd.read_csv(file, index_col=0, compression=compression_only) ) def test_gzip_reproducibility_file_name(): """ Gzip should create reproducible archives with mtime. Note: Archives created with different filenames will still be different! GH 28103 """ df = tm.makeDataFrame() compression_options = {"method": "gzip", "mtime": 1} # test for filename with tm.ensure_clean() as path: path = Path(path) df.to_csv(path, compression=compression_options) time.sleep(2) output = path.read_bytes() df.to_csv(path, compression=compression_options) assert output == path.read_bytes() def test_gzip_reproducibility_file_object(): """ Gzip should create reproducible archives with mtime. GH 28103 """ df = tm.makeDataFrame() compression_options = {"method": "gzip", "mtime": 1} # test for file object buffer = io.BytesIO() df.to_csv(buffer, compression=compression_options, mode="wb") output = buffer.getvalue() time.sleep(2) buffer = io.BytesIO() df.to_csv(buffer, compression=compression_options, mode="wb") assert output == buffer.getvalue() def test_with_missing_lzma(): """Tests if import pandas works when lzma is not present.""" # https://github.com/pandas-dev/pandas/issues/27575 code = textwrap.dedent( """\ import sys sys.modules['lzma'] = None import pandas """ ) subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE) def test_with_missing_lzma_runtime(): """Tests if RuntimeError is hit when calling lzma without having the module available. """ code = textwrap.dedent( """ import sys import pytest sys.modules['lzma'] = None import pandas as pd df = pd.DataFrame() with pytest.raises(RuntimeError, match='lzma module'): df.to_csv('foo.csv', compression='xz') """ ) subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE) @pytest.mark.parametrize( "obj", [ pd.DataFrame( 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], columns=["X", "Y", "Z"], ), pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), ], ) @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) def test_gzip_compression_level(obj, method): # GH33196 with tm.ensure_clean() as path: getattr(obj, method)(path, compression="gzip") compressed_size_default = os.path.getsize(path) getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1}) compressed_size_fast = os.path.getsize(path) assert compressed_size_default < compressed_size_fast @pytest.mark.parametrize( "obj", [ pd.DataFrame( 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], columns=["X", "Y", "Z"], ), pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), ], ) @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) def test_bzip_compression_level(obj, method): """GH33196 bzip needs file size > 100k to show a size difference between compression levels, so here we just check if the call works when compression is passed as a dict. """ with tm.ensure_clean() as path: getattr(obj, method)(path, compression={"method": "bz2", "compresslevel": 1}) @pytest.mark.parametrize( "suffix,archive", [ (".zip", zipfile.ZipFile), (".tar", tarfile.TarFile), ], ) def test_empty_archive_zip(suffix, archive): with tm.ensure_clean(filename=suffix) as path: file = archive(path, "w") file.close() with pytest.raises(ValueError, match="Zero files found"): pd.read_csv(path) def test_ambiguous_archive_zip(): with tm.ensure_clean(filename=".zip") as path: file = zipfile.ZipFile(path, "w") file.writestr("a.csv", "foo,bar") file.writestr("b.csv", "foo,bar") file.close() with pytest.raises(ValueError, match="Multiple files found in ZIP file"): pd.read_csv(path) def test_ambiguous_archive_tar(): with tm.ensure_clean_dir() as dir: csvAPath = os.path.join(dir, "a.csv") with open(csvAPath, "w") as a: a.write("foo,bar\n") csvBPath = os.path.join(dir, "b.csv") with open(csvBPath, "w") as b: b.write("foo,bar\n") tarpath = os.path.join(dir, "archive.tar") with tarfile.TarFile(tarpath, "w") as tar: tar.add(csvAPath, "a.csv") tar.add(csvBPath, "b.csv") with pytest.raises(ValueError, match="Multiple files found in TAR archive"): pd.read_csv(tarpath) def test_tar_gz_to_different_filename(): with tm.ensure_clean(filename=".foo") as file: pd.DataFrame( [["1", "2"]], columns=["foo", "bar"], ).to_csv(file, compression={"method": "tar", "mode": "w:gz"}, index=False) with gzip.open(file) as uncompressed: with tarfile.TarFile(fileobj=uncompressed) as archive: members = archive.getmembers() assert len(members) == 1 content = archive.extractfile(members[0]).read().decode("utf8") if is_platform_windows(): expected = "foo,bar\r\n1,2\r\n" else: expected = "foo,bar\n1,2\n" assert content == expected def test_tar_no_error_on_close(): with io.BytesIO() as buffer: with icom._BytesTarFile(fileobj=buffer, mode="w"): pass