380 lines
12 KiB
Python
380 lines
12 KiB
Python
"""
|
|
Tests for the pandas custom headers in http(s) requests
|
|
"""
|
|
import gzip
|
|
import http.server
|
|
from io import BytesIO
|
|
import multiprocessing
|
|
import socket
|
|
import time
|
|
import urllib.error
|
|
|
|
import pytest
|
|
|
|
from pandas.compat import is_ci_environment
|
|
import pandas.util._test_decorators as td
|
|
|
|
import pandas as pd
|
|
import pandas._testing as tm
|
|
|
|
pytestmark = pytest.mark.skipif(
|
|
is_ci_environment(),
|
|
reason="This test can hang in our CI min_versions build "
|
|
"and leads to '##[error]The runner has "
|
|
"received a shutdown signal...' in GHA. GH 45651",
|
|
)
|
|
|
|
|
|
class BaseUserAgentResponder(http.server.BaseHTTPRequestHandler):
|
|
"""
|
|
Base class for setting up a server that can be set up to respond
|
|
with a particular file format with accompanying content-type headers.
|
|
The interfaces on the different io methods are different enough
|
|
that this seemed logical to do.
|
|
"""
|
|
|
|
def start_processing_headers(self):
|
|
"""
|
|
shared logic at the start of a GET request
|
|
"""
|
|
self.send_response(200)
|
|
self.requested_from_user_agent = self.headers["User-Agent"]
|
|
response_df = pd.DataFrame(
|
|
{
|
|
"header": [self.requested_from_user_agent],
|
|
}
|
|
)
|
|
return response_df
|
|
|
|
def gzip_bytes(self, response_bytes):
|
|
"""
|
|
some web servers will send back gzipped files to save bandwidth
|
|
"""
|
|
with BytesIO() as bio:
|
|
with gzip.GzipFile(fileobj=bio, mode="w") as zipper:
|
|
zipper.write(response_bytes)
|
|
response_bytes = bio.getvalue()
|
|
return response_bytes
|
|
|
|
def write_back_bytes(self, response_bytes):
|
|
"""
|
|
shared logic at the end of a GET request
|
|
"""
|
|
self.wfile.write(response_bytes)
|
|
|
|
|
|
class CSVUserAgentResponder(BaseUserAgentResponder):
|
|
def do_GET(self):
|
|
response_df = self.start_processing_headers()
|
|
|
|
self.send_header("Content-Type", "text/csv")
|
|
self.end_headers()
|
|
|
|
response_bytes = response_df.to_csv(index=False).encode("utf-8")
|
|
self.write_back_bytes(response_bytes)
|
|
|
|
|
|
class GzippedCSVUserAgentResponder(BaseUserAgentResponder):
|
|
def do_GET(self):
|
|
response_df = self.start_processing_headers()
|
|
self.send_header("Content-Type", "text/csv")
|
|
self.send_header("Content-Encoding", "gzip")
|
|
self.end_headers()
|
|
|
|
response_bytes = response_df.to_csv(index=False).encode("utf-8")
|
|
response_bytes = self.gzip_bytes(response_bytes)
|
|
|
|
self.write_back_bytes(response_bytes)
|
|
|
|
|
|
class JSONUserAgentResponder(BaseUserAgentResponder):
|
|
def do_GET(self):
|
|
response_df = self.start_processing_headers()
|
|
self.send_header("Content-Type", "application/json")
|
|
self.end_headers()
|
|
|
|
response_bytes = response_df.to_json().encode("utf-8")
|
|
|
|
self.write_back_bytes(response_bytes)
|
|
|
|
|
|
class GzippedJSONUserAgentResponder(BaseUserAgentResponder):
|
|
def do_GET(self):
|
|
response_df = self.start_processing_headers()
|
|
self.send_header("Content-Type", "application/json")
|
|
self.send_header("Content-Encoding", "gzip")
|
|
self.end_headers()
|
|
|
|
response_bytes = response_df.to_json().encode("utf-8")
|
|
response_bytes = self.gzip_bytes(response_bytes)
|
|
|
|
self.write_back_bytes(response_bytes)
|
|
|
|
|
|
class ParquetPyArrowUserAgentResponder(BaseUserAgentResponder):
|
|
def do_GET(self):
|
|
response_df = self.start_processing_headers()
|
|
self.send_header("Content-Type", "application/octet-stream")
|
|
self.end_headers()
|
|
|
|
response_bytes = response_df.to_parquet(index=False, engine="pyarrow")
|
|
|
|
self.write_back_bytes(response_bytes)
|
|
|
|
|
|
class ParquetFastParquetUserAgentResponder(BaseUserAgentResponder):
|
|
def do_GET(self):
|
|
response_df = self.start_processing_headers()
|
|
self.send_header("Content-Type", "application/octet-stream")
|
|
self.end_headers()
|
|
|
|
# the fastparquet engine doesn't like to write to a buffer
|
|
# it can do it via the open_with function being set appropriately
|
|
# however it automatically calls the close method and wipes the buffer
|
|
# so just overwrite that attribute on this instance to not do that
|
|
|
|
# protected by an importorskip in the respective test
|
|
import fsspec
|
|
|
|
response_df.to_parquet(
|
|
"memory://fastparquet_user_agent.parquet",
|
|
index=False,
|
|
engine="fastparquet",
|
|
compression=None,
|
|
)
|
|
with fsspec.open("memory://fastparquet_user_agent.parquet", "rb") as f:
|
|
response_bytes = f.read()
|
|
|
|
self.write_back_bytes(response_bytes)
|
|
|
|
|
|
class PickleUserAgentResponder(BaseUserAgentResponder):
|
|
def do_GET(self):
|
|
response_df = self.start_processing_headers()
|
|
self.send_header("Content-Type", "application/octet-stream")
|
|
self.end_headers()
|
|
|
|
bio = BytesIO()
|
|
response_df.to_pickle(bio)
|
|
response_bytes = bio.getvalue()
|
|
|
|
self.write_back_bytes(response_bytes)
|
|
|
|
|
|
class StataUserAgentResponder(BaseUserAgentResponder):
|
|
def do_GET(self):
|
|
response_df = self.start_processing_headers()
|
|
self.send_header("Content-Type", "application/octet-stream")
|
|
self.end_headers()
|
|
|
|
bio = BytesIO()
|
|
response_df.to_stata(bio, write_index=False)
|
|
response_bytes = bio.getvalue()
|
|
|
|
self.write_back_bytes(response_bytes)
|
|
|
|
|
|
class AllHeaderCSVResponder(http.server.BaseHTTPRequestHandler):
|
|
"""
|
|
Send all request headers back for checking round trip
|
|
"""
|
|
|
|
def do_GET(self):
|
|
response_df = pd.DataFrame(self.headers.items())
|
|
self.send_response(200)
|
|
self.send_header("Content-Type", "text/csv")
|
|
self.end_headers()
|
|
response_bytes = response_df.to_csv(index=False).encode("utf-8")
|
|
self.wfile.write(response_bytes)
|
|
|
|
|
|
def wait_until_ready(func, *args, **kwargs):
|
|
def inner(*args, **kwargs):
|
|
while True:
|
|
try:
|
|
return func(*args, **kwargs)
|
|
except urllib.error.URLError:
|
|
# Connection refused as http server is starting
|
|
time.sleep(0.1)
|
|
|
|
return inner
|
|
|
|
|
|
def process_server(responder, port):
|
|
with http.server.HTTPServer(("localhost", port), responder) as server:
|
|
server.handle_request()
|
|
server.server_close()
|
|
|
|
|
|
@pytest.fixture
|
|
def responder(request):
|
|
"""
|
|
Fixture that starts a local http server in a separate process on localhost
|
|
and returns the port.
|
|
|
|
Running in a separate process instead of a thread to allow termination/killing
|
|
of http server upon cleanup.
|
|
"""
|
|
# Find an available port
|
|
with socket.socket() as sock:
|
|
sock.bind(("localhost", 0))
|
|
port = sock.getsockname()[1]
|
|
|
|
server_process = multiprocessing.Process(
|
|
target=process_server, args=(request.param, port)
|
|
)
|
|
server_process.start()
|
|
yield port
|
|
server_process.join(10)
|
|
server_process.terminate()
|
|
kill_time = 5
|
|
wait_time = 0
|
|
while server_process.is_alive():
|
|
if wait_time > kill_time:
|
|
server_process.kill()
|
|
break
|
|
else:
|
|
wait_time += 0.1
|
|
time.sleep(0.1)
|
|
server_process.close()
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"responder, read_method, parquet_engine",
|
|
[
|
|
(CSVUserAgentResponder, pd.read_csv, None),
|
|
(JSONUserAgentResponder, pd.read_json, None),
|
|
(ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"),
|
|
pytest.param(
|
|
ParquetFastParquetUserAgentResponder,
|
|
pd.read_parquet,
|
|
"fastparquet",
|
|
# TODO(ArrayManager) fastparquet
|
|
marks=[
|
|
td.skip_array_manager_not_yet_implemented,
|
|
],
|
|
),
|
|
(PickleUserAgentResponder, pd.read_pickle, None),
|
|
(StataUserAgentResponder, pd.read_stata, None),
|
|
(GzippedCSVUserAgentResponder, pd.read_csv, None),
|
|
(GzippedJSONUserAgentResponder, pd.read_json, None),
|
|
],
|
|
indirect=["responder"],
|
|
)
|
|
def test_server_and_default_headers(responder, read_method, parquet_engine):
|
|
if parquet_engine is not None:
|
|
pytest.importorskip(parquet_engine)
|
|
if parquet_engine == "fastparquet":
|
|
pytest.importorskip("fsspec")
|
|
|
|
read_method = wait_until_ready(read_method)
|
|
if parquet_engine is None:
|
|
df_http = read_method(f"http://localhost:{responder}")
|
|
else:
|
|
df_http = read_method(f"http://localhost:{responder}", engine=parquet_engine)
|
|
|
|
assert not df_http.empty
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"responder, read_method, parquet_engine",
|
|
[
|
|
(CSVUserAgentResponder, pd.read_csv, None),
|
|
(JSONUserAgentResponder, pd.read_json, None),
|
|
(ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"),
|
|
pytest.param(
|
|
ParquetFastParquetUserAgentResponder,
|
|
pd.read_parquet,
|
|
"fastparquet",
|
|
# TODO(ArrayManager) fastparquet
|
|
marks=[
|
|
td.skip_array_manager_not_yet_implemented,
|
|
],
|
|
),
|
|
(PickleUserAgentResponder, pd.read_pickle, None),
|
|
(StataUserAgentResponder, pd.read_stata, None),
|
|
(GzippedCSVUserAgentResponder, pd.read_csv, None),
|
|
(GzippedJSONUserAgentResponder, pd.read_json, None),
|
|
],
|
|
indirect=["responder"],
|
|
)
|
|
def test_server_and_custom_headers(responder, read_method, parquet_engine):
|
|
if parquet_engine is not None:
|
|
pytest.importorskip(parquet_engine)
|
|
if parquet_engine == "fastparquet":
|
|
pytest.importorskip("fsspec")
|
|
|
|
custom_user_agent = "Super Cool One"
|
|
df_true = pd.DataFrame({"header": [custom_user_agent]})
|
|
|
|
read_method = wait_until_ready(read_method)
|
|
if parquet_engine is None:
|
|
df_http = read_method(
|
|
f"http://localhost:{responder}",
|
|
storage_options={"User-Agent": custom_user_agent},
|
|
)
|
|
else:
|
|
df_http = read_method(
|
|
f"http://localhost:{responder}",
|
|
storage_options={"User-Agent": custom_user_agent},
|
|
engine=parquet_engine,
|
|
)
|
|
|
|
tm.assert_frame_equal(df_true, df_http)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"responder, read_method",
|
|
[
|
|
(AllHeaderCSVResponder, pd.read_csv),
|
|
],
|
|
indirect=["responder"],
|
|
)
|
|
def test_server_and_all_custom_headers(responder, read_method):
|
|
custom_user_agent = "Super Cool One"
|
|
custom_auth_token = "Super Secret One"
|
|
storage_options = {
|
|
"User-Agent": custom_user_agent,
|
|
"Auth": custom_auth_token,
|
|
}
|
|
read_method = wait_until_ready(read_method)
|
|
df_http = read_method(
|
|
f"http://localhost:{responder}",
|
|
storage_options=storage_options,
|
|
)
|
|
|
|
df_http = df_http[df_http["0"].isin(storage_options.keys())]
|
|
df_http = df_http.sort_values(["0"]).reset_index()
|
|
df_http = df_http[["0", "1"]]
|
|
|
|
keys = list(storage_options.keys())
|
|
df_true = pd.DataFrame({"0": keys, "1": [storage_options[k] for k in keys]})
|
|
df_true = df_true.sort_values(["0"])
|
|
df_true = df_true.reset_index().drop(["index"], axis=1)
|
|
|
|
tm.assert_frame_equal(df_true, df_http)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"engine",
|
|
[
|
|
"pyarrow",
|
|
"fastparquet",
|
|
],
|
|
)
|
|
def test_to_parquet_to_disk_with_storage_options(engine):
|
|
headers = {
|
|
"User-Agent": "custom",
|
|
"Auth": "other_custom",
|
|
}
|
|
|
|
pytest.importorskip(engine)
|
|
|
|
true_df = pd.DataFrame({"column_name": ["column_value"]})
|
|
msg = (
|
|
"storage_options passed with file object or non-fsspec file path|"
|
|
"storage_options passed with buffer, or non-supported URL"
|
|
)
|
|
with pytest.raises(ValueError, match=msg):
|
|
true_df.to_parquet("/tmp/junk.parquet", storage_options=headers, engine=engine)
|