aoc-2022/venv/Lib/site-packages/pandas/tests/io/test_user_agent.py

380 lines
12 KiB
Python

"""
Tests for the pandas custom headers in http(s) requests
"""
import gzip
import http.server
from io import BytesIO
import multiprocessing
import socket
import time
import urllib.error
import pytest
from pandas.compat import is_ci_environment
import pandas.util._test_decorators as td
import pandas as pd
import pandas._testing as tm
pytestmark = pytest.mark.skipif(
is_ci_environment(),
reason="This test can hang in our CI min_versions build "
"and leads to '##[error]The runner has "
"received a shutdown signal...' in GHA. GH 45651",
)
class BaseUserAgentResponder(http.server.BaseHTTPRequestHandler):
"""
Base class for setting up a server that can be set up to respond
with a particular file format with accompanying content-type headers.
The interfaces on the different io methods are different enough
that this seemed logical to do.
"""
def start_processing_headers(self):
"""
shared logic at the start of a GET request
"""
self.send_response(200)
self.requested_from_user_agent = self.headers["User-Agent"]
response_df = pd.DataFrame(
{
"header": [self.requested_from_user_agent],
}
)
return response_df
def gzip_bytes(self, response_bytes):
"""
some web servers will send back gzipped files to save bandwidth
"""
with BytesIO() as bio:
with gzip.GzipFile(fileobj=bio, mode="w") as zipper:
zipper.write(response_bytes)
response_bytes = bio.getvalue()
return response_bytes
def write_back_bytes(self, response_bytes):
"""
shared logic at the end of a GET request
"""
self.wfile.write(response_bytes)
class CSVUserAgentResponder(BaseUserAgentResponder):
def do_GET(self):
response_df = self.start_processing_headers()
self.send_header("Content-Type", "text/csv")
self.end_headers()
response_bytes = response_df.to_csv(index=False).encode("utf-8")
self.write_back_bytes(response_bytes)
class GzippedCSVUserAgentResponder(BaseUserAgentResponder):
def do_GET(self):
response_df = self.start_processing_headers()
self.send_header("Content-Type", "text/csv")
self.send_header("Content-Encoding", "gzip")
self.end_headers()
response_bytes = response_df.to_csv(index=False).encode("utf-8")
response_bytes = self.gzip_bytes(response_bytes)
self.write_back_bytes(response_bytes)
class JSONUserAgentResponder(BaseUserAgentResponder):
def do_GET(self):
response_df = self.start_processing_headers()
self.send_header("Content-Type", "application/json")
self.end_headers()
response_bytes = response_df.to_json().encode("utf-8")
self.write_back_bytes(response_bytes)
class GzippedJSONUserAgentResponder(BaseUserAgentResponder):
def do_GET(self):
response_df = self.start_processing_headers()
self.send_header("Content-Type", "application/json")
self.send_header("Content-Encoding", "gzip")
self.end_headers()
response_bytes = response_df.to_json().encode("utf-8")
response_bytes = self.gzip_bytes(response_bytes)
self.write_back_bytes(response_bytes)
class ParquetPyArrowUserAgentResponder(BaseUserAgentResponder):
def do_GET(self):
response_df = self.start_processing_headers()
self.send_header("Content-Type", "application/octet-stream")
self.end_headers()
response_bytes = response_df.to_parquet(index=False, engine="pyarrow")
self.write_back_bytes(response_bytes)
class ParquetFastParquetUserAgentResponder(BaseUserAgentResponder):
def do_GET(self):
response_df = self.start_processing_headers()
self.send_header("Content-Type", "application/octet-stream")
self.end_headers()
# the fastparquet engine doesn't like to write to a buffer
# it can do it via the open_with function being set appropriately
# however it automatically calls the close method and wipes the buffer
# so just overwrite that attribute on this instance to not do that
# protected by an importorskip in the respective test
import fsspec
response_df.to_parquet(
"memory://fastparquet_user_agent.parquet",
index=False,
engine="fastparquet",
compression=None,
)
with fsspec.open("memory://fastparquet_user_agent.parquet", "rb") as f:
response_bytes = f.read()
self.write_back_bytes(response_bytes)
class PickleUserAgentResponder(BaseUserAgentResponder):
def do_GET(self):
response_df = self.start_processing_headers()
self.send_header("Content-Type", "application/octet-stream")
self.end_headers()
bio = BytesIO()
response_df.to_pickle(bio)
response_bytes = bio.getvalue()
self.write_back_bytes(response_bytes)
class StataUserAgentResponder(BaseUserAgentResponder):
def do_GET(self):
response_df = self.start_processing_headers()
self.send_header("Content-Type", "application/octet-stream")
self.end_headers()
bio = BytesIO()
response_df.to_stata(bio, write_index=False)
response_bytes = bio.getvalue()
self.write_back_bytes(response_bytes)
class AllHeaderCSVResponder(http.server.BaseHTTPRequestHandler):
"""
Send all request headers back for checking round trip
"""
def do_GET(self):
response_df = pd.DataFrame(self.headers.items())
self.send_response(200)
self.send_header("Content-Type", "text/csv")
self.end_headers()
response_bytes = response_df.to_csv(index=False).encode("utf-8")
self.wfile.write(response_bytes)
def wait_until_ready(func, *args, **kwargs):
def inner(*args, **kwargs):
while True:
try:
return func(*args, **kwargs)
except urllib.error.URLError:
# Connection refused as http server is starting
time.sleep(0.1)
return inner
def process_server(responder, port):
with http.server.HTTPServer(("localhost", port), responder) as server:
server.handle_request()
server.server_close()
@pytest.fixture
def responder(request):
"""
Fixture that starts a local http server in a separate process on localhost
and returns the port.
Running in a separate process instead of a thread to allow termination/killing
of http server upon cleanup.
"""
# Find an available port
with socket.socket() as sock:
sock.bind(("localhost", 0))
port = sock.getsockname()[1]
server_process = multiprocessing.Process(
target=process_server, args=(request.param, port)
)
server_process.start()
yield port
server_process.join(10)
server_process.terminate()
kill_time = 5
wait_time = 0
while server_process.is_alive():
if wait_time > kill_time:
server_process.kill()
break
else:
wait_time += 0.1
time.sleep(0.1)
server_process.close()
@pytest.mark.parametrize(
"responder, read_method, parquet_engine",
[
(CSVUserAgentResponder, pd.read_csv, None),
(JSONUserAgentResponder, pd.read_json, None),
(ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"),
pytest.param(
ParquetFastParquetUserAgentResponder,
pd.read_parquet,
"fastparquet",
# TODO(ArrayManager) fastparquet
marks=[
td.skip_array_manager_not_yet_implemented,
],
),
(PickleUserAgentResponder, pd.read_pickle, None),
(StataUserAgentResponder, pd.read_stata, None),
(GzippedCSVUserAgentResponder, pd.read_csv, None),
(GzippedJSONUserAgentResponder, pd.read_json, None),
],
indirect=["responder"],
)
def test_server_and_default_headers(responder, read_method, parquet_engine):
if parquet_engine is not None:
pytest.importorskip(parquet_engine)
if parquet_engine == "fastparquet":
pytest.importorskip("fsspec")
read_method = wait_until_ready(read_method)
if parquet_engine is None:
df_http = read_method(f"http://localhost:{responder}")
else:
df_http = read_method(f"http://localhost:{responder}", engine=parquet_engine)
assert not df_http.empty
@pytest.mark.parametrize(
"responder, read_method, parquet_engine",
[
(CSVUserAgentResponder, pd.read_csv, None),
(JSONUserAgentResponder, pd.read_json, None),
(ParquetPyArrowUserAgentResponder, pd.read_parquet, "pyarrow"),
pytest.param(
ParquetFastParquetUserAgentResponder,
pd.read_parquet,
"fastparquet",
# TODO(ArrayManager) fastparquet
marks=[
td.skip_array_manager_not_yet_implemented,
],
),
(PickleUserAgentResponder, pd.read_pickle, None),
(StataUserAgentResponder, pd.read_stata, None),
(GzippedCSVUserAgentResponder, pd.read_csv, None),
(GzippedJSONUserAgentResponder, pd.read_json, None),
],
indirect=["responder"],
)
def test_server_and_custom_headers(responder, read_method, parquet_engine):
if parquet_engine is not None:
pytest.importorskip(parquet_engine)
if parquet_engine == "fastparquet":
pytest.importorskip("fsspec")
custom_user_agent = "Super Cool One"
df_true = pd.DataFrame({"header": [custom_user_agent]})
read_method = wait_until_ready(read_method)
if parquet_engine is None:
df_http = read_method(
f"http://localhost:{responder}",
storage_options={"User-Agent": custom_user_agent},
)
else:
df_http = read_method(
f"http://localhost:{responder}",
storage_options={"User-Agent": custom_user_agent},
engine=parquet_engine,
)
tm.assert_frame_equal(df_true, df_http)
@pytest.mark.parametrize(
"responder, read_method",
[
(AllHeaderCSVResponder, pd.read_csv),
],
indirect=["responder"],
)
def test_server_and_all_custom_headers(responder, read_method):
custom_user_agent = "Super Cool One"
custom_auth_token = "Super Secret One"
storage_options = {
"User-Agent": custom_user_agent,
"Auth": custom_auth_token,
}
read_method = wait_until_ready(read_method)
df_http = read_method(
f"http://localhost:{responder}",
storage_options=storage_options,
)
df_http = df_http[df_http["0"].isin(storage_options.keys())]
df_http = df_http.sort_values(["0"]).reset_index()
df_http = df_http[["0", "1"]]
keys = list(storage_options.keys())
df_true = pd.DataFrame({"0": keys, "1": [storage_options[k] for k in keys]})
df_true = df_true.sort_values(["0"])
df_true = df_true.reset_index().drop(["index"], axis=1)
tm.assert_frame_equal(df_true, df_http)
@pytest.mark.parametrize(
"engine",
[
"pyarrow",
"fastparquet",
],
)
def test_to_parquet_to_disk_with_storage_options(engine):
headers = {
"User-Agent": "custom",
"Auth": "other_custom",
}
pytest.importorskip(engine)
true_df = pd.DataFrame({"column_name": ["column_value"]})
msg = (
"storage_options passed with file object or non-fsspec file path|"
"storage_options passed with buffer, or non-supported URL"
)
with pytest.raises(ValueError, match=msg):
true_df.to_parquet("/tmp/junk.parquet", storage_options=headers, engine=engine)