[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -22,24 +22,6 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
||||
PREV_MINOR_VERSION = version._prev_minor_version()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[True])
|
||||
def use_v1(request):
|
||||
# Module-scoped variant of run_with_both_engines
|
||||
#
|
||||
# Use this fixture to run a test with both v0 and v1, and
|
||||
# also to conditionalize the test logic e.g.
|
||||
#
|
||||
# def test_metrics_exist(use_v1, server, client):
|
||||
# ...
|
||||
# expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
|
||||
# for metric in expected:
|
||||
# assert metric in response.text
|
||||
#
|
||||
# @skip_v1 wouldn't work here because this is a module-level
|
||||
# fixture - per-function decorators would have no effect
|
||||
yield request.param
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args():
|
||||
return [
|
||||
@@ -63,13 +45,11 @@ def default_server_args():
|
||||
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
|
||||
],
|
||||
)
|
||||
def server(use_v1, default_server_args, request):
|
||||
def server(default_server_args, request):
|
||||
if request.param:
|
||||
default_server_args.append(request.param)
|
||||
env_dict = dict(VLLM_USE_V1="1" if use_v1 else "0")
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME, default_server_args, env_dict=env_dict
|
||||
) as remote_server:
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@@ -129,7 +109,8 @@ EXPECTED_VALUES = {
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_metrics_counts(
|
||||
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
|
||||
server: RemoteOpenAIServer,
|
||||
client: openai.AsyncClient,
|
||||
):
|
||||
for _ in range(_NUM_REQUESTS):
|
||||
# sending a request triggers the metrics to be logged.
|
||||
@@ -145,7 +126,7 @@ async def test_metrics_counts(
|
||||
|
||||
# Loop over all expected metric_families
|
||||
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
|
||||
if (use_v1 and metric_family not in EXPECTED_METRICS_V1) or (
|
||||
if (metric_family not in EXPECTED_METRICS_V1) or (
|
||||
not server.show_hidden_metrics
|
||||
and metric_family in HIDDEN_DEPRECATED_METRICS
|
||||
):
|
||||
@@ -183,62 +164,6 @@ async def test_metrics_counts(
|
||||
assert found_metric, f"Did not find {metric_family} in prom endpoint"
|
||||
|
||||
|
||||
EXPECTED_METRICS = [
|
||||
"vllm:num_requests_running",
|
||||
"vllm:num_requests_waiting",
|
||||
"vllm:gpu_cache_usage_perc",
|
||||
"vllm:time_to_first_token_seconds_sum",
|
||||
"vllm:time_to_first_token_seconds_bucket",
|
||||
"vllm:time_to_first_token_seconds_count",
|
||||
"vllm:time_per_output_token_seconds_sum",
|
||||
"vllm:time_per_output_token_seconds_bucket",
|
||||
"vllm:time_per_output_token_seconds_count",
|
||||
"vllm:e2e_request_latency_seconds_sum",
|
||||
"vllm:e2e_request_latency_seconds_bucket",
|
||||
"vllm:e2e_request_latency_seconds_count",
|
||||
"vllm:request_queue_time_seconds_sum",
|
||||
"vllm:request_queue_time_seconds_bucket",
|
||||
"vllm:request_queue_time_seconds_count",
|
||||
"vllm:request_inference_time_seconds_sum",
|
||||
"vllm:request_inference_time_seconds_bucket",
|
||||
"vllm:request_inference_time_seconds_count",
|
||||
"vllm:request_prefill_time_seconds_sum",
|
||||
"vllm:request_prefill_time_seconds_bucket",
|
||||
"vllm:request_prefill_time_seconds_count",
|
||||
"vllm:request_decode_time_seconds_sum",
|
||||
"vllm:request_decode_time_seconds_bucket",
|
||||
"vllm:request_decode_time_seconds_count",
|
||||
"vllm:request_prompt_tokens_sum",
|
||||
"vllm:request_prompt_tokens_bucket",
|
||||
"vllm:request_prompt_tokens_count",
|
||||
"vllm:request_generation_tokens_sum",
|
||||
"vllm:request_generation_tokens_bucket",
|
||||
"vllm:request_generation_tokens_count",
|
||||
"vllm:request_params_n_sum",
|
||||
"vllm:request_params_n_bucket",
|
||||
"vllm:request_params_n_count",
|
||||
"vllm:request_params_max_tokens_sum",
|
||||
"vllm:request_params_max_tokens_bucket",
|
||||
"vllm:request_params_max_tokens_count",
|
||||
"vllm:iteration_tokens_total",
|
||||
"vllm:num_preemptions_total",
|
||||
"vllm:prompt_tokens_total",
|
||||
"vllm:generation_tokens_total",
|
||||
"vllm:request_success_total",
|
||||
"vllm:cache_config_info",
|
||||
# labels in cache_config_info
|
||||
"block_size",
|
||||
"cache_dtype",
|
||||
"cpu_offload_gb",
|
||||
"enable_prefix_caching",
|
||||
"gpu_memory_utilization",
|
||||
"num_cpu_blocks",
|
||||
"num_gpu_blocks",
|
||||
"num_gpu_blocks_override",
|
||||
"sliding_window",
|
||||
"swap_space_bytes",
|
||||
]
|
||||
|
||||
EXPECTED_METRICS_V1 = [
|
||||
"vllm:num_requests_running",
|
||||
"vllm:num_requests_waiting",
|
||||
@@ -304,17 +229,21 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_metrics_exist(
|
||||
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
|
||||
server: RemoteOpenAIServer,
|
||||
client: openai.AsyncClient,
|
||||
):
|
||||
# sending a request triggers the metrics to be logged.
|
||||
await client.completions.create(
|
||||
model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0
|
||||
model=MODEL_NAME,
|
||||
prompt="Hello, my name is",
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
response = requests.get(server.url_for("metrics"))
|
||||
assert response.status_code == HTTPStatus.OK
|
||||
|
||||
for metric in EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS:
|
||||
for metric in EXPECTED_METRICS_V1:
|
||||
if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
|
||||
continue
|
||||
assert metric in response.text
|
||||
@@ -322,10 +251,11 @@ async def test_metrics_exist(
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_abort_metrics_reset(
|
||||
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
|
||||
server: RemoteOpenAIServer,
|
||||
client: openai.AsyncClient,
|
||||
):
|
||||
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
|
||||
server, use_v1
|
||||
server
|
||||
)
|
||||
|
||||
# Expect no running requests or kvcache usage
|
||||
@@ -351,7 +281,7 @@ async def test_abort_metrics_reset(
|
||||
|
||||
# Check that we have running requests
|
||||
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
|
||||
server, use_v1
|
||||
server
|
||||
)
|
||||
|
||||
# Expect running requests and kvcache usage
|
||||
@@ -371,7 +301,7 @@ async def test_abort_metrics_reset(
|
||||
|
||||
# Verify running and waiting requests counts and KV cache usage are zero
|
||||
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
|
||||
_get_running_metrics_from_api(server, use_v1)
|
||||
_get_running_metrics_from_api(server)
|
||||
)
|
||||
|
||||
assert running_requests_after == 0, (
|
||||
@@ -385,7 +315,7 @@ async def test_abort_metrics_reset(
|
||||
)
|
||||
|
||||
|
||||
def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
|
||||
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
|
||||
"""Return (running_count, waiting_count, kv_cache_usage)"""
|
||||
|
||||
response = requests.get(server.url_for("metrics"))
|
||||
@@ -394,9 +324,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
|
||||
# Verify running and waiting requests counts and KV cache usage are zero
|
||||
running_requests, waiting_requests, kv_cache_usage = None, None, None
|
||||
|
||||
kv_cache_usage_metric = (
|
||||
"vllm:kv_cache_usage_perc" if use_v1 else "vllm:gpu_cache_usage_perc"
|
||||
)
|
||||
kv_cache_usage_metric = "vllm:kv_cache_usage_perc"
|
||||
|
||||
for family in text_string_to_metric_families(response.text):
|
||||
if family.name == "vllm:num_requests_running":
|
||||
@@ -422,7 +350,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
|
||||
return running_requests, waiting_requests, kv_cache_usage
|
||||
|
||||
|
||||
def test_metrics_exist_run_batch(use_v1: bool):
|
||||
def test_metrics_exist_run_batch():
|
||||
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
|
||||
|
||||
base_url = "0.0.0.0"
|
||||
@@ -452,7 +380,6 @@ def test_metrics_exist_run_batch(use_v1: bool):
|
||||
"--port",
|
||||
port,
|
||||
],
|
||||
env={"VLLM_USE_V1": "1"},
|
||||
)
|
||||
|
||||
def is_server_up(url):
|
||||
|
||||
Reference in New Issue
Block a user