[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-10-07 23:42:31 +08:00
committed by GitHub
parent c0a7b89d8e
commit 1e4ecca1d0
51 changed files with 817 additions and 1275 deletions

View File

@@ -22,24 +22,6 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
PREV_MINOR_VERSION = version._prev_minor_version()
@pytest.fixture(scope="module", params=[True])
def use_v1(request):
# Module-scoped variant of run_with_both_engines
#
# Use this fixture to run a test with both v0 and v1, and
# also to conditionalize the test logic e.g.
#
# def test_metrics_exist(use_v1, server, client):
# ...
# expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
# for metric in expected:
# assert metric in response.text
#
# @skip_v1 wouldn't work here because this is a module-level
# fixture - per-function decorators would have no effect
yield request.param
@pytest.fixture(scope="module")
def default_server_args():
return [
@@ -63,13 +45,11 @@ def default_server_args():
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
],
)
def server(use_v1, default_server_args, request):
def server(default_server_args, request):
if request.param:
default_server_args.append(request.param)
env_dict = dict(VLLM_USE_V1="1" if use_v1 else "0")
with RemoteOpenAIServer(
MODEL_NAME, default_server_args, env_dict=env_dict
) as remote_server:
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server
@@ -129,7 +109,8 @@ EXPECTED_VALUES = {
@pytest.mark.asyncio
async def test_metrics_counts(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
server: RemoteOpenAIServer,
client: openai.AsyncClient,
):
for _ in range(_NUM_REQUESTS):
# sending a request triggers the metrics to be logged.
@@ -145,7 +126,7 @@ async def test_metrics_counts(
# Loop over all expected metric_families
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
if (use_v1 and metric_family not in EXPECTED_METRICS_V1) or (
if (metric_family not in EXPECTED_METRICS_V1) or (
not server.show_hidden_metrics
and metric_family in HIDDEN_DEPRECATED_METRICS
):
@@ -183,62 +164,6 @@ async def test_metrics_counts(
assert found_metric, f"Did not find {metric_family} in prom endpoint"
EXPECTED_METRICS = [
"vllm:num_requests_running",
"vllm:num_requests_waiting",
"vllm:gpu_cache_usage_perc",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
"vllm:e2e_request_latency_seconds_sum",
"vllm:e2e_request_latency_seconds_bucket",
"vllm:e2e_request_latency_seconds_count",
"vllm:request_queue_time_seconds_sum",
"vllm:request_queue_time_seconds_bucket",
"vllm:request_queue_time_seconds_count",
"vllm:request_inference_time_seconds_sum",
"vllm:request_inference_time_seconds_bucket",
"vllm:request_inference_time_seconds_count",
"vllm:request_prefill_time_seconds_sum",
"vllm:request_prefill_time_seconds_bucket",
"vllm:request_prefill_time_seconds_count",
"vllm:request_decode_time_seconds_sum",
"vllm:request_decode_time_seconds_bucket",
"vllm:request_decode_time_seconds_count",
"vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket",
"vllm:request_prompt_tokens_count",
"vllm:request_generation_tokens_sum",
"vllm:request_generation_tokens_bucket",
"vllm:request_generation_tokens_count",
"vllm:request_params_n_sum",
"vllm:request_params_n_bucket",
"vllm:request_params_n_count",
"vllm:request_params_max_tokens_sum",
"vllm:request_params_max_tokens_bucket",
"vllm:request_params_max_tokens_count",
"vllm:iteration_tokens_total",
"vllm:num_preemptions_total",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
"vllm:request_success_total",
"vllm:cache_config_info",
# labels in cache_config_info
"block_size",
"cache_dtype",
"cpu_offload_gb",
"enable_prefix_caching",
"gpu_memory_utilization",
"num_cpu_blocks",
"num_gpu_blocks",
"num_gpu_blocks_override",
"sliding_window",
"swap_space_bytes",
]
EXPECTED_METRICS_V1 = [
"vllm:num_requests_running",
"vllm:num_requests_waiting",
@@ -304,17 +229,21 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [
@pytest.mark.asyncio
async def test_metrics_exist(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
server: RemoteOpenAIServer,
client: openai.AsyncClient,
):
# sending a request triggers the metrics to be logged.
await client.completions.create(
model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0
model=MODEL_NAME,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0,
)
response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK
for metric in EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS:
for metric in EXPECTED_METRICS_V1:
if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
continue
assert metric in response.text
@@ -322,10 +251,11 @@ async def test_metrics_exist(
@pytest.mark.asyncio
async def test_abort_metrics_reset(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
server: RemoteOpenAIServer,
client: openai.AsyncClient,
):
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
server, use_v1
server
)
# Expect no running requests or kvcache usage
@@ -351,7 +281,7 @@ async def test_abort_metrics_reset(
# Check that we have running requests
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
server, use_v1
server
)
# Expect running requests and kvcache usage
@@ -371,7 +301,7 @@ async def test_abort_metrics_reset(
# Verify running and waiting requests counts and KV cache usage are zero
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
_get_running_metrics_from_api(server, use_v1)
_get_running_metrics_from_api(server)
)
assert running_requests_after == 0, (
@@ -385,7 +315,7 @@ async def test_abort_metrics_reset(
)
def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
"""Return (running_count, waiting_count, kv_cache_usage)"""
response = requests.get(server.url_for("metrics"))
@@ -394,9 +324,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
# Verify running and waiting requests counts and KV cache usage are zero
running_requests, waiting_requests, kv_cache_usage = None, None, None
kv_cache_usage_metric = (
"vllm:kv_cache_usage_perc" if use_v1 else "vllm:gpu_cache_usage_perc"
)
kv_cache_usage_metric = "vllm:kv_cache_usage_perc"
for family in text_string_to_metric_families(response.text):
if family.name == "vllm:num_requests_running":
@@ -422,7 +350,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
return running_requests, waiting_requests, kv_cache_usage
def test_metrics_exist_run_batch(use_v1: bool):
def test_metrics_exist_run_batch():
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
base_url = "0.0.0.0"
@@ -452,7 +380,6 @@ def test_metrics_exist_run_batch(use_v1: bool):
"--port",
port,
],
env={"VLLM_USE_V1": "1"},
)
def is_server_up(url):