[ROCm][CI] Fix tool use test stability - disable skinny GEMM, prefix caching, eliminate batch variance (#35553)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-03-06 01:15:12 -06:00
parent 5afb387bd4
commit 807d680337
5 changed files with 33 additions and 17 deletions
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -9,14 +9,13 @@ import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio

-from vllm.platforms import current_platform
-
 # downloading lora to test lora requests
-from ...utils import RemoteOpenAIServer
+from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer

 # any model with a chat template should work here
 MODEL_NAME = "Qwen/Qwen3-0.6B"

+
 tools = [
    {
        "type": "function",
@@ -142,19 +141,11 @@ def server():
        "--gpu-memory-utilization",
        "0.4",
        "--enforce-eager",
-    ]
+    ] + ROCM_EXTRA_ARGS

-    rocm_args = {
-        "--max-num-seqs": "1",
-        "--no-enable-prefix-caching": None,
-    }
-    if current_platform.is_rocm():
-        for k, v in rocm_args.items():
-            args.append(k)
-            if v is not None:
-                args.append(v)
-
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    with RemoteOpenAIServer(
+        MODEL_NAME, args, env_dict=ROCM_ENV_OVERRIDES
+    ) as remote_server:
        yield remote_server


@@ -239,12 +230,13 @@ def k2_server():
        "qwen3",
        "--gpu-memory-utilization",
        "0.4",
-    ]
+    ] + ROCM_EXTRA_ARGS
    # hack to test kimi_k2 tool use tool_id format.
    # avoid error in is_deepseek_mla check by setting kv_lora_rank=null
    with RemoteOpenAIServer(
        MODEL_NAME,
        args,
+        env_dict=ROCM_ENV_OVERRIDES,
        override_hf_configs={"model_type": "kimi_k2", "kv_lora_rank": None},
    ) as remote_server:
        yield remote_server