[ROCm][CI] Fix tool use test stability - disable skinny GEMM, prefix caching, eliminate batch variance (#35553)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
@@ -9,14 +9,13 @@ import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
# downloading lora to test lora requests
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
@@ -142,19 +141,11 @@ def server():
|
||||
"--gpu-memory-utilization",
|
||||
"0.4",
|
||||
"--enforce-eager",
|
||||
]
|
||||
] + ROCM_EXTRA_ARGS
|
||||
|
||||
rocm_args = {
|
||||
"--max-num-seqs": "1",
|
||||
"--no-enable-prefix-caching": None,
|
||||
}
|
||||
if current_platform.is_rocm():
|
||||
for k, v in rocm_args.items():
|
||||
args.append(k)
|
||||
if v is not None:
|
||||
args.append(v)
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME, args, env_dict=ROCM_ENV_OVERRIDES
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@@ -239,12 +230,13 @@ def k2_server():
|
||||
"qwen3",
|
||||
"--gpu-memory-utilization",
|
||||
"0.4",
|
||||
]
|
||||
] + ROCM_EXTRA_ARGS
|
||||
# hack to test kimi_k2 tool use tool_id format.
|
||||
# avoid error in is_deepseek_mla check by setting kv_lora_rank=null
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME,
|
||||
args,
|
||||
env_dict=ROCM_ENV_OVERRIDES,
|
||||
override_hf_configs={"model_type": "kimi_k2", "kv_lora_rank": None},
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
Reference in New Issue
Block a user