diff --git a/tests/entrypoints/openai/test_serving_tokens.py b/tests/entrypoints/openai/test_serving_tokens.py
index aa56dfd6b..6cd4fd7a1 100644
--- a/tests/entrypoints/openai/test_serving_tokens.py
+++ b/tests/entrypoints/openai/test_serving_tokens.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import os
+
 import httpx
 import pytest
 import pytest_asyncio
@@ -46,6 +48,27 @@ def server(request):
         "--max-model-len",
         "1024",
         "--enforce-eager",
+        # On ROCm (e.g. MI355X/gfx950), bf16 GEMM results can differ by
+        # 1 ULP when the batch dimension (M) changes, because different M
+        # values cause the Tensile backend to select different tile
+        # configurations with different fp32 accumulation orders. With
+        # prefix caching, cache-miss prefills compute all tokens in one
+        # pass (large M) while cache-hit requests compute only the
+        # uncached suffix (small M), seeding a divergence that amplifies
+        # through the residual stream and flips argmax tokens.
+        # See: https://github.com/vllm-project/vllm/issues/33123
+        #
+        # Either disable prefix caching entirely, or enable it with
+        # --deterministic-prefix-caching which forces cache-miss prefills
+        # to split at block boundaries so the suffix GEMM shape is always
+        # identical regardless of cache state.
+        #
+        # Option A: disable prefix caching
+        "--no-enable-prefix-caching",
+        #
+        # Option B: deterministic prefix caching
+        # "--enable-prefix-caching",
+        # "--deterministic-prefix-caching",
     ]
 
     extra_args = getattr(request, "param", None)
@@ -56,7 +79,11 @@ def server(request):
             else [str(extra_args)]
         )
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    envs = os.environ.copy()
+    # See: https://github.com/vllm-project/vllm/pull/33493#issuecomment-3888060787
+    envs["VLLM_ROCM_USE_SKINNY_GEMM"] = "0"
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
         yield remote_server