From a8a47c17b68fbd4229a86cc1d4202ebc94bdb9fe Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sun, 22 Feb 2026 03:03:44 -0600 Subject: [PATCH] [ROCm][CI] Fix flaky embedding chat test by using tolerance-based comparison (#35050) Signed-off-by: Andreas Karatzas --- .../entrypoints/pooling/embed/test_online.py | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py index d2a5974b7..89341670c 100644 --- a/tests/entrypoints/pooling/embed/test_online.py +++ b/tests/entrypoints/pooling/embed/test_online.py @@ -58,13 +58,19 @@ if current_platform.is_rocm(): torch.backends.cuda.enable_mem_efficient_sdp(False) torch.backends.cuda.enable_math_sdp(True) +# On ROCm, floating-point reductions in attention and GEMM kernels are +# non-associative and sensitive to batch geometry. Force LLM instances +# into an identical, deterministic execution mode: +ROCM_DETERMINISM_ARGS: list[str] = ( + ["--max-num-seqs", "1"] if current_platform.is_rocm() else [] +) + @pytest.fixture(scope="module") def server(): args = [ "--runner", "pooling", - # use half precision for speed and memory savings in CI environment "--dtype", DTYPE, "--enforce-eager", @@ -72,12 +78,9 @@ def server(): "512", "--chat-template", DUMMY_CHAT_TEMPLATE, + *ROCM_DETERMINISM_ARGS, ] - # ROCm: Use Flex Attention to support encoder-only self-attention. - if current_platform.is_rocm(): - args.extend(["--attention-backend", "FLEX_ATTENTION"]) - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server @@ -343,8 +346,15 @@ async def test_chat_request( assert chat_embeddings.id is not None assert completion_embeddings.id is not None assert chat_embeddings.created <= completion_embeddings.created - assert chat_embeddings.model_dump(exclude={"id", "created"}) == ( - completion_embeddings.model_dump(exclude={"id", "created"}) + # Use tolerance-based comparison for embeddings + check_embeddings_close( + embeddings_0_lst=[d.embedding for d in chat_embeddings.data], + embeddings_1_lst=[d.embedding for d in completion_embeddings.data], + name_0="chat", + name_1="completion", + ) + assert chat_embeddings.model_dump(exclude={"id", "created", "data"}) == ( + completion_embeddings.model_dump(exclude={"id", "created", "data"}) ) # test add_generation_prompt