From eb1629da2496fe351251abe1905f88021fbc3509 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Wed, 21 Jan 2026 23:55:25 -0600 Subject: [PATCH] [ROCm][CI] Fix AITER test flakiness by using explicit attention backend (#32346) Signed-off-by: Andreas Karatzas Signed-off-by: Matthew Wong Co-authored-by: Matthew Wong --- .buildkite/test-amd.yaml | 2 +- tests/models/language/generation/test_common.py | 6 +++++- .../E=8,N=3584,device_name=AMD_Instinct_MI325X.json | 8 ++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index e383c10dd..e87aed027 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -866,7 +866,7 @@ steps: - label: Language Models Tests (Standard) timeout_in_minutes: 25 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking torch_nightly: true diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index 5a90cb85f..1425bb044 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -160,8 +160,12 @@ def test_models( tokenizer_name=model_info.tokenizer or model, tokenizer_mode=model_info.tokenizer_mode, trust_remote_code=model_info.trust_remote_code, - max_num_seqs=2, + # Remove the effects of batch variance on ROCm since batch invariance + # is not yet supported. + # See: https://github.com/vllm-project/vllm/issues/27433 + max_num_seqs=1 if current_platform.is_rocm() else 2, enable_prompt_embeds=use_prompt_embeds, + compilation_config={"cudagraph_capture_sizes": [1, 2]}, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json index de9d0aba7..7dbc0f888 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json @@ -45,14 +45,14 @@ }, "16": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, + "num_warps": 2, "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "24": { "BLOCK_SIZE_M": 16,