From e27078ea807cf5655c6be3b59608e55c1682756d Mon Sep 17 00:00:00 2001 From: vllmellm Date: Thu, 15 Jan 2026 03:32:48 +0800 Subject: [PATCH] [Bugfix][ROCm][performance] Resolve the performance regression issue of the Qwen3-Next-80B-A3B-Thinking under rocm_atten (#32336) Signed-off-by: vllmellm --- vllm/v1/attention/backends/rocm_attn.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py index 6ec6825cc..73747aaed 100644 --- a/vllm/v1/attention/backends/rocm_attn.py +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -167,7 +167,16 @@ class RocmAttentionBackend(AttentionBackend): # ROCM paged attention kernel only supports block sizes 16 and 32 # due to shared memory (LDS) constraints on AMD GPUs. # See csrc/rocm/attention.cu CALL_CUSTOM_LAUNCHER_BLK macro. - return [16, 32] + + # However, The limitations in [16, 32] are reasonable for a native C++ kernel, + # but vLLM should allow support for non-standard sizes via the Triton path, + # as addressed in this PR: https://github.com/vllm-project/vllm/pull/31380, + # where the Triton kernel under rocm_atten does not support inference + # for a non-standard qwen3-next model with a block_size of 544. + # We have fixed the Triton kernel so that the standard model uses the original + # bit-addressing logic, while the non-standard model + # uses our optimized kernel logic. + return [16, 32, 544] @classmethod def get_supported_head_sizes(cls) -> list[int]: