From e27078ea807cf5655c6be3b59608e55c1682756d Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Thu, 15 Jan 2026 03:32:48 +0800
Subject: [PATCH] [Bugfix][ROCm][performance] Resolve the performance
 regression issue of the Qwen3-Next-80B-A3B-Thinking under rocm_atten (#32336)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/v1/attention/backends/rocm_attn.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 6ec6825cc..73747aaed 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -167,7 +167,16 @@ class RocmAttentionBackend(AttentionBackend):
         # ROCM paged attention kernel only supports block sizes 16 and 32
         # due to shared memory (LDS) constraints on AMD GPUs.
         # See csrc/rocm/attention.cu CALL_CUSTOM_LAUNCHER_BLK macro.
-        return [16, 32]
+
+        # However, The limitations in [16, 32] are reasonable for a native C++ kernel,
+        # but vLLM should allow support for non-standard sizes via the Triton path,
+        # as addressed in this PR: https://github.com/vllm-project/vllm/pull/31380,
+        # where the Triton kernel under rocm_atten does not support inference
+        # for a non-standard qwen3-next model with a block_size of 544.
+        # We have fixed the Triton kernel so that the standard model uses the original
+        # bit-addressing logic, while the non-standard model
+        # uses our optimized kernel logic.
+        return [16, 32, 544]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]: