From 22dffca9822987f0e912bfd9635e94bbdd05def3 Mon Sep 17 00:00:00 2001 From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Date: Tue, 6 Jan 2026 21:32:46 +0400 Subject: [PATCH] [PERF] Speed-up of GDN attention decode part (Qwen3-Next) (#31722) Signed-off-by: Vadim Gimpelson --- vllm/model_executor/layers/fla/ops/fused_recurrent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fla/ops/fused_recurrent.py b/vllm/model_executor/layers/fla/ops/fused_recurrent.py index 0f2750478..91b07129d 100644 --- a/vllm/model_executor/layers/fla/ops/fused_recurrent.py +++ b/vllm/model_executor/layers/fla/ops/fused_recurrent.py @@ -189,7 +189,7 @@ def fused_recurrent_gated_delta_rule_fwd( B, T, H, K, V = *k.shape, v.shape[-1] HV = v.shape[2] N = B if cu_seqlens is None else len(cu_seqlens) - 1 - BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 8) + BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 32) NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV) assert NK == 1, "NK > 1 is not supported yet" num_stages = 3