[Perf] fuse kernels in gdn (#37813)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2026-04-02 19:52:18 +08:00
parent 551b3fb39f
commit 4eefbf9609
4 changed files with 495 additions and 10 deletions
--- a/tests/kernels/test_fused_gdn_post_conv.py
+++ b/tests/kernels/test_fused_gdn_post_conv.py
@@ -0,0 +1,209 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for fused_gdn_prefill_post_conv kernel.
+
+Verifies that the fused kernel matches the reference:
+  split → rearrange → contiguous → l2norm → gating
+"""
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.model_executor.layers.fla.ops.fused_gdn_prefill_post_conv import (
+    fused_post_conv_prep,
+)
+
+
+def reference_post_conv(
+    conv_output: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    A_log: torch.Tensor,
+    dt_bias: torch.Tensor,
+    H: int,
+    K: int,
+    V: int,
+    apply_l2norm: bool = True,
+    output_g_exp: bool = False,
+):
+    """Reference implementation using individual ops."""
+    L = conv_output.shape[0]
+    HV = A_log.shape[0]
+
+    # Split
+    q_flat, k_flat, v_flat = torch.split(conv_output, [H * K, H * K, HV * V], dim=-1)
+
+    # Rearrange + contiguous
+    q = q_flat.view(L, H, K).contiguous()
+    k = k_flat.view(L, H, K).contiguous()
+    v = v_flat.view(L, HV, V).contiguous()
+
+    # L2 norm
+    if apply_l2norm:
+        q = F.normalize(q.float(), p=2, dim=-1, eps=1e-6).to(conv_output.dtype)
+        k = F.normalize(k.float(), p=2, dim=-1, eps=1e-6).to(conv_output.dtype)
+
+    # Gating
+    x = a.float() + dt_bias.float()
+    sp = F.softplus(x, beta=1.0, threshold=20.0)
+    g = -torch.exp(A_log.float()) * sp
+
+    if output_g_exp:
+        g = torch.exp(g)
+
+    beta_out = torch.sigmoid(b.float())
+
+    return q, k, v, g, beta_out
+
+
+# Qwen3.5-35B config: H=16, HV=32, K=128, V=128
+# Qwen3.5-397B config: H=16, HV=64, K=128, V=128
+@pytest.mark.parametrize(
+    "H, HV, K, V",
+    [
+        (16, 32, 128, 128),  # 35B
+        (16, 64, 128, 128),  # 397B
+        (4, 8, 64, 64),  # small
+    ],
+)
+@pytest.mark.parametrize("L", [1, 16, 128, 512, 2048])
+@pytest.mark.parametrize("apply_l2norm", [True, False])
+@pytest.mark.parametrize("output_g_exp", [True, False])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+def test_fused_post_conv_correctness(H, HV, K, V, L, apply_l2norm, output_g_exp, dtype):
+    """Test fused kernel matches reference for all configs."""
+    torch.manual_seed(42)
+    device = "cuda"
+    qkv_dim = 2 * H * K + HV * V
+
+    conv_output = torch.randn(L, qkv_dim, dtype=dtype, device=device)
+    a = torch.randn(L, HV, dtype=dtype, device=device)
+    b = torch.randn(L, HV, dtype=dtype, device=device)
+    A_log = torch.randn(HV, dtype=torch.float32, device=device) - 2.0
+    dt_bias = torch.randn(HV, dtype=torch.float32, device=device) * 0.1
+
+    # Reference
+    ref_q, ref_k, ref_v, ref_g, ref_beta = reference_post_conv(
+        conv_output,
+        a,
+        b,
+        A_log,
+        dt_bias,
+        H,
+        K,
+        V,
+        apply_l2norm,
+        output_g_exp,
+    )
+
+    # Fused kernel
+    fused_q, fused_k, fused_v, fused_g, fused_beta = fused_post_conv_prep(
+        conv_output,
+        a,
+        b,
+        A_log,
+        dt_bias,
+        num_k_heads=H,
+        head_k_dim=K,
+        head_v_dim=V,
+        apply_l2norm=apply_l2norm,
+        output_g_exp=output_g_exp,
+    )
+
+    # Check shapes
+    assert fused_q.shape == (L, H, K), f"q shape: {fused_q.shape}"
+    assert fused_k.shape == (L, H, K), f"k shape: {fused_k.shape}"
+    assert fused_v.shape == (L, HV, V), f"v shape: {fused_v.shape}"
+    assert fused_g.shape == (L, HV), f"g shape: {fused_g.shape}"
+    assert fused_beta.shape == (L, HV), f"beta shape: {fused_beta.shape}"
+
+    # Check dtypes
+    assert fused_q.dtype == dtype
+    assert fused_k.dtype == dtype
+    assert fused_v.dtype == dtype
+    assert fused_g.dtype == torch.float32
+    assert fused_beta.dtype == torch.float32
+
+    # Check contiguity
+    assert fused_q.is_contiguous()
+    assert fused_k.is_contiguous()
+    assert fused_v.is_contiguous()
+
+    # Check values
+    atol_qkv = 1e-2 if apply_l2norm else 1e-3
+    rtol_qkv = 1e-2 if apply_l2norm else 1e-3
+
+    torch.testing.assert_close(fused_q, ref_q, atol=atol_qkv, rtol=rtol_qkv)
+    torch.testing.assert_close(fused_k, ref_k, atol=atol_qkv, rtol=rtol_qkv)
+    torch.testing.assert_close(fused_v, ref_v, atol=1e-3, rtol=1e-3)
+    torch.testing.assert_close(fused_g, ref_g, atol=1e-4, rtol=1e-4)
+    torch.testing.assert_close(fused_beta, ref_beta, atol=1e-4, rtol=1e-4)
+
+
+@pytest.mark.parametrize("L", [1, 64, 256])
+def test_fused_post_conv_sanity(L):
+    """Sanity checks: no NaN, unit-norm q/k, beta in (0,1)."""
+    torch.manual_seed(0)
+    device = "cuda"
+    H, HV, K, V = 16, 32, 128, 128
+    qkv_dim = 2 * H * K + HV * V
+
+    conv_output = torch.randn(L, qkv_dim, dtype=torch.bfloat16, device=device)
+    a = torch.randn(L, HV, dtype=torch.bfloat16, device=device)
+    b = torch.randn(L, HV, dtype=torch.bfloat16, device=device)
+    A_log = torch.randn(HV, dtype=torch.float32, device=device) - 2.0
+    dt_bias = torch.randn(HV, dtype=torch.float32, device=device)
+
+    q, k, v, g, beta = fused_post_conv_prep(
+        conv_output,
+        a,
+        b,
+        A_log,
+        dt_bias,
+        num_k_heads=H,
+        head_k_dim=K,
+        head_v_dim=V,
+    )
+
+    # Basic sanity
+    assert not torch.isnan(q).any(), "NaN in q"
+    assert not torch.isnan(k).any(), "NaN in k"
+    assert not torch.isnan(v).any(), "NaN in v"
+    assert not torch.isnan(g).any(), "NaN in g"
+    assert not torch.isnan(beta).any(), "NaN in beta"
+
+    # L2 norm check: each head vector should have unit norm
+    q_norms = torch.norm(q.float(), dim=-1)
+    k_norms = torch.norm(k.float(), dim=-1)
+    torch.testing.assert_close(q_norms, torch.ones_like(q_norms), atol=1e-3, rtol=1e-3)
+    torch.testing.assert_close(k_norms, torch.ones_like(k_norms), atol=1e-3, rtol=1e-3)
+
+    # Beta should be in (0, 1)
+    assert (beta >= 0).all() and (beta <= 1).all(), "beta out of range"
+
+
+def test_fused_post_conv_l0():
+    """Test L=0 edge case."""
+    device = "cuda"
+    H, HV, K, V = 16, 32, 128, 128
+    qkv_dim = 2 * H * K + HV * V
+
+    conv_output = torch.empty(0, qkv_dim, dtype=torch.bfloat16, device=device)
+    a = torch.empty(0, HV, dtype=torch.bfloat16, device=device)
+    b = torch.empty(0, HV, dtype=torch.bfloat16, device=device)
+    A_log = torch.randn(HV, dtype=torch.float32, device=device)
+    dt_bias = torch.randn(HV, dtype=torch.float32, device=device)
+
+    q, k, v, g, beta = fused_post_conv_prep(
+        conv_output,
+        a,
+        b,
+        A_log,
+        dt_bias,
+        num_k_heads=H,
+        head_k_dim=K,
+        head_v_dim=V,
+    )
+    assert q.shape == (0, H, K)
+    assert g.shape == (0, HV)