210 lines
6.4 KiB
Python
210 lines
6.4 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""Tests for fused_gdn_prefill_post_conv kernel.
|
|
|
|
Verifies that the fused kernel matches the reference:
|
|
split → rearrange → contiguous → l2norm → gating
|
|
"""
|
|
|
|
import pytest
|
|
import torch
|
|
import torch.nn.functional as F
|
|
|
|
from vllm.model_executor.layers.fla.ops.fused_gdn_prefill_post_conv import (
|
|
fused_post_conv_prep,
|
|
)
|
|
|
|
|
|
def reference_post_conv(
|
|
conv_output: torch.Tensor,
|
|
a: torch.Tensor,
|
|
b: torch.Tensor,
|
|
A_log: torch.Tensor,
|
|
dt_bias: torch.Tensor,
|
|
H: int,
|
|
K: int,
|
|
V: int,
|
|
apply_l2norm: bool = True,
|
|
output_g_exp: bool = False,
|
|
):
|
|
"""Reference implementation using individual ops."""
|
|
L = conv_output.shape[0]
|
|
HV = A_log.shape[0]
|
|
|
|
# Split
|
|
q_flat, k_flat, v_flat = torch.split(conv_output, [H * K, H * K, HV * V], dim=-1)
|
|
|
|
# Rearrange + contiguous
|
|
q = q_flat.view(L, H, K).contiguous()
|
|
k = k_flat.view(L, H, K).contiguous()
|
|
v = v_flat.view(L, HV, V).contiguous()
|
|
|
|
# L2 norm
|
|
if apply_l2norm:
|
|
q = F.normalize(q.float(), p=2, dim=-1, eps=1e-6).to(conv_output.dtype)
|
|
k = F.normalize(k.float(), p=2, dim=-1, eps=1e-6).to(conv_output.dtype)
|
|
|
|
# Gating
|
|
x = a.float() + dt_bias.float()
|
|
sp = F.softplus(x, beta=1.0, threshold=20.0)
|
|
g = -torch.exp(A_log.float()) * sp
|
|
|
|
if output_g_exp:
|
|
g = torch.exp(g)
|
|
|
|
beta_out = torch.sigmoid(b.float())
|
|
|
|
return q, k, v, g, beta_out
|
|
|
|
|
|
# Qwen3.5-35B config: H=16, HV=32, K=128, V=128
|
|
# Qwen3.5-397B config: H=16, HV=64, K=128, V=128
|
|
@pytest.mark.parametrize(
|
|
"H, HV, K, V",
|
|
[
|
|
(16, 32, 128, 128), # 35B
|
|
(16, 64, 128, 128), # 397B
|
|
(4, 8, 64, 64), # small
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("L", [1, 16, 128, 512, 2048])
|
|
@pytest.mark.parametrize("apply_l2norm", [True, False])
|
|
@pytest.mark.parametrize("output_g_exp", [True, False])
|
|
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
|
def test_fused_post_conv_correctness(H, HV, K, V, L, apply_l2norm, output_g_exp, dtype):
|
|
"""Test fused kernel matches reference for all configs."""
|
|
torch.manual_seed(42)
|
|
device = "cuda"
|
|
qkv_dim = 2 * H * K + HV * V
|
|
|
|
conv_output = torch.randn(L, qkv_dim, dtype=dtype, device=device)
|
|
a = torch.randn(L, HV, dtype=dtype, device=device)
|
|
b = torch.randn(L, HV, dtype=dtype, device=device)
|
|
A_log = torch.randn(HV, dtype=torch.float32, device=device) - 2.0
|
|
dt_bias = torch.randn(HV, dtype=torch.float32, device=device) * 0.1
|
|
|
|
# Reference
|
|
ref_q, ref_k, ref_v, ref_g, ref_beta = reference_post_conv(
|
|
conv_output,
|
|
a,
|
|
b,
|
|
A_log,
|
|
dt_bias,
|
|
H,
|
|
K,
|
|
V,
|
|
apply_l2norm,
|
|
output_g_exp,
|
|
)
|
|
|
|
# Fused kernel
|
|
fused_q, fused_k, fused_v, fused_g, fused_beta = fused_post_conv_prep(
|
|
conv_output,
|
|
a,
|
|
b,
|
|
A_log,
|
|
dt_bias,
|
|
num_k_heads=H,
|
|
head_k_dim=K,
|
|
head_v_dim=V,
|
|
apply_l2norm=apply_l2norm,
|
|
output_g_exp=output_g_exp,
|
|
)
|
|
|
|
# Check shapes
|
|
assert fused_q.shape == (L, H, K), f"q shape: {fused_q.shape}"
|
|
assert fused_k.shape == (L, H, K), f"k shape: {fused_k.shape}"
|
|
assert fused_v.shape == (L, HV, V), f"v shape: {fused_v.shape}"
|
|
assert fused_g.shape == (L, HV), f"g shape: {fused_g.shape}"
|
|
assert fused_beta.shape == (L, HV), f"beta shape: {fused_beta.shape}"
|
|
|
|
# Check dtypes
|
|
assert fused_q.dtype == dtype
|
|
assert fused_k.dtype == dtype
|
|
assert fused_v.dtype == dtype
|
|
assert fused_g.dtype == torch.float32
|
|
assert fused_beta.dtype == torch.float32
|
|
|
|
# Check contiguity
|
|
assert fused_q.is_contiguous()
|
|
assert fused_k.is_contiguous()
|
|
assert fused_v.is_contiguous()
|
|
|
|
# Check values
|
|
atol_qkv = 1e-2 if apply_l2norm else 1e-3
|
|
rtol_qkv = 1e-2 if apply_l2norm else 1e-3
|
|
|
|
torch.testing.assert_close(fused_q, ref_q, atol=atol_qkv, rtol=rtol_qkv)
|
|
torch.testing.assert_close(fused_k, ref_k, atol=atol_qkv, rtol=rtol_qkv)
|
|
torch.testing.assert_close(fused_v, ref_v, atol=1e-3, rtol=1e-3)
|
|
torch.testing.assert_close(fused_g, ref_g, atol=1e-4, rtol=1e-4)
|
|
torch.testing.assert_close(fused_beta, ref_beta, atol=1e-4, rtol=1e-4)
|
|
|
|
|
|
@pytest.mark.parametrize("L", [1, 64, 256])
|
|
def test_fused_post_conv_sanity(L):
|
|
"""Sanity checks: no NaN, unit-norm q/k, beta in (0,1)."""
|
|
torch.manual_seed(0)
|
|
device = "cuda"
|
|
H, HV, K, V = 16, 32, 128, 128
|
|
qkv_dim = 2 * H * K + HV * V
|
|
|
|
conv_output = torch.randn(L, qkv_dim, dtype=torch.bfloat16, device=device)
|
|
a = torch.randn(L, HV, dtype=torch.bfloat16, device=device)
|
|
b = torch.randn(L, HV, dtype=torch.bfloat16, device=device)
|
|
A_log = torch.randn(HV, dtype=torch.float32, device=device) - 2.0
|
|
dt_bias = torch.randn(HV, dtype=torch.float32, device=device)
|
|
|
|
q, k, v, g, beta = fused_post_conv_prep(
|
|
conv_output,
|
|
a,
|
|
b,
|
|
A_log,
|
|
dt_bias,
|
|
num_k_heads=H,
|
|
head_k_dim=K,
|
|
head_v_dim=V,
|
|
)
|
|
|
|
# Basic sanity
|
|
assert not torch.isnan(q).any(), "NaN in q"
|
|
assert not torch.isnan(k).any(), "NaN in k"
|
|
assert not torch.isnan(v).any(), "NaN in v"
|
|
assert not torch.isnan(g).any(), "NaN in g"
|
|
assert not torch.isnan(beta).any(), "NaN in beta"
|
|
|
|
# L2 norm check: each head vector should have unit norm
|
|
q_norms = torch.norm(q.float(), dim=-1)
|
|
k_norms = torch.norm(k.float(), dim=-1)
|
|
torch.testing.assert_close(q_norms, torch.ones_like(q_norms), atol=1e-3, rtol=1e-3)
|
|
torch.testing.assert_close(k_norms, torch.ones_like(k_norms), atol=1e-3, rtol=1e-3)
|
|
|
|
# Beta should be in (0, 1)
|
|
assert (beta >= 0).all() and (beta <= 1).all(), "beta out of range"
|
|
|
|
|
|
def test_fused_post_conv_l0():
|
|
"""Test L=0 edge case."""
|
|
device = "cuda"
|
|
H, HV, K, V = 16, 32, 128, 128
|
|
qkv_dim = 2 * H * K + HV * V
|
|
|
|
conv_output = torch.empty(0, qkv_dim, dtype=torch.bfloat16, device=device)
|
|
a = torch.empty(0, HV, dtype=torch.bfloat16, device=device)
|
|
b = torch.empty(0, HV, dtype=torch.bfloat16, device=device)
|
|
A_log = torch.randn(HV, dtype=torch.float32, device=device)
|
|
dt_bias = torch.randn(HV, dtype=torch.float32, device=device)
|
|
|
|
q, k, v, g, beta = fused_post_conv_prep(
|
|
conv_output,
|
|
a,
|
|
b,
|
|
A_log,
|
|
dt_bias,
|
|
num_k_heads=H,
|
|
head_k_dim=K,
|
|
head_v_dim=V,
|
|
)
|
|
assert q.shape == (0, H, K)
|
|
assert g.shape == (0, HV)
|