[Attention] Use sparse prefill kernel for fp8 kv-cache in DeepSeek-v3.2 (#27532)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-12-12 08:57:47 -05:00
parent 91401c7a26
commit 3e41992fec
30 changed files with 1372 additions and 256 deletions
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -274,6 +274,7 @@ def test_cutlass_moe_8_bit_no_graph(
    per_act_token: bool,
    per_out_ch: bool,
    monkeypatch,
+    workspace_init,
    ep_size: int | None = None,
 ):
    current_platform.seed_everything(7)
@@ -329,6 +330,7 @@ def test_cutlass_moe_8_bit_cuda_graph(
    per_act_token: bool,
    per_out_ch: bool,
    monkeypatch,
+    workspace_init,
 ):
    current_platform.seed_everything(7)
    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
@@ -385,9 +387,19 @@ def test_cutlass_moe_8_bit_EP(
    per_out_channel: bool,
    ep_size: int,
    monkeypatch,
+    workspace_init,
 ):
    test_cutlass_moe_8_bit_no_graph(
-        m, n, k, e, topk, per_act_token, per_out_channel, monkeypatch, ep_size
+        m,
+        n,
+        k,
+        e,
+        topk,
+        per_act_token,
+        per_out_channel,
+        monkeypatch,
+        workspace_init,
+        ep_size,
    )


@@ -419,9 +431,19 @@ def test_cutlass_moe_8_bit_EP_large(
    per_out_channel: bool,
    ep_size: int,
    monkeypatch,
+    workspace_init,
 ):
    test_cutlass_moe_8_bit_no_graph(
-        m, n, k, e, topk, per_act_token, per_out_channel, monkeypatch, ep_size
+        m,
+        n,
+        k,
+        e,
+        topk,
+        per_act_token,
+        per_out_channel,
+        monkeypatch,
+        workspace_init,
+        ep_size,
    )


@@ -445,6 +467,7 @@ def test_run_cutlass_moe_fp8(
    per_act_token: bool,
    per_out_channel: bool,
    ep_size: int,
+    workspace_init,
 ):
    current_platform.seed_everything(7)
    with set_current_vllm_config(vllm_config):