[Attention] Use sparse prefill kernel for fp8 kv-cache in DeepSeek-v3.2 (#27532)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-12-12 08:57:47 -05:00
parent 91401c7a26
commit 3e41992fec
30 changed files with 1372 additions and 256 deletions
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -29,6 +29,7 @@ from vllm.utils.deep_gemm import (
    is_deep_gemm_supported,
 )
 from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
+from vllm.v1.worker.workspace import init_workspace_manager

 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
@@ -363,6 +364,9 @@ def _test_deepep_deepgemm_moe(
    w1_scale: torch.Tensor,
    w2_scale: torch.Tensor,
 ):
+    device = torch.device(f"cuda:{pgi.local_rank}")
+    init_workspace_manager(device)
+
    current_platform.seed_everything(pgi.rank)

    w1 = w1.to(device=torch.cuda.current_device())
@@ -445,6 +449,7 @@ def test_ht_deepep_deepgemm_moe(
    topk: int,
    world_dp_size: tuple[int, int],
    disable_deepgemm_ue8m0,
+    workspace_init,
 ):
    """
    Tests for High-Throughput DeepEP + DeepGemm integration.
@@ -518,6 +523,7 @@ def test_ll_deepep_deepgemm_moe(
    block_size: list[int],
    world_dp_size: tuple[int, int],
    disable_deepgemm_ue8m0,
+    workspace_init,
 ):
    """
    Tests for Low-Latency DeepEP + DeepGemm integration.