[Attention] Use sparse prefill kernel for fp8 kv-cache in DeepSeek-v3.2 (#27532)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
This commit is contained in:
Lucas Wilkinson
2025-12-12 08:57:47 -05:00
committed by GitHub
parent 91401c7a26
commit 3e41992fec
30 changed files with 1372 additions and 256 deletions

View File

@@ -29,6 +29,7 @@ from vllm.utils.deep_gemm import (
is_deep_gemm_supported,
)
from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
from vllm.v1.worker.workspace import init_workspace_manager
from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch
@@ -363,6 +364,9 @@ def _test_deepep_deepgemm_moe(
w1_scale: torch.Tensor,
w2_scale: torch.Tensor,
):
device = torch.device(f"cuda:{pgi.local_rank}")
init_workspace_manager(device)
current_platform.seed_everything(pgi.rank)
w1 = w1.to(device=torch.cuda.current_device())
@@ -445,6 +449,7 @@ def test_ht_deepep_deepgemm_moe(
topk: int,
world_dp_size: tuple[int, int],
disable_deepgemm_ue8m0,
workspace_init,
):
"""
Tests for High-Throughput DeepEP + DeepGemm integration.
@@ -518,6 +523,7 @@ def test_ll_deepep_deepgemm_moe(
block_size: list[int],
world_dp_size: tuple[int, int],
disable_deepgemm_ue8m0,
workspace_init,
):
"""
Tests for Low-Latency DeepEP + DeepGemm integration.