[Attention] Use sparse prefill kernel for fp8 kv-cache in DeepSeek-v3.2 (#27532)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
This commit is contained in:
Lucas Wilkinson
2025-12-12 08:57:47 -05:00
committed by GitHub
parent 91401c7a26
commit 3e41992fec
30 changed files with 1372 additions and 256 deletions

View File

@@ -54,6 +54,7 @@ from vllm.v1.outputs import (
from vllm.v1.utils import report_usage_stats
from vllm.v1.worker.utils import is_residual_scattered_for_sp
from vllm.v1.worker.worker_base import WorkerBase
from vllm.v1.worker.workspace import init_workspace_manager
logger = init_logger(__name__)
@@ -255,6 +256,10 @@ class Worker(WorkerBase):
else:
raise RuntimeError(f"Not support device type: {self.device_config.device}")
# Initialize workspace manager
num_ubatches = 2 if self.vllm_config.parallel_config.enable_dbo else 1
init_workspace_manager(self.device, num_ubatches)
# Construct the model runner
if self.use_v2_model_runner:
from vllm.v1.worker.gpu.model_runner import (