Implements dual-chunk-flash-attn backend for dual chunk attention with sparse attention support (#11844)

This commit is contained in:
Tao He
2025-05-13 10:52:47 +08:00
committed by GitHub
parent f6518b2b48
commit 60f7624334
17 changed files with 2444 additions and 32 deletions

View File

@@ -929,6 +929,23 @@ class ModelConfig:
"Number of experts in the model must be greater than 0 "
"when expert parallelism is enabled.")
def verify_dual_chunk_attention_config(
self,
load_config: "LoadConfig",
) -> None:
if hasattr(self.hf_config, "dual_chunk_attention_config"):
# Try loading the sparse attention config
from vllm.model_executor.model_loader.weight_utils import (
get_sparse_attention_config)
sparse_attn_config = get_sparse_attention_config(self, load_config)
if sparse_attn_config:
self.hf_config.dual_chunk_attention_config[
"sparse_attention_config"] = sparse_attn_config
if "sparse_attention_enabled" not in \
self.hf_config.dual_chunk_attention_config:
self.hf_config.dual_chunk_attention_config[
"sparse_attention_enabled"] = True
def verify_async_output_proc(self, parallel_config, speculative_config,
device_config) -> None:
if not self.use_async_output_proc:
@@ -4187,6 +4204,8 @@ class VllmConfig:
self.speculative_config,
self.device_config)
self.model_config.verify_with_parallel_config(self.parallel_config)
self.model_config.verify_dual_chunk_attention_config(
self.load_config)
if self.cache_config is not None:
self.cache_config.verify_with_parallel_config(self.parallel_config)