Implements dual-chunk-flash-attn backend for dual chunk attention with sparse attention support (#11844)

2025-05-13 10:52:47 +08:00
parent f6518b2b48
commit 60f7624334
17 changed files with 2444 additions and 32 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -37,8 +37,8 @@ from vllm.reasoning import ReasoningParserManager
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (FlexibleArgumentParser, GiB_bytes, is_in_doc_build,
-                        is_in_ray_actor)
+from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
+                        GiB_bytes, is_in_doc_build, is_in_ray_actor)

 # yapf: enable

@@ -983,6 +983,17 @@ class EngineArgs:

        assert self.enable_chunked_prefill is not None

+        if envs.VLLM_ATTENTION_BACKEND in [STR_DUAL_CHUNK_FLASH_ATTN_VAL]:
+            assert self.enforce_eager, (
+                "Cuda graph is not supported with DualChunkFlashAttention. "
+                "To run the model in eager mode, set 'enforce_eager=True' "
+                "or use '--enforce-eager' in the CLI.")
+            assert current_platform.is_cuda(), (
+                "DualChunkFlashAttention is only supported on CUDA platform.")
+            assert not use_v1, (
+                "DualChunkFlashAttention is not supported on V1 engine. "
+                "To run the model in V0 engine, try set 'VLLM_USE_V1=0'")
+
        cache_config = CacheConfig(
            block_size=self.block_size,
            gpu_memory_utilization=self.gpu_memory_utilization,