Implements dual-chunk-flash-attn backend for dual chunk attention with sparse attention support (#11844)
This commit is contained in:
@@ -37,8 +37,8 @@ from vllm.reasoning import ReasoningParserManager
|
||||
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
|
||||
from vllm.transformers_utils.utils import check_gguf_file
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import (FlexibleArgumentParser, GiB_bytes, is_in_doc_build,
|
||||
is_in_ray_actor)
|
||||
from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
|
||||
GiB_bytes, is_in_doc_build, is_in_ray_actor)
|
||||
|
||||
# yapf: enable
|
||||
|
||||
@@ -983,6 +983,17 @@ class EngineArgs:
|
||||
|
||||
assert self.enable_chunked_prefill is not None
|
||||
|
||||
if envs.VLLM_ATTENTION_BACKEND in [STR_DUAL_CHUNK_FLASH_ATTN_VAL]:
|
||||
assert self.enforce_eager, (
|
||||
"Cuda graph is not supported with DualChunkFlashAttention. "
|
||||
"To run the model in eager mode, set 'enforce_eager=True' "
|
||||
"or use '--enforce-eager' in the CLI.")
|
||||
assert current_platform.is_cuda(), (
|
||||
"DualChunkFlashAttention is only supported on CUDA platform.")
|
||||
assert not use_v1, (
|
||||
"DualChunkFlashAttention is not supported on V1 engine. "
|
||||
"To run the model in V0 engine, try set 'VLLM_USE_V1=0'")
|
||||
|
||||
cache_config = CacheConfig(
|
||||
block_size=self.block_size,
|
||||
gpu_memory_utilization=self.gpu_memory_utilization,
|
||||
|
||||
Reference in New Issue
Block a user