Remove V0 attention backends (#25351)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon
2025-09-21 16:03:28 -07:00
committed by GitHub
parent af7dfb0d1a
commit bc6e542d9f
28 changed files with 143 additions and 7376 deletions

View File

@@ -32,8 +32,7 @@ from vllm.transformers_utils.config import (
from vllm.transformers_utils.runai_utils import (ObjectStorageModel,
is_runai_obj_uri)
from vllm.transformers_utils.utils import maybe_model_redirect
from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, LayerBlockType,
LazyLoader, common_broadcastable_dtype)
from vllm.utils import LayerBlockType, LazyLoader, common_broadcastable_dtype
if TYPE_CHECKING:
from transformers import PretrainedConfig
@@ -1103,10 +1102,6 @@ class ModelConfig:
self.hf_config.dual_chunk_attention_config[
"sparse_attention_enabled"] = True
if envs.VLLM_ATTENTION_BACKEND != STR_DUAL_CHUNK_FLASH_ATTN_VAL:
raise ValueError("please set VLLM_ATTENTION_BACKEND to "
f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}")
def verify_with_parallel_config(
self,
parallel_config: ParallelConfig,