Add tree attention backend for v1 (part 1) (#20401)

Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
This commit is contained in:
Giancarlo Delfin
2025-08-03 22:13:26 -07:00
committed by GitHub
parent c2e75b3c11
commit aa7012eb6d
12 changed files with 1098 additions and 25 deletions

View File

@@ -1454,7 +1454,6 @@ class EngineArgs:
"Please consider using other speculative decoding methods "
"such as ngram, medusa, eagle, or deepseek_mtp.")
# No XFormers so far.
V1_BACKENDS = [
"FLASH_ATTN_VLLM_V1",
"FLASH_ATTN",
@@ -1469,6 +1468,7 @@ class EngineArgs:
"ROCM_AITER_MLA",
"TORCH_SDPA_VLLM_V1",
"FLEX_ATTENTION",
"TREE_ATTN",
]
if (envs.is_set("VLLM_ATTENTION_BACKEND")
and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):