[Core] Add Flashinfer TRTLLM Backend for Flashinfer decode path (SM100). (#19825)

Signed-off-by: Pavani Majety <pmajety@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: shuw <shuw@nvidia.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Pavani Majety
2025-07-11 02:23:23 -07:00
committed by GitHub
parent 8020e98c9f
commit 7bd4c37ae7
8 changed files with 667 additions and 56 deletions

View File

@@ -1424,6 +1424,8 @@ class EngineArgs:
from vllm.attention.utils.fa_utils import (
flash_attn_supports_fp8)
supported = flash_attn_supports_fp8()
elif envs.VLLM_USE_TRTLLM_DECODE_ATTENTION:
supported = True
if not supported:
_raise_or_fallback(feature_name="--kv-cache-dtype",
recommend_to_remove=False)