[Core] Add Flashinfer TRTLLM Backend for Flashinfer decode path (SM100). (#19825)

Signed-off-by: Pavani Majety <pmajety@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: shuw <shuw@nvidia.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Pavani Majety
2025-07-11 02:23:23 -07:00
committed by GitHub
parent 8020e98c9f
commit 7bd4c37ae7
8 changed files with 667 additions and 56 deletions

View File

@@ -24,6 +24,7 @@ from vllm.distributed.kv_transfer.kv_connector.utils import (
from vllm.logger import init_logger
logger = init_logger(__name__)
_KV_CACHE_LAYOUT_OVERRIDE = None
@dataclass
@@ -103,6 +104,7 @@ class AttentionMetadataBuilder(abc.ABC, Generic[M]):
@functools.lru_cache
def get_kv_cache_layout():
global _KV_CACHE_LAYOUT_OVERRIDE
# Override with format specified by the user.
cache_layout = envs.VLLM_KV_CACHE_LAYOUT
if cache_layout is None:
@@ -110,10 +112,16 @@ def get_kv_cache_layout():
else:
logger.info_once("`VLLM_KV_CACHE_LAYOUT` environment variable " \
"detected. Setting KV cache layout to %s.", cache_layout)
if _KV_CACHE_LAYOUT_OVERRIDE is not None:
cache_layout = _KV_CACHE_LAYOUT_OVERRIDE
return cache_layout
def set_kv_cache_layout(cache_layout: str):
global _KV_CACHE_LAYOUT_OVERRIDE
_KV_CACHE_LAYOUT_OVERRIDE = cache_layout
@dataclass
class PerLayerParameters:
"""