[Kernel] Add nvfp4 gemm flashinfer backends (#22346)

Signed-off-by: Julien Lin <jullin@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
nvjullin
2025-08-15 04:03:55 +08:00
committed by GitHub
parent b8ff05361a
commit 279a5f31b3
9 changed files with 369 additions and 39 deletions

View File

@@ -1101,6 +1101,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_TRTLLM_ATTENTION":
lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None),
# If set to 1, force the use of TRTLLM FP4 GEMM backend in flashinfer.
# Otherwise, uses the first available of: flashinfer cutlass GEMM,
# vllm cutlass GEMM, marlin GEMM.
"VLLM_USE_TRTLLM_FP4_GEMM":
lambda: bool(int(os.getenv("VLLM_USE_TRTLLM_FP4_GEMM", "0"))),
# Controls garbage collection during CUDA graph capture.
# If set to 0 (default), enables GC freezing to speed up capture time.
# If set to 1, allows GC to run during capture.
@@ -1208,6 +1214,7 @@ def compute_hash() -> str:
"VLLM_DP_SIZE",
"VLLM_USE_STANDALONE_COMPILE",
"VLLM_FUSED_MOE_CHUNK_SIZE",
"VLLM_USE_TRTLLM_FP4_GEMM",
]
for key in environment_variables_to_hash:
if key in environment_variables: