[Kernel] Add nvfp4 gemm flashinfer backends (#22346)
Signed-off-by: Julien Lin <jullin@nvidia.com> Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
@@ -1101,6 +1101,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_USE_TRTLLM_ATTENTION":
|
||||
lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None),
|
||||
|
||||
# If set to 1, force the use of TRTLLM FP4 GEMM backend in flashinfer.
|
||||
# Otherwise, uses the first available of: flashinfer cutlass GEMM,
|
||||
# vllm cutlass GEMM, marlin GEMM.
|
||||
"VLLM_USE_TRTLLM_FP4_GEMM":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_TRTLLM_FP4_GEMM", "0"))),
|
||||
|
||||
# Controls garbage collection during CUDA graph capture.
|
||||
# If set to 0 (default), enables GC freezing to speed up capture time.
|
||||
# If set to 1, allows GC to run during capture.
|
||||
@@ -1208,6 +1214,7 @@ def compute_hash() -> str:
|
||||
"VLLM_DP_SIZE",
|
||||
"VLLM_USE_STANDALONE_COMPILE",
|
||||
"VLLM_FUSED_MOE_CHUNK_SIZE",
|
||||
"VLLM_USE_TRTLLM_FP4_GEMM",
|
||||
]
|
||||
for key in environment_variables_to_hash:
|
||||
if key in environment_variables:
|
||||
|
||||
Reference in New Issue
Block a user