[Perf] Enable FlashInfer DeepGEMM swapAB on SM90 by default (#34924)
Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
This commit is contained in:
@@ -159,7 +159,7 @@ if TYPE_CHECKING:
|
||||
"relax",
|
||||
] = "relax"
|
||||
VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
|
||||
VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER: bool = False
|
||||
VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER: bool = True
|
||||
VLLM_USE_FLASHINFER_MOE_FP16: bool = False
|
||||
VLLM_USE_FLASHINFER_MOE_FP8: bool = False
|
||||
VLLM_USE_FLASHINFER_MOE_FP4: bool = False
|
||||
@@ -1198,7 +1198,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# Allow use of FlashInfer FP8 block-scale GEMM for linear layers.
|
||||
# This uses TensorRT-LLM kernels and requires SM90+ (Hopper).
|
||||
"VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER": lambda: bool(
|
||||
int(os.getenv("VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER", "0"))
|
||||
int(os.getenv("VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER", "1"))
|
||||
),
|
||||
# Allow use of FlashInfer BF16 MoE kernels for fused moe ops.
|
||||
"VLLM_USE_FLASHINFER_MOE_FP16": lambda: bool(
|
||||
|
||||
Reference in New Issue
Block a user