[NVIDIA][test] Tests for flashinfer TRTLLM BF16 MoE (#33715)

Signed-off-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com>
Co-authored-by: Pavani Majety <pmajety@nvidia.com>
This commit is contained in:
Linda
2026-02-11 13:38:11 +01:00
committed by GitHub
parent 0f5e55e7a8
commit 275e0d2a99
7 changed files with 296 additions and 1 deletions

View File

@@ -78,7 +78,10 @@ def select_unquantized_moe_backend(
activation_format=activation_format,
)
flashinfer_trtllm_moe_enabled = (
has_flashinfer() and envs.VLLM_USE_FLASHINFER_MOE_FP16 and trtllm_supported
has_flashinfer()
and envs.VLLM_USE_FLASHINFER_MOE_FP16
and trtllm_supported
and envs.VLLM_FLASHINFER_MOE_BACKEND == "latency"
)
# FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
flashinfer_cutlass_moe_enabled = (
@@ -98,11 +101,19 @@ def select_unquantized_moe_backend(
backend = UnquantizedMoeBackend.FLASHINFER_TRTLLM
elif flashinfer_cutlass_moe_enabled:
backend = UnquantizedMoeBackend.FLASHINFER_CUTLASS
if trtllm_supported:
logger.info_once(
"FlashInfer TRTLLM MoE is available but not enabled, "
"consider setting VLLM_FLASHINFER_MOE_BACKEND=latency "
"to enable it for better performance.",
scope="local",
)
else:
if not envs.VLLM_USE_FLASHINFER_MOE_FP16 and trtllm_supported:
logger.info_once(
"FlashInfer TRTLLM MoE is available but not enabled, "
"consider setting VLLM_USE_FLASHINFER_MOE_FP16=1 "
"and VLLM_FLASHINFER_MOE_BACKEND=latency "
"to enable it for better performance.",
scope="local",
)