[NVIDIA][test] Tests for flashinfer TRTLLM BF16 MoE (#33715)

Signed-off-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com> Co-authored-by: Pavani Majety <pmajety@nvidia.com>
2026-02-11 13:38:11 +01:00
parent 0f5e55e7a8
commit 275e0d2a99
7 changed files with 296 additions and 1 deletions
--- a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py
@@ -78,7 +78,10 @@ def select_unquantized_moe_backend(
        activation_format=activation_format,
    )
    flashinfer_trtllm_moe_enabled = (
-        has_flashinfer() and envs.VLLM_USE_FLASHINFER_MOE_FP16 and trtllm_supported
+        has_flashinfer()
+        and envs.VLLM_USE_FLASHINFER_MOE_FP16
+        and trtllm_supported
+        and envs.VLLM_FLASHINFER_MOE_BACKEND == "latency"
    )
    # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
    flashinfer_cutlass_moe_enabled = (
@@ -98,11 +101,19 @@ def select_unquantized_moe_backend(
            backend = UnquantizedMoeBackend.FLASHINFER_TRTLLM
        elif flashinfer_cutlass_moe_enabled:
            backend = UnquantizedMoeBackend.FLASHINFER_CUTLASS
+            if trtllm_supported:
+                logger.info_once(
+                    "FlashInfer TRTLLM MoE is available but not enabled, "
+                    "consider setting VLLM_FLASHINFER_MOE_BACKEND=latency "
+                    "to enable it for better performance.",
+                    scope="local",
+                )
        else:
            if not envs.VLLM_USE_FLASHINFER_MOE_FP16 and trtllm_supported:
                logger.info_once(
                    "FlashInfer TRTLLM MoE is available but not enabled, "
                    "consider setting VLLM_USE_FLASHINFER_MOE_FP16=1 "
+                    "and VLLM_FLASHINFER_MOE_BACKEND=latency "
                    "to enable it for better performance.",
                    scope="local",
                )