[NVIDIA][test] Tests for flashinfer TRTLLM BF16 MoE (#33715)
Signed-off-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com> Co-authored-by: Pavani Majety <pmajety@nvidia.com>
This commit is contained in:
@@ -78,7 +78,10 @@ def select_unquantized_moe_backend(
|
||||
activation_format=activation_format,
|
||||
)
|
||||
flashinfer_trtllm_moe_enabled = (
|
||||
has_flashinfer() and envs.VLLM_USE_FLASHINFER_MOE_FP16 and trtllm_supported
|
||||
has_flashinfer()
|
||||
and envs.VLLM_USE_FLASHINFER_MOE_FP16
|
||||
and trtllm_supported
|
||||
and envs.VLLM_FLASHINFER_MOE_BACKEND == "latency"
|
||||
)
|
||||
# FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
|
||||
flashinfer_cutlass_moe_enabled = (
|
||||
@@ -98,11 +101,19 @@ def select_unquantized_moe_backend(
|
||||
backend = UnquantizedMoeBackend.FLASHINFER_TRTLLM
|
||||
elif flashinfer_cutlass_moe_enabled:
|
||||
backend = UnquantizedMoeBackend.FLASHINFER_CUTLASS
|
||||
if trtllm_supported:
|
||||
logger.info_once(
|
||||
"FlashInfer TRTLLM MoE is available but not enabled, "
|
||||
"consider setting VLLM_FLASHINFER_MOE_BACKEND=latency "
|
||||
"to enable it for better performance.",
|
||||
scope="local",
|
||||
)
|
||||
else:
|
||||
if not envs.VLLM_USE_FLASHINFER_MOE_FP16 and trtllm_supported:
|
||||
logger.info_once(
|
||||
"FlashInfer TRTLLM MoE is available but not enabled, "
|
||||
"consider setting VLLM_USE_FLASHINFER_MOE_FP16=1 "
|
||||
"and VLLM_FLASHINFER_MOE_BACKEND=latency "
|
||||
"to enable it for better performance.",
|
||||
scope="local",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user