[BugFix][Performance] Restore flashinfer autotuning for all scenarios (#27904)

This commit is contained in:
Varun Sundar Rabindranath
2025-11-04 02:56:21 -05:00
committed by GitHub
parent 53f6e81dfd
commit 4022a9d279
4 changed files with 14 additions and 44 deletions

View File

@@ -127,10 +127,17 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
"routing_method_type": 1,
"do_finalize": True,
"output": output,
"tune_max_num_tokens": self.max_capture_size,
"tune_max_num_tokens": max(self.max_capture_size, 1),
}
from flashinfer import trtllm_fp4_block_scale_routed_moe
trtllm_fp4_block_scale_routed_moe(**kwargs)
from vllm.utils.flashinfer import autotune
with autotune(False):
# Enable autotune when,
# https://github.com/flashinfer-ai/flashinfer/issues/2023 is
# resolved.
trtllm_fp4_block_scale_routed_moe(**kwargs)
return output

View File

@@ -1047,7 +1047,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
None,
1 if renormalize else 0, # routing_method_type, renormalize
True, # do finalize
tune_max_num_tokens=self.max_capture_size,
tune_max_num_tokens=max(self.max_capture_size, 1),
)[0]
return trtllm_gen_output
elif (
@@ -1122,7 +1122,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
tp_rank=self.moe.tp_rank,
ep_size=self.moe.ep_size,
ep_rank=self.moe.ep_rank,
tune_max_num_tokens=self.max_capture_size,
tune_max_num_tokens=max(self.max_capture_size, 1),
**extra_kwargs,
)