[BugFix][Performance] Restore flashinfer autotuning for all scenarios (#27904)
This commit is contained in:
committed by
GitHub
parent
53f6e81dfd
commit
4022a9d279
@@ -127,10 +127,17 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
|
||||
"routing_method_type": 1,
|
||||
"do_finalize": True,
|
||||
"output": output,
|
||||
"tune_max_num_tokens": self.max_capture_size,
|
||||
"tune_max_num_tokens": max(self.max_capture_size, 1),
|
||||
}
|
||||
|
||||
from flashinfer import trtllm_fp4_block_scale_routed_moe
|
||||
|
||||
trtllm_fp4_block_scale_routed_moe(**kwargs)
|
||||
from vllm.utils.flashinfer import autotune
|
||||
|
||||
with autotune(False):
|
||||
# Enable autotune when,
|
||||
# https://github.com/flashinfer-ai/flashinfer/issues/2023 is
|
||||
# resolved.
|
||||
trtllm_fp4_block_scale_routed_moe(**kwargs)
|
||||
|
||||
return output
|
||||
|
||||
@@ -1047,7 +1047,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
None,
|
||||
1 if renormalize else 0, # routing_method_type, renormalize
|
||||
True, # do finalize
|
||||
tune_max_num_tokens=self.max_capture_size,
|
||||
tune_max_num_tokens=max(self.max_capture_size, 1),
|
||||
)[0]
|
||||
return trtllm_gen_output
|
||||
elif (
|
||||
@@ -1122,7 +1122,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
tp_rank=self.moe.tp_rank,
|
||||
ep_size=self.moe.ep_size,
|
||||
ep_rank=self.moe.ep_rank,
|
||||
tune_max_num_tokens=self.max_capture_size,
|
||||
tune_max_num_tokens=max(self.max_capture_size, 1),
|
||||
**extra_kwargs,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user