Update Flashinfer to 0.2.14.post1 (#23537)

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: siyuanf <siyuanf@nvidia.com>
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
weiliang
2025-08-26 09:30:44 +08:00
committed by GitHub
parent 906e461ed6
commit ae067888d6
5 changed files with 14 additions and 7 deletions

View File

@@ -6,6 +6,7 @@ import torch
from torch.nn.parameter import Parameter
from vllm import envs
from vllm.config import get_current_vllm_config
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
FusedMoEMethodBase)
@@ -113,6 +114,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
self.topk_indices_dtype = None
self.moe = moe
self.use_marlin = self._should_use_marlin()
self.max_capture_size = get_current_vllm_config(
).compilation_config.max_capture_size
if current_platform.is_device_capability(100) and not has_flashinfer():
logger.warning_once(
@@ -520,7 +523,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
x_scale = None
else:
x_quant, x_scale = mxfp8_quantize(x, False) # to mxfp8
x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1)
x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
*x.shape[:-1], -1)
trtllm_gen_output = trtllm_fp4_block_scale_moe(
router_logits.to(torch.bfloat16),
None, # routing_bias
@@ -549,6 +553,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
self._get_tile_tokens_dim(x, top_k),
1 if renormalize else 0, # routing_method_type, renormalize
True, # do finalize
tune_max_num_tokens=self.max_capture_size,
)[0]
return trtllm_gen_output
else: