Update Flashinfer to 0.2.14.post1 (#23537)
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com> Signed-off-by: siyuanf <siyuanf@nvidia.com> Signed-off-by: Weiliang Liu <weiliangl@nvidia.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Siyuan Fu <siyuanf@nvidia.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -6,6 +6,7 @@ import torch
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from vllm import envs
|
||||
from vllm.config import get_current_vllm_config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
|
||||
FusedMoEMethodBase)
|
||||
@@ -113,6 +114,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
self.topk_indices_dtype = None
|
||||
self.moe = moe
|
||||
self.use_marlin = self._should_use_marlin()
|
||||
self.max_capture_size = get_current_vllm_config(
|
||||
).compilation_config.max_capture_size
|
||||
|
||||
if current_platform.is_device_capability(100) and not has_flashinfer():
|
||||
logger.warning_once(
|
||||
@@ -520,7 +523,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
x_scale = None
|
||||
else:
|
||||
x_quant, x_scale = mxfp8_quantize(x, False) # to mxfp8
|
||||
x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1)
|
||||
x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
|
||||
*x.shape[:-1], -1)
|
||||
trtllm_gen_output = trtllm_fp4_block_scale_moe(
|
||||
router_logits.to(torch.bfloat16),
|
||||
None, # routing_bias
|
||||
@@ -549,6 +553,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
self._get_tile_tokens_dim(x, top_k),
|
||||
1 if renormalize else 0, # routing_method_type, renormalize
|
||||
True, # do finalize
|
||||
tune_max_num_tokens=self.max_capture_size,
|
||||
)[0]
|
||||
return trtllm_gen_output
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user