[LoRA] Support FusedMoE LoRA Triton kernel for mxfp4 (#28971)
Signed-off-by: Xin Yang <xyangx@amazon.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
@@ -30,6 +30,7 @@ from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
|
||||
OAITritonExperts,
|
||||
UnfusedOAITritonExperts,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts
|
||||
from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
|
||||
@@ -83,8 +84,21 @@ def get_mxfp4_backend_with_lora() -> Mxfp4Backend:
|
||||
if not current_platform.is_cuda():
|
||||
return Mxfp4Backend.NONE
|
||||
|
||||
logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
|
||||
return Mxfp4Backend.MARLIN
|
||||
# If FlashInfer is not available, try either Marlin or Triton
|
||||
triton_kernels_supported = (
|
||||
has_triton_kernels()
|
||||
and is_torch_equal_or_newer("2.8.0")
|
||||
# NOTE: triton_kernels are only confirmed to work on SM90 and SM100
|
||||
# SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
|
||||
# SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
|
||||
and (9, 0) <= current_platform.get_device_capability() < (11, 0)
|
||||
)
|
||||
if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
|
||||
logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
|
||||
return Mxfp4Backend.MARLIN
|
||||
|
||||
logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend")
|
||||
return Mxfp4Backend.TRITON
|
||||
|
||||
|
||||
def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
|
||||
@@ -854,6 +868,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
elif self.mxfp4_backend == Mxfp4Backend.MARLIN:
|
||||
return MarlinExperts(self.moe_quant_config)
|
||||
elif self.mxfp4_backend == Mxfp4Backend.TRITON:
|
||||
if self.moe.is_lora_enabled:
|
||||
return UnfusedOAITritonExperts(self.moe_quant_config)
|
||||
return OAITritonExperts(self.moe_quant_config)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
|
||||
Reference in New Issue
Block a user