[Kernel] Add NVFP4 MoE CUTLASS support for SM120 (#29242)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
Michael Goin
2025-11-25 09:59:07 -05:00
committed by GitHub
parent dbc3d9991a
commit e502098643
8 changed files with 264 additions and 30 deletions

View File

@@ -103,7 +103,7 @@ __all__ = [
"CompressedTensorsW8A8Int8MoEMethod",
"CompressedTensorsWNA16MarlinMoEMethod",
"CompressedTensorsWNA16MoEMethod",
"CompressedTensorsW4A4MoeMethod",
"CompressedTensorsW4A4Nvfp4MoeMethod",
"CompressedTensorsW4A8Int8MoEMethod",
]
@@ -171,7 +171,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
quant_config, layer.moe_config
)
elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
return CompressedTensorsW4A4MoeMethod(layer.moe_config)
return CompressedTensorsW4A4Nvfp4MoeMethod(layer.moe_config)
elif (
quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
@@ -188,7 +188,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
)
class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
class CompressedTensorsW4A4Nvfp4MoeMethod(CompressedTensorsMoEMethod):
def __init__(self, moe: FusedMoEConfig):
from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501
detect_nvfp4_moe_support,
@@ -205,8 +205,12 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
self.flashinfer_moe_backend = get_flashinfer_moe_backend()
logger.info_once(
f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
" for CompressedTensorsW4A4MoeMethod."
" for CompressedTensorsW4A4Nvfp4MoeMethod."
)
elif self.use_marlin:
logger.info_once("Using Marlin for CompressedTensorsW4A4Nvfp4MoeMethod.")
else:
logger.info_once("Using Cutlass for CompressedTensorsW4A4Nvfp4MoeMethod.")
def create_weights(
self,
@@ -612,7 +616,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
assert expert_map is None, (
"Expert Parallelism / expert_map "
"is currently not supported for "
"CompressedTensorsW4A4MoeMethod."
"CompressedTensorsW4A4Nvfp4MoeMethod."
)
assert self.moe_quant_config is not None

View File

@@ -1132,6 +1132,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
" for ModelOptNvFp4FusedMoE."
)
elif self.use_marlin:
logger.info_once("Using Marlin for ModelOptNvFp4FusedMoE.")
else:
logger.info_once("Using Cutlass for ModelOptNvFp4FusedMoE.")
def maybe_make_prepare_finalize(
self,