[Kernel] Add NVFP4 MoE CUTLASS support for SM120 (#29242)
Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
@@ -103,7 +103,7 @@ __all__ = [
|
||||
"CompressedTensorsW8A8Int8MoEMethod",
|
||||
"CompressedTensorsWNA16MarlinMoEMethod",
|
||||
"CompressedTensorsWNA16MoEMethod",
|
||||
"CompressedTensorsW4A4MoeMethod",
|
||||
"CompressedTensorsW4A4Nvfp4MoeMethod",
|
||||
"CompressedTensorsW4A8Int8MoEMethod",
|
||||
]
|
||||
|
||||
@@ -171,7 +171,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
|
||||
quant_config, layer.moe_config
|
||||
)
|
||||
elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
|
||||
return CompressedTensorsW4A4MoeMethod(layer.moe_config)
|
||||
return CompressedTensorsW4A4Nvfp4MoeMethod(layer.moe_config)
|
||||
elif (
|
||||
quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
|
||||
or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
|
||||
@@ -188,7 +188,7 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
|
||||
)
|
||||
|
||||
|
||||
class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
class CompressedTensorsW4A4Nvfp4MoeMethod(CompressedTensorsMoEMethod):
|
||||
def __init__(self, moe: FusedMoEConfig):
|
||||
from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501
|
||||
detect_nvfp4_moe_support,
|
||||
@@ -205,8 +205,12 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
self.flashinfer_moe_backend = get_flashinfer_moe_backend()
|
||||
logger.info_once(
|
||||
f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
|
||||
" for CompressedTensorsW4A4MoeMethod."
|
||||
" for CompressedTensorsW4A4Nvfp4MoeMethod."
|
||||
)
|
||||
elif self.use_marlin:
|
||||
logger.info_once("Using Marlin for CompressedTensorsW4A4Nvfp4MoeMethod.")
|
||||
else:
|
||||
logger.info_once("Using Cutlass for CompressedTensorsW4A4Nvfp4MoeMethod.")
|
||||
|
||||
def create_weights(
|
||||
self,
|
||||
@@ -612,7 +616,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
assert expert_map is None, (
|
||||
"Expert Parallelism / expert_map "
|
||||
"is currently not supported for "
|
||||
"CompressedTensorsW4A4MoeMethod."
|
||||
"CompressedTensorsW4A4Nvfp4MoeMethod."
|
||||
)
|
||||
assert self.moe_quant_config is not None
|
||||
|
||||
|
||||
@@ -1132,6 +1132,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
|
||||
" for ModelOptNvFp4FusedMoE."
|
||||
)
|
||||
elif self.use_marlin:
|
||||
logger.info_once("Using Marlin for ModelOptNvFp4FusedMoE.")
|
||||
else:
|
||||
logger.info_once("Using Cutlass for ModelOptNvFp4FusedMoE.")
|
||||
|
||||
def maybe_make_prepare_finalize(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user