[MoE Refactor][3/N] Deprecate cutlass block quant fp8 (b200) (#30990)
Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com>
This commit is contained in:
@@ -118,9 +118,8 @@ class Fp8MoeBackend(Enum):
|
||||
FLASHINFER_TRTLLM = 1
|
||||
FLASHINFER_CUTLASS = 2
|
||||
DEEPGEMM = 3
|
||||
CUTLASS_BLOCK_SCALED_GROUPED_GEMM = 4
|
||||
MARLIN = 5
|
||||
TRITON = 6
|
||||
MARLIN = 4
|
||||
TRITON = 5
|
||||
|
||||
|
||||
def get_fp8_moe_backend(
|
||||
@@ -191,17 +190,6 @@ def get_fp8_moe_backend(
|
||||
logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local")
|
||||
return Fp8MoeBackend.DEEPGEMM
|
||||
|
||||
# CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
|
||||
if (
|
||||
current_platform.is_cuda()
|
||||
and current_platform.is_device_capability_family(100)
|
||||
and block_quant
|
||||
):
|
||||
logger.info_once(
|
||||
"Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local"
|
||||
)
|
||||
return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
|
||||
|
||||
# default to Triton
|
||||
logger.info_once("Using Triton backend for FP8 MoE")
|
||||
return Fp8MoeBackend.TRITON
|
||||
@@ -752,9 +740,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
)
|
||||
|
||||
self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
|
||||
self.allow_cutlass_block_scaled_grouped_gemm = (
|
||||
self.fp8_backend == Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
|
||||
)
|
||||
|
||||
def create_weights(
|
||||
self,
|
||||
@@ -1316,9 +1301,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
expert_map=layer.expert_map,
|
||||
quant_config=self.moe_quant_config,
|
||||
allow_deep_gemm=self.allow_deep_gemm,
|
||||
allow_cutlass_block_scaled_grouped_gemm=(
|
||||
self.allow_cutlass_block_scaled_grouped_gemm
|
||||
),
|
||||
)
|
||||
|
||||
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
|
||||
|
||||
Reference in New Issue
Block a user