[MoE Refactor][3/N] Deprecate cutlass block quant fp8 (b200) (#30990)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
This commit is contained in:
Robert Shaw
2025-12-19 16:09:54 -05:00
committed by GitHub
parent 5f6477d1d0
commit 83a317f650
8 changed files with 3 additions and 704 deletions

View File

@@ -118,9 +118,8 @@ class Fp8MoeBackend(Enum):
FLASHINFER_TRTLLM = 1
FLASHINFER_CUTLASS = 2
DEEPGEMM = 3
CUTLASS_BLOCK_SCALED_GROUPED_GEMM = 4
MARLIN = 5
TRITON = 6
MARLIN = 4
TRITON = 5
def get_fp8_moe_backend(
@@ -191,17 +190,6 @@ def get_fp8_moe_backend(
logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local")
return Fp8MoeBackend.DEEPGEMM
# CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
if (
current_platform.is_cuda()
and current_platform.is_device_capability_family(100)
and block_quant
):
logger.info_once(
"Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local"
)
return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
# default to Triton
logger.info_once("Using Triton backend for FP8 MoE")
return Fp8MoeBackend.TRITON
@@ -752,9 +740,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
)
self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
self.allow_cutlass_block_scaled_grouped_gemm = (
self.fp8_backend == Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
)
def create_weights(
self,
@@ -1316,9 +1301,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
expert_map=layer.expert_map,
quant_config=self.moe_quant_config,
allow_deep_gemm=self.allow_deep_gemm,
allow_cutlass_block_scaled_grouped_gemm=(
self.allow_cutlass_block_scaled_grouped_gemm
),
)
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: