[MoE Refactor][3/N] Deprecate cutlass block quant fp8 (b200) (#30990)

Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com>
2025-12-19 16:09:54 -05:00
parent 5f6477d1d0
commit 83a317f650
8 changed files with 3 additions and 704 deletions
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -118,9 +118,8 @@ class Fp8MoeBackend(Enum):
    FLASHINFER_TRTLLM = 1
    FLASHINFER_CUTLASS = 2
    DEEPGEMM = 3
-    CUTLASS_BLOCK_SCALED_GROUPED_GEMM = 4
-    MARLIN = 5
-    TRITON = 6
+    MARLIN = 4
+    TRITON = 5


 def get_fp8_moe_backend(
@@ -191,17 +190,6 @@ def get_fp8_moe_backend(
            logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local")
            return Fp8MoeBackend.DEEPGEMM

-    # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
-    if (
-        current_platform.is_cuda()
-        and current_platform.is_device_capability_family(100)
-        and block_quant
-    ):
-        logger.info_once(
-            "Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local"
-        )
-        return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
-
    # default to Triton
    logger.info_once("Using Triton backend for FP8 MoE")
    return Fp8MoeBackend.TRITON
@@ -752,9 +740,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
            )

        self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
-        self.allow_cutlass_block_scaled_grouped_gemm = (
-            self.fp8_backend == Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
-        )

    def create_weights(
        self,
@@ -1316,9 +1301,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                expert_map=layer.expert_map,
                quant_config=self.moe_quant_config,
                allow_deep_gemm=self.allow_deep_gemm,
-                allow_cutlass_block_scaled_grouped_gemm=(
-                    self.allow_cutlass_block_scaled_grouped_gemm
-                ),
            )

        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: