[Feature] Add SM103 (Blackwell Ultra) Support to vLLM (#30484)

Signed-off-by: LopezCastroRoberto <robertol.c510@gmail.com> Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com> Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-12-13 04:34:23 +01:00
parent 57e9bf1864
commit 4fa7ce46f3
21 changed files with 53 additions and 33 deletions
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -137,7 +137,7 @@ def get_fp8_moe_backend(
    if (
        current_platform.is_cuda()
        and (
-            current_platform.is_device_capability(100)
+            current_platform.is_device_capability_family(100)
            or current_platform.is_device_capability(90)
        )
        and envs.VLLM_USE_FLASHINFER_MOE_FP8
@@ -148,7 +148,7 @@ def get_fp8_moe_backend(
            logger.info_once("Using FlashInfer FP8 MoE TRTLLM backend for SM100")
            return Fp8MoeBackend.FLASHINFER_TRTLLM
        else:
-            if block_quant and current_platform.is_device_capability(100):
+            if block_quant and current_platform.is_device_capability_family(100):
                raise ValueError(
                    "FlashInfer FP8 MoE throughput backend does not "
                    "support block quantization. Please use "
@@ -193,7 +193,7 @@ def get_fp8_moe_backend(
    # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
    if (
        current_platform.is_cuda()
-        and current_platform.is_device_capability(100)
+        and current_platform.is_device_capability_family(100)
        and block_quant
    ):
        logger.info_once(