[Feature] Add SM103 (Blackwell Ultra) Support to vLLM (#30484)

Signed-off-by: LopezCastroRoberto <robertol.c510@gmail.com>
Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
Roberto L. Castro
2025-12-13 04:34:23 +01:00
committed by GitHub
parent 57e9bf1864
commit 4fa7ce46f3
21 changed files with 53 additions and 33 deletions

View File

@@ -137,7 +137,7 @@ def get_fp8_moe_backend(
if (
current_platform.is_cuda()
and (
current_platform.is_device_capability(100)
current_platform.is_device_capability_family(100)
or current_platform.is_device_capability(90)
)
and envs.VLLM_USE_FLASHINFER_MOE_FP8
@@ -148,7 +148,7 @@ def get_fp8_moe_backend(
logger.info_once("Using FlashInfer FP8 MoE TRTLLM backend for SM100")
return Fp8MoeBackend.FLASHINFER_TRTLLM
else:
if block_quant and current_platform.is_device_capability(100):
if block_quant and current_platform.is_device_capability_family(100):
raise ValueError(
"FlashInfer FP8 MoE throughput backend does not "
"support block quantization. Please use "
@@ -193,7 +193,7 @@ def get_fp8_moe_backend(
# CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
if (
current_platform.is_cuda()
and current_platform.is_device_capability(100)
and current_platform.is_device_capability_family(100)
and block_quant
):
logger.info_once(