[Feature] Add SM103 (Blackwell Ultra) Support to vLLM (#30484)
Signed-off-by: LopezCastroRoberto <robertol.c510@gmail.com> Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com> Co-authored-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
committed by
GitHub
parent
57e9bf1864
commit
4fa7ce46f3
@@ -137,7 +137,7 @@ def get_fp8_moe_backend(
|
||||
if (
|
||||
current_platform.is_cuda()
|
||||
and (
|
||||
current_platform.is_device_capability(100)
|
||||
current_platform.is_device_capability_family(100)
|
||||
or current_platform.is_device_capability(90)
|
||||
)
|
||||
and envs.VLLM_USE_FLASHINFER_MOE_FP8
|
||||
@@ -148,7 +148,7 @@ def get_fp8_moe_backend(
|
||||
logger.info_once("Using FlashInfer FP8 MoE TRTLLM backend for SM100")
|
||||
return Fp8MoeBackend.FLASHINFER_TRTLLM
|
||||
else:
|
||||
if block_quant and current_platform.is_device_capability(100):
|
||||
if block_quant and current_platform.is_device_capability_family(100):
|
||||
raise ValueError(
|
||||
"FlashInfer FP8 MoE throughput backend does not "
|
||||
"support block quantization. Please use "
|
||||
@@ -193,7 +193,7 @@ def get_fp8_moe_backend(
|
||||
# CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
|
||||
if (
|
||||
current_platform.is_cuda()
|
||||
and current_platform.is_device_capability(100)
|
||||
and current_platform.is_device_capability_family(100)
|
||||
and block_quant
|
||||
):
|
||||
logger.info_once(
|
||||
|
||||
Reference in New Issue
Block a user