From 66e601ef795c27b43025ec7f4f864cac561c322e Mon Sep 17 00:00:00 2001 From: IriKa Date: Wed, 28 Jan 2026 00:04:05 +0800 Subject: [PATCH] Support compress-tensors with nvfp4 or fp8 weights and modelopt with nvfp4 weights on Turing (#33076) Signed-off-by: IriKa Qiu --- .../schemes/compressed_tensors_w4a16_nvfp4.py | 2 +- .../schemes/compressed_tensors_w8a16_fp8.py | 4 ++-- vllm/model_executor/layers/quantization/modelopt.py | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py index d2701a464..d0a924471 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py @@ -29,7 +29,7 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme): @classmethod def get_min_capability(cls) -> int: # don't restrict as emulations - return 80 + return 75 def create_weights( self, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py index 904a9f5d4..11b34064c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py @@ -34,8 +34,8 @@ class CompressedTensorsW8A16Fp8(CompressedTensorsScheme): @classmethod def get_min_capability(cls) -> int: - # ampere and up - return 80 + # turing and up + return 75 # W8A8-Fp8 kernels support only per-tensor and per-channel cases. # So if we have a fused module (QKV, MLP) with per tensor scales, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 3b59f76d3..2046561fb 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -96,6 +96,7 @@ from vllm.model_executor.parameter import ( PerTensorScaleParameter, ) from vllm.model_executor.utils import replace_parameter +from vllm.platforms import current_platform from vllm.utils.flashinfer import ( flashinfer_scaled_fp4_mm, has_flashinfer, @@ -1110,7 +1111,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase): self.backend = "none" if envs.VLLM_NVFP4_GEMM_BACKEND is None: - if has_flashinfer(): + if current_platform.has_device_capability(100) and has_flashinfer(): self.backend = "flashinfer-cutlass" elif cutlass_fp4_supported(): self.backend = "cutlass"