Support compress-tensors with nvfp4 or fp8 weights and modelopt with nvfp4 weights on Turing (#33076)
Signed-off-by: IriKa Qiu <qiujie.jq@gmail.com>
This commit is contained in:
@@ -29,7 +29,7 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme):
|
||||
@classmethod
|
||||
def get_min_capability(cls) -> int:
|
||||
# don't restrict as emulations
|
||||
return 80
|
||||
return 75
|
||||
|
||||
def create_weights(
|
||||
self,
|
||||
|
||||
@@ -34,8 +34,8 @@ class CompressedTensorsW8A16Fp8(CompressedTensorsScheme):
|
||||
|
||||
@classmethod
|
||||
def get_min_capability(cls) -> int:
|
||||
# ampere and up
|
||||
return 80
|
||||
# turing and up
|
||||
return 75
|
||||
|
||||
# W8A8-Fp8 kernels support only per-tensor and per-channel cases.
|
||||
# So if we have a fused module (QKV, MLP) with per tensor scales,
|
||||
|
||||
@@ -96,6 +96,7 @@ from vllm.model_executor.parameter import (
|
||||
PerTensorScaleParameter,
|
||||
)
|
||||
from vllm.model_executor.utils import replace_parameter
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.flashinfer import (
|
||||
flashinfer_scaled_fp4_mm,
|
||||
has_flashinfer,
|
||||
@@ -1110,7 +1111,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
|
||||
|
||||
self.backend = "none"
|
||||
if envs.VLLM_NVFP4_GEMM_BACKEND is None:
|
||||
if has_flashinfer():
|
||||
if current_platform.has_device_capability(100) and has_flashinfer():
|
||||
self.backend = "flashinfer-cutlass"
|
||||
elif cutlass_fp4_supported():
|
||||
self.backend = "cutlass"
|
||||
|
||||
Reference in New Issue
Block a user