diff --git a/tests/models/quantization/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py index b73462bfd..30f69f621 100644 --- a/tests/models/quantization/test_nvfp4.py +++ b/tests/models/quantization/test_nvfp4.py @@ -89,22 +89,33 @@ def test_models(example_prompts, model_name) -> None: EAGER = [True, False] +SM_100_NVFP4_BACKENDS = [ + "flashinfer-cudnn", + "flashinfer-trtllm", + "flashinfer-cutlass", +] + -@pytest.mark.skipif( - not current_platform.has_device_capability(100), - reason="modelopt_fp4 is not supported on this GPU type.", -) @pytest.mark.parametrize("model", ["nvidia/Llama-3.1-8B-Instruct-NVFP4"]) @pytest.mark.parametrize("eager", EAGER) @pytest.mark.parametrize( "backend", [ + "emulation", "flashinfer-cudnn", "flashinfer-trtllm", # the small seq_len ensures trtllm_8x4_layout backend is used "flashinfer-cutlass", ], ) def test_nvfp4(vllm_runner, model, eager, backend, monkeypatch): + if ( + not current_platform.has_device_capability(100) + and backend in SM_100_NVFP4_BACKENDS + ): + pytest.skip( + f"The backend {backend} is not supported with current_platform.has_device_capability(100) == False" + ) + monkeypatch.setenv("VLLM_NVFP4_GEMM_BACKEND", backend) with vllm_runner(model, enforce_eager=eager) as llm: output = llm.generate_greedy(["1 2 3 4 5"], max_tokens=2) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index f23506b00..badcac005 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -366,9 +366,6 @@ def test_compressed_tensors_kv_cache_fp8_per_attn_head(vllm_runner): assert output -@pytest.mark.skipif( - not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform." -) @pytest.mark.parametrize( "args", [ @@ -398,7 +395,7 @@ def test_compressed_tensors_nvfp4(vllm_runner, args): assert qkv_proj.scheme.group_size == 16 llm.apply_model(check_model) - output = llm.generate_greedy("Hello my name is", max_tokens=4) + output = llm.generate_greedy(["Hello my name is"], max_tokens=4) print(output) assert output diff --git a/vllm/envs.py b/vllm/envs.py index c2f8ca8c5..d2af9e64d 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1464,6 +1464,10 @@ environment_variables: dict[str, Callable[[], Any]] = { # - "flashinfer-trtllm": use flashinfer trtllm GEMM backend # - "flashinfer-cutlass": use flashinfer cutlass GEMM backend # - "marlin": use marlin GEMM backend (for GPUs without native FP4 support) + # - "emulation": + # use BF16/FP16 GEMM, dequantizing weights and running QDQ on activations. + # This is only meant for research purposes to run on devices where NVFP4 + # GEMM kernels are not available. # - : automatically pick an available backend "VLLM_NVFP4_GEMM_BACKEND": env_with_choices( "VLLM_NVFP4_GEMM_BACKEND", @@ -1474,6 +1478,7 @@ environment_variables: dict[str, Callable[[], Any]] = { "flashinfer-cutlass", "cutlass", "marlin", + "emulation", ], ), # Controls garbage collection during CUDA graph capture. diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py index a3b53626b..fff738726 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py @@ -5,10 +5,12 @@ from collections.abc import Callable import torch from torch.nn.parameter import Parameter +from vllm.logger import init_logger from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme, ) from vllm.model_executor.layers.quantization.utils.nvfp4_utils import ( + NvFp4LinearBackend, apply_nvfp4_linear, convert_to_nvfp4_linear_kernel_format, select_nvfp4_linear_backend, @@ -19,6 +21,9 @@ from vllm.model_executor.parameter import ( PerTensorScaleParameter, ) +logger = init_logger(__name__) + + __all__ = ["CompressedTensorsW4A4Fp4"] @@ -27,6 +32,10 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme): self.backend = select_nvfp4_linear_backend() self.group_size = 16 + self.swizzle = None + if self.backend == NvFp4LinearBackend.EMULATION: + self.swizzle = False + @classmethod def get_min_capability(cls) -> int: return 75 @@ -89,6 +98,19 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme): # Rename CT checkpoint names to standardized names layer.weight = layer.weight_packed del layer.weight_packed + + if ( + torch.unique(layer.input_global_scale).numel() != 1 + or torch.unique(layer.weight_global_scale).numel() != 1 + ): + logger.warning_once( + "In NVFP4 linear, the global scale for input or weight are different" + " for parallel layers (e.g. q_proj, k_proj, v_proj). This " + " will likely result in reduced accuracy. Please verify the model" + " accuracy. Consider using a checkpoint with a shared global NVFP4" + " scale for fused layers." + ) + # Process global scales (CT stores as divisors, i.e. 1/scale) input_global_scale_inv = layer.input_global_scale.max().to(torch.float32) layer.input_global_scale = Parameter( @@ -121,4 +143,5 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme): layer=layer, x=x, bias=bias, + swizzle=self.swizzle, ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 7871b774e..53b15950d 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -71,6 +71,7 @@ from vllm.model_executor.layers.quantization.utils.mxfp8_utils import ( mxfp8_e4m3_quantize, ) from vllm.model_executor.layers.quantization.utils.nvfp4_utils import ( + NvFp4LinearBackend, apply_nvfp4_linear, convert_to_nvfp4_linear_kernel_format, select_nvfp4_linear_backend, @@ -1074,6 +1075,10 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase): self.marlin_input_dtype = None self.backend = select_nvfp4_linear_backend() + self.swizzle = None + if self.backend == NvFp4LinearBackend.EMULATION: + self.swizzle = False + def create_weights( self, layer: torch.nn.Module, @@ -1149,10 +1154,23 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase): layer.register_parameter("weight_scale", weight_scale) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + if ( + torch.unique(layer.input_scale).numel() != 1 + or torch.unique(layer.weight_scale_2).numel() != 1 + ): + logger.warning_once( + "In NVFP4 linear, the global scale for input or weight are different" + " for parallel layers (e.g. q_proj, k_proj, v_proj). This " + " will likely results in reduce accuracy. Please verify the model" + " accuracy. Consider using a checkpoint with a shared global NVFP4" + " scale for parallel layers." + ) + # Rename ModelOpt checkpoint names to standardized names input_global_scale = layer.input_scale.max().to(torch.float32) layer.input_global_scale = Parameter(input_global_scale, requires_grad=False) del layer.input_scale + weight_global_scale = layer.weight_scale_2.max().to(torch.float32) layer.weight_global_scale = Parameter(weight_global_scale, requires_grad=False) del layer.weight_scale_2 @@ -1179,6 +1197,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase): layer=layer, x=x, bias=bias, + swizzle=self.swizzle, ) diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py index 4fd484ede..19473fa32 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py @@ -24,7 +24,7 @@ logger = init_logger(__name__) def is_fp4_marlin_supported(): - return current_platform.has_device_capability(75) + return current_platform.is_cuda() and current_platform.has_device_capability(75) def _nvfp4_compute_scale_factor( diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py index 62b480210..9a0c52b62 100644 --- a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +++ b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from types import SimpleNamespace + import torch from vllm.scalar_type import scalar_types @@ -11,9 +13,10 @@ __all__ = [ ] FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() +FLOAT4_E2M1_MAX_RECIPROCAL = 1 / FLOAT4_E2M1_MAX -kE2M1ToFloat = torch.tensor( - [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32 +kE2M1ToFloat_handle = SimpleNamespace( + val=torch.tensor([0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32) ) @@ -29,8 +32,9 @@ def break_fp4_bytes(a, dtype): # Vectorized sign and magnitude extraction signs = (combined & 0x08).to(torch.bool) # Sign bits abs_vals = (combined & 0x07).to(torch.long) + + kE2M1 = kE2M1ToFloat_handle.val # Device-aware lookup and sign application - kE2M1 = kE2M1ToFloat.to(device=a.device) values = kE2M1[abs_vals] * torch.where(signs, -1.0, 1.0) # Reshape to final form return values.reshape(m, n * 2).to(dtype=dtype) @@ -47,7 +51,12 @@ def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size): def dequantize_to_dtype( - tensor_fp4, tensor_sf, global_scale, dtype, device, block_size=16 + tensor_fp4: torch.Tensor, + tensor_sf: torch.Tensor, + global_scale: torch.Tensor | float, + dtype: torch.dtype, + block_size: int = 16, + swizzle: bool | None = True, ): """Dequantize the fp4 tensor back to high precision.""" # Two fp4 values are packed into one uint8. @@ -57,8 +66,10 @@ def dequantize_to_dtype( tensor_f32 = break_fp4_bytes(tensor_fp4, torch.float32) tensor_f32 = tensor_f32.reshape(m, k // block_size, block_size) tensor_sf = tensor_sf.view(torch.float8_e4m3fn) - tensor_sf = convert_swizzled_to_linear(tensor_sf, m, k, block_size) - tensor_sf_dtype = tensor_sf.to(torch.float32) / global_scale + + if swizzle: + tensor_sf = convert_swizzled_to_linear(tensor_sf, m, k, block_size) + tensor_sf_dtype = tensor_sf.to(torch.float32) * global_scale # scale the tensor out = (tensor_f32 * tensor_sf_dtype.unsqueeze(-1)).reshape(m, k) @@ -67,7 +78,8 @@ def dequantize_to_dtype( def get_reciprocal(x): if isinstance(x, torch.Tensor): - return torch.where(x == 0, torch.tensor(0.0, dtype=x.dtype), 1.0 / x) + # torch.where yields operation not permitted when stream is capturing. + return 1.0 / (x + (x == 0) * 1e8) elif isinstance(x, (float, int)): return 0.0 if x == 0 else 1.0 / x else: @@ -94,7 +106,7 @@ def ref_nvfp4_quant(x, global_scale, block_size): m, n = x.shape x = torch.reshape(x, (m, n // block_size, block_size)) vec_max = torch.max(torch.abs(x), dim=-1, keepdim=True)[0].to(torch.float32) - scale = global_scale * (vec_max * get_reciprocal(FLOAT4_E2M1_MAX)) + scale = global_scale * (vec_max * FLOAT4_E2M1_MAX_RECIPROCAL) scale = torch.clamp(scale, max=448, min=-448) scale = scale.to(torch.float8_e4m3fn).to(torch.float32) output_scale = get_reciprocal(scale * get_reciprocal(global_scale)) @@ -111,6 +123,7 @@ def run_nvfp4_emulations( weight: torch.Tensor, weight_scale_swizzled: torch.Tensor, weight_global_scale: torch.Tensor, + swizzle: bool | None = True, ): group_size = 16 x_m, x_k = x.shape @@ -132,8 +145,8 @@ def run_nvfp4_emulations( weight_scale_swizzled.data, weight_global_scale, output_dtype, - x.device, group_size, + swizzle=swizzle, ) # matmul diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_utils.py b/vllm/model_executor/layers/quantization/utils/nvfp4_utils.py index f21f2ef23..4796cc6c9 100644 --- a/vllm/model_executor/layers/quantization/utils/nvfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/nvfp4_utils.py @@ -17,31 +17,99 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( prepare_fp4_layer_for_marlin, ) from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import ( + kE2M1ToFloat_handle, run_nvfp4_emulations, ) from vllm.platforms import current_platform from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm, has_flashinfer +from vllm.utils.import_utils import has_fbgemm_gpu from vllm.utils.math_utils import round_up logger = init_logger(__name__) +# NOTE: This is ordered by preferred backend. +# Example: if both are available, FLASHINFER_CUTLASS is preferred to VLLM_CUTLASS. class NvFp4LinearBackend(Enum): - VLLM_CUTLASS = "cutlass" FLASHINFER_CUTLASS = "flashinfer-cutlass" + VLLM_CUTLASS = "cutlass" + MARLIN = "marlin" FLASHINFER_TRTLLM = "flashinfer-trtllm" FLASHINFER_CUDNN = "flashinfer-cudnn" FBGEMM = "fbgemm" - MARLIN = "marlin" EMULATION = "emulation" +NVFP4_LINEAR_BACKENDS = list(NvFp4LinearBackend) + + +def is_backend_supported(backend: NvFp4LinearBackend) -> tuple[bool, str | None]: + reason = None + supported = True + + if backend == NvFp4LinearBackend.FLASHINFER_CUTLASS: + # cutlass_fp4_supported() checks that the vLLM NVFP4 kernels (both + # quantization and GEMM) were compiled for the current SM version. + # FlashInfer backends still rely on the vLLM quantization kernels, + # so we gate them on the same check. + supported = ( + cutlass_fp4_supported() + and current_platform.has_device_capability(100) + and has_flashinfer() + ) + + if not supported: + reason = "FlashInfer is required, >=sm_100 is required" + elif backend == NvFp4LinearBackend.VLLM_CUTLASS: + supported = cutlass_fp4_supported() + if not supported: + reason = "Cutlass is required" + elif backend == NvFp4LinearBackend.MARLIN: + supported = is_fp4_marlin_supported() + if not supported: + reason = "Marlin is required" + elif backend in [ + NvFp4LinearBackend.FLASHINFER_TRTLLM, + NvFp4LinearBackend.FLASHINFER_CUDNN, + ]: + supported = has_flashinfer() + if not supported: + reason = "FlashInfer is required" + elif backend == NvFp4LinearBackend.FBGEMM: + supported = has_fbgemm_gpu() + if not supported: + reason = "fbgemm_gpu is required" + elif backend == NvFp4LinearBackend.EMULATION: + # e.g. AMD Instinct does not support native NVFP4. + unsupported_reasons = {} + for other_backend in NVFP4_LINEAR_BACKENDS: + if other_backend == NvFp4LinearBackend.EMULATION: + continue + other_supported, other_reason = is_backend_supported(other_backend) + if not other_supported: + unsupported_reasons[other_backend] = other_reason + + if unsupported_reasons: + unsupported_reasons_str = "\n - ".join( + [f"{b.value}: {r}" for b, r in unsupported_reasons.items()] + ) + logger.warning_once( + f"NVFP4 linear falling back to the slow and unoptimized " + f"backend=NvFp4LinearBackend.EMULATION as no optimized backend is " + f"available (unavailable reasons:\n - {unsupported_reasons_str}\n). " + "In case you expect one of these backend to be used, " + "please verify your environment." + ) + + return supported, reason + + def select_nvfp4_linear_backend() -> NvFp4LinearBackend: """ Select the best available NVFP4 GEMM backend based on environment configuration and platform capabilities. """ - backend: NvFp4LinearBackend | None = None + selected_backend: NvFp4LinearBackend | None = None if envs.VLLM_USE_FBGEMM: try: @@ -51,51 +119,36 @@ def select_nvfp4_linear_backend() -> NvFp4LinearBackend: "Backend fbgemm requires fbgemm.f4f4bf16 operator, " "Please install with: pip install fbgemm-gpu-genai" ) from exc - backend = NvFp4LinearBackend.FBGEMM + selected_backend = NvFp4LinearBackend.FBGEMM elif envs.VLLM_USE_NVFP4_CT_EMULATIONS: - backend = NvFp4LinearBackend.EMULATION + selected_backend = NvFp4LinearBackend.EMULATION elif envs.VLLM_NVFP4_GEMM_BACKEND is None: - # Auto-select best available backend. - # cutlass_fp4_supported() checks that the vLLM NVFP4 kernels (both - # quantization and GEMM) were compiled for the current SM version. - # FlashInfer backends still rely on the vLLM quantization kernels, - # so we gate them on the same check. - if ( - cutlass_fp4_supported() - and current_platform.has_device_capability(100) - and has_flashinfer() - ): - backend = NvFp4LinearBackend.FLASHINFER_CUTLASS - elif cutlass_fp4_supported(): - backend = NvFp4LinearBackend.VLLM_CUTLASS - elif is_fp4_marlin_supported(): - backend = NvFp4LinearBackend.MARLIN + for backend in NVFP4_LINEAR_BACKENDS: + supported, reason = is_backend_supported(backend) + if supported: + selected_backend = backend + break else: - backend = NvFp4LinearBackend(envs.VLLM_NVFP4_GEMM_BACKEND) + selected_backend = NvFp4LinearBackend(envs.VLLM_NVFP4_GEMM_BACKEND) - # Validate that the backend is supported - if backend in ( - NvFp4LinearBackend.FLASHINFER_CUTLASS, - NvFp4LinearBackend.FLASHINFER_TRTLLM, - NvFp4LinearBackend.FLASHINFER_CUDNN, - ): - assert has_flashinfer(), f"FlashInfer is required for {backend}" - assert cutlass_fp4_supported(), ( - f"{backend} requires vLLM NVFP4 quantization kernels compiled " - f"for the current GPU (SM {current_platform.get_device_capability()})" - ) - elif backend == NvFp4LinearBackend.VLLM_CUTLASS: - assert cutlass_fp4_supported(), f"Cutlass is required for {backend}" - elif backend == NvFp4LinearBackend.MARLIN: - assert is_fp4_marlin_supported(), f"Marlin is required for {backend}" - elif backend is None: + if selected_backend is None: raise ValueError( f"No NVFP4 GEMM backend selected, " - f"available backends: {list(NvFp4LinearBackend)}" + f"available backends: {NVFP4_LINEAR_BACKENDS}" ) - logger.info_once(f"Using {backend} for NVFP4 GEMM") - return backend + supported, reason = is_backend_supported(selected_backend) + + if not supported: + raise ValueError( + f"The selected backend={selected_backend} is not supported in current " + f"environment. Reason: {reason}. Current environment: " + f"{envs.VLLM_USE_FBGEMM=}, {envs.VLLM_USE_NVFP4_CT_EMULATIONS=}, " + f"{envs.VLLM_NVFP4_GEMM_BACKEND}." + ) + + logger.info_once(f"Using {selected_backend} for NVFP4 GEMM") + return selected_backend def prepare_weights_for_nvfp4_flashinfer_trtllm( @@ -183,6 +236,10 @@ def convert_to_nvfp4_linear_kernel_format( layer.weight = torch.nn.Parameter(weight, requires_grad=False) layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False) layer.weights_padding_cols = weights_padding_cols + elif backend == NvFp4LinearBackend.EMULATION: + # We can not call `.to(device)` during cuda graph capture - do it here instead. + # (operation not permitted when stream is capturing) + kE2M1ToFloat_handle.val = kE2M1ToFloat_handle.val.to(layer.weight.device) def apply_nvfp4_linear( @@ -190,6 +247,7 @@ def apply_nvfp4_linear( layer: torch.nn.Module, x: torch.Tensor, bias: torch.Tensor | None = None, + swizzle: bool | None = None, ) -> torch.Tensor: """ Apply NVFP4 linear transformation using the specified backend. @@ -220,6 +278,7 @@ def apply_nvfp4_linear( weight=weight, weight_scale_swizzled=weight_scale, weight_global_scale=weight_global_scale, + swizzle=swizzle, ) if bias is not None: out = out + bias diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 7b7135366..2ba4ef3fe 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -409,6 +409,7 @@ class RocmPlatform(Platform): "mxfp4", "torchao", "bitsandbytes", + "modelopt_fp4", ] @classmethod diff --git a/vllm/utils/import_utils.py b/vllm/utils/import_utils.py index e7f966b27..b72108b4b 100644 --- a/vllm/utils/import_utils.py +++ b/vllm/utils/import_utils.py @@ -461,3 +461,8 @@ def has_aiter() -> bool: def has_mori() -> bool: """Whether the optional `mori` package is available.""" return _has_module("mori") + + +def has_fbgemm_gpu() -> bool: + """Whether the optional `fbgemm_gpu` package is available.""" + return _has_module("fbgemm_gpu")