diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index ff5ce20d1..5eadc6bd8 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -731,7 +731,7 @@ steps: - label: Quantization Test # 70min timeout_in_minutes: 90 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking source_file_dependencies: diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 535f02820..7fb38708c 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -644,6 +644,9 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4): assert output +@pytest.mark.skipif( + not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform." +) @pytest.mark.parametrize( "args", [ @@ -762,7 +765,10 @@ def test_compressed_tensors_fp8_block_enabled(vllm_runner): input_quant_op = qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op assert isinstance(input_quant_op, QuantFP8) - assert input_quant_op._forward_method == input_quant_op.forward_cuda + assert input_quant_op._forward_method in ( + input_quant_op.forward_cuda, + input_quant_op.forward_hip, + ) llm.apply_model(check_model) diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index 797b565b9..40ca587bc 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -10,6 +10,7 @@ from dataclasses import dataclass import pytest from vllm.config import ModelConfig +from vllm.platforms import current_platform @dataclass @@ -23,20 +24,44 @@ MODEL_ARG_EXPTYPES = [ # AUTOGPTQ # compat: autogptq <=0.7.1 is_marlin_format: bool # Model Serialized in Exllama Format. - ("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"), - ("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"), + ( + "TheBloke/Llama-2-7B-Chat-GPTQ", + None, + "gptq_marlin" if current_platform.is_cuda() else "gptq", + ), + ( + "TheBloke/Llama-2-7B-Chat-GPTQ", + "marlin", + "gptq_marlin" if current_platform.is_cuda() else "ERROR", + ), ("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"), ("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"), # compat: autogptq >=0.8.0 use checkpoint_format: str # Model Serialized in Exllama Format. - ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"), - ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"), + ( + "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", + None, + "gptq_marlin" if current_platform.is_cuda() else "gptq", + ), + ( + "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", + "marlin", + "gptq_marlin" if current_platform.is_cuda() else "ERROR", + ), ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"), ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"), # AUTOAWQ - ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"), + ( + "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", + None, + "awq_marlin" if current_platform.is_cuda() else "awq", + ), ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"), - ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "awq_marlin"), + ( + "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", + "marlin", + "awq_marlin" if current_platform.is_cuda() else "ERROR", + ), ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"), ] diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py index 1591ce1c4..3b58614e5 100644 --- a/tests/quantization/test_cpu_offload.py +++ b/tests/quantization/test_cpu_offload.py @@ -66,7 +66,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch): monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto") # Test wNa16 compare_two_settings( - "nm-testing/tinyllama-oneshot-w4a16-channel-v2", + "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16", ["--enforce_eager"], ["--enforce_eager", "--cpu-offload-gb", "1"], max_wait_seconds=480, diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 7b7b6c1d0..e8abe0d41 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -36,7 +36,9 @@ MODELS = [ reason="FP8 is not supported on this GPU type.", ) @pytest.mark.parametrize("model_id", MODELS) -@pytest.mark.parametrize("force_marlin", [False, True]) +@pytest.mark.parametrize( + "force_marlin", [False] if current_platform.is_rocm() else [False, True] +) @pytest.mark.parametrize( "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False] ) @@ -125,7 +127,9 @@ def test_kv_cache_model_load_and_run( reason="FP8 is not supported on this GPU type.", ) @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) -@pytest.mark.parametrize("force_marlin", [False, True]) +@pytest.mark.parametrize( + "force_marlin", [False] if current_platform.is_rocm() else [False, True] +) @pytest.mark.parametrize( "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False] ) @@ -197,10 +201,10 @@ def test_scaled_fp8_quant(dtype) -> None: def quantize_ref(tensor, inv_scale): # The reference implementation that fully aligns to # the kernel being tested. - finfo = torch.finfo(torch.float8_e4m3fn) + finfo = torch.finfo(current_platform.fp8_dtype()) scale = inv_scale.reciprocal() qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max) - qweight = qweight.to(torch.float8_e4m3fn) + qweight = qweight.to(current_platform.fp8_dtype()) return qweight def per_tensor_dequantize(tensor, inv_scale, dtype): @@ -267,6 +271,10 @@ def test_scaled_fp8_quant(dtype) -> None: ) +@pytest.mark.skipif( + current_platform.is_fp8_fnuz(), + reason="FP8 e4m3fn weight reloading is not supported on e4m3fnuz platforms", +) @pytest.mark.parametrize("method_cls", [Fp8LinearMethod, Fp8MoEMethod]) # FP8 weight reloading does not support online quantization @pytest.mark.parametrize("is_checkpoint_fp8_serialized", [True]) # skip False diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py index 37fe2dd32..f35e49094 100644 --- a/tests/quantization/test_gptq_dynamic.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -14,6 +14,7 @@ from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinLinear from vllm.model_executor.layers.quantization.utils.gptq_utils import ( get_dynamic_override, ) +from vllm.platforms import current_platform PROMPT = "On the surface of Mars, we found" @@ -21,7 +22,10 @@ PROMPT = "On the surface of Mars, we found" # The second layer is quantized using bits=8, group_size=32 # All other layers (layer index >= 2) are not quantized MODEL_QUANT = [ - ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", True), + ( + "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", + current_platform.is_cuda(), + ), ( "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse", False, diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py index 61efd2ce6..6858062b9 100644 --- a/tests/quantization/test_ptpc_fp8.py +++ b/tests/quantization/test_ptpc_fp8.py @@ -6,18 +6,12 @@ Run `pytest tests/quantization/test_ptpc_fp8.py --forked`. """ import pytest -import torch from tests.quantization.utils import is_quant_method_supported from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod from vllm.model_executor.layers.quantization.ptpc_fp8 import PTPCFp8LinearMethod from vllm.platforms import current_platform -UNSUPPORTED_STR = ( - "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only " - "support output dtype of bfloat16. torch.float16 is specified." -) - @pytest.fixture(scope="function", autouse=True) def enable_pickle(monkeypatch): @@ -30,24 +24,17 @@ def enable_pickle(monkeypatch): reason="PTPC FP8 is not supported on this GPU type.", ) @pytest.mark.skipif(not current_platform.is_rocm(), reason="This test is for ROCm GPU.") -@pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"]) -@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"]) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None: - try: - llm = vllm_runner( - "facebook/opt-125m", - dtype=dtype, - quantization="ptpc_fp8", - enforce_eager=True, - kv_cache_dtype=kv_cache_dtype, - ) - except AssertionError as e: - if str(e) == UNSUPPORTED_STR: - # If the error message matches, the test passes - return - else: - # If the error message does not match, re-raise the exception - raise + llm = vllm_runner( + "facebook/opt-125m", + dtype=dtype, + quantization="ptpc_fp8", + enforce_eager=True, + kv_cache_dtype=kv_cache_dtype, + allow_deprecated_quantization=True, + ) with llm: @@ -60,9 +47,9 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None: assert attn._k_scale == 1.0 assert attn._v_scale == 1.0 + # For GPUs with hardware support, we keep weights in fp8 if current_platform.has_device_capability(94): - # For GPUs with hardware support, we keep weights in fp8 - assert fc1.weight.dtype == torch.float8_e4m3fnuz + assert fc1.weight.dtype == current_platform.fp8_dtype() llm.apply_model(check_model) diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py index 20a425b72..cf3da37b0 100644 --- a/tests/quantization/utils.py +++ b/tests/quantization/utils.py @@ -10,6 +10,11 @@ def is_quant_method_supported(quant_method: str) -> bool: if not (current_platform.is_cuda() or current_platform.is_rocm()): return False + try: + current_platform.verify_quantization(quant_method) + except ValueError: + return False + capability = current_platform.get_device_capability() assert capability is not None diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 1c1587ebe..f0f8868c1 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -5,6 +5,7 @@ from typing import Literal, get_args from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.platforms import current_platform logger = init_logger(__name__) @@ -98,6 +99,9 @@ def register_quantization_config(quantization: str): ) else: QUANTIZATION_METHODS.append(quantization) + # Automatically assume the custom quantization config is supported + if sq := current_platform.supported_quantization: + sq.append(quantization) if not issubclass(quant_config_cls, QuantizationConfig): raise ValueError( diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py index 760f1f7f7..e4286f91b 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py @@ -9,6 +9,9 @@ from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm triton_scaled_mm, ) from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + convert_to_channelwise, +) from vllm.platforms import current_platform from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig @@ -37,6 +40,20 @@ class TritonScaledMMLinearKernel(ScaledMMLinearKernel): torch.nn.Parameter(weight.t().data, requires_grad=False), ) + # WEIGHT SCALE + # Triton kernel supports only per-tensor and per-channel. + # If we have a fused module (QKV, MLP) with per tensor scales (thus N + # scales being passed to the kernel), convert to the per-channel case. + is_fused_module = len(layer.logical_widths) > 1 + weight_scale = getattr(layer, self.w_s_name) + if is_fused_module and not self.config.is_channelwise: + weight_scale = convert_to_channelwise(weight_scale, layer.logical_widths) + replace_parameter( + layer, + self.w_s_name, + torch.nn.Parameter(weight_scale.data, requires_grad=False), + ) + # INPUT SCALE if self.config.is_static_input_scheme: input_scale = getattr(layer, self.i_s_name) diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py index ed8a2c7fa..80efc29de 100644 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py @@ -103,21 +103,25 @@ class PTPCFp8LinearMethod(Fp8LinearMethod): ) def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False) - - assert layer.weight.data.dtype == torch.bfloat16, ( - f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified." # noqa: E501 - ) - # Quantize the weights. - qweight, weight_scale = ops.scaled_fp8_quant( - layer.weight, scale=None, use_per_token_if_dynamic=True + assert layer.weight.data.dtype not in (torch.float16, torch.float32), ( + "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support " + f"output dtype of bfloat16. {layer.weight.data.dtype} is specified." ) - # Update the layer with the new values. - layer.weight = Parameter( - qweight.t(), requires_grad=False - ) # Pretranspose the weight - layer.weight_scale = Parameter(weight_scale, requires_grad=False) + if layer.weight.data.dtype == torch.bfloat16: + # Quantize the weights. + qweight, weight_scale = ops.scaled_fp8_quant( + layer.weight, scale=None, use_per_token_if_dynamic=True + ) + + # Update the layer with the new values. + layer.weight = Parameter( + qweight.t(), requires_grad=False + ) # Pretranspose the weight + layer.weight_scale = Parameter(weight_scale, requires_grad=False) + else: + assert layer.weight.data.dtype == current_platform.fp8_dtype() + assert getattr(layer, "weight_scale", None) is not None layer.input_scale = None def apply( diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 3a55dd36d..d5df0013a 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -170,7 +170,9 @@ class RocmPlatform(Platform): supported_quantization: list[str] = [ "awq", + "awq_marlin", # will be overwritten with awq "gptq", + "gptq_marlin", # will be overwritten with gptq "fp8", "compressed-tensors", "fbgemm_fp8",