[Hardware][AMD][CI][Bugfix] Fix AMD Quantization test group (#31713)
Signed-off-by: Matthew Wong <Matthew.Wong2@amd.com>
This commit is contained in:
@@ -644,6 +644,9 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"args",
|
||||
[
|
||||
@@ -762,7 +765,10 @@ def test_compressed_tensors_fp8_block_enabled(vllm_runner):
|
||||
|
||||
input_quant_op = qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
|
||||
assert isinstance(input_quant_op, QuantFP8)
|
||||
assert input_quant_op._forward_method == input_quant_op.forward_cuda
|
||||
assert input_quant_op._forward_method in (
|
||||
input_quant_op.forward_cuda,
|
||||
input_quant_op.forward_hip,
|
||||
)
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ from dataclasses import dataclass
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -23,20 +24,44 @@ MODEL_ARG_EXPTYPES = [
|
||||
# AUTOGPTQ
|
||||
# compat: autogptq <=0.7.1 is_marlin_format: bool
|
||||
# Model Serialized in Exllama Format.
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"),
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"),
|
||||
(
|
||||
"TheBloke/Llama-2-7B-Chat-GPTQ",
|
||||
None,
|
||||
"gptq_marlin" if current_platform.is_cuda() else "gptq",
|
||||
),
|
||||
(
|
||||
"TheBloke/Llama-2-7B-Chat-GPTQ",
|
||||
"marlin",
|
||||
"gptq_marlin" if current_platform.is_cuda() else "ERROR",
|
||||
),
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"),
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"),
|
||||
# compat: autogptq >=0.8.0 use checkpoint_format: str
|
||||
# Model Serialized in Exllama Format.
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),
|
||||
(
|
||||
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
|
||||
None,
|
||||
"gptq_marlin" if current_platform.is_cuda() else "gptq",
|
||||
),
|
||||
(
|
||||
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
|
||||
"marlin",
|
||||
"gptq_marlin" if current_platform.is_cuda() else "ERROR",
|
||||
),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
|
||||
# AUTOAWQ
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"),
|
||||
(
|
||||
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ",
|
||||
None,
|
||||
"awq_marlin" if current_platform.is_cuda() else "awq",
|
||||
),
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "awq_marlin"),
|
||||
(
|
||||
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ",
|
||||
"marlin",
|
||||
"awq_marlin" if current_platform.is_cuda() else "ERROR",
|
||||
),
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),
|
||||
]
|
||||
|
||||
|
||||
@@ -66,7 +66,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
|
||||
monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
|
||||
# Test wNa16
|
||||
compare_two_settings(
|
||||
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
|
||||
"nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16",
|
||||
["--enforce_eager"],
|
||||
["--enforce_eager", "--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480,
|
||||
|
||||
@@ -36,7 +36,9 @@ MODELS = [
|
||||
reason="FP8 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", MODELS)
|
||||
@pytest.mark.parametrize("force_marlin", [False, True])
|
||||
@pytest.mark.parametrize(
|
||||
"force_marlin", [False] if current_platform.is_rocm() else [False, True]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
|
||||
)
|
||||
@@ -125,7 +127,9 @@ def test_kv_cache_model_load_and_run(
|
||||
reason="FP8 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
|
||||
@pytest.mark.parametrize("force_marlin", [False, True])
|
||||
@pytest.mark.parametrize(
|
||||
"force_marlin", [False] if current_platform.is_rocm() else [False, True]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
|
||||
)
|
||||
@@ -197,10 +201,10 @@ def test_scaled_fp8_quant(dtype) -> None:
|
||||
def quantize_ref(tensor, inv_scale):
|
||||
# The reference implementation that fully aligns to
|
||||
# the kernel being tested.
|
||||
finfo = torch.finfo(torch.float8_e4m3fn)
|
||||
finfo = torch.finfo(current_platform.fp8_dtype())
|
||||
scale = inv_scale.reciprocal()
|
||||
qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max)
|
||||
qweight = qweight.to(torch.float8_e4m3fn)
|
||||
qweight = qweight.to(current_platform.fp8_dtype())
|
||||
return qweight
|
||||
|
||||
def per_tensor_dequantize(tensor, inv_scale, dtype):
|
||||
@@ -267,6 +271,10 @@ def test_scaled_fp8_quant(dtype) -> None:
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_fp8_fnuz(),
|
||||
reason="FP8 e4m3fn weight reloading is not supported on e4m3fnuz platforms",
|
||||
)
|
||||
@pytest.mark.parametrize("method_cls", [Fp8LinearMethod, Fp8MoEMethod])
|
||||
# FP8 weight reloading does not support online quantization
|
||||
@pytest.mark.parametrize("is_checkpoint_fp8_serialized", [True]) # skip False
|
||||
|
||||
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinLinear
|
||||
from vllm.model_executor.layers.quantization.utils.gptq_utils import (
|
||||
get_dynamic_override,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
PROMPT = "On the surface of Mars, we found"
|
||||
|
||||
@@ -21,7 +22,10 @@ PROMPT = "On the surface of Mars, we found"
|
||||
# The second layer is quantized using bits=8, group_size=32
|
||||
# All other layers (layer index >= 2) are not quantized
|
||||
MODEL_QUANT = [
|
||||
("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", True),
|
||||
(
|
||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
|
||||
current_platform.is_cuda(),
|
||||
),
|
||||
(
|
||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
|
||||
False,
|
||||
|
||||
@@ -6,18 +6,12 @@ Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod
|
||||
from vllm.model_executor.layers.quantization.ptpc_fp8 import PTPCFp8LinearMethod
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
UNSUPPORTED_STR = (
|
||||
"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only "
|
||||
"support output dtype of bfloat16. torch.float16 is specified."
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def enable_pickle(monkeypatch):
|
||||
@@ -30,24 +24,17 @@ def enable_pickle(monkeypatch):
|
||||
reason="PTPC FP8 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.skipif(not current_platform.is_rocm(), reason="This test is for ROCm GPU.")
|
||||
@pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"])
|
||||
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"])
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
|
||||
def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
|
||||
try:
|
||||
llm = vllm_runner(
|
||||
"facebook/opt-125m",
|
||||
dtype=dtype,
|
||||
quantization="ptpc_fp8",
|
||||
enforce_eager=True,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
)
|
||||
except AssertionError as e:
|
||||
if str(e) == UNSUPPORTED_STR:
|
||||
# If the error message matches, the test passes
|
||||
return
|
||||
else:
|
||||
# If the error message does not match, re-raise the exception
|
||||
raise
|
||||
llm = vllm_runner(
|
||||
"facebook/opt-125m",
|
||||
dtype=dtype,
|
||||
quantization="ptpc_fp8",
|
||||
enforce_eager=True,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
allow_deprecated_quantization=True,
|
||||
)
|
||||
|
||||
with llm:
|
||||
|
||||
@@ -60,9 +47,9 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
|
||||
assert attn._k_scale == 1.0
|
||||
assert attn._v_scale == 1.0
|
||||
|
||||
# For GPUs with hardware support, we keep weights in fp8
|
||||
if current_platform.has_device_capability(94):
|
||||
# For GPUs with hardware support, we keep weights in fp8
|
||||
assert fc1.weight.dtype == torch.float8_e4m3fnuz
|
||||
assert fc1.weight.dtype == current_platform.fp8_dtype()
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
|
||||
@@ -10,6 +10,11 @@ def is_quant_method_supported(quant_method: str) -> bool:
|
||||
if not (current_platform.is_cuda() or current_platform.is_rocm()):
|
||||
return False
|
||||
|
||||
try:
|
||||
current_platform.verify_quantization(quant_method)
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
capability = current_platform.get_device_capability()
|
||||
assert capability is not None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user