Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -14,14 +14,15 @@ FLOAT8_E8M0_MAX_EXP = 127
FLOAT4_EXP_BIAS = 1
FLOAT4_MANTISSA_BITS = 1
FLOAT16_VAL_TO_ADD = (1 << (FLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1))
FLOAT16_SIGN_EXPONENT_MASK = ((
(1 << (FLOAT16_EXP_BITS + 1)) - 1) << FLOAT16_MANTISSA_BITS)
FLOAT16_VAL_TO_ADD = 1 << (FLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1)
FLOAT16_SIGN_EXPONENT_MASK = (
(1 << (FLOAT16_EXP_BITS + 1)) - 1
) << FLOAT16_MANTISSA_BITS
BFLOAT16_VAL_TO_ADD = (1 <<
(BFLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1))
BFLOAT16_SIGN_EXPONENT_MASK = ((
(1 << (BFLOAT16_EXP_BITS + 1)) - 1) << BFLOAT16_MANTISSA_BITS)
BFLOAT16_VAL_TO_ADD = 1 << (BFLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1)
BFLOAT16_SIGN_EXPONENT_MASK = (
(1 << (BFLOAT16_EXP_BITS + 1)) - 1
) << BFLOAT16_MANTISSA_BITS
def e8m0_to_half(scale, half_dtype: torch.dtype):
@@ -30,19 +31,19 @@ def e8m0_to_half(scale, half_dtype: torch.dtype):
scale_exp = scale.to(torch.int16) - 127
# This can be implemented with bitwise operations in a proper kernel.
scale_half = 2.0**(scale_exp.to(torch.float))
scale_half = 2.0 ** (scale_exp.to(torch.float))
return scale_half.to(half_dtype)
def upcast_fp4_to_fp16_or_bf16(val, float_dtype: torch.dtype,
half_exp_bias: int, half_mantissa_bits: int):
def upcast_fp4_to_fp16_or_bf16(
val, float_dtype: torch.dtype, half_exp_bias: int, half_mantissa_bits: int
):
assert val.dtype == torch.uint8
unpacked = torch.zeros(*val.shape[:-1],
val.shape[-1] * 2,
dtype=torch.uint8,
device=val.device)
unpacked = torch.zeros(
*val.shape[:-1], val.shape[-1] * 2, dtype=torch.uint8, device=val.device
)
unpacked[..., 1::2] = (val >> 4) & 0x0F # Extract high 4 bits.
unpacked[..., ::2] = val & 0x0F # Extract low 4 bits.
@@ -72,8 +73,11 @@ def upcast_fp4_to_fp16_or_bf16(val, float_dtype: torch.dtype,
new_exp = new_exp.to(torch.int32)
sign = sign.to(torch.int32)
qdq_val = (sign << 15) + (new_exp << half_mantissa_bits) + (
new_mantissa << (half_mantissa_bits - 1))
qdq_val = (
(sign << 15)
+ (new_exp << half_mantissa_bits)
+ (new_mantissa << (half_mantissa_bits - 1))
)
assert qdq_val.max() <= 65535
assert qdq_val.min() >= 0
@@ -84,8 +88,9 @@ def upcast_fp4_to_fp16_or_bf16(val, float_dtype: torch.dtype,
return result
def dq_mxfp4_torch(x: torch.Tensor, scale: torch.Tensor,
float_dtype: torch.dtype) -> torch.Tensor:
def dq_mxfp4_torch(
x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype
) -> torch.Tensor:
assert x.dtype == torch.uint8
assert scale.dtype == torch.uint8
@@ -98,10 +103,12 @@ def dq_mxfp4_torch(x: torch.Tensor, scale: torch.Tensor,
scale_half = e8m0_to_half(scale, half_dtype=float_dtype)
x_half = upcast_fp4_to_fp16_or_bf16(x,
float_dtype=float_dtype,
half_exp_bias=half_exp_bias,
half_mantissa_bits=half_mantissa_bits)
x_half = upcast_fp4_to_fp16_or_bf16(
x,
float_dtype=float_dtype,
half_exp_bias=half_exp_bias,
half_mantissa_bits=half_mantissa_bits,
)
x_half = x_half.reshape(*x_half.shape[:-1], -1, 32)
x_half = x_half * scale_half[..., None]
@@ -110,8 +117,9 @@ def dq_mxfp4_torch(x: torch.Tensor, scale: torch.Tensor,
return x_half
def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
half_exp_bias: int):
def fp16_to_fp4_simulate(
val, half_mantissa_bits: int, half_exp_bits: int, half_exp_bias: int
):
# Casts an fp16/bf16 input to the restricted values of float4_e2m1,
# that is to say [0., 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0, -0.0,
# -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0].
@@ -119,7 +127,7 @@ def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
float_type = val.dtype
# "rshift_cuda" not implemented for 'UInt16'
val_view = val.view(torch.int16) #.to(torch.int32)
val_view = val.view(torch.int16) # .to(torch.int32)
exp = val_view >> half_mantissa_bits
exp = exp & ((1 << half_exp_bits) - 1)
@@ -147,23 +155,15 @@ def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
tail = mantissa_plus_one & ((1 << tail_bits) - 1)
round_close = (tail < half) # round towards 0
round_away = (tail > half) # round away from 0
round_close = tail < half # round towards 0
round_away = tail > half # round away from 0
tie = tail == half
new_mantissa_close = torch.zeros(val.shape,
device=val.device,
dtype=torch.bool)
new_exp_close = torch.zeros(val.shape,
device=val.device,
dtype=torch.uint16)
new_mantissa_close = torch.zeros(val.shape, device=val.device, dtype=torch.bool)
new_exp_close = torch.zeros(val.shape, device=val.device, dtype=torch.uint16)
new_mantissa_away = torch.zeros(val.shape,
device=val.device,
dtype=torch.bool)
new_exp_away = torch.zeros(val.shape,
device=val.device,
dtype=torch.uint16)
new_mantissa_away = torch.zeros(val.shape, device=val.device, dtype=torch.bool)
new_exp_away = torch.zeros(val.shape, device=val.device, dtype=torch.uint16)
new_exp_tie = torch.zeros(val.shape, device=val.device, dtype=torch.uint16)
@@ -202,27 +202,29 @@ def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
new_exp_tie = (exp > (half_exp_bias - 2)) * (exp + (mantissa_last == 1))
# Gather round up, round down and tie.
new_exp = round_away * new_exp_away \
+ round_close * new_exp_close \
+ tie * new_exp_tie
new_exp = (
round_away * new_exp_away + round_close * new_exp_close + tie * new_exp_tie
)
new_mantissa = round_away * new_mantissa_away \
+ round_close * new_mantissa_close
new_mantissa = round_away * new_mantissa_away + round_close * new_mantissa_close
# if new_exp > 3:
# new_mantissa = 1
new_mantissa = new_mantissa + (new_exp >
(2 + half_exp_bias)) * (new_mantissa == 0)
new_mantissa = new_mantissa + (new_exp > (2 + half_exp_bias)) * (new_mantissa == 0)
# Clamp the exponent to acceptable values.
new_exp = (new_exp >= (half_exp_bias - 2)) * torch.clamp(
new_exp, half_exp_bias - 2, half_exp_bias + 2)
new_exp, half_exp_bias - 2, half_exp_bias + 2
)
sign = sign.to(torch.int32)
new_mantissa = new_mantissa.to(torch.int32)
qdq_val = (sign << 15) + (new_exp << half_mantissa_bits) + (
new_mantissa << (half_mantissa_bits - 1))
qdq_val = (
(sign << 15)
+ (new_exp << half_mantissa_bits)
+ (new_mantissa << (half_mantissa_bits - 1))
)
assert qdq_val.max() <= 65535
assert qdq_val.min() >= 0
@@ -233,8 +235,9 @@ def fp16_to_fp4_simulate(val, half_mantissa_bits: int, half_exp_bits: int,
return result
def qdq_mxfp4_torch(x: torch.Tensor,
scale_calculation_mode: str = "even") -> torch.Tensor:
def qdq_mxfp4_torch(
x: torch.Tensor, scale_calculation_mode: str = "even"
) -> torch.Tensor:
half_dtype = x.dtype
if half_dtype == torch.float16:
@@ -258,8 +261,7 @@ def qdq_mxfp4_torch(x: torch.Tensor,
block_max = block_max.view(torch.uint16).to(torch.int32)
block_max_uint = torch.bitwise_and(block_max + val_to_add,
sign_exponent_mask)
block_max_uint = torch.bitwise_and(block_max + val_to_add, sign_exponent_mask)
assert block_max_uint.max() <= 65535
assert block_max_uint.min() >= 0
@@ -268,20 +270,23 @@ def qdq_mxfp4_torch(x: torch.Tensor,
block_max = block_max_uint.view(half_dtype)
scale_exp = FLOAT8_E8M0_MAX_EXP + torch.floor(torch.log2(block_max)).to(
torch.int32) - 2
scale_exp = (
FLOAT8_E8M0_MAX_EXP + torch.floor(torch.log2(block_max)).to(torch.int32) - 2
)
scale_exp = torch.clamp(scale_exp, 0, 2 * FLOAT8_E8M0_MAX_EXP)
scale = 2.0**(scale_exp - FLOAT8_E8M0_MAX_EXP)
scale = 2.0 ** (scale_exp - FLOAT8_E8M0_MAX_EXP)
scale = scale.to(half_dtype)
x = x / scale[..., None]
x_fp4 = fp16_to_fp4_simulate(x,
half_exp_bits=half_exp_bits,
half_mantissa_bits=half_mantissa_bits,
half_exp_bias=half_exp_bias)
x_fp4 = fp16_to_fp4_simulate(
x,
half_exp_bits=half_exp_bits,
half_mantissa_bits=half_mantissa_bits,
half_exp_bias=half_exp_bias,
)
x_fp4 = x_fp4 * scale[..., None]
return x_fp4.reshape(*x_fp4.shape[:-2], -1)

View File

@@ -1,11 +1,11 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test model set-up and inference for quantized HF models supported
on the AutoRound.
on the AutoRound.
Validating the configuration and printing results for manual checking.
Validating the configuration and printing results for manual checking.
Run `pytest tests/quantization/test_auto_round.py`.
Run `pytest tests/quantization/test_auto_round.py`.
"""
import pytest
@@ -14,18 +14,19 @@ from vllm.platforms import current_platform
MODELS = [
"OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc", ##auto_round:auto_gptq
"Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound" ##auto_round:auto_awq
"Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound", ##auto_round:auto_awq
]
@pytest.mark.skipif(not current_platform.is_cpu()
and not current_platform.is_xpu()
and not current_platform.is_cuda(),
reason="only supports CPU/XPU/CUDA backend.")
@pytest.mark.skipif(
not current_platform.is_cpu()
and not current_platform.is_xpu()
and not current_platform.is_cuda(),
reason="only supports CPU/XPU/CUDA backend.",
)
@pytest.mark.parametrize("model", MODELS)
def test_auto_round(vllm_runner, model):
with vllm_runner(model) as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=8)
output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
assert output
print(f"{output[0][1]}")

View File

@@ -11,8 +11,9 @@ from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform
if not current_platform.is_device_capability(100):
pytest.skip("This test only runs on Blackwell GPUs (SM100).",
allow_module_level=True)
pytest.skip(
"This test only runs on Blackwell GPUs (SM100).", allow_module_level=True
)
os.environ["FLASHINFER_NVCC_THREADS"] = "16"
@@ -22,7 +23,6 @@ dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4}
def can_initialize(model: str, extra_args: Optional[list[str]] = None):
# Server arguments
extra_args = extra_args if extra_args is not None else []
server_args = [
@@ -40,10 +40,11 @@ def can_initialize(model: str, extra_args: Optional[list[str]] = None):
# Launch server and make a simple request
with RemoteOpenAIServer(
model,
server_args,
max_wait_seconds=1000, # Due to FlashInfer compile
override_hf_configs=dummy_hf_overrides) as server:
model,
server_args,
max_wait_seconds=1000, # Due to FlashInfer compile
override_hf_configs=dummy_hf_overrides,
) as server:
client = server.get_client()
# Make a simple request to verify the server works
completion = client.completions.create(
@@ -59,20 +60,21 @@ def can_initialize(model: str, extra_args: Optional[list[str]] = None):
## Llama4 ##
@pytest.mark.skip(reason=(
"RuntimeError: run_moe() Expected a value of type "
"'Optional[List[Tensor]]' for argument '_9' but instead found type "
"'list'."))
def test_llama4_fp8_tensor_moe_flashinfer_cutlass(
monkeypatch: pytest.MonkeyPatch):
@pytest.mark.skip(
reason=(
"RuntimeError: run_moe() Expected a value of type "
"'Optional[List[Tensor]]' for argument '_9' but instead found type "
"'list'."
)
)
def test_llama4_fp8_tensor_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
@pytest.mark.skip(reason="Works, but takes too long to run")
def test_llama4_fp8_tensor_moe_flashinfer_trtllm(
monkeypatch: pytest.MonkeyPatch):
def test_llama4_fp8_tensor_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
@@ -100,24 +102,25 @@ def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
can_initialize("deepseek-ai/DeepSeek-V3.1")
@pytest.mark.skip(reason=("Known issue: lack of kernel support. "
"Expected failure: assert self.block_quant is None"))
def test_deepseek_fp8_block_moe_flashinfer_cutlass(
monkeypatch: pytest.MonkeyPatch):
@pytest.mark.skip(
reason=(
"Known issue: lack of kernel support. "
"Expected failure: assert self.block_quant is None"
)
)
def test_deepseek_fp8_block_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
can_initialize("deepseek-ai/DeepSeek-V3.1")
def test_deepseek_fp8_block_moe_flashinfer_trtllm(
monkeypatch: pytest.MonkeyPatch):
def test_deepseek_fp8_block_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
can_initialize("deepseek-ai/DeepSeek-V3.1")
def test_deepseek_nvfp4_moe_flashinfer_cutlass(
monkeypatch: pytest.MonkeyPatch):
def test_deepseek_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
@@ -138,13 +141,11 @@ def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch):
can_initialize("openai/gpt-oss-20b")
def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(
monkeypatch: pytest.MonkeyPatch):
def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1")
can_initialize("openai/gpt-oss-20b")
def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(
monkeypatch: pytest.MonkeyPatch):
def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
can_initialize("openai/gpt-oss-20b")

View File

@@ -13,18 +13,25 @@ from compressed_tensors.quantization import QuantizationType
from tests.models.utils import check_logprobs_close
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
CompressedTensors24, CompressedTensorsLinearMethod,
CompressedTensorsW4A4Fp4, CompressedTensorsW4A8Fp8,
CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
CompressedTensors24,
CompressedTensorsLinearMethod,
CompressedTensorsW4A4Fp4,
CompressedTensorsW4A8Fp8,
CompressedTensorsW4A16Fp4,
CompressedTensorsW4A16Sparse24,
CompressedTensorsW8A8Fp8,
CompressedTensorsW8A8Int8,
CompressedTensorsW8A16Fp8,
CompressedTensorsWNA16,
)
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
W8A8BlockFp8LinearOp)
from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8LinearOp
from vllm.model_executor.layers.quantization.utils.quant_utils import (
cutlass_fp4_supported)
cutlass_fp4_supported,
)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
sparse_cutlass_supported)
sparse_cutlass_supported,
)
from vllm.platforms import current_platform
# AITER only supports per-channel-per-channel INT8 gemm
@@ -32,7 +39,7 @@ from vllm.platforms import current_platform
# It does not support mix precision MM and mix quantization scheme.
ROCM_AITER_SUPPORTED_INT8_MODEL = [
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
]
# TritonScaledMMLinearKernel only supports symmetric quantization.
@@ -80,8 +87,10 @@ def enable_pickle(monkeypatch):
def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
model_path, strategy, quant_type, shape_0, is_symmetric = model_args
if current_platform.is_rocm(
) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
if (
current_platform.is_rocm()
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
):
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
with vllm_runner(model_path, enforce_eager=True) as llm:
@@ -106,14 +115,10 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
assert zp_valid(gate_up_proj.input_zero_point)
assert zp_valid(down_proj.input_zero_point)
assert isinstance(qkv_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(o_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(gate_up_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(down_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
assert qkv_proj.scheme.strategy == strategy
@@ -151,7 +156,8 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [10])
@pytest.mark.parametrize(
"use_aiter", [True, False] if current_platform.is_rocm() else [False])
"use_aiter", [True, False] if current_platform.is_rocm() else [False]
)
def test_compressed_tensors_w8a8_logprobs(
hf_runner,
vllm_runner,
@@ -162,15 +168,15 @@ def test_compressed_tensors_w8a8_logprobs(
use_aiter,
monkeypatch,
):
if current_platform.is_rocm(
) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
if (
current_platform.is_rocm()
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
):
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
if use_aiter:
if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
pytest.skip(
f"Skip model {model_path} as it is not support by aiter.")
pytest.skip(f"Skip model {model_path} as it is not support by aiter.")
# this will enable VLLM_ROCM_USE_AITER_LINEAR
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
@@ -178,18 +184,20 @@ def test_compressed_tensors_w8a8_logprobs(
# skip language translation prompt for the static per tensor models
if model_path in (
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
):
example_prompts = example_prompts[0:-1]
with hf_runner(model_path, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
with vllm_runner(model_path, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
example_prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=hf_outputs,
@@ -225,7 +233,8 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
],
)
@pytest.mark.parametrize(
"use_aiter", [True, False] if current_platform.is_rocm() else [False])
"use_aiter", [True, False] if current_platform.is_rocm() else [False]
)
def test_compressed_tensors_w8a8_dynamic_per_token(
vllm_runner,
model_args,
@@ -234,14 +243,15 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
):
model_path, strategy = model_args
if current_platform.is_rocm(
) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
if (
current_platform.is_rocm()
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
):
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
if use_aiter:
if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
pytest.skip(
f"Skip model {model_path} as it is not support by aiter.")
pytest.skip(f"Skip model {model_path} as it is not support by aiter.")
# this will enable VLLM_ROCM_USE_AITER_LINEAR
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
@@ -252,8 +262,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
assert not qkv_proj.scheme.is_static_input_scheme
assert qkv_proj.scheme.strategy == strategy
@@ -267,21 +276,60 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
@pytest.mark.parametrize(
"wNa16_args",
[("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8,
True, False),
("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8, True,
False),
("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4,
True, False),
("nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256", "group", 128,
8, False, False),
("nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel",
"channel", None, 8, False, False),
("nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
"group", 128, 8, False, True)],
[
(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
"channel",
None,
8,
True,
False,
),
(
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
"group",
128,
8,
True,
False,
),
(
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
"channel",
None,
4,
True,
False,
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-awq-group128-asym256",
"group",
128,
8,
False,
False,
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-Channel",
"channel",
None,
8,
False,
False,
),
(
"nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
"group",
128,
8,
False,
True,
),
],
)
@pytest.mark.skipif(
not current_platform.is_cuda(), reason="The tests are skipped on non-CUDA platform."
)
@pytest.mark.skipif(not current_platform.is_cuda(),
reason="The tests are skipped on non-CUDA platform.")
def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
model, strategy, group, pack_factor, symmetric, has_g_idx = wNa16_args
with vllm_runner(model) as llm:
@@ -290,13 +338,11 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
assert qkv_proj.scheme.strategy == strategy
assert qkv_proj.scheme.group_size == (-1
if group is None else group)
assert qkv_proj.scheme.group_size == (-1 if group is None else group)
assert qkv_proj.scheme.pack_factor == pack_factor
assert qkv_proj.scheme.symmetric == symmetric
@@ -308,8 +354,9 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
assert output
@pytest.mark.skipif(not current_platform.is_cuda(),
reason="This test is skipped on non-CUDA platform.")
@pytest.mark.skipif(
not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
)
def test_compressed_tensors_w4a16_marlin24(vllm_runner):
model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
with vllm_runner(model_path) as llm:
@@ -319,8 +366,7 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner):
qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
assert qkv_proj.weight_packed.dtype is torch.int32
@@ -339,8 +385,7 @@ def test_compressed_tensors_fp8(vllm_runner):
qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(
qkv_proj.scheme,
(CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8),
@@ -362,9 +407,11 @@ def test_compressed_tensors_fp8(vllm_runner):
@pytest.mark.skipif(
not current_platform.is_kv_cache_dtype_supported("fp8", None),
reason="FP8 KV cache is not supported on this device.")
@pytest.mark.skipif(not current_platform.is_cuda(),
reason="This test is skipped on non-CUDA platform.")
reason="FP8 KV cache is not supported on this device.",
)
@pytest.mark.skipif(
not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
)
def test_compressed_tensors_kv_cache(vllm_runner):
model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
@@ -376,10 +423,7 @@ def test_compressed_tensors_kv_cache(vllm_runner):
not sparse_cutlass_supported(),
reason="Sparse FP8 is not yet supported on this GPU type.",
)
def _test_2of4_quant_models(qkv_proj,
weight_strategy,
input_strategy,
format="dense"):
def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy, format="dense"):
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensors24)
@@ -393,8 +437,7 @@ def _test_2of4_quant_models(qkv_proj,
@pytest.mark.skipif(
not current_platform.is_cuda()
or not current_platform.has_device_capability(90),
not current_platform.is_cuda() or not current_platform.has_device_capability(90),
reason="Sparse FP8 is not yet supported on this GPU type.",
)
@pytest.mark.parametrize(
@@ -441,8 +484,7 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
@pytest.mark.skipif(
not current_platform.is_cuda()
or not current_platform.has_device_capability(90),
not current_platform.is_cuda() or not current_platform.has_device_capability(90),
reason="Sparse FP8 is not yet supported on this GPU type.",
)
@pytest.mark.parametrize(
@@ -603,17 +645,14 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensors24)
assert qkv_proj.scheme.weight_quant is None
assert qkv_proj.scheme.input_quant is None
assert not qkv_proj.scheme.quantized
assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
sparsity_map = (
qkv_proj.quant_method.quantization_config.sparsity_scheme_map
) # noqa: E501
sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501
assert sparsity_map.get("Linear").format == "dense"
assert sparsity_map.get("Linear").sparsity_structure == "2:4"
@@ -629,7 +668,8 @@ def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
reason="Cutlass is not yet supported on this GPU type.",
)
@pytest.mark.parametrize(
"args_2of4", [("nm-testing/llama2.c-stories42M-pruned2.4-compressed")])
"args_2of4", [("nm-testing/llama2.c-stories42M-pruned2.4-compressed")]
)
def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
model = args_2of4
with vllm_runner(model) as llm:
@@ -638,17 +678,14 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensors24)
assert qkv_proj.scheme.weight_quant is None
assert qkv_proj.scheme.input_quant is None
assert not qkv_proj.scheme.quantized
assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
sparsity_map = (
qkv_proj.quant_method.quantization_config.sparsity_scheme_map
) # noqa: E501
sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501
assert sparsity_map.get("Linear").format == "sparse-24-bitmask"
assert sparsity_map.get("Linear").sparsity_structure == "2:4"
@@ -661,9 +698,11 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
@pytest.mark.parametrize(
"args",
[("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16",
CompressedTensorsW4A16Fp4),
("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4)])
[
("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4),
("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4),
],
)
def test_compressed_tensors_nvfp4(vllm_runner, args):
model, scheme = args
with vllm_runner(model, enforce_eager=True) as llm:
@@ -672,11 +711,12 @@ def test_compressed_tensors_nvfp4(vllm_runner, args):
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method,
CompressedTensorsLinearMethod)
if isinstance(qkv_proj.scheme, scheme) or isinstance(
qkv_proj.scheme,
CompressedTensorsW4A16Fp4) and not cutlass_fp4_supported():
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
if (
isinstance(qkv_proj.scheme, scheme)
or isinstance(qkv_proj.scheme, CompressedTensorsW4A16Fp4)
and not cutlass_fp4_supported()
):
assert True
else:
raise AssertionError("FP4 Scheme Mismatch")
@@ -690,13 +730,13 @@ def test_compressed_tensors_nvfp4(vllm_runner, args):
@pytest.mark.skipif(
not current_platform.is_cuda()
or not current_platform.has_device_capability(90),
not current_platform.is_cuda() or not current_platform.has_device_capability(90),
reason="W4A8 FP8 is not yet supported on this GPU type.",
)
@pytest.mark.parametrize("args", [
("czhu-cohere/TinyLlama-1.1B-Chat-v1.0-W4A8-e2e", CompressedTensorsW4A8Fp8)
])
@pytest.mark.parametrize(
"args",
[("czhu-cohere/TinyLlama-1.1B-Chat-v1.0-W4A8-e2e", CompressedTensorsW4A8Fp8)],
)
def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
model, scheme = args
with vllm_runner(model, enforce_eager=True) as llm:
@@ -710,8 +750,7 @@ def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
down_proj = layer.mlp.down_proj
for proj in (qkv_proj, o_proj, gate_up_proj, down_proj):
assert isinstance(proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(proj.scheme, scheme)
assert proj.weight_packed.dtype is torch.int32
@@ -725,22 +764,27 @@ def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
assert output
@pytest.mark.skipif(not current_platform.is_cuda(),
reason="This test is skipped on non-CUDA platform.")
@pytest.mark.parametrize("model,prompt,exp_perplexity", [
(
"nm-testing/Llama-3.2-1B-Instruct-spinquantR1R2R4-w4a16",
"Flat is better than nested.\nSparse is better than dense.",
150.0,
),
(
"nm-testing/Llama-3.2-1B-Instruct-quip-w4a16",
"Flat is better than nested.\nSparse is better than dense.",
150.0,
),
])
def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt,
exp_perplexity):
@pytest.mark.skipif(
not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
)
@pytest.mark.parametrize(
"model,prompt,exp_perplexity",
[
(
"nm-testing/Llama-3.2-1B-Instruct-spinquantR1R2R4-w4a16",
"Flat is better than nested.\nSparse is better than dense.",
150.0,
),
(
"nm-testing/Llama-3.2-1B-Instruct-quip-w4a16",
"Flat is better than nested.\nSparse is better than dense.",
150.0,
),
],
)
def test_compressed_tensors_transforms_perplexity(
vllm_runner, model, prompt, exp_perplexity
):
with vllm_runner(model, enforce_eager=True) as llm:
perplexity = llm.generate_prompt_perplexity([prompt])[0]
print(perplexity)
@@ -750,26 +794,24 @@ def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt,
def test_compressed_tensors_fp8_block_enabled(vllm_runner):
model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
with vllm_runner(model_path) as llm:
fp8_dtype = current_platform.fp8_dtype()
def check_model(model):
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method,
CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8)
assert isinstance(qkv_proj.scheme.w8a8_block_fp8_linear,
W8A8BlockFp8LinearOp)
assert isinstance(
qkv_proj.scheme.w8a8_block_fp8_linear, W8A8BlockFp8LinearOp
)
assert qkv_proj.weight.dtype is fp8_dtype
assert qkv_proj.weight_scale.dtype is torch.float32
assert len(qkv_proj.weight.shape) == 2
assert len(qkv_proj.weight_scale.shape) == 2
input_quant_op = \
qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
input_quant_op = qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
assert isinstance(input_quant_op, QuantFP8)
assert input_quant_op._forward_method == input_quant_op.forward_cuda

View File

@@ -33,7 +33,6 @@ MODEL_ARG_EXPTYPES = [
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
# AUTOAWQ
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"),
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
@@ -55,4 +54,5 @@ def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None:
assert found_quantization_type == expected_type, (
f"Expected quant_type == {expected_type} for {model_path}, "
f"but found {found_quantization_type} "
f"for no --quantization {quantization_arg} case")
f"for no --quantization {quantization_arg} case"
)

View File

@@ -1,77 +1,108 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Expanded quantized model tests for CPU offloading
# Base tests: tests/basic_correctness/test_cpu_offload.py
import pytest
from tests.quantization.utils import is_quant_method_supported
from ..utils import compare_two_settings
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.")
def test_cpu_offload_fp8():
# Test quantization of an unquantized checkpoint
compare_two_settings("meta-llama/Llama-3.2-1B-Instruct",
["--quantization", "fp8"],
["--quantization", "fp8", "--cpu-offload-gb", "1"],
max_wait_seconds=480)
# Test loading a quantized checkpoint
compare_two_settings("neuralmagic/Qwen2-1.5B-Instruct-FP8", [],
["--cpu-offload-gb", "1"],
max_wait_seconds=480)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_gptq(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
# Test GPTQ Marlin
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
["--cpu-offload-gb", "1"],
max_wait_seconds=480)
# Test GPTQ
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
["--quantization", "gptq"],
["--quantization", "gptq", "--cpu-offload-gb", "1"],
max_wait_seconds=480)
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
reason="awq_marlin is not supported on this GPU type.")
def test_cpu_offload_awq(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
# Test AWQ Marlin
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
["--cpu-offload-gb", "1"],
max_wait_seconds=480)
# Test AWQ
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ",
["--quantization", "awq"],
["--quantization", "awq", "--cpu-offload-gb", "1"],
max_wait_seconds=480)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_compressed_tensors(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
# Test wNa16
compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
["--cpu-offload-gb", "1"],
max_wait_seconds=480)
# Test w4a16_marlin24
compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
[], ["--cpu-offload-gb", "1"],
max_wait_seconds=480)
# Test w8a8
compare_two_settings(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", [],
["--cpu-offload-gb", "1"],
max_wait_seconds=480)
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Expanded quantized model tests for CPU offloading
# Base tests: tests/basic_correctness/test_cpu_offload.py
import pytest
from tests.quantization.utils import is_quant_method_supported
from ..utils import compare_two_settings
@pytest.mark.skipif(
not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.",
)
def test_cpu_offload_fp8():
# Test quantization of an unquantized checkpoint
compare_two_settings(
"meta-llama/Llama-3.2-1B-Instruct",
["--quantization", "fp8"],
["--quantization", "fp8", "--cpu-offload-gb", "1"],
max_wait_seconds=480,
)
# Test loading a quantized checkpoint
compare_two_settings(
"neuralmagic/Qwen2-1.5B-Instruct-FP8",
[],
["--cpu-offload-gb", "1"],
max_wait_seconds=480,
)
@pytest.mark.skipif(
not is_quant_method_supported("gptq_marlin"),
reason="gptq_marlin is not supported on this GPU type.",
)
def test_cpu_offload_gptq(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
# Test GPTQ Marlin
compare_two_settings(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
[],
["--cpu-offload-gb", "1"],
max_wait_seconds=480,
)
# Test GPTQ
compare_two_settings(
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
["--quantization", "gptq"],
["--quantization", "gptq", "--cpu-offload-gb", "1"],
max_wait_seconds=480,
)
@pytest.mark.skipif(
not is_quant_method_supported("awq_marlin"),
reason="awq_marlin is not supported on this GPU type.",
)
def test_cpu_offload_awq(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
# Test AWQ Marlin
compare_two_settings(
"Qwen/Qwen2-1.5B-Instruct-AWQ",
[],
["--cpu-offload-gb", "1"],
max_wait_seconds=480,
)
# Test AWQ
compare_two_settings(
"Qwen/Qwen2-1.5B-Instruct-AWQ",
["--quantization", "awq"],
["--quantization", "awq", "--cpu-offload-gb", "1"],
max_wait_seconds=480,
)
@pytest.mark.skipif(
not is_quant_method_supported("gptq_marlin"),
reason="gptq_marlin is not supported on this GPU type.",
)
def test_cpu_offload_compressed_tensors(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
# Test wNa16
compare_two_settings(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
[],
["--cpu-offload-gb", "1"],
max_wait_seconds=480,
)
# Test w4a16_marlin24
compare_two_settings(
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
[],
["--cpu-offload-gb", "1"],
max_wait_seconds=480,
)
# Test w8a8
compare_two_settings(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
[],
["--cpu-offload-gb", "1"],
max_wait_seconds=480,
)

View File

@@ -2,9 +2,10 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# flake8: noqa
"""Tests experts_int8 quantization startup and generation,
"""Tests experts_int8 quantization startup and generation,
doesn't test correctness
"""
import pytest
from tests.quantization.utils import is_quant_method_supported
@@ -14,8 +15,10 @@ from ..models.registry import HF_EXAMPLE_MODELS
MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"]
@pytest.mark.skipif(not is_quant_method_supported("experts_int8"),
reason="ExpertsInt8 is not supported on this GPU type.")
@pytest.mark.skipif(
not is_quant_method_supported("experts_int8"),
reason="ExpertsInt8 is not supported on this GPU type.",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [10])
@@ -30,6 +33,5 @@ def test_model_experts_int8_startup(
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_transformers_version(on_fail="skip")
with vllm_runner(model, dtype=dtype,
quantization="experts_int8") as vllm_model:
with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)

View File

@@ -4,13 +4,16 @@
Run `pytest tests/quantization/test_fp8.py --forked`.
"""
import pytest
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
Fp8LinearMethod)
from vllm.model_executor.layers.quantization.fp8 import (
Fp8KVCacheMethod,
Fp8LinearMethod,
)
from vllm.platforms import current_platform
MODELS = [
@@ -20,15 +23,18 @@ MODELS = [
]
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.")
@pytest.mark.skipif(
not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_id", MODELS)
@pytest.mark.parametrize("force_marlin", [False, True])
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
use_rocm_aiter: bool, monkeypatch) -> None:
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
)
def test_model_load_and_run(
vllm_runner, model_id: str, force_marlin: bool, use_rocm_aiter: bool, monkeypatch
) -> None:
if use_rocm_aiter:
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
@@ -50,13 +56,17 @@ KV_CACHE_MODELS = [
]
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.")
@pytest.mark.skipif(
not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
use_rocm_aiter: bool, monkeypatch):
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
)
def test_kv_cache_model_load_and_run(
vllm_runner, model_id: str, use_rocm_aiter: bool, monkeypatch
):
if use_rocm_aiter:
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
@@ -93,14 +103,22 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
print(outputs[0][1])
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.")
@pytest.mark.skipif(
not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.",
)
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
@pytest.mark.parametrize("force_marlin", [False, True])
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
use_rocm_aiter: bool, monkeypatch) -> None:
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
)
def test_load_fp16_model(
vllm_runner,
kv_cache_dtype: str,
force_marlin: bool,
use_rocm_aiter: bool,
monkeypatch,
) -> None:
if use_rocm_aiter:
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
@@ -110,9 +128,9 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
if force_marlin:
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
with vllm_runner("facebook/opt-125m",
quantization="fp8",
kv_cache_dtype=kv_cache_dtype) as llm:
with vllm_runner(
"facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype
) as llm:
def check_model(model):
fc1 = model.model.decoder.layers[0].fc1
@@ -139,26 +157,29 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
pytest.skip(
"Skip `test_load_fp16_model`. "
"It only runs on ROCm platform with FP8 compute."
" e.g. MI300X and above.")
" e.g. MI300X and above."
)
else: # unsupported platform
pytest.skip("Skip `test_load_fp16_model`. "
"It only runs on CUDA and ROCm platform.")
pytest.skip(
"Skip `test_load_fp16_model`. "
"It only runs on CUDA and ROCm platform."
)
llm.apply_model(check_model)
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.")
@pytest.mark.skipif(
not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.",
)
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_scaled_fp8_quant(dtype) -> None:
def quantize_ref(tensor, inv_scale):
# The reference implementation that fully aligns to
# the kernel being tested.
finfo = torch.finfo(torch.float8_e4m3fn)
scale = inv_scale.reciprocal()
qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min,
max=finfo.max)
qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max)
qweight = qweight.to(torch.float8_e4m3fn)
return qweight
@@ -177,26 +198,23 @@ def test_scaled_fp8_quant(dtype) -> None:
# Reference dynamic quantizaton
y = quantize_ref(x, inv_scale)
torch.testing.assert_close(ref_y,
per_tensor_dequantize(y, inv_scale, dtype))
torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
# Static quantization
y, _ = ops.scaled_fp8_quant(x, inv_scale)
torch.testing.assert_close(ref_y,
per_tensor_dequantize(y, inv_scale, dtype))
torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
# Padding
y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17)
assert y.shape[0] == 17
torch.testing.assert_close(
ref_y,
per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale,
dtype))
per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale, dtype),
)
# non-contiguous input with padding
m, n, padded_stride = 975, 512, 576
padded_tensor = (torch.randn(size=(m, padded_stride), device="cuda") *
13).to(dtype)
padded_tensor = (torch.randn(size=(m, padded_stride), device="cuda") * 13).to(dtype)
x_nc = padded_tensor[:, :n] # shape (m, n) with stride (padded_stride, 1)
assert not x_nc.is_contiguous()
@@ -209,19 +227,21 @@ def test_scaled_fp8_quant(dtype) -> None:
# reference dynamic quantization
y_nc = quantize_ref(x_nc, inv_scale_nc)
torch.testing.assert_close(
ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype))
ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype)
)
# static quantization
y_nc, _ = ops.scaled_fp8_quant(x_nc, inv_scale_nc)
torch.testing.assert_close(
ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype))
ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype)
)
# padding after non-contiguous input quantization
y_nc_pad, _ = ops.scaled_fp8_quant(x_nc,
inv_scale_nc,
num_token_padding=m + 10)
y_nc_pad, _ = ops.scaled_fp8_quant(x_nc, inv_scale_nc, num_token_padding=m + 10)
assert y_nc_pad.shape[0] == m + 10
torch.testing.assert_close(
ref_y_nc,
per_tensor_dequantize(torch.narrow(y_nc_pad, 0, 0, x_nc.shape[0]),
inv_scale_nc, dtype))
per_tensor_dequantize(
torch.narrow(y_nc_pad, 0, 0, x_nc.shape[0]), inv_scale_nc, dtype
),
)

View File

@@ -10,10 +10,10 @@ import torch
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
from vllm.model_executor.layers.quantization.gptq_marlin import (
GPTQMarlinLinearMethod)
from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinLinearMethod
from vllm.model_executor.layers.quantization.utils.gptq_utils import (
get_dynamic_override)
get_dynamic_override,
)
PROMPT = "On the surface of Mars, we found"
@@ -21,56 +21,59 @@ PROMPT = "On the surface of Mars, we found"
# The second layer is quantized using bits=8, group_size=32
# All other layers (layer index >= 2) are not quantized
MODEL_QUANT = [
("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
True),
("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
False),
("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", True),
(
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
False,
),
]
@pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
monkeypatch):
def test_gptq_with_dynamic(
vllm_runner, model_id: str, use_marlin_kernel: bool, monkeypatch
):
# `LLM.apply_model` requires pickling a function.
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
GPTQLinearMethod)
linear_method_cls = (
GPTQMarlinLinearMethod if use_marlin_kernel else (GPTQLinearMethod)
)
with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as llm:
def check_model(model):
for name, submodule in model.named_modules():
if name == "lm_head":
assert isinstance(submodule.quant_method,
linear_method_cls)
elif name == 'model.layers.0.self_attn.qkv_proj':
assert isinstance(submodule.quant_method, linear_method_cls)
elif name == "model.layers.0.self_attn.qkv_proj":
# The first layer is quantized using bits=4, group_size=128
# desc_act=True
assert isinstance(submodule.quant_method,
linear_method_cls)
assert isinstance(submodule.quant_method, linear_method_cls)
config = submodule.quant_method.quant_config
assert config.weight_bits == 4
assert config.group_size == 128
assert config.desc_act
elif name == 'model.layers.1.self_attn.qkv_proj':
elif name == "model.layers.1.self_attn.qkv_proj":
# The second layer is quantized using bits=8, group_size=32
# desc_act=False
assert isinstance(submodule.quant_method,
linear_method_cls)
assert isinstance(submodule.quant_method, linear_method_cls)
config = submodule.quant_method.quant_config
assert get_dynamic_override(config,
layer_name=name,
key="bits") == 8
assert get_dynamic_override(config,
layer_name=name,
key="group_size") == 32
assert (
get_dynamic_override(config, layer_name=name, key="bits") == 8
)
assert (
get_dynamic_override(config, layer_name=name, key="group_size")
== 32
)
assert not get_dynamic_override(
config, layer_name=name, key="desc_act")
elif (name == 'model.layers.2.self_attn.qkv_proj'
or name == 'model.layers.2.mlp.gate_up_proj'):
config, layer_name=name, key="desc_act"
)
elif (
name == "model.layers.2.self_attn.qkv_proj"
or name == "model.layers.2.mlp.gate_up_proj"
):
# All other layers (layer index >= 2) are not quantized
assert isinstance(submodule.quant_method,
UnquantizedLinearMethod)
assert isinstance(submodule.quant_method, UnquantizedLinearMethod)
llm.apply_model(check_model)

View File

@@ -1,11 +1,11 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Test model set-up and inference for quantized HF models supported
on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
Validating the configuration and printing results for manual checking.
on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
Run `pytest tests/quantization/test_ipex_quant.py`.
Validating the configuration and printing results for manual checking.
Run `pytest tests/quantization/test_ipex_quant.py`.
"""
import pytest
@@ -19,14 +19,14 @@ MODELS = [
DTYPE = ["bfloat16"]
@pytest.mark.skipif(not current_platform.is_cpu()
and not current_platform.is_xpu(),
reason="only supports Intel CPU/XPU backend.")
@pytest.mark.skipif(
not current_platform.is_cpu() and not current_platform.is_xpu(),
reason="only supports Intel CPU/XPU backend.",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", DTYPE)
def test_ipex_quant(vllm_runner, model, dtype):
with vllm_runner(model, dtype=dtype) as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=32)
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
assert output
print(output)

View File

@@ -9,10 +9,10 @@ import pytest
import torch
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
from vllm.model_executor.layers.quantization.gptq_marlin import (
GPTQMarlinLinearMethod)
from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinLinearMethod
from vllm.model_executor.layers.vocab_parallel_embedding import (
UnquantizedEmbeddingMethod)
UnquantizedEmbeddingMethod,
)
PROMPT = "On the surface of Mars, we found"
@@ -31,20 +31,20 @@ def test_lm_head(
) -> None:
# `LLM.apply_model` requires pickling a function.
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
with vllm_runner(model_id, dtype=torch.float16,
max_model_len=2048) as vllm_model:
with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as vllm_model:
def check_model(model):
lm_head_layer = model.lm_head
if lm_head_quantized:
assert isinstance(lm_head_layer.quant_method,
(GPTQLinearMethod, GPTQMarlinLinearMethod))
assert isinstance(
lm_head_layer.quant_method,
(GPTQLinearMethod, GPTQMarlinLinearMethod),
)
else:
assert isinstance(lm_head_layer.quant_method,
UnquantizedEmbeddingMethod)
assert isinstance(
lm_head_layer.quant_method, UnquantizedEmbeddingMethod
)
vllm_model.apply_model(check_model)
print(
vllm_model.generate_greedy(["Hello my name is"],
max_tokens=10)[0][1])
print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=10)[0][1])

View File

@@ -19,21 +19,26 @@ def enable_pickle(monkeypatch):
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
@pytest.mark.skipif(not is_quant_method_supported("modelopt"),
reason="ModelOpt FP8 is not supported on this GPU type.")
@pytest.mark.skipif(
not is_quant_method_supported("modelopt"),
reason="ModelOpt FP8 is not supported on this GPU type.",
)
def test_modelopt_fp8_checkpoint_setup(vllm_runner):
"""Test ModelOpt FP8 checkpoint loading and structure validation."""
# TODO: provide a small publicly available test checkpoint
model_path = ("/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/"
"TinyLlama-1.1B-Chat-v1.0-fp8-0710")
model_path = (
"/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/"
"TinyLlama-1.1B-Chat-v1.0-fp8-0710"
)
# Skip test if checkpoint doesn't exist
if not os.path.exists(model_path):
pytest.skip(f"Test checkpoint not found at {model_path}. "
"This test requires a local ModelOpt FP8 checkpoint.")
pytest.skip(
f"Test checkpoint not found at {model_path}. "
"This test requires a local ModelOpt FP8 checkpoint."
)
with vllm_runner(model_path, quantization="modelopt",
enforce_eager=True) as llm:
with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
def check_model(model):
layer = model.model.layers[0]
@@ -45,11 +50,12 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
# Check that ModelOpt quantization method is properly applied
from vllm.model_executor.layers.quantization.modelopt import (
ModelOptFp8LinearMethod)
ModelOptFp8LinearMethod,
)
assert isinstance(qkv_proj.quant_method, ModelOptFp8LinearMethod)
assert isinstance(o_proj.quant_method, ModelOptFp8LinearMethod)
assert isinstance(gate_up_proj.quant_method,
ModelOptFp8LinearMethod)
assert isinstance(gate_up_proj.quant_method, ModelOptFp8LinearMethod)
assert isinstance(down_proj.quant_method, ModelOptFp8LinearMethod)
# Check weight dtype is FP8
@@ -59,23 +65,23 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
assert down_proj.weight.dtype == torch.float8_e4m3fn
# Check scales are present and have correct dtype
assert hasattr(qkv_proj, 'weight_scale')
assert hasattr(qkv_proj, 'input_scale')
assert hasattr(qkv_proj, "weight_scale")
assert hasattr(qkv_proj, "input_scale")
assert qkv_proj.weight_scale.dtype == torch.float32
assert qkv_proj.input_scale.dtype == torch.float32
assert hasattr(o_proj, 'weight_scale')
assert hasattr(o_proj, 'input_scale')
assert hasattr(o_proj, "weight_scale")
assert hasattr(o_proj, "input_scale")
assert o_proj.weight_scale.dtype == torch.float32
assert o_proj.input_scale.dtype == torch.float32
assert hasattr(gate_up_proj, 'weight_scale')
assert hasattr(gate_up_proj, 'input_scale')
assert hasattr(gate_up_proj, "weight_scale")
assert hasattr(gate_up_proj, "input_scale")
assert gate_up_proj.weight_scale.dtype == torch.float32
assert gate_up_proj.input_scale.dtype == torch.float32
assert hasattr(down_proj, 'weight_scale')
assert hasattr(down_proj, 'input_scale')
assert hasattr(down_proj, "weight_scale")
assert hasattr(down_proj, "input_scale")
assert down_proj.weight_scale.dtype == torch.float32
assert down_proj.input_scale.dtype == torch.float32

View File

@@ -4,18 +4,19 @@
Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
"""
import pytest
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod
from vllm.model_executor.layers.quantization.ptpc_fp8 import (
PTPCFp8LinearMethod)
from vllm.model_executor.layers.quantization.ptpc_fp8 import PTPCFp8LinearMethod
from vllm.platforms import current_platform
UNSUPPORTED_STR = (
"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only "
"support output dtype of bfloat16. torch.float16 is specified.")
"support output dtype of bfloat16. torch.float16 is specified."
)
@pytest.fixture(scope="function", autouse=True)
@@ -24,18 +25,21 @@ def enable_pickle(monkeypatch):
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
@pytest.mark.skipif(not is_quant_method_supported("ptpc_fp8"),
reason="PTPC FP8 is not supported on this GPU type.")
@pytest.mark.skipif(not current_platform.is_rocm(),
reason="This test is for ROCm GPU.")
@pytest.mark.skipif(
not is_quant_method_supported("ptpc_fp8"),
reason="PTPC FP8 is not supported on this GPU type.",
)
@pytest.mark.skipif(not current_platform.is_rocm(), reason="This test is for ROCm GPU.")
@pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"])
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"])
def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
try:
llm = vllm_runner("facebook/opt-125m",
dtype=dtype,
quantization="ptpc_fp8",
kv_cache_dtype=kv_cache_dtype)
llm = vllm_runner(
"facebook/opt-125m",
dtype=dtype,
quantization="ptpc_fp8",
kv_cache_dtype=kv_cache_dtype,
)
except AssertionError as e:
if str(e) == UNSUPPORTED_STR:
# If the error message matches, the test passes

View File

@@ -19,23 +19,27 @@ import torch
from packaging import version
from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501
QuarkLinearMethod, QuarkW8A8Fp8, QuarkW8A8Int8)
QuarkLinearMethod,
QuarkW8A8Fp8,
QuarkW8A8Int8,
)
from vllm.platforms import current_platform
from .reference_mxfp4 import dq_mxfp4_torch, qdq_mxfp4_torch
QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse(
importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
importlib.metadata.version("amd-quark")
) >= version.parse("0.8.99")
if QUARK_MXFP4_AVAILABLE:
from quark.torch.export.nn.modules.realquantizer import (
StaticScaledRealQuantizer)
from quark.torch.export.nn.modules.realquantizer import StaticScaledRealQuantizer
from quark.torch.kernel import mx as mx_kernel
from quark.torch.quantization.config.config import FP4PerGroupSpec
try:
huggingface_hub.list_repo_refs(
"amd/Llama-3.3-70B-Instruct-WMXFP4-AMXFP4-KVFP8-Scale-UINT8-SQ")
"amd/Llama-3.3-70B-Instruct-WMXFP4-AMXFP4-KVFP8-Scale-UINT8-SQ"
)
HF_HUB_AMD_ORG_ACCESS = True
except huggingface_hub.errors.RepositoryNotFoundError:
HF_HUB_AMD_ORG_ACCESS = False
@@ -47,13 +51,13 @@ def enable_pickle(monkeypatch):
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
@pytest.mark.parametrize('kv_cache_dtype', ['auto', 'fp8'])
@pytest.mark.parametrize('tp', [1])
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
@pytest.mark.parametrize("tp", [1])
def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
with vllm_runner(model_path,
kv_cache_dtype=kv_cache_dtype,
tensor_parallel_size=tp) as llm:
with vllm_runner(
model_path, kv_cache_dtype=kv_cache_dtype, tensor_parallel_size=tp
) as llm:
def check_model(model):
layer = model.model.layers[0]
@@ -74,7 +78,7 @@ def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
assert output
@pytest.mark.parametrize('tp', [1])
@pytest.mark.parametrize("tp", [1])
def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp):
model_path = "amd/Qwen2.5-1.5B-Instruct-ptpc-Quark-ts"
with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
@@ -89,8 +93,7 @@ def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp):
if isinstance(qkv_proj.scheme, QuarkW8A8Fp8):
assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
assert qkv_proj.weight_scale.shape[0] == qkv_proj.weight.shape[
1]
assert qkv_proj.weight_scale.shape[0] == qkv_proj.weight.shape[1]
assert qkv_proj.weight_scale.shape[1] == 1
llm.apply_model(check_model)
@@ -99,7 +102,7 @@ def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp):
assert output
@pytest.mark.parametrize('tp', [1])
@pytest.mark.parametrize("tp", [1])
def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
model_path = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
@@ -125,16 +128,18 @@ def test_quark_fp8_parity(vllm_runner):
llm_kwargs = {
"tensor_parallel_size": 1,
"enforce_eager": True,
"gpu_memory_utilization": 0.1
"gpu_memory_utilization": 0.1,
}
with (vllm_runner(quark_model_id, **llm_kwargs) as
quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle):
with (
vllm_runner(quark_model_id, **llm_kwargs) as quark_handle,
vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle,
):
def get_state_dict(model):
return {k: v.cpu() for k, v in model.state_dict().items()}
quark_state_dict, = quark_handle.apply_model(get_state_dict)
fp8_state_dict, = fp8_handle.apply_model(get_state_dict)
(quark_state_dict,) = quark_handle.apply_model(get_state_dict)
(fp8_state_dict,) = fp8_handle.apply_model(get_state_dict)
assert fp8_state_dict.keys() == quark_state_dict.keys()
@@ -164,16 +169,17 @@ ACCURACY_CONFIGS = [
# Private model.
GSM8KAccuracyTestConfig(
model_name="amd/DeepSeek-R1-WMXFP4-AMXFP4-Scale-UINT8-MoE-Quant",
excepted_value=0.96),
excepted_value=0.96,
),
]
@pytest.mark.parametrize("config", ACCURACY_CONFIGS)
@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
reason="amd-quark>=0.9 is not available")
@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
@pytest.mark.skipif(
not HF_HUB_AMD_ORG_ACCESS,
reason="Read access to huggingface.co/amd is required for this test.")
reason="Read access to huggingface.co/amd is required for this test.",
)
def test_mxfp4_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
if torch.cuda.device_count() < 8:
pytest.skip(
@@ -195,28 +201,26 @@ def test_mxfp4_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
EXPECTED_VALUE = config.excepted_value
measured_value = results["results"][task]["exact_match,strict-match"]
assert (measured_value - rtol < EXPECTED_VALUE
and measured_value + rtol > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
assert (
measured_value - rtol < EXPECTED_VALUE
and measured_value + rtol > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
del os.environ["VLLM_USE_TRITON_FLASH_ATTN"]
@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
reason="amd-quark>=0.9 is not available")
@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
@pytest.mark.parametrize("float_dtype", [torch.bfloat16, torch.float16])
@pytest.mark.parametrize("scalings",
[[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
def test_mxfp4_fused_qdq_match_quark(float_dtype: torch.dtype,
scalings: list[int]):
@pytest.mark.parametrize("scalings", [[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
def test_mxfp4_fused_qdq_match_quark(float_dtype: torch.dtype, scalings: list[int]):
torch.manual_seed(0)
hidden_size = 64 * 32
inp = (torch.rand(1, hidden_size, dtype=float_dtype, device="cuda") -
0.5) * 2
inp = (torch.rand(1, hidden_size, dtype=float_dtype, device="cuda") - 0.5) * 2
for i in range(hidden_size // 32):
inp[:, i * 32:(i + 1) *
32] = inp[:, i * 32:(i + 1) * 32] * scalings[i % len(scalings)]
inp[:, i * 32 : (i + 1) * 32] = (
inp[:, i * 32 : (i + 1) * 32] * scalings[i % len(scalings)]
)
inp_kernel = inp.clone()
inp_kernel_clone = inp_kernel.clone()
@@ -225,20 +229,20 @@ def test_mxfp4_fused_qdq_match_quark(float_dtype: torch.dtype,
res_torch = qdq_mxfp4_torch(inp_kernel, "even")
for i in range(hidden_size // 32):
assert torch.all(torch.isfinite(res_hip[:, i * 32:(i + 1) * 32]))
assert torch.all(torch.isfinite(res_torch[:, i * 32:(i + 1) * 32]))
assert torch.all(torch.isfinite(res_hip[:, i * 32 : (i + 1) * 32]))
assert torch.all(torch.isfinite(res_torch[:, i * 32 : (i + 1) * 32]))
torch.testing.assert_close(res_hip[:, i * 32:(i + 1) * 32],
res_torch[:, i * 32:(i + 1) * 32])
torch.testing.assert_close(
res_hip[:, i * 32 : (i + 1) * 32], res_torch[:, i * 32 : (i + 1) * 32]
)
@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
reason="amd-quark>=0.9 is not available")
@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
@pytest.mark.parametrize("float_dtype", [torch.bfloat16, torch.float16])
@pytest.mark.parametrize("scalings",
[[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
def test_mxfp4_dequant_kernel_match_quark(float_dtype: torch.dtype,
scalings: list[int]):
@pytest.mark.parametrize("scalings", [[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
def test_mxfp4_dequant_kernel_match_quark(
float_dtype: torch.dtype, scalings: list[int]
):
qspec = FP4PerGroupSpec(
ch_axis=-1,
group_size=32,
@@ -265,8 +269,9 @@ def test_mxfp4_dequant_kernel_match_quark(float_dtype: torch.dtype,
# Make it so that different groups have different scales.
for i in range(hidden_size // 32):
w[:, i * 32:(i + 1) *
32] = w[:, i * 32:(i + 1) * 32] * scalings[i % len(scalings)]
w[:, i * 32 : (i + 1) * 32] = (
w[:, i * 32 : (i + 1) * 32] * scalings[i % len(scalings)]
)
observer(w)
scale, _ = observer._calculate_qparams()

View File

@@ -6,18 +6,25 @@ See https://github.com/vllm-project/vllm/issues/11926 for more details.
Run `pytest tests/quantization/test_register_quantization_config.py`.
"""
from typing import Any, Optional
import pytest
import torch
import torch.nn.functional as F
from vllm.model_executor.layers.linear import LinearBase # noqa: E501
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.model_executor.layers.linear import (
LinearBase, # noqa: E501
UnquantizedLinearMethod,
)
from vllm.model_executor.layers.quantization import (
QuantizationMethods, get_quantization_config, register_quantization_config)
QuantizationMethods,
get_quantization_config,
register_quantization_config,
)
from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501
QuantizationConfig)
QuantizationConfig,
)
class FakeQuantLinearMethod(UnquantizedLinearMethod):
@@ -28,10 +35,12 @@ class FakeQuantLinearMethod(UnquantizedLinearMethod):
super().__init__()
self.num_bits = num_bits
def apply(self,
layer: "torch.nn.Module",
x: "torch.Tensor",
bias: Optional["torch.Tensor"] = None) -> "torch.Tensor":
def apply(
self,
layer: "torch.nn.Module",
x: "torch.Tensor",
bias: Optional["torch.Tensor"] = None,
) -> "torch.Tensor":
"""Perform fake quantization before the linear layer."""
# Calculate the scales dynamically
@@ -40,8 +49,11 @@ class FakeQuantLinearMethod(UnquantizedLinearMethod):
scales = (max_val - min_val) / (2**self.num_bits - 1)
# Fake quantize the input
quant_x = torch.clamp(torch.round(x / scales), -2**(self.num_bits - 1),
2**(self.num_bits - 1) - 1)
quant_x = torch.clamp(
torch.round(x / scales),
-(2 ** (self.num_bits - 1)),
2 ** (self.num_bits - 1) - 1,
)
dequant_x = quant_x * scales
return F.linear(dequant_x, layer.weight, bias)
@@ -79,8 +91,9 @@ class CustomQuantConfig(QuantizationConfig):
"""Create a config class from the model's quantization config."""
return CustomQuantConfig(num_bits=config.get("num_bits", 8))
def get_quant_method(self, layer: "torch.nn.Module",
prefix: str) -> Optional["FakeQuantLinearMethod"]:
def get_quant_method(
self, layer: "torch.nn.Module", prefix: str
) -> Optional["FakeQuantLinearMethod"]:
"""Get the quantize method to use for the quantized layer."""
if isinstance(layer, LinearBase):
return FakeQuantLinearMethod(num_bits=self.num_bits)
@@ -99,18 +112,20 @@ def test_register_quantization_config():
register_quantization_config("custom_quant")(CustomQuantConfig)
@pytest.mark.parametrize(argnames="model",
argvalues=[
"meta-llama/Llama-3.2-1B-Instruct",
])
@pytest.mark.parametrize(
argnames="model",
argvalues=[
"meta-llama/Llama-3.2-1B-Instruct",
],
)
def test_custom_quant(vllm_runner, model, monkeypatch):
"""Test infer with the custom quantization method."""
# `LLM.apply_model` requires pickling a function.
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
with vllm_runner(model_name=model,
quantization="custom_quant",
enforce_eager=True) as llm:
with vllm_runner(
model_name=model, quantization="custom_quant", enforce_eager=True
) as llm:
def check_model(model):
layer = model.model.layers[0]

View File

@@ -1,9 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright © 2025, Oracle and/or its affiliates.
"""Tests RTN quantization startup and generation,
"""Tests RTN quantization startup and generation,
doesn't test correctness
"""
import pytest
from tests.quantization.utils import is_quant_method_supported
@@ -14,8 +15,10 @@ MODELS = [
]
@pytest.mark.skipif(not is_quant_method_supported("rtn"),
reason="RTN is not supported on this GPU type.")
@pytest.mark.skipif(
not is_quant_method_supported("rtn"),
reason="RTN is not supported on this GPU type.",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [10])
@@ -27,6 +30,5 @@ def test_model_rtn_startup(
dtype: str,
max_tokens: int,
) -> None:
with vllm_runner(model, dtype=dtype, quantization="rtn") as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)

View File

@@ -13,12 +13,13 @@ TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
def test_pre_quantized_model(vllm_runner):
with vllm_runner("drisspg/fp8-opt-125m",
quantization="torchao",
dtype="bfloat16",
enforce_eager=True) as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=32)
with vllm_runner(
"drisspg/fp8-opt-125m",
quantization="torchao",
dtype="bfloat16",
enforce_eager=True,
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
assert output
@@ -28,17 +29,18 @@ def test_pre_quantized_model(vllm_runner):
[
"cuda:0",
# {"": "cuda"},
])
def test_opt_125m_int8wo_model_loading_with_params(vllm_runner,
pt_load_map_location):
],
)
def test_opt_125m_int8wo_model_loading_with_params(vllm_runner, pt_load_map_location):
torch._dynamo.reset()
model_name = "jerryzh168/opt-125m-int8wo-partial-quant"
with vllm_runner(model_name=model_name,
quantization="torchao",
dtype="bfloat16",
pt_load_map_location=pt_load_map_location) as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=32)
with vllm_runner(
model_name=model_name,
quantization="torchao",
dtype="bfloat16",
pt_load_map_location=pt_load_map_location,
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
assert output
@@ -47,12 +49,13 @@ def test_opt_125m_int8wo_model_loading_with_params(vllm_runner,
def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
torch._dynamo.reset()
model_name = "jerryzh168/opt-125m-int4wo-per-module"
with vllm_runner(model_name=model_name,
quantization="torchao",
dtype="bfloat16",
pt_load_map_location="cuda:0") as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=32)
with vllm_runner(
model_name=model_name,
quantization="torchao",
dtype="bfloat16",
pt_load_map_location="cuda:0",
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
assert output
@@ -61,12 +64,13 @@ def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
torch._dynamo.reset()
model_name = "mobicham/Qwen2.5-VL-3B-Instruct_int8wo_ao"
with vllm_runner(model_name=model_name,
quantization="torchao",
dtype="bfloat16",
pt_load_map_location="cuda:0") as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=32)
with vllm_runner(
model_name=model_name,
quantization="torchao",
dtype="bfloat16",
pt_load_map_location="cuda:0",
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
assert output
@@ -75,17 +79,18 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
@pytest.mark.skip(
reason="since torchao nightly is only compatible with torch nightly"
"currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
"torchao tests that requires newer versions (0.14.0.dev+) for now")
"torchao tests that requires newer versions (0.14.0.dev+) for now"
)
def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner):
torch._dynamo.reset()
model_name = ("torchao-testing/opt-125m-AWQConfig-Int4WeightOnlyConfig-v2"
"-0.14.0.dev")
with vllm_runner(model_name=model_name,
quantization="torchao",
dtype="bfloat16",
pt_load_map_location="cuda:0") as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=32)
model_name = "torchao-testing/opt-125m-AWQConfig-Int4WeightOnlyConfig-v2-0.14.0.dev"
with vllm_runner(
model_name=model_name,
quantization="torchao",
dtype="bfloat16",
pt_load_map_location="cuda:0",
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
assert output
@@ -101,22 +106,24 @@ def test_on_the_fly_quant_config_dict_json(vllm_runner):
import json
from torchao.core.config import config_to_dict
from torchao.quantization import (
Float8DynamicActivationFloat8WeightConfig, PerRow)
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
torchao_quant_config = Float8DynamicActivationFloat8WeightConfig(
granularity=PerRow())
granularity=PerRow()
)
hf_overrides = {
"quantization_config_dict_json":
json.dumps(config_to_dict(torchao_quant_config))
"quantization_config_dict_json": json.dumps(
config_to_dict(torchao_quant_config)
)
}
with vllm_runner(model_name=model_name,
dtype="bfloat16",
pt_load_map_location="cuda:0",
quantization="torchao",
hf_overrides=hf_overrides) as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=32)
with vllm_runner(
model_name=model_name,
dtype="bfloat16",
pt_load_map_location="cuda:0",
quantization="torchao",
hf_overrides=hf_overrides,
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
assert output
@@ -132,8 +139,7 @@ def test_on_the_fly_quant_config_file(vllm_runner):
from tempfile import NamedTemporaryFile
from torchao.core.config import config_to_dict
from torchao.quantization import (
Float8DynamicActivationFloat8WeightConfig, PerRow)
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
@@ -144,13 +150,14 @@ def test_on_the_fly_quant_config_file(vllm_runner):
config_file_name = str(f.name)
hf_overrides = {"quantization_config_file": config_file_name}
with vllm_runner(model_name=model_name,
dtype="bfloat16",
pt_load_map_location="cuda:0",
quantization="torchao",
hf_overrides=hf_overrides) as llm:
output = llm.generate_greedy(["The capital of France is"],
max_tokens=32)
with vllm_runner(
model_name=model_name,
dtype="bfloat16",
pt_load_map_location="cuda:0",
quantization="torchao",
hf_overrides=hf_overrides,
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
assert output
@@ -160,17 +167,18 @@ def test_reload_weights():
import json
from torchao.core.config import config_to_dict
from torchao.quantization import (
Float8DynamicActivationFloat8WeightConfig, PerRow)
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
from vllm import LLM, SamplingParams
torchao_quant_config = Float8DynamicActivationFloat8WeightConfig(
granularity=PerRow())
granularity=PerRow()
)
hf_overrides = {
"quantization_config_dict_json":
json.dumps(config_to_dict(torchao_quant_config))
"quantization_config_dict_json": json.dumps(
config_to_dict(torchao_quant_config)
)
}
llm = LLM(
@@ -182,12 +190,9 @@ def test_reload_weights():
hf_overrides=hf_overrides,
)
# Update load format from `dummy` to `auto`
llm.collective_rpc("update_config",
args=({
"load_config": {
"load_format": "auto"
}
}, ))
llm.collective_rpc(
"update_config", args=({"load_config": {"load_format": "auto"}},)
)
# Now reload real weights inplace
llm.collective_rpc("reload_weights")
prompts = [