[Frontend] new online quantization frontend (#38138)

Signed-off-by: Vasiliy Kuznetsov <vasiliy@meta.com>
This commit is contained in:
Vasiliy Kuznetsov
2026-04-03 11:58:39 -04:00
committed by GitHub
parent 97f92c6b47
commit 7b1a7423be
13 changed files with 1205 additions and 0 deletions

View File

@@ -0,0 +1,179 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests online quantization."""
import pytest
import torch
from tests.quantization.utils import (
_test_online_quant_peak_mem_impl,
is_quant_method_supported,
)
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.model_executor.layers.quantization.online.fp8 import (
Fp8PerBlockOnlineLinearMethod,
Fp8PerBlockOnlineMoEMethod,
Fp8PerTensorOnlineLinearMethod,
Fp8PerTensorOnlineMoEMethod,
)
from vllm.platforms import current_platform
@pytest.mark.skipif(
not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.",
)
@pytest.mark.parametrize(
"quant_scheme,online_quant_args,expected_linear_cls,expected_moe_cls",
[
# simple case - quantization='fp8_per_tensor'
(
"fp8_per_tensor",
None,
Fp8PerTensorOnlineLinearMethod,
Fp8PerTensorOnlineMoEMethod,
),
# simple case - quantization='fp8_per_block'
(
"fp8_per_block",
None,
Fp8PerBlockOnlineLinearMethod,
Fp8PerBlockOnlineMoEMethod,
),
# quantization='online with linear_scheme_override and
# moe_scheme_override
(
"online",
{
"linear_scheme_override": "fp8_per_block",
"moe_scheme_override": "fp8_per_tensor",
},
Fp8PerBlockOnlineLinearMethod,
Fp8PerTensorOnlineMoEMethod,
),
# ignore with direct layer name
(
"fp8_per_tensor",
# qkv_proj is fused from q_proj/k_proj/v_proj, so currently the
# ignore regex must match the unfused shard names
# TODO(future PR): also make 're:.*qkv_proj.*' work
{"ignore": ["model.layers.1.self_attn.o_proj", "re:.*[qkv]_proj"]},
Fp8PerTensorOnlineLinearMethod,
Fp8PerTensorOnlineMoEMethod,
),
],
)
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
)
def test_online_quantization(
vllm_runner,
quant_scheme: str,
online_quant_args: dict | None,
expected_linear_cls,
expected_moe_cls,
use_rocm_aiter: bool,
monkeypatch,
) -> None:
"""
Tests that online quantization frontend configuration works -
selecting quant schemes, overriding quant schemes by type, ignoring
layers.
Does not test performance, peak memory usage, etc.
"""
if use_rocm_aiter:
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
# `LLM.apply_model` requires pickling a function.
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
# a tiny model with both dense and MoE layers
model_name = "ibm-granite/granite-3.0-1b-a400m-base"
runner_kwargs = dict(
quantization=quant_scheme,
enforce_eager=True,
)
if online_quant_args is not None:
runner_kwargs["quantization_config"] = online_quant_args
with vllm_runner(
model_name,
**runner_kwargs,
) as llm:
def check_model(model):
# checks further down in the test case are hardcoded for this
# model
assert model_name == "ibm-granite/granite-3.0-1b-a400m-base"
o_proj = model.model.layers[0].self_attn.o_proj
moe = model.model.layers[0].block_sparse_moe.experts
# o_proj and moe in layer 0 are always quantized (never ignored)
# because of how we craft the test case inputs
assert isinstance(o_proj.quant_method, expected_linear_cls)
if moe is not None:
assert isinstance(moe.quant_method, expected_moe_cls)
if current_platform.is_cuda():
assert o_proj.weight.dtype == torch.float8_e4m3fn
elif current_platform.is_rocm():
assert o_proj.weight.dtype == current_platform.fp8_dtype()
else:
pytest.skip("Only runs on CUDA and ROCm.")
# Verify ignored layers are unquantized.
if isinstance(online_quant_args, dict) and "ignore" in online_quant_args:
# only .*1.self_attn_o_proj is skipped
for layer_idx in range(len(model.model.layers)):
o_proj = model.model.layers[layer_idx].self_attn.o_proj
if layer_idx == 1:
assert isinstance(o_proj.quant_method, UnquantizedLinearMethod)
else:
assert isinstance(o_proj.quant_method, expected_linear_cls)
# every .*self_attn.qkv_proj is skipped
for layer_idx in range(len(model.model.layers)):
qkv_proj = model.model.layers[layer_idx].self_attn.qkv_proj
assert isinstance(qkv_proj.quant_method, UnquantizedLinearMethod)
llm.apply_model(check_model)
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
print(outputs[0][1])
@pytest.mark.skipif(
not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.",
)
def test_online_quant_peak_mem(
vllm_runner,
caplog_mp_spawn,
monkeypatch,
) -> None:
_test_online_quant_peak_mem_impl(
"fp8_per_tensor", vllm_runner, caplog_mp_spawn, monkeypatch
)
@pytest.mark.skipif(
not is_quant_method_supported("fp8"),
reason="FP8 is not supported on this GPU type.",
)
def test_online_quant_load_format_dummy(
vllm_runner,
monkeypatch,
caplog,
) -> None:
with vllm_runner(
"ibm-granite/granite-3.0-1b-a400m-base",
quantization="fp8_per_tensor",
enforce_eager=True,
load_format="dummy",
) as llm:
outputs = llm.generate_greedy(["The future of AI is"], max_tokens=4)
print(outputs[0][1])

View File

@@ -1,6 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import logging
import regex as re
from vllm.model_executor.layers.quantization import get_quantization_config
from vllm.platforms import current_platform
@@ -21,3 +25,74 @@ def is_quant_method_supported(quant_method: str) -> bool:
min_capability = get_quantization_config(quant_method).get_min_capability()
return capability.to_int() >= min_capability
def _test_online_quant_peak_mem_impl(
quantization_arg_value,
vllm_runner,
caplog_mp_spawn,
monkeypatch,
) -> None:
# Note: `allenai/OLMoE-1B-7B-0125-Instruct` was selected because:
# 1. it covers both Linear and MoE paths
# 2. it is already used by other tests in CI, so adding it here
# does not increase disk space for CI runners
# I really wanted to use `ibm-granite/granite-3.0-1b-a400m-base`
# which I think is the smallest MoE model in vLLM (2.5 GiB bf16,
# 1.3 GiB fp8), but could not as adding one more model makes CI
# run out of disk space.
model_name = "allenai/OLMoE-1B-7B-0125-Instruct"
# Force spawn to ensure caplog_mp_spawn works consistently
# (it relies on VLLM_LOGGING_CONFIG_PATH which spawn reads but fork ignores)
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
with (
caplog_mp_spawn(logging.DEBUG) as log_holder,
vllm_runner(
model_name,
quantization=quantization_arg_value,
enforce_eager=True,
) as llm,
):
outputs = llm.generate_greedy(["The future of AI is"], max_tokens=4)
print(outputs[0][1])
log_text = log_holder.text
# Parse memory usage from captured logs
model_memory_gib = None
peak_memory_gib = None
for line in log_text.splitlines():
if model_memory_gib is None:
match = re.search(r"Model loading took ([\d.]+) GiB memory", line)
if match:
model_memory_gib = float(match.group(1))
if peak_memory_gib is None:
match = re.search(
r"Peak GPU memory after loading weights: ([\d.]+) GiB", line
)
if match:
peak_memory_gib = float(match.group(1))
assert model_memory_gib is not None, "Could not find model loading memory log"
assert peak_memory_gib is not None, "Could not find peak memory log"
print(f"GPU memory used after loading weights: {model_memory_gib} GiB")
print(f"Peak GPU memory usage while loading weights: {peak_memory_gib} GiB")
# model specific, allenai/OLMoE-1B-7B-0125-Instruct fp8 online quant
# uses 6.65 GiB for weight loading (bf16 checkpoint is ~12.89 GiB)
expected_model_memory_gib = 6.7
# for allenai/OLMoE-1B-7B-0125-Instruct the number we see today is 9.06
# GiB, which is 1.36x above model_memory_gib. A slightly higher number is
# expected as when we load and quantize weights in a streaming fashion we
# need to have individual weights in bf16 + fp8 alive at the same time.
expected_peak_memory_gib = expected_model_memory_gib * 1.4
assert model_memory_gib < expected_model_memory_gib, (
f"{model_memory_gib=} higher than {expected_model_memory_gib}"
)
assert peak_memory_gib < expected_peak_memory_gib, (
f"{peak_memory_gib=} higher than {expected_peak_memory_gib}"
)