fp8 online quant: split out Fp8OnlineLinearMethod (#32189)

This commit is contained in:
Vasiliy Kuznetsov
2026-01-20 18:13:22 -05:00
committed by GitHub
parent 22375f8d13
commit d2389c1262
2 changed files with 143 additions and 110 deletions

View File

@@ -133,7 +133,7 @@ def test_kv_cache_model_load_and_run(
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
)
def test_load_fp16_model(
def test_online_quantization(
vllm_runner,
kv_cache_dtype: str,
force_marlin: bool,
@@ -191,6 +191,9 @@ def test_load_fp16_model(
llm.apply_model(check_model)
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
print(outputs[0][1])
@pytest.mark.skipif(
not is_quant_method_supported("fp8"),