fp8 online quant: split out Fp8OnlineLinearMethod (#32189)

2026-01-20 18:13:22 -05:00
parent 22375f8d13
commit d2389c1262
2 changed files with 143 additions and 110 deletions
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -133,7 +133,7 @@ def test_kv_cache_model_load_and_run(
@pytest.mark.parametrize(
    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
 )
-def test_load_fp16_model(
+def test_online_quantization(
    vllm_runner,
    kv_cache_dtype: str,
    force_marlin: bool,
@@ -191,6 +191,9 @@ def test_load_fp16_model(

        llm.apply_model(check_model)

+        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
+        print(outputs[0][1])
+

@pytest.mark.skipif(
    not is_quant_method_supported("fp8"),