[Frontend] new online quantization frontend (#38138)

Signed-off-by: Vasiliy Kuznetsov <vasiliy@meta.com>
2026-04-03 11:58:39 -04:00
parent 97f92c6b47
commit 7b1a7423be
13 changed files with 1205 additions and 0 deletions
--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -1,6 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import logging
+
+import regex as re
+
 from vllm.model_executor.layers.quantization import get_quantization_config
 from vllm.platforms import current_platform

@@ -21,3 +25,74 @@ def is_quant_method_supported(quant_method: str) -> bool:
    min_capability = get_quantization_config(quant_method).get_min_capability()

    return capability.to_int() >= min_capability
+
+
+def _test_online_quant_peak_mem_impl(
+    quantization_arg_value,
+    vllm_runner,
+    caplog_mp_spawn,
+    monkeypatch,
+) -> None:
+    # Note: `allenai/OLMoE-1B-7B-0125-Instruct` was selected because:
+    # 1. it covers both Linear and MoE paths
+    # 2. it is already used by other tests in CI, so adding it here
+    #    does not increase disk space for CI runners
+    # I really wanted to use `ibm-granite/granite-3.0-1b-a400m-base`
+    # which I think is the smallest MoE model in vLLM (2.5 GiB bf16,
+    # 1.3 GiB fp8), but could not as adding one more model makes CI
+    # run out of disk space.
+    model_name = "allenai/OLMoE-1B-7B-0125-Instruct"
+
+    # Force spawn to ensure caplog_mp_spawn works consistently
+    # (it relies on VLLM_LOGGING_CONFIG_PATH which spawn reads but fork ignores)
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+
+    with (
+        caplog_mp_spawn(logging.DEBUG) as log_holder,
+        vllm_runner(
+            model_name,
+            quantization=quantization_arg_value,
+            enforce_eager=True,
+        ) as llm,
+    ):
+        outputs = llm.generate_greedy(["The future of AI is"], max_tokens=4)
+        print(outputs[0][1])
+
+    log_text = log_holder.text
+
+    # Parse memory usage from captured logs
+    model_memory_gib = None
+    peak_memory_gib = None
+    for line in log_text.splitlines():
+        if model_memory_gib is None:
+            match = re.search(r"Model loading took ([\d.]+) GiB memory", line)
+            if match:
+                model_memory_gib = float(match.group(1))
+        if peak_memory_gib is None:
+            match = re.search(
+                r"Peak GPU memory after loading weights: ([\d.]+) GiB", line
+            )
+            if match:
+                peak_memory_gib = float(match.group(1))
+
+    assert model_memory_gib is not None, "Could not find model loading memory log"
+    assert peak_memory_gib is not None, "Could not find peak memory log"
+    print(f"GPU memory used after loading weights: {model_memory_gib} GiB")
+    print(f"Peak GPU memory usage while loading weights: {peak_memory_gib} GiB")
+
+    # model specific, allenai/OLMoE-1B-7B-0125-Instruct fp8 online quant
+    # uses 6.65 GiB for weight loading (bf16 checkpoint is ~12.89 GiB)
+    expected_model_memory_gib = 6.7
+
+    # for allenai/OLMoE-1B-7B-0125-Instruct the number we see today is 9.06
+    # GiB, which is 1.36x above model_memory_gib. A slightly higher number is
+    # expected as when we load and quantize weights in a streaming fashion we
+    # need to have individual weights in bf16 + fp8 alive at the same time.
+    expected_peak_memory_gib = expected_model_memory_gib * 1.4
+
+    assert model_memory_gib < expected_model_memory_gib, (
+        f"{model_memory_gib=} higher than {expected_model_memory_gib}"
+    )
+    assert peak_memory_gib < expected_peak_memory_gib, (
+        f"{peak_memory_gib=} higher than {expected_peak_memory_gib}"
+    )