# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging import regex as re from vllm.model_executor.layers.quantization import get_quantization_config from vllm.platforms import current_platform def is_quant_method_supported(quant_method: str) -> bool: # Currently, all quantization methods require Nvidia or AMD GPUs if not (current_platform.is_cuda() or current_platform.is_rocm()): return False try: current_platform.verify_quantization(quant_method) except ValueError: return False capability = current_platform.get_device_capability() assert capability is not None min_capability = get_quantization_config(quant_method).get_min_capability() return capability.to_int() >= min_capability def _test_online_quant_peak_mem_impl( quantization_arg_value, vllm_runner, caplog_mp_spawn, monkeypatch, ) -> None: # Note: `allenai/OLMoE-1B-7B-0125-Instruct` was selected because: # 1. it covers both Linear and MoE paths # 2. it is already used by other tests in CI, so adding it here # does not increase disk space for CI runners # I really wanted to use `ibm-granite/granite-3.0-1b-a400m-base` # which I think is the smallest MoE model in vLLM (2.5 GiB bf16, # 1.3 GiB fp8), but could not as adding one more model makes CI # run out of disk space. model_name = "allenai/OLMoE-1B-7B-0125-Instruct" # Force spawn to ensure caplog_mp_spawn works consistently # (it relies on VLLM_LOGGING_CONFIG_PATH which spawn reads but fork ignores) monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") with ( caplog_mp_spawn(logging.DEBUG) as log_holder, vllm_runner( model_name, quantization=quantization_arg_value, enforce_eager=True, ) as llm, ): outputs = llm.generate_greedy(["The future of AI is"], max_tokens=4) print(outputs[0][1]) log_text = log_holder.text # Parse memory usage from captured logs model_memory_gib = None peak_memory_gib = None for line in log_text.splitlines(): if model_memory_gib is None: match = re.search(r"Model loading took ([\d.]+) GiB memory", line) if match: model_memory_gib = float(match.group(1)) if peak_memory_gib is None: match = re.search( r"Peak GPU memory after loading weights: ([\d.]+) GiB", line ) if match: peak_memory_gib = float(match.group(1)) assert model_memory_gib is not None, "Could not find model loading memory log" assert peak_memory_gib is not None, "Could not find peak memory log" print(f"GPU memory used after loading weights: {model_memory_gib} GiB") print(f"Peak GPU memory usage while loading weights: {peak_memory_gib} GiB") # model specific, allenai/OLMoE-1B-7B-0125-Instruct fp8 online quant # uses 6.65 GiB for weight loading (bf16 checkpoint is ~12.89 GiB) expected_model_memory_gib = 6.7 # for allenai/OLMoE-1B-7B-0125-Instruct the number we see today is 9.06 # GiB, which is 1.36x above model_memory_gib. A slightly higher number is # expected as when we load and quantize weights in a streaming fashion we # need to have individual weights in bf16 + fp8 alive at the same time. expected_peak_memory_gib = expected_model_memory_gib * 1.4 assert model_memory_gib < expected_model_memory_gib, ( f"{model_memory_gib=} higher than {expected_model_memory_gib}" ) assert peak_memory_gib < expected_peak_memory_gib, ( f"{peak_memory_gib=} higher than {expected_peak_memory_gib}" )