diff --git a/tests/models/multimodal/generation/test_memory_leak.py b/tests/models/multimodal/generation/test_memory_leak.py new file mode 100644 index 000000000..1d8046fdc --- /dev/null +++ b/tests/models/multimodal/generation/test_memory_leak.py @@ -0,0 +1,182 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import gc +import random +import string +import sys +import weakref + +import pytest +import torch + +from tests.models.registry import HF_EXAMPLE_MODELS +from vllm import LLM, SamplingParams +from vllm.distributed import cleanup_dist_env_and_memory +from vllm.entrypoints.chat_utils import ChatCompletionMessageParam +from vllm.platforms import current_platform +from vllm.utils.mem_utils import KiB_bytes, MiB_bytes, format_mib + +MODEL_NAME = "Qwen/Qwen3-VL-4B-Instruct" +RANDOM_PREFIX_LEN = 100 +TEST_IMAGE_NAMES = [ + "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + "Grayscale_8bits_palette_sample_image.png", +] +MAX_MODEL_LEN = 8192 +REQUESTS_PER_ROUND = 4 +WARMUP_ROUNDS = 1 +MEASURED_ROUNDS = 16 +GPU_GROWTH_THRESHOLD_MIB = 0 +CPU_PEAK_GROWTH_THRESHOLD_MIB = 0 + +SAMPLING_PARAMS = SamplingParams( + temperature=0.0, + max_tokens=16, +) + + +def _make_messages(image_url: str) -> list[ChatCompletionMessageParam]: + # Avoid obscuring memory leaks because of prefix caching + random_text = "".join(random.choices(string.ascii_uppercase, k=RANDOM_PREFIX_LEN)) + + return [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"Ignore this random string: {random_text}", + }, + {"type": "image_url", "image_url": {"url": image_url}}, + { + "type": "text", + "text": "Describe this image in one short sentence.", + }, + ], + } + ] + + +def _build_request_batch( + image_urls: list[str], +) -> list[list[ChatCompletionMessageParam]]: + return [ + _make_messages(image_urls[i % len(image_urls)]) + for i in range(REQUESTS_PER_ROUND) + ] + + +def _ru_maxrss_bytes() -> int | None: + try: + import resource + except ImportError: + return None + + rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + if rss <= 0: + return 0 + + # Linux reports kilobytes, macOS reports bytes. + return rss if sys.platform == "darwin" else rss * KiB_bytes + + +def _gpu_used_bytes() -> int: + torch.accelerator.synchronize() + free_bytes, total_bytes = current_platform.mem_get_info() + return int(total_bytes - free_bytes) + + +def _format_mib(num_bytes: int | None) -> str: + if num_bytes is None: + return "n/a" + + return f"{format_mib(num_bytes)} MiB" + + +@pytest.fixture(scope="function") +def llm(monkeypatch): + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") + + # pytest caches the fixture so we use weakref.proxy to + # enable garbage collection + llm_kwargs = dict( + model=MODEL_NAME, + enforce_eager=True, + max_model_len=MAX_MODEL_LEN, + max_num_seqs=REQUESTS_PER_ROUND, + limit_mm_per_prompt={"image": 1}, + seed=0, + disable_log_stats=True, + gpu_memory_utilization=0.8, + ) + if current_platform.is_rocm(): + llm_kwargs["attention_backend"] = "TRITON_ATTN" + + llm = LLM(**llm_kwargs) + + yield weakref.proxy(llm) + + del llm + + cleanup_dist_env_and_memory() + + +@pytest.mark.core_model +@pytest.mark.parametrize("image_urls", [TEST_IMAGE_NAMES], indirect=True) +def test_qwen3_vl_no_memory_leak(llm, image_urls: list[str]) -> None: + model_info = HF_EXAMPLE_MODELS.find_hf_info(MODEL_NAME) + model_info.check_available_online(on_fail="skip") + model_info.check_transformers_version(on_fail="skip") + + request_batch = _build_request_batch(image_urls) + + # Establish a warmup baseline after model load and the first multimodal + # requests complete. Later rounds should remain near this steady state. + for _ in range(WARMUP_ROUNDS): + outputs = llm.chat(request_batch, sampling_params=SAMPLING_PARAMS) + assert len(outputs) == len(request_batch) + assert llm.llm_engine.get_num_unfinished_requests() == 0 + del outputs + + gc.collect() + warmup_gpu = _gpu_used_bytes() + warmup_cpu_peak = _ru_maxrss_bytes() + + post_warmup_gpu_samples: list[int] = [] + post_warmup_cpu_peak_samples: list[int] = [] + + for _ in range(MEASURED_ROUNDS): + outputs = llm.chat(request_batch, sampling_params=SAMPLING_PARAMS) + assert len(outputs) == len(request_batch) + assert llm.llm_engine.get_num_unfinished_requests() == 0 + del outputs + + gc.collect() + post_warmup_gpu_samples.append(_gpu_used_bytes()) + cpu_peak = _ru_maxrss_bytes() + if cpu_peak is not None: + post_warmup_cpu_peak_samples.append(cpu_peak) + + gpu_growth = max(post_warmup_gpu_samples) - warmup_gpu + gpu_threshold = GPU_GROWTH_THRESHOLD_MIB * MiB_bytes + + assert gpu_growth <= gpu_threshold, ( + "Qwen3-VL GPU memory kept growing after warmup. " + f"warmup_baseline={_format_mib(warmup_gpu)}, " + f"post_warmup_samples={[_format_mib(x) for x in post_warmup_gpu_samples]}, " + f"gpu_growth={_format_mib(gpu_growth)}, " + f"gpu_threshold={GPU_GROWTH_THRESHOLD_MIB} MiB" + ) + + if warmup_cpu_peak is not None and post_warmup_cpu_peak_samples: + cpu_peak_growth = max(post_warmup_cpu_peak_samples) - warmup_cpu_peak + cpu_threshold = CPU_PEAK_GROWTH_THRESHOLD_MIB * MiB_bytes + + assert cpu_peak_growth <= cpu_threshold, ( + "Qwen3-VL CPU peak RSS kept growing after warmup. " + f"warmup_ru_maxrss={_format_mib(warmup_cpu_peak)}, " + f"post_warmup_ru_maxrss={[_format_mib(x) for x in post_warmup_cpu_peak_samples]}, " # noqa: E501 + f"cpu_peak_growth={_format_mib(cpu_peak_growth)}, " + f"cpu_peak_threshold={CPU_PEAK_GROWTH_THRESHOLD_MIB} MiB" + ) diff --git a/vllm/utils/mem_constants.py b/vllm/utils/mem_constants.py index 62b725fbb..b2f9a037c 100644 --- a/vllm/utils/mem_constants.py +++ b/vllm/utils/mem_constants.py @@ -1,5 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +KB_bytes = 1_000 +"""The number of bytes in one kilobyte (KB).""" + +KiB_bytes = 1 << 10 +"""The number of bytes in one kibibyte (KiB).""" + MB_bytes = 1_000_000 """The number of bytes in one megabyte (MB).""" diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py index e6a60a0c1..1b723f3f1 100644 --- a/vllm/utils/mem_utils.py +++ b/vllm/utils/mem_utils.py @@ -13,7 +13,11 @@ import torch.types from vllm.platforms import current_platform -from .mem_constants import GiB_bytes, MiB_bytes +from .mem_constants import GiB_bytes, KiB_bytes, MiB_bytes + + +def format_kib(b: int) -> str: + return f"{round(b / KiB_bytes, 2)}" def format_mib(b: int) -> str: