[Tests] Add Qwen3-VL multimodal memory leak check (#39268)

Signed-off-by: Lalit Laxminarayan Bangad <lalitbangad@gmail.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-04-09 04:54:46 -07:00
parent df2503e125
commit 91eea72330
3 changed files with 193 additions and 1 deletions
--- a/tests/models/multimodal/generation/test_memory_leak.py
+++ b/tests/models/multimodal/generation/test_memory_leak.py
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import gc
+import random
+import string
+import sys
+import weakref
+
+import pytest
+import torch
+
+from tests.models.registry import HF_EXAMPLE_MODELS
+from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.platforms import current_platform
+from vllm.utils.mem_utils import KiB_bytes, MiB_bytes, format_mib
+
+MODEL_NAME = "Qwen/Qwen3-VL-4B-Instruct"
+RANDOM_PREFIX_LEN = 100
+TEST_IMAGE_NAMES = [
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "Grayscale_8bits_palette_sample_image.png",
+]
+MAX_MODEL_LEN = 8192
+REQUESTS_PER_ROUND = 4
+WARMUP_ROUNDS = 1
+MEASURED_ROUNDS = 16
+GPU_GROWTH_THRESHOLD_MIB = 0
+CPU_PEAK_GROWTH_THRESHOLD_MIB = 0
+
+SAMPLING_PARAMS = SamplingParams(
+    temperature=0.0,
+    max_tokens=16,
+)
+
+
+def _make_messages(image_url: str) -> list[ChatCompletionMessageParam]:
+    # Avoid obscuring memory leaks because of prefix caching
+    random_text = "".join(random.choices(string.ascii_uppercase, k=RANDOM_PREFIX_LEN))
+
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": f"Ignore this random string: {random_text}",
+                },
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {
+                    "type": "text",
+                    "text": "Describe this image in one short sentence.",
+                },
+            ],
+        }
+    ]
+
+
+def _build_request_batch(
+    image_urls: list[str],
+) -> list[list[ChatCompletionMessageParam]]:
+    return [
+        _make_messages(image_urls[i % len(image_urls)])
+        for i in range(REQUESTS_PER_ROUND)
+    ]
+
+
+def _ru_maxrss_bytes() -> int | None:
+    try:
+        import resource
+    except ImportError:
+        return None
+
+    rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+    if rss <= 0:
+        return 0
+
+    # Linux reports kilobytes, macOS reports bytes.
+    return rss if sys.platform == "darwin" else rss * KiB_bytes
+
+
+def _gpu_used_bytes() -> int:
+    torch.accelerator.synchronize()
+    free_bytes, total_bytes = current_platform.mem_get_info()
+    return int(total_bytes - free_bytes)
+
+
+def _format_mib(num_bytes: int | None) -> str:
+    if num_bytes is None:
+        return "n/a"
+
+    return f"{format_mib(num_bytes)} MiB"
+
+
+@pytest.fixture(scope="function")
+def llm(monkeypatch):
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm_kwargs = dict(
+        model=MODEL_NAME,
+        enforce_eager=True,
+        max_model_len=MAX_MODEL_LEN,
+        max_num_seqs=REQUESTS_PER_ROUND,
+        limit_mm_per_prompt={"image": 1},
+        seed=0,
+        disable_log_stats=True,
+        gpu_memory_utilization=0.8,
+    )
+    if current_platform.is_rocm():
+        llm_kwargs["attention_backend"] = "TRITON_ATTN"
+
+    llm = LLM(**llm_kwargs)
+
+    yield weakref.proxy(llm)
+
+    del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("image_urls", [TEST_IMAGE_NAMES], indirect=True)
+def test_qwen3_vl_no_memory_leak(llm, image_urls: list[str]) -> None:
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(MODEL_NAME)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
+    request_batch = _build_request_batch(image_urls)
+
+    # Establish a warmup baseline after model load and the first multimodal
+    # requests complete. Later rounds should remain near this steady state.
+    for _ in range(WARMUP_ROUNDS):
+        outputs = llm.chat(request_batch, sampling_params=SAMPLING_PARAMS)
+        assert len(outputs) == len(request_batch)
+        assert llm.llm_engine.get_num_unfinished_requests() == 0
+        del outputs
+
+    gc.collect()
+    warmup_gpu = _gpu_used_bytes()
+    warmup_cpu_peak = _ru_maxrss_bytes()
+
+    post_warmup_gpu_samples: list[int] = []
+    post_warmup_cpu_peak_samples: list[int] = []
+
+    for _ in range(MEASURED_ROUNDS):
+        outputs = llm.chat(request_batch, sampling_params=SAMPLING_PARAMS)
+        assert len(outputs) == len(request_batch)
+        assert llm.llm_engine.get_num_unfinished_requests() == 0
+        del outputs
+
+        gc.collect()
+        post_warmup_gpu_samples.append(_gpu_used_bytes())
+        cpu_peak = _ru_maxrss_bytes()
+        if cpu_peak is not None:
+            post_warmup_cpu_peak_samples.append(cpu_peak)
+
+    gpu_growth = max(post_warmup_gpu_samples) - warmup_gpu
+    gpu_threshold = GPU_GROWTH_THRESHOLD_MIB * MiB_bytes
+
+    assert gpu_growth <= gpu_threshold, (
+        "Qwen3-VL GPU memory kept growing after warmup. "
+        f"warmup_baseline={_format_mib(warmup_gpu)}, "
+        f"post_warmup_samples={[_format_mib(x) for x in post_warmup_gpu_samples]}, "
+        f"gpu_growth={_format_mib(gpu_growth)}, "
+        f"gpu_threshold={GPU_GROWTH_THRESHOLD_MIB} MiB"
+    )
+
+    if warmup_cpu_peak is not None and post_warmup_cpu_peak_samples:
+        cpu_peak_growth = max(post_warmup_cpu_peak_samples) - warmup_cpu_peak
+        cpu_threshold = CPU_PEAK_GROWTH_THRESHOLD_MIB * MiB_bytes
+
+        assert cpu_peak_growth <= cpu_threshold, (
+            "Qwen3-VL CPU peak RSS kept growing after warmup. "
+            f"warmup_ru_maxrss={_format_mib(warmup_cpu_peak)}, "
+            f"post_warmup_ru_maxrss={[_format_mib(x) for x in post_warmup_cpu_peak_samples]}, "  # noqa: E501
+            f"cpu_peak_growth={_format_mib(cpu_peak_growth)}, "
+            f"cpu_peak_threshold={CPU_PEAK_GROWTH_THRESHOLD_MIB} MiB"
+        )