[Tests] Add Qwen3-VL multimodal memory leak check (#39268)
Signed-off-by: Lalit Laxminarayan Bangad <lalitbangad@gmail.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
182
tests/models/multimodal/generation/test_memory_leak.py
Normal file
182
tests/models/multimodal/generation/test_memory_leak.py
Normal file
@@ -0,0 +1,182 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import gc
|
||||
import random
|
||||
import string
|
||||
import sys
|
||||
import weakref
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.models.registry import HF_EXAMPLE_MODELS
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.mem_utils import KiB_bytes, MiB_bytes, format_mib
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-VL-4B-Instruct"
|
||||
RANDOM_PREFIX_LEN = 100
|
||||
TEST_IMAGE_NAMES = [
|
||||
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
|
||||
"Grayscale_8bits_palette_sample_image.png",
|
||||
]
|
||||
MAX_MODEL_LEN = 8192
|
||||
REQUESTS_PER_ROUND = 4
|
||||
WARMUP_ROUNDS = 1
|
||||
MEASURED_ROUNDS = 16
|
||||
GPU_GROWTH_THRESHOLD_MIB = 0
|
||||
CPU_PEAK_GROWTH_THRESHOLD_MIB = 0
|
||||
|
||||
SAMPLING_PARAMS = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=16,
|
||||
)
|
||||
|
||||
|
||||
def _make_messages(image_url: str) -> list[ChatCompletionMessageParam]:
|
||||
# Avoid obscuring memory leaks because of prefix caching
|
||||
random_text = "".join(random.choices(string.ascii_uppercase, k=RANDOM_PREFIX_LEN))
|
||||
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"Ignore this random string: {random_text}",
|
||||
},
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image in one short sentence.",
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _build_request_batch(
|
||||
image_urls: list[str],
|
||||
) -> list[list[ChatCompletionMessageParam]]:
|
||||
return [
|
||||
_make_messages(image_urls[i % len(image_urls)])
|
||||
for i in range(REQUESTS_PER_ROUND)
|
||||
]
|
||||
|
||||
|
||||
def _ru_maxrss_bytes() -> int | None:
|
||||
try:
|
||||
import resource
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
||||
if rss <= 0:
|
||||
return 0
|
||||
|
||||
# Linux reports kilobytes, macOS reports bytes.
|
||||
return rss if sys.platform == "darwin" else rss * KiB_bytes
|
||||
|
||||
|
||||
def _gpu_used_bytes() -> int:
|
||||
torch.accelerator.synchronize()
|
||||
free_bytes, total_bytes = current_platform.mem_get_info()
|
||||
return int(total_bytes - free_bytes)
|
||||
|
||||
|
||||
def _format_mib(num_bytes: int | None) -> str:
|
||||
if num_bytes is None:
|
||||
return "n/a"
|
||||
|
||||
return f"{format_mib(num_bytes)} MiB"
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def llm(monkeypatch):
|
||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm_kwargs = dict(
|
||||
model=MODEL_NAME,
|
||||
enforce_eager=True,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
max_num_seqs=REQUESTS_PER_ROUND,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
seed=0,
|
||||
disable_log_stats=True,
|
||||
gpu_memory_utilization=0.8,
|
||||
)
|
||||
if current_platform.is_rocm():
|
||||
llm_kwargs["attention_backend"] = "TRITON_ATTN"
|
||||
|
||||
llm = LLM(**llm_kwargs)
|
||||
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
del llm
|
||||
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("image_urls", [TEST_IMAGE_NAMES], indirect=True)
|
||||
def test_qwen3_vl_no_memory_leak(llm, image_urls: list[str]) -> None:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(MODEL_NAME)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
request_batch = _build_request_batch(image_urls)
|
||||
|
||||
# Establish a warmup baseline after model load and the first multimodal
|
||||
# requests complete. Later rounds should remain near this steady state.
|
||||
for _ in range(WARMUP_ROUNDS):
|
||||
outputs = llm.chat(request_batch, sampling_params=SAMPLING_PARAMS)
|
||||
assert len(outputs) == len(request_batch)
|
||||
assert llm.llm_engine.get_num_unfinished_requests() == 0
|
||||
del outputs
|
||||
|
||||
gc.collect()
|
||||
warmup_gpu = _gpu_used_bytes()
|
||||
warmup_cpu_peak = _ru_maxrss_bytes()
|
||||
|
||||
post_warmup_gpu_samples: list[int] = []
|
||||
post_warmup_cpu_peak_samples: list[int] = []
|
||||
|
||||
for _ in range(MEASURED_ROUNDS):
|
||||
outputs = llm.chat(request_batch, sampling_params=SAMPLING_PARAMS)
|
||||
assert len(outputs) == len(request_batch)
|
||||
assert llm.llm_engine.get_num_unfinished_requests() == 0
|
||||
del outputs
|
||||
|
||||
gc.collect()
|
||||
post_warmup_gpu_samples.append(_gpu_used_bytes())
|
||||
cpu_peak = _ru_maxrss_bytes()
|
||||
if cpu_peak is not None:
|
||||
post_warmup_cpu_peak_samples.append(cpu_peak)
|
||||
|
||||
gpu_growth = max(post_warmup_gpu_samples) - warmup_gpu
|
||||
gpu_threshold = GPU_GROWTH_THRESHOLD_MIB * MiB_bytes
|
||||
|
||||
assert gpu_growth <= gpu_threshold, (
|
||||
"Qwen3-VL GPU memory kept growing after warmup. "
|
||||
f"warmup_baseline={_format_mib(warmup_gpu)}, "
|
||||
f"post_warmup_samples={[_format_mib(x) for x in post_warmup_gpu_samples]}, "
|
||||
f"gpu_growth={_format_mib(gpu_growth)}, "
|
||||
f"gpu_threshold={GPU_GROWTH_THRESHOLD_MIB} MiB"
|
||||
)
|
||||
|
||||
if warmup_cpu_peak is not None and post_warmup_cpu_peak_samples:
|
||||
cpu_peak_growth = max(post_warmup_cpu_peak_samples) - warmup_cpu_peak
|
||||
cpu_threshold = CPU_PEAK_GROWTH_THRESHOLD_MIB * MiB_bytes
|
||||
|
||||
assert cpu_peak_growth <= cpu_threshold, (
|
||||
"Qwen3-VL CPU peak RSS kept growing after warmup. "
|
||||
f"warmup_ru_maxrss={_format_mib(warmup_cpu_peak)}, "
|
||||
f"post_warmup_ru_maxrss={[_format_mib(x) for x in post_warmup_cpu_peak_samples]}, " # noqa: E501
|
||||
f"cpu_peak_growth={_format_mib(cpu_peak_growth)}, "
|
||||
f"cpu_peak_threshold={CPU_PEAK_GROWTH_THRESHOLD_MIB} MiB"
|
||||
)
|
||||
Reference in New Issue
Block a user