183 lines
5.6 KiB
Python
183 lines
5.6 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import gc
|
|
import random
|
|
import string
|
|
import sys
|
|
import weakref
|
|
|
|
import pytest
|
|
import torch
|
|
|
|
from tests.models.registry import HF_EXAMPLE_MODELS
|
|
from vllm import LLM, SamplingParams
|
|
from vllm.distributed import cleanup_dist_env_and_memory
|
|
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
|
|
from vllm.platforms import current_platform
|
|
from vllm.utils.mem_utils import KiB_bytes, MiB_bytes, format_mib
|
|
|
|
MODEL_NAME = "Qwen/Qwen3-VL-4B-Instruct"
|
|
RANDOM_PREFIX_LEN = 100
|
|
TEST_IMAGE_NAMES = [
|
|
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
|
|
"Grayscale_8bits_palette_sample_image.png",
|
|
]
|
|
MAX_MODEL_LEN = 8192
|
|
REQUESTS_PER_ROUND = 4
|
|
WARMUP_ROUNDS = 1
|
|
MEASURED_ROUNDS = 16
|
|
GPU_GROWTH_THRESHOLD_MIB = 0
|
|
CPU_PEAK_GROWTH_THRESHOLD_MIB = 0
|
|
|
|
SAMPLING_PARAMS = SamplingParams(
|
|
temperature=0.0,
|
|
max_tokens=16,
|
|
)
|
|
|
|
|
|
def _make_messages(image_url: str) -> list[ChatCompletionMessageParam]:
|
|
# Avoid obscuring memory leaks because of prefix caching
|
|
random_text = "".join(random.choices(string.ascii_uppercase, k=RANDOM_PREFIX_LEN))
|
|
|
|
return [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": f"Ignore this random string: {random_text}",
|
|
},
|
|
{"type": "image_url", "image_url": {"url": image_url}},
|
|
{
|
|
"type": "text",
|
|
"text": "Describe this image in one short sentence.",
|
|
},
|
|
],
|
|
}
|
|
]
|
|
|
|
|
|
def _build_request_batch(
|
|
image_urls: list[str],
|
|
) -> list[list[ChatCompletionMessageParam]]:
|
|
return [
|
|
_make_messages(image_urls[i % len(image_urls)])
|
|
for i in range(REQUESTS_PER_ROUND)
|
|
]
|
|
|
|
|
|
def _ru_maxrss_bytes() -> int | None:
|
|
try:
|
|
import resource
|
|
except ImportError:
|
|
return None
|
|
|
|
rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
if rss <= 0:
|
|
return 0
|
|
|
|
# Linux reports kilobytes, macOS reports bytes.
|
|
return rss if sys.platform == "darwin" else rss * KiB_bytes
|
|
|
|
|
|
def _gpu_used_bytes() -> int:
|
|
torch.accelerator.synchronize()
|
|
free_bytes, total_bytes = current_platform.mem_get_info()
|
|
return int(total_bytes - free_bytes)
|
|
|
|
|
|
def _format_mib(num_bytes: int | None) -> str:
|
|
if num_bytes is None:
|
|
return "n/a"
|
|
|
|
return f"{format_mib(num_bytes)} MiB"
|
|
|
|
|
|
@pytest.fixture(scope="function")
|
|
def llm(monkeypatch):
|
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
|
|
|
# pytest caches the fixture so we use weakref.proxy to
|
|
# enable garbage collection
|
|
llm_kwargs = dict(
|
|
model=MODEL_NAME,
|
|
enforce_eager=True,
|
|
max_model_len=MAX_MODEL_LEN,
|
|
max_num_seqs=REQUESTS_PER_ROUND,
|
|
limit_mm_per_prompt={"image": 1},
|
|
seed=0,
|
|
disable_log_stats=True,
|
|
gpu_memory_utilization=0.8,
|
|
)
|
|
if current_platform.is_rocm():
|
|
llm_kwargs["attention_backend"] = "TRITON_ATTN"
|
|
|
|
llm = LLM(**llm_kwargs)
|
|
|
|
yield weakref.proxy(llm)
|
|
|
|
del llm
|
|
|
|
cleanup_dist_env_and_memory()
|
|
|
|
|
|
@pytest.mark.core_model
|
|
@pytest.mark.parametrize("image_urls", [TEST_IMAGE_NAMES], indirect=True)
|
|
def test_no_memory_leak(llm, image_urls: list[str]) -> None:
|
|
model_info = HF_EXAMPLE_MODELS.find_hf_info(MODEL_NAME)
|
|
model_info.check_available_online(on_fail="skip")
|
|
model_info.check_transformers_version(on_fail="skip")
|
|
|
|
request_batch = _build_request_batch(image_urls)
|
|
|
|
# Establish a warmup baseline after model load and the first multimodal
|
|
# requests complete. Later rounds should remain near this steady state.
|
|
for _ in range(WARMUP_ROUNDS):
|
|
outputs = llm.chat(request_batch, sampling_params=SAMPLING_PARAMS)
|
|
assert len(outputs) == len(request_batch)
|
|
assert llm.llm_engine.get_num_unfinished_requests() == 0
|
|
del outputs
|
|
|
|
gc.collect()
|
|
warmup_gpu = _gpu_used_bytes()
|
|
warmup_cpu_peak = _ru_maxrss_bytes()
|
|
|
|
post_warmup_gpu_samples: list[int] = []
|
|
post_warmup_cpu_peak_samples: list[int] = []
|
|
|
|
for _ in range(MEASURED_ROUNDS):
|
|
outputs = llm.chat(request_batch, sampling_params=SAMPLING_PARAMS)
|
|
assert len(outputs) == len(request_batch)
|
|
assert llm.llm_engine.get_num_unfinished_requests() == 0
|
|
del outputs
|
|
|
|
gc.collect()
|
|
post_warmup_gpu_samples.append(_gpu_used_bytes())
|
|
cpu_peak = _ru_maxrss_bytes()
|
|
if cpu_peak is not None:
|
|
post_warmup_cpu_peak_samples.append(cpu_peak)
|
|
|
|
gpu_growth = max(post_warmup_gpu_samples) - warmup_gpu
|
|
gpu_threshold = GPU_GROWTH_THRESHOLD_MIB * MiB_bytes
|
|
|
|
assert gpu_growth <= gpu_threshold, (
|
|
"Qwen3-VL GPU memory kept growing after warmup. "
|
|
f"warmup_baseline={_format_mib(warmup_gpu)}, "
|
|
f"post_warmup_samples={[_format_mib(x) for x in post_warmup_gpu_samples]}, "
|
|
f"gpu_growth={_format_mib(gpu_growth)}, "
|
|
f"gpu_threshold={GPU_GROWTH_THRESHOLD_MIB} MiB"
|
|
)
|
|
|
|
if warmup_cpu_peak is not None and post_warmup_cpu_peak_samples:
|
|
cpu_peak_growth = max(post_warmup_cpu_peak_samples) - warmup_cpu_peak
|
|
cpu_threshold = CPU_PEAK_GROWTH_THRESHOLD_MIB * MiB_bytes
|
|
|
|
assert cpu_peak_growth <= cpu_threshold, (
|
|
"Qwen3-VL CPU peak RSS kept growing after warmup. "
|
|
f"warmup_ru_maxrss={_format_mib(warmup_cpu_peak)}, "
|
|
f"post_warmup_ru_maxrss={[_format_mib(x) for x in post_warmup_cpu_peak_samples]}, " # noqa: E501
|
|
f"cpu_peak_growth={_format_mib(cpu_peak_growth)}, "
|
|
f"cpu_peak_threshold={CPU_PEAK_GROWTH_THRESHOLD_MIB} MiB"
|
|
)
|