2025-02-02 14:58:18 -05:00
|
|
|
# SPDX-License-Identifier: Apache-2.0
|
2025-06-03 11:20:17 -07:00
|
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
2025-01-03 03:39:19 -05:00
|
|
|
|
2025-12-13 03:49:11 +01:00
|
|
|
from collections.abc import Sequence
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
import librosa
|
2025-01-03 03:39:19 -05:00
|
|
|
import pytest
|
2025-12-13 03:49:11 +01:00
|
|
|
from transformers import AutoModelForSpeechSeq2Seq
|
2025-01-03 03:39:19 -05:00
|
|
|
|
|
|
|
|
from vllm.assets.audio import AudioAsset
|
2025-12-13 03:49:11 +01:00
|
|
|
from vllm.platforms import current_platform
|
2025-01-03 03:39:19 -05:00
|
|
|
|
2025-12-13 03:49:11 +01:00
|
|
|
from ....conftest import HfRunner, PromptAudioInput, VllmRunner
|
2025-03-17 19:33:35 +08:00
|
|
|
from ....utils import create_new_process_for_each_test, multi_gpu_test
|
2025-12-13 03:49:11 +01:00
|
|
|
from ...registry import HF_EXAMPLE_MODELS
|
|
|
|
|
from ...utils import check_logprobs_close
|
|
|
|
|
|
|
|
|
|
VLLM_PROMPT = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
|
|
|
|
|
HF_PROMPT = ""
|
|
|
|
|
# Whisper expects 16kHz audio
|
|
|
|
|
WHISPER_SAMPLE_RATE = 16000
|
2025-01-03 03:39:19 -05:00
|
|
|
|
2025-12-13 03:49:11 +01:00
|
|
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
|
|
|
def use_spawn_for_whisper(monkeypatch):
|
|
|
|
|
"""Whisper has issues with forked workers, use spawn instead."""
|
|
|
|
|
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
|
2025-01-03 03:39:19 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_test(
|
2025-12-13 03:49:11 +01:00
|
|
|
hf_runner: type[HfRunner],
|
2025-05-01 14:03:08 +08:00
|
|
|
vllm_runner: type[VllmRunner],
|
2025-12-13 03:49:11 +01:00
|
|
|
inputs: Sequence[tuple[list[str], list[str], PromptAudioInput]],
|
2025-01-03 03:39:19 -05:00
|
|
|
model: str,
|
|
|
|
|
*,
|
2025-12-13 03:49:11 +01:00
|
|
|
max_model_len: int,
|
|
|
|
|
dtype: str,
|
|
|
|
|
max_tokens: int,
|
|
|
|
|
num_logprobs: int,
|
2025-01-03 03:39:19 -05:00
|
|
|
tensor_parallel_size: int,
|
|
|
|
|
distributed_executor_backend: str | None = None,
|
2025-12-13 03:49:11 +01:00
|
|
|
enforce_eager: bool = True,
|
2025-01-03 03:39:19 -05:00
|
|
|
) -> None:
|
2025-12-13 03:49:11 +01:00
|
|
|
"""Inference result should be the same between hf and vllm.
|
2025-01-03 03:39:19 -05:00
|
|
|
|
2025-12-13 03:49:11 +01:00
|
|
|
All the audio fixtures for the test are from AudioAsset.
|
|
|
|
|
For huggingface runner, we provide the audio as input.
|
|
|
|
|
For vllm runner, we provide MultiModalDataDict objects
|
|
|
|
|
and corresponding MultiModalConfig as input.
|
|
|
|
|
"""
|
2025-05-01 14:03:08 +08:00
|
|
|
with vllm_runner(
|
|
|
|
|
model,
|
2025-12-10 12:58:42 +00:00
|
|
|
dtype=dtype,
|
2025-12-13 03:49:11 +01:00
|
|
|
max_model_len=max_model_len,
|
2025-05-01 14:03:08 +08:00
|
|
|
tensor_parallel_size=tensor_parallel_size,
|
|
|
|
|
distributed_executor_backend=distributed_executor_backend,
|
2025-12-13 03:49:11 +01:00
|
|
|
limit_mm_per_prompt={"audio": 2},
|
|
|
|
|
enforce_eager=enforce_eager,
|
|
|
|
|
disable_custom_all_reduce=True,
|
2025-05-01 14:03:08 +08:00
|
|
|
) as vllm_model:
|
2025-12-13 03:49:11 +01:00
|
|
|
vllm_outputs_per_case = [
|
|
|
|
|
vllm_model.generate_greedy_logprobs(
|
|
|
|
|
vllm_prompts,
|
|
|
|
|
max_tokens,
|
|
|
|
|
num_logprobs=num_logprobs,
|
|
|
|
|
audios=audios,
|
|
|
|
|
)
|
|
|
|
|
for vllm_prompts, _, audios in inputs
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
|
|
|
|
|
hf_outputs_per_case = [
|
|
|
|
|
hf_model.generate_greedy_logprobs_limit(
|
|
|
|
|
hf_prompts,
|
|
|
|
|
max_tokens,
|
|
|
|
|
num_logprobs=num_logprobs,
|
|
|
|
|
audios=audios,
|
|
|
|
|
)
|
|
|
|
|
for _, hf_prompts, audios in inputs
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
|
|
|
|
|
check_logprobs_close(
|
|
|
|
|
outputs_0_lst=hf_outputs,
|
|
|
|
|
outputs_1_lst=vllm_outputs,
|
|
|
|
|
name_0="hf",
|
|
|
|
|
name_1="vllm",
|
2025-05-01 14:03:08 +08:00
|
|
|
)
|
2025-01-03 03:39:19 -05:00
|
|
|
|
|
|
|
|
|
2025-12-13 03:49:11 +01:00
|
|
|
@pytest.fixture
|
|
|
|
|
def input_audios() -> list[tuple[list[str], list[str], list[tuple[Any, int]]]]:
|
|
|
|
|
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
|
|
|
|
|
inputs = []
|
|
|
|
|
for asset in audio_assets:
|
|
|
|
|
audio, orig_sr = asset.audio_and_sample_rate
|
|
|
|
|
# Resample to Whisper's expected sample rate (16kHz)
|
|
|
|
|
if orig_sr != WHISPER_SAMPLE_RATE:
|
|
|
|
|
audio = librosa.resample(
|
|
|
|
|
audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE
|
|
|
|
|
)
|
|
|
|
|
# vLLM prompts, HF prompts, audio inputs
|
|
|
|
|
inputs.append(([VLLM_PROMPT], [HF_PROMPT], [(audio, WHISPER_SAMPLE_RATE)]))
|
|
|
|
|
return inputs
|
2025-01-03 03:39:19 -05:00
|
|
|
|
|
|
|
|
|
2025-12-13 03:49:11 +01:00
|
|
|
def check_model_available(model: str) -> None:
|
|
|
|
|
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
|
|
|
|
model_info.check_available_online(on_fail="skip")
|
|
|
|
|
model_info.check_transformers_version(on_fail="skip")
|
2025-12-10 12:58:42 +00:00
|
|
|
|
|
|
|
|
|
2025-12-13 03:49:11 +01:00
|
|
|
@pytest.mark.core_model
|
2025-12-10 12:58:42 +00:00
|
|
|
@pytest.mark.cpu_model
|
|
|
|
|
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
|
2026-01-06 00:00:23 +08:00
|
|
|
@pytest.mark.parametrize("dtype", ["half", "float"])
|
2025-12-13 03:49:11 +01:00
|
|
|
@pytest.mark.parametrize("num_logprobs", [5])
|
|
|
|
|
@pytest.mark.parametrize("enforce_eager", [True, False])
|
|
|
|
|
def test_models(
|
|
|
|
|
hf_runner,
|
|
|
|
|
vllm_runner,
|
|
|
|
|
model: str,
|
|
|
|
|
dtype: str,
|
|
|
|
|
num_logprobs: int,
|
|
|
|
|
input_audios,
|
|
|
|
|
enforce_eager: bool,
|
|
|
|
|
) -> None:
|
|
|
|
|
check_model_available(model)
|
|
|
|
|
if current_platform.is_cpu() and not enforce_eager:
|
|
|
|
|
pytest.skip("Skipping test for CPU with non-eager mode")
|
2025-12-10 12:58:42 +00:00
|
|
|
run_test(
|
2025-12-13 03:49:11 +01:00
|
|
|
hf_runner,
|
2025-12-10 12:58:42 +00:00
|
|
|
vllm_runner,
|
2025-12-13 03:49:11 +01:00
|
|
|
input_audios,
|
2025-12-10 12:58:42 +00:00
|
|
|
model,
|
|
|
|
|
dtype=dtype,
|
2025-12-13 03:49:11 +01:00
|
|
|
max_model_len=448,
|
|
|
|
|
max_tokens=200,
|
|
|
|
|
num_logprobs=num_logprobs,
|
|
|
|
|
tensor_parallel_size=1,
|
|
|
|
|
enforce_eager=enforce_eager,
|
2025-05-01 14:03:08 +08:00
|
|
|
)
|
2025-01-03 03:39:19 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@multi_gpu_test(num_gpus=2)
|
|
|
|
|
@pytest.mark.core_model
|
|
|
|
|
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
|
|
|
|
|
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
2025-12-13 03:49:11 +01:00
|
|
|
@pytest.mark.parametrize("dtype", ["half"])
|
|
|
|
|
@pytest.mark.parametrize("max_tokens", [200])
|
|
|
|
|
@pytest.mark.parametrize("num_logprobs", [5])
|
|
|
|
|
@create_new_process_for_each_test("spawn")
|
2025-05-01 14:03:08 +08:00
|
|
|
def test_models_distributed(
|
2025-12-13 03:49:11 +01:00
|
|
|
hf_runner,
|
2025-05-01 14:03:08 +08:00
|
|
|
vllm_runner,
|
2025-12-13 03:49:11 +01:00
|
|
|
model: str,
|
|
|
|
|
distributed_executor_backend: str,
|
|
|
|
|
dtype: str,
|
|
|
|
|
max_tokens: int,
|
|
|
|
|
num_logprobs: int,
|
|
|
|
|
input_audios,
|
2025-05-01 14:03:08 +08:00
|
|
|
) -> None:
|
2025-12-13 03:49:11 +01:00
|
|
|
check_model_available(model)
|
2025-05-01 14:03:08 +08:00
|
|
|
run_test(
|
2025-12-13 03:49:11 +01:00
|
|
|
hf_runner,
|
2025-05-01 14:03:08 +08:00
|
|
|
vllm_runner,
|
2025-12-13 03:49:11 +01:00
|
|
|
input_audios,
|
2025-05-01 14:03:08 +08:00
|
|
|
model,
|
2025-12-13 03:49:11 +01:00
|
|
|
dtype=dtype,
|
|
|
|
|
max_model_len=448,
|
|
|
|
|
max_tokens=max_tokens,
|
|
|
|
|
num_logprobs=num_logprobs,
|
2025-05-01 14:03:08 +08:00
|
|
|
tensor_parallel_size=2,
|
|
|
|
|
distributed_executor_backend=distributed_executor_backend,
|
2025-12-13 03:49:11 +01:00
|
|
|
enforce_eager=False,
|
2025-05-01 14:03:08 +08:00
|
|
|
)
|
2026-01-22 11:50:37 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.core_model
|
|
|
|
|
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
|
|
|
|
|
def test_encoder_cache_cleanup(
|
|
|
|
|
vllm_runner,
|
|
|
|
|
model: str,
|
|
|
|
|
input_audios,
|
|
|
|
|
monkeypatch,
|
|
|
|
|
) -> None:
|
|
|
|
|
"""Test that encoder cache is properly cleaned up after requests complete.
|
|
|
|
|
|
|
|
|
|
This is a regression test for a bug where encoder cache entries were freed
|
|
|
|
|
in the same scheduling step they were allocated, before the model could use
|
|
|
|
|
them.
|
|
|
|
|
"""
|
|
|
|
|
# Set single-process mode to access the model runner's encoder cache directly
|
|
|
|
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
|
|
|
|
check_model_available(model)
|
|
|
|
|
|
|
|
|
|
with vllm_runner(
|
|
|
|
|
model,
|
|
|
|
|
dtype="half",
|
|
|
|
|
max_model_len=448,
|
|
|
|
|
tensor_parallel_size=1,
|
|
|
|
|
limit_mm_per_prompt={"audio": 2},
|
|
|
|
|
enforce_eager=True,
|
|
|
|
|
) as vllm_model:
|
|
|
|
|
engine_core = vllm_model.llm.llm_engine.engine_core.engine_core
|
|
|
|
|
model_runner = engine_core.model_executor.driver_worker.worker.model_runner
|
|
|
|
|
encoder_cache = model_runner.encoder_cache
|
|
|
|
|
|
|
|
|
|
# Run multiple sequential requests to ensure cache is properly managed
|
|
|
|
|
for vllm_prompts, _, audios in input_audios:
|
|
|
|
|
vllm_model.generate_greedy(vllm_prompts, max_tokens=50, audios=audios)
|
|
|
|
|
|
|
|
|
|
# After all requests complete, encoder cache should be empty
|
|
|
|
|
cache_size = len(encoder_cache)
|
|
|
|
|
assert cache_size == 0, (
|
|
|
|
|
f"Encoder cache should be empty after all requests complete, "
|
|
|
|
|
f"but has {cache_size} entries. This indicates encoder cache "
|
|
|
|
|
f"entries are not being properly freed."
|
|
|
|
|
)
|