2025-07-10 14:09:37 -06:00
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for applying default registered multimodal loras .
"""
import os
2025-11-20 22:32:30 -07:00
import unittest . mock as mock
2025-07-10 14:09:37 -06:00
2025-11-20 22:32:30 -07:00
import pytest
2025-07-10 14:09:37 -06:00
from huggingface_hub import snapshot_download
from vllm . lora . request import LoRARequest
from . . conftest import AudioTestAssets , VllmRunner
MODEL_PATH = snapshot_download ( " microsoft/Phi-4-multimodal-instruct " )
AUDIO_LORA_PATH = os . path . join ( MODEL_PATH , " speech-lora " )
IMAGE_LORA_PATH = os . path . join ( MODEL_PATH , " vision-lora " )
AUDIO_PROMPT = " <|user|><|audio_1|>Can you transcribe this audio?<|end|><|assistant|> " # noqa: E501
# Responses are greedy decoded; we just check the end of
# the generated text. If the lora is inactive, this model
# generates commentary on the transcription.
RESPONSE_SUFFIX_WITH_LORA = " Spoken text: The first words I spoke in the original chronograph, a little piece of practical poetry. Mary had a little lamb, it slept with quite a snow, and everywhere that Mary went, the lamb was sure to go. " # noqa: E501
RESPONSE_SUFFIX_WITHOUT_LORA = " Certainly! Here is the transcription of the audio you provided: \n \n The first words I spoke in the original phonograph record: A little piece of practical poetry. Mary had a little lamb; its fleece was white as snow, and everywhere that Mary went, the lamb was sure to go. " # noqa: E501
VLLM_RUNNER_BASE_KWARGS = {
" model_name " : MODEL_PATH ,
" dtype " : " half " ,
" enable_lora " : " True " ,
" max_num_seqs " : 2 ,
" max_lora_rank " : 320 ,
2025-10-30 03:13:03 -07:00
# Keep these LoRA tests on short-RoPE for determinism post-LongRoPE change.
" max_model_len " : 4096 ,
2025-07-10 14:09:37 -06:00
" gpu_memory_utilization " : 0.8 ,
" limit_mm_per_prompt " : { " audio " : 1 } ,
" enforce_eager " : True ,
}
def run_test ( vllm_runner , audio_assets , lora_request , expected_suffix , * * kwargs ) :
inputs = [ ( [ AUDIO_PROMPT ] , [ audio_assets [ 0 ] . audio_and_sample_rate [ 0 ] ] ) ]
# Apply any additional kwargs as overrides to the base kwargs
vllm_runner_kwargs = { * * VLLM_RUNNER_BASE_KWARGS , * * kwargs }
with vllm_runner ( * * vllm_runner_kwargs ) as vllm_model :
vllm_outputs_with_default_lora = [
vllm_model . generate_greedy (
prompts ,
max_tokens = 128 ,
audios = audios ,
lora_request = lora_request ,
)
for prompts , audios in inputs
]
assert vllm_outputs_with_default_lora [ - 1 ] [ - 1 ] [ - 1 ] . endswith ( expected_suffix )
def test_active_default_mm_lora (
vllm_runner : type [ VllmRunner ] ,
audio_assets : AudioTestAssets ,
) :
""" Ensure that we can use the default audio lora. """
run_test (
vllm_runner ,
audio_assets ,
lora_request = None ,
default_mm_loras = { " audio " : AUDIO_LORA_PATH } ,
expected_suffix = RESPONSE_SUFFIX_WITH_LORA ,
)
def test_inactive_default_mm_lora (
vllm_runner : type [ VllmRunner ] ,
audio_assets : AudioTestAssets ,
) :
""" Ensure that modalities are filtered properly. """
# Default image lora won't be active since we only pass audio
run_test (
vllm_runner ,
audio_assets ,
lora_request = None ,
default_mm_loras = { " image " : IMAGE_LORA_PATH } ,
expected_suffix = RESPONSE_SUFFIX_WITHOUT_LORA ,
)
def test_default_mm_lora_succeeds_with_redundant_lora_request (
vllm_runner : type [ VllmRunner ] ,
audio_assets : AudioTestAssets ,
) :
""" Ensure that redundantly providing the lora works. """
run_test (
vllm_runner ,
audio_assets ,
lora_request = LoRARequest ( " audio " , 1 , AUDIO_LORA_PATH ) ,
default_mm_loras = { " audio " : AUDIO_LORA_PATH } ,
expected_suffix = RESPONSE_SUFFIX_WITH_LORA ,
)
def test_default_mm_lora_fails_with_overridden_lora_request (
vllm_runner : type [ VllmRunner ] ,
audio_assets : AudioTestAssets ,
) :
""" Ensure that if the lora_request conflicts with default_mm_loras,
we use the lora_request . """
run_test (
vllm_runner ,
audio_assets ,
lora_request = LoRARequest ( " speech " , 2 , AUDIO_LORA_PATH ) ,
default_mm_loras = { " audio " : IMAGE_LORA_PATH } ,
expected_suffix = RESPONSE_SUFFIX_WITH_LORA ,
)
2025-11-20 22:32:30 -07:00
def test_default_mm_lora_does_not_expand_string_reqs ( vllm_runner ) :
class MockEngineException ( Exception ) :
pass
# Regression test for ensuring default multimodal lora resolution
# does not expand the lora req if the prompt type is a string.
vllm_runner_kwargs = {
* * VLLM_RUNNER_BASE_KWARGS ,
* * { " default_mm_loras " : { " audio " : AUDIO_LORA_PATH } } ,
}
# Avoid the full generation call since these tests are expensive;
# just check what lora request is actually submitted to the engine
mock_err = " Engine is mocked for this test "
with (
mock . patch (
" vllm.v1.engine.llm_engine.LLMEngine.add_request " ,
side_effect = MockEngineException ( mock_err ) ,
) as mock_add_request ,
vllm_runner ( * * vllm_runner_kwargs ) as vllm_model ,
) :
# Die once we actually submit the request to the engine
with pytest . raises ( MockEngineException ) :
vllm_model . llm . generate ( prompts = AUDIO_PROMPT )
# Then check to make sure the submitted lora request
# and text prompt were zipped together correctly
engine_args , engine_kwargs = mock_add_request . call_args
assert engine_kwargs [ " lora_request " ] is None
assert engine_kwargs [ " prompt_text " ] == AUDIO_PROMPT