tests/entrypoints/openai/test_default_mm_loras.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import os

import openai  # use the official client for correctness check
import pytest
import pytest_asyncio
from huggingface_hub import snapshot_download

from ...conftest import AudioTestAssets
from ...utils import RemoteOpenAIServer

# NOTE - the tests in this module are currently analogous to test_chat, but are
# separated to avoid OOM killing due to module-scoped servers, since we
# need a multimodal model for these tests.

# Contains a modality specific lora alongside the base model
MULTIMODAL_MODEL_NAME = snapshot_download("microsoft/Phi-4-multimodal-instruct")
AUDIO_LORA_PATH = os.path.join(MULTIMODAL_MODEL_NAME, "speech-lora")

ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original chronograph, a little piece of practical poetry. Mary had a little lamb, it slept with quite a snow, and everywhere that Mary went, the lamb was sure to go."  # noqa: E501


@pytest.fixture(scope="module")
def multimodal_server():  # noqa: F811
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "half",
        "--max-model-len",
        "4096",
        "--enforce-eager",
        # lora config below
        "--enable-lora",
        "--lora-modules",
        f"speech={AUDIO_LORA_PATH}",
        "--max-lora-rank",
        "320",
        "--max-num-seqs",
        "2",
        "--trust-remote-code",
        "--gpu-memory-utilization",
        "0.8",
        "--default-mm-loras",
        f'{{"audio": "{AUDIO_LORA_PATH}"}}',
    ]

    with RemoteOpenAIServer(
        MULTIMODAL_MODEL_NAME, args, max_wait_seconds=480
    ) as remote_server:
        yield remote_server


@pytest_asyncio.fixture
async def multi_modal_client(multimodal_server):
    async with multimodal_server.get_async_client() as async_client:
        yield async_client


@pytest.mark.asyncio
@pytest.mark.parametrize(
    # base model with default lora should give the same response as lora model
    "model_name",
    [MULTIMODAL_MODEL_NAME, "speech"],
)
async def test_default_mm_lora_chat_completions(
    model_name: str,
    multi_modal_client: openai.AsyncOpenAI,
    audio_assets: AudioTestAssets,
):
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Can you transcribe this audio?",
                },
                {
                    "type": "audio_url",
                    "audio_url": {"url": audio_assets[0].url},
                },
            ],
        }
    ]

    chat_completion = await multi_modal_client.chat.completions.create(
        model=model_name, messages=messages, max_completion_tokens=128, temperature=0.0
    )

    assert len(chat_completion.choices) > 0

    message = chat_completion.choices[0].message
    assert message.content is not None and len(message.content) >= 0
    assert message.content == ACTIVE_MM_LORA_RESPONSE
[Core] Add Support for Default Modality Specific LoRAs [generate / chat completions] (#19126) Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> 2025-07-10 14:09:37 -06:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`

			`import os`

			`import openai # use the official client for correctness check`
			`import pytest`
			`import pytest_asyncio`
			`from huggingface_hub import snapshot_download`

			`from ...conftest import AudioTestAssets`
			`from ...utils import RemoteOpenAIServer`

			`# NOTE - the tests in this module are currently analogous to test_chat, but are`
			`# separated to avoid OOM killing due to module-scoped servers, since we`
			`# need a multimodal model for these tests.`

			`# Contains a modality specific lora alongside the base model`
			`MULTIMODAL_MODEL_NAME = snapshot_download("microsoft/Phi-4-multimodal-instruct")`
			`AUDIO_LORA_PATH = os.path.join(MULTIMODAL_MODEL_NAME, "speech-lora")`

			`ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original chronograph, a little piece of practical poetry. Mary had a little lamb, it slept with quite a snow, and everywhere that Mary went, the lamb was sure to go." # noqa: E501`


			`@pytest.fixture(scope="module")`
[CI] remove flaky v0 test (#22864) Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> 2025-08-14 00:41:51 -04:00			`def multimodal_server(): # noqa: F811`
[Core] Add Support for Default Modality Specific LoRAs [generate / chat completions] (#19126) Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> 2025-07-10 14:09:37 -06:00			`args = [`
			`# use half precision for speed and memory savings in CI environment`
			`"--dtype",`
			`"half",`
			`"--max-model-len",`
[Bugfix] In LongRoPE, decide short vs long based on max_model_len (#27431) Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> 2025-10-28 08:00:56 -04:00			`"4096",`
[Core] Add Support for Default Modality Specific LoRAs [generate / chat completions] (#19126) Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> 2025-07-10 14:09:37 -06:00			`"--enforce-eager",`
			`# lora config below`
			`"--enable-lora",`
			`"--lora-modules",`
			`f"speech={AUDIO_LORA_PATH}",`
			`"--max-lora-rank",`
			`"320",`
			`"--max-num-seqs",`
			`"2",`
			`"--trust-remote-code",`
			`"--gpu-memory-utilization",`
			`"0.8",`
			`"--default-mm-loras",`
			`f'{{"audio": "{AUDIO_LORA_PATH}"}}',`
			`]`

[Flaky CI] Increase timeout tolerance for test_mp_crash_detection+test_default_mm_lora_chat_completions (#23028) Signed-off-by: mgoin <mgoin64@gmail.com> 2025-08-16 14:33:08 -04:00			`with RemoteOpenAIServer(`
			`MULTIMODAL_MODEL_NAME, args, max_wait_seconds=480`
			`) as remote_server:`
[Core] Add Support for Default Modality Specific LoRAs [generate / chat completions] (#19126) Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> 2025-07-10 14:09:37 -06:00			`yield remote_server`


			`@pytest_asyncio.fixture`
			`async def multi_modal_client(multimodal_server):`
			`async with multimodal_server.get_async_client() as async_client:`
			`yield async_client`


			`@pytest.mark.asyncio`
			`@pytest.mark.parametrize(`
			`# base model with default lora should give the same response as lora model`
			`"model_name",`
			`[MULTIMODAL_MODEL_NAME, "speech"],`
			`)`
			`async def test_default_mm_lora_chat_completions(`
			`model_name: str,`
			`multi_modal_client: openai.AsyncOpenAI,`
			`audio_assets: AudioTestAssets,`
			`):`
			`messages = [`
			`{`
			`"role": "user",`
			`"content": [`
			`{`
			`"type": "text",`
			`"text": "Can you transcribe this audio?",`
			`},`
			`{`
			`"type": "audio_url",`
			`"audio_url": {"url": audio_assets[0].url},`
			`},`
Convert formatting to use `ruff` instead of `yapf` + `isort` (#26247) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2025-10-05 15:06:22 +01:00			`],`
[Core] Add Support for Default Modality Specific LoRAs [generate / chat completions] (#19126) Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> 2025-07-10 14:09:37 -06:00			`}`
			`]`

			`chat_completion = await multi_modal_client.chat.completions.create(`
			`model=model_name, messages=messages, max_completion_tokens=128, temperature=0.0`
			`)`

			`assert len(chat_completion.choices) > 0`

			`message = chat_completion.choices[0].message`
			`assert message.content is not None and len(message.content) >= 0`
			`assert message.content == ACTIVE_MM_LORA_RESPONSE`