[LoRA] Cleanup LoRA unused code (#29611)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-11-29 14:52:58 +08:00
parent 4a80ad0a25
commit 39e63dec7c
46 changed files with 126 additions and 173 deletions
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -16,7 +16,7 @@ from vllm.version import __version__ as VLLM_VERSION

 from ...utils import RemoteOpenAIServer

-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -19,6 +19,14 @@ from ...utils import RemoteOpenAIServer
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"


+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    """Download zephyr LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
+
+
@pytest.fixture(scope="module")
 def server(zephyr_lora_files):  # noqa: F811
    args = [
--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
@@ -8,7 +8,7 @@ import pytest_asyncio
 from ...utils import RemoteOpenAIServer

 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
@@ -20,7 +20,6 @@ def server():
        "--max-model-len",
        "8192",
        "--enforce-eager",
-        # lora config below
        "--max-num-seqs",
        "128",
        "--enable-chunked-prefill",
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -13,9 +13,8 @@ import pytest_asyncio
 from ...utils import RemoteOpenAIServer

 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-# technically this needs Mistral-7B-v0.1 as base, but we're not testing
-# generation quality here
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+

 BADREQUEST_CASES = [
    (
@@ -33,11 +32,11 @@ BADREQUEST_CASES = [


@pytest.fixture(scope="module", params=[True])
-def server_with_lora_modules_json(request, zephyr_lora_files):
+def server_with_lora_modules_json(request, qwen3_lora_files):
    # Define the json format LoRA module configurations
    lora_module_1 = {
-        "name": "zephyr-lora",
-        "path": zephyr_lora_files,
+        "name": "qwen3-lora",
+        "path": qwen3_lora_files,
        "base_model_name": MODEL_NAME,
    }

@@ -74,7 +73,7 @@ async def client(server_with_lora_modules_json):


@pytest.mark.asyncio
-async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
+async def test_static_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
    models = await client.models.list()
    models = models.data
    served_model = models[0]
@@ -82,17 +81,17 @@ async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files
    assert served_model.id == MODEL_NAME
    assert served_model.root == MODEL_NAME
    assert served_model.parent is None
-    assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
+    assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
    assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
-    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[0].id == "qwen3-lora"


@pytest.mark.asyncio
-async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
+async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
    response = await client.post(
        "load_lora_adapter",
        cast_to=str,
-        body={"lora_name": "zephyr-lora-3", "lora_path": zephyr_lora_files},
+        body={"lora_name": "qwen3-lora-3", "lora_path": qwen3_lora_files},
    )
    # Ensure adapter loads before querying /models
    assert "success" in response
@@ -100,9 +99,9 @@ async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_file
    models = await client.models.list()
    models = models.data
    dynamic_lora_model = models[-1]
-    assert dynamic_lora_model.root == zephyr_lora_files
+    assert dynamic_lora_model.root == qwen3_lora_files
    assert dynamic_lora_model.parent == MODEL_NAME
-    assert dynamic_lora_model.id == "zephyr-lora-3"
+    assert dynamic_lora_model.id == "qwen3-lora-3"


@pytest.mark.asyncio
@@ -134,7 +133,7 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
 async def test_dynamic_lora_badrequests(
    client: openai.AsyncOpenAI,
    tmp_path,
-    zephyr_lora_files,
+    qwen3_lora_files,
    test_name: str,
    config_change: dict,
    expected_error: str,
@@ -143,7 +142,7 @@ async def test_dynamic_lora_badrequests(
    test_dir = tmp_path / test_name

    # Copy adapter files
-    shutil.copytree(zephyr_lora_files, test_dir)
+    shutil.copytree(qwen3_lora_files, test_dir)

    # Load and modify configuration
    config_path = test_dir / "adapter_config.json"
@@ -167,7 +166,7 @@ async def test_dynamic_lora_badrequests(

@pytest.mark.asyncio
 async def test_multiple_lora_adapters(
-    client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
+    client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
 ):
    """Validate that many loras can be dynamically registered and inferenced
    with concurrently"""
@@ -178,7 +177,7 @@ async def test_multiple_lora_adapters(
        await client.post(
            "load_lora_adapter",
            cast_to=str,
-            body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
+            body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
        )
        for _ in range(3):
            await client.completions.create(
@@ -199,7 +198,7 @@ async def test_multiple_lora_adapters(

@pytest.mark.asyncio
 async def test_loading_invalid_adapters_does_not_break_others(
-    client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
+    client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
 ):
    invalid_files = tmp_path / "invalid_files"
    invalid_files.mkdir()
@@ -215,7 +214,7 @@ async def test_loading_invalid_adapters_does_not_break_others(
        while not stop_good_requests_event.is_set():
            try:
                batch = await client.completions.create(
-                    model="zephyr-lora",
+                    model="qwen3-lora",
                    prompt=["Hello there", "Foo bar bazz buzz"],
                    max_tokens=5,
                )
@@ -254,7 +253,7 @@ async def test_loading_invalid_adapters_does_not_break_others(
    await client.post(
        "load_lora_adapter",
        cast_to=str,
-        body={"lora_name": "valid", "lora_path": zephyr_lora_files},
+        body={"lora_name": "valid", "lora_path": qwen3_lora_files},
    )
    await client.completions.create(
        model="valid",
@@ -267,7 +266,7 @@ async def test_loading_invalid_adapters_does_not_break_others(
 async def test_beam_search_with_lora_adapters(
    client: openai.AsyncOpenAI,
    tmp_path,
-    zephyr_lora_files,
+    qwen3_lora_files,
 ):
    """Validate that async beam search can be used with lora."""

@@ -275,7 +274,7 @@ async def test_beam_search_with_lora_adapters(
        await client.post(
            "load_lora_adapter",
            cast_to=str,
-            body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
+            body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
        )
        for _ in range(3):
            await client.completions.create(
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -8,13 +8,13 @@ import pytest_asyncio
 from ...utils import RemoteOpenAIServer

 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here


@pytest.fixture(scope="module")
-def server(zephyr_lora_files):
+def server(qwen3_lora_files):
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
@@ -25,7 +25,7 @@ def server(zephyr_lora_files):
        # lora config below
        "--enable-lora",
        "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
+        f"qwen3-lora={qwen3_lora_files}",
        "--max-lora-rank",
        "64",
        "--max-cpu-loras",
@@ -45,12 +45,12 @@ async def client(server):


@pytest.mark.asyncio
-async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
+async def test_check_models(client: openai.AsyncOpenAI, qwen3_lora_files):
    models = await client.models.list()
    models = models.data
    served_model = models[0]
    lora_models = models[1:]
    assert served_model.id == MODEL_NAME
    assert served_model.root == MODEL_NAME
-    assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
-    assert lora_models[0].id == "zephyr-lora"
+    assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
+    assert lora_models[0].id == "qwen3-lora"
--- a/tests/entrypoints/openai/test_orca_metrics.py
+++ b/tests/entrypoints/openai/test_orca_metrics.py
@@ -8,7 +8,7 @@ import pytest_asyncio
 from ...utils import RemoteOpenAIServer

 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
@@ -110,8 +110,9 @@ async def test_single_completion(client: openai.AsyncOpenAI):
    choice = completion.choices[0]
    assert len(choice.text) >= 5
    assert choice.finish_reason == "length"
+    # When using Qwen3-0.6B, prompt tokens=[9707, 11, 847, 829, 374]
    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11
+        completion_tokens=5, prompt_tokens=5, total_tokens=10
    )

    # test using token IDs
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -11,11 +11,11 @@ from vllm.transformers_utils.tokenizer import get_tokenizer

 from ...utils import RemoteOpenAIServer

-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
-def default_server_args(zephyr_lora_files):
+def default_server_args(qwen3_lora_files):
    return [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
@@ -28,7 +28,7 @@ def default_server_args(zephyr_lora_files):
        # lora config
        "--enable-lora",
        "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
+        f"qwen3-lora={qwen3_lora_files}",
        "--max-lora-rank",
        "64",
        "--max-cpu-loras",
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -10,7 +10,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
 from ...utils import RemoteOpenAIServer

 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
--- a/tests/entrypoints/openai/test_uds.py
+++ b/tests/entrypoints/openai/test_uds.py
@@ -10,7 +10,7 @@ from vllm.version import __version__ as VLLM_VERSION

 from ...utils import RemoteOpenAIServer

-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")