[LoRA] Cleanup LoRA unused code (#29611)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -16,7 +16,7 @@ from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
|
||||
@@ -19,6 +19,14 @@ from ...utils import RemoteOpenAIServer
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_files():
|
||||
"""Download zephyr LoRA files once per test session."""
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(zephyr_lora_files): # noqa: F811
|
||||
args = [
|
||||
|
||||
@@ -8,7 +8,7 @@ import pytest_asyncio
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@@ -20,7 +20,6 @@ def server():
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
# lora config below
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enable-chunked-prefill",
|
||||
|
||||
@@ -13,9 +13,8 @@ import pytest_asyncio
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
|
||||
# generation quality here
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
BADREQUEST_CASES = [
|
||||
(
|
||||
@@ -33,11 +32,11 @@ BADREQUEST_CASES = [
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[True])
|
||||
def server_with_lora_modules_json(request, zephyr_lora_files):
|
||||
def server_with_lora_modules_json(request, qwen3_lora_files):
|
||||
# Define the json format LoRA module configurations
|
||||
lora_module_1 = {
|
||||
"name": "zephyr-lora",
|
||||
"path": zephyr_lora_files,
|
||||
"name": "qwen3-lora",
|
||||
"path": qwen3_lora_files,
|
||||
"base_model_name": MODEL_NAME,
|
||||
}
|
||||
|
||||
@@ -74,7 +73,7 @@ async def client(server_with_lora_modules_json):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
|
||||
async def test_static_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
|
||||
models = await client.models.list()
|
||||
models = models.data
|
||||
served_model = models[0]
|
||||
@@ -82,17 +81,17 @@ async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files
|
||||
assert served_model.id == MODEL_NAME
|
||||
assert served_model.root == MODEL_NAME
|
||||
assert served_model.parent is None
|
||||
assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
|
||||
assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
|
||||
assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
|
||||
assert lora_models[0].id == "zephyr-lora"
|
||||
assert lora_models[0].id == "qwen3-lora"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
|
||||
async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
|
||||
response = await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={"lora_name": "zephyr-lora-3", "lora_path": zephyr_lora_files},
|
||||
body={"lora_name": "qwen3-lora-3", "lora_path": qwen3_lora_files},
|
||||
)
|
||||
# Ensure adapter loads before querying /models
|
||||
assert "success" in response
|
||||
@@ -100,9 +99,9 @@ async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_file
|
||||
models = await client.models.list()
|
||||
models = models.data
|
||||
dynamic_lora_model = models[-1]
|
||||
assert dynamic_lora_model.root == zephyr_lora_files
|
||||
assert dynamic_lora_model.root == qwen3_lora_files
|
||||
assert dynamic_lora_model.parent == MODEL_NAME
|
||||
assert dynamic_lora_model.id == "zephyr-lora-3"
|
||||
assert dynamic_lora_model.id == "qwen3-lora-3"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -134,7 +133,7 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
|
||||
async def test_dynamic_lora_badrequests(
|
||||
client: openai.AsyncOpenAI,
|
||||
tmp_path,
|
||||
zephyr_lora_files,
|
||||
qwen3_lora_files,
|
||||
test_name: str,
|
||||
config_change: dict,
|
||||
expected_error: str,
|
||||
@@ -143,7 +142,7 @@ async def test_dynamic_lora_badrequests(
|
||||
test_dir = tmp_path / test_name
|
||||
|
||||
# Copy adapter files
|
||||
shutil.copytree(zephyr_lora_files, test_dir)
|
||||
shutil.copytree(qwen3_lora_files, test_dir)
|
||||
|
||||
# Load and modify configuration
|
||||
config_path = test_dir / "adapter_config.json"
|
||||
@@ -167,7 +166,7 @@ async def test_dynamic_lora_badrequests(
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multiple_lora_adapters(
|
||||
client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
|
||||
client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
|
||||
):
|
||||
"""Validate that many loras can be dynamically registered and inferenced
|
||||
with concurrently"""
|
||||
@@ -178,7 +177,7 @@ async def test_multiple_lora_adapters(
|
||||
await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
|
||||
body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
|
||||
)
|
||||
for _ in range(3):
|
||||
await client.completions.create(
|
||||
@@ -199,7 +198,7 @@ async def test_multiple_lora_adapters(
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_loading_invalid_adapters_does_not_break_others(
|
||||
client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
|
||||
client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
|
||||
):
|
||||
invalid_files = tmp_path / "invalid_files"
|
||||
invalid_files.mkdir()
|
||||
@@ -215,7 +214,7 @@ async def test_loading_invalid_adapters_does_not_break_others(
|
||||
while not stop_good_requests_event.is_set():
|
||||
try:
|
||||
batch = await client.completions.create(
|
||||
model="zephyr-lora",
|
||||
model="qwen3-lora",
|
||||
prompt=["Hello there", "Foo bar bazz buzz"],
|
||||
max_tokens=5,
|
||||
)
|
||||
@@ -254,7 +253,7 @@ async def test_loading_invalid_adapters_does_not_break_others(
|
||||
await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={"lora_name": "valid", "lora_path": zephyr_lora_files},
|
||||
body={"lora_name": "valid", "lora_path": qwen3_lora_files},
|
||||
)
|
||||
await client.completions.create(
|
||||
model="valid",
|
||||
@@ -267,7 +266,7 @@ async def test_loading_invalid_adapters_does_not_break_others(
|
||||
async def test_beam_search_with_lora_adapters(
|
||||
client: openai.AsyncOpenAI,
|
||||
tmp_path,
|
||||
zephyr_lora_files,
|
||||
qwen3_lora_files,
|
||||
):
|
||||
"""Validate that async beam search can be used with lora."""
|
||||
|
||||
@@ -275,7 +274,7 @@ async def test_beam_search_with_lora_adapters(
|
||||
await client.post(
|
||||
"load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
|
||||
body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
|
||||
)
|
||||
for _ in range(3):
|
||||
await client.completions.create(
|
||||
|
||||
@@ -8,13 +8,13 @@ import pytest_asyncio
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
|
||||
# generation quality here
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(zephyr_lora_files):
|
||||
def server(qwen3_lora_files):
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
@@ -25,7 +25,7 @@ def server(zephyr_lora_files):
|
||||
# lora config below
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"zephyr-lora={zephyr_lora_files}",
|
||||
f"qwen3-lora={qwen3_lora_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
@@ -45,12 +45,12 @@ async def client(server):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
|
||||
async def test_check_models(client: openai.AsyncOpenAI, qwen3_lora_files):
|
||||
models = await client.models.list()
|
||||
models = models.data
|
||||
served_model = models[0]
|
||||
lora_models = models[1:]
|
||||
assert served_model.id == MODEL_NAME
|
||||
assert served_model.root == MODEL_NAME
|
||||
assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
|
||||
assert lora_models[0].id == "zephyr-lora"
|
||||
assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
|
||||
assert lora_models[0].id == "qwen3-lora"
|
||||
|
||||
@@ -8,7 +8,7 @@ import pytest_asyncio
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@@ -110,8 +110,9 @@ async def test_single_completion(client: openai.AsyncOpenAI):
|
||||
choice = completion.choices[0]
|
||||
assert len(choice.text) >= 5
|
||||
assert choice.finish_reason == "length"
|
||||
# When using Qwen3-0.6B, prompt tokens=[9707, 11, 847, 829, 374]
|
||||
assert completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=5, prompt_tokens=6, total_tokens=11
|
||||
completion_tokens=5, prompt_tokens=5, total_tokens=10
|
||||
)
|
||||
|
||||
# test using token IDs
|
||||
|
||||
@@ -11,11 +11,11 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args(zephyr_lora_files):
|
||||
def default_server_args(qwen3_lora_files):
|
||||
return [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
@@ -28,7 +28,7 @@ def default_server_args(zephyr_lora_files):
|
||||
# lora config
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"zephyr-lora={zephyr_lora_files}",
|
||||
f"qwen3-lora={qwen3_lora_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
|
||||
@@ -10,7 +10,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
|
||||
@@ -10,7 +10,7 @@ from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
|
||||
Reference in New Issue
Block a user