[Bugfix] Validate lora adapters to avoid crashing server (#11727)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
269
tests/entrypoints/openai/test_lora_adapters.py
Normal file
269
tests/entrypoints/openai/test_lora_adapters.py
Normal file
@@ -0,0 +1,269 @@
|
||||
import asyncio
|
||||
import json
|
||||
import shutil
|
||||
from contextlib import suppress
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
# downloading lora to test lora requests
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
|
||||
# generation quality here
|
||||
LORA_NAME = "typeof/zephyr-7b-beta-lora"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_files():
|
||||
return snapshot_download(repo_id=LORA_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server_with_lora_modules_json(zephyr_lora_files):
|
||||
# Define the json format LoRA module configurations
|
||||
lora_module_1 = {
|
||||
"name": "zephyr-lora",
|
||||
"path": zephyr_lora_files,
|
||||
"base_model_name": MODEL_NAME
|
||||
}
|
||||
|
||||
lora_module_2 = {
|
||||
"name": "zephyr-lora2",
|
||||
"path": zephyr_lora_files,
|
||||
"base_model_name": MODEL_NAME
|
||||
}
|
||||
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
# lora config below
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
json.dumps(lora_module_1),
|
||||
json.dumps(lora_module_2),
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
"--max-num-seqs",
|
||||
"64",
|
||||
]
|
||||
|
||||
# Enable the /v1/load_lora_adapter endpoint
|
||||
envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server_with_lora_modules_json):
|
||||
async with server_with_lora_modules_json.get_async_client(
|
||||
) as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_static_lora_lineage(client: openai.AsyncOpenAI,
|
||||
zephyr_lora_files):
|
||||
models = await client.models.list()
|
||||
models = models.data
|
||||
served_model = models[0]
|
||||
lora_models = models[1:]
|
||||
assert served_model.id == MODEL_NAME
|
||||
assert served_model.root == MODEL_NAME
|
||||
assert served_model.parent is None
|
||||
assert all(lora_model.root == zephyr_lora_files
|
||||
for lora_model in lora_models)
|
||||
assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
|
||||
assert lora_models[0].id == "zephyr-lora"
|
||||
assert lora_models[1].id == "zephyr-lora2"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI,
|
||||
zephyr_lora_files):
|
||||
|
||||
response = await client.post("load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={
|
||||
"lora_name": "zephyr-lora-3",
|
||||
"lora_path": zephyr_lora_files
|
||||
})
|
||||
# Ensure adapter loads before querying /models
|
||||
assert "success" in response
|
||||
|
||||
models = await client.models.list()
|
||||
models = models.data
|
||||
dynamic_lora_model = models[-1]
|
||||
assert dynamic_lora_model.root == zephyr_lora_files
|
||||
assert dynamic_lora_model.parent == MODEL_NAME
|
||||
assert dynamic_lora_model.id == "zephyr-lora-3"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
|
||||
with pytest.raises(openai.NotFoundError):
|
||||
await client.post("load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={
|
||||
"lora_name": "notfound",
|
||||
"lora_path": "/not/an/adapter"
|
||||
})
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI,
|
||||
tmp_path):
|
||||
invalid_files = tmp_path / "invalid_files"
|
||||
invalid_files.mkdir()
|
||||
(invalid_files / "adapter_config.json").write_text("this is not json")
|
||||
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.post("load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={
|
||||
"lora_name": "invalid-json",
|
||||
"lora_path": str(invalid_files)
|
||||
})
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dynamic_lora_invalid_lora_rank(client: openai.AsyncOpenAI,
|
||||
tmp_path, zephyr_lora_files):
|
||||
invalid_rank = tmp_path / "invalid_rank"
|
||||
|
||||
# Copy adapter from zephyr_lora_files to invalid_rank
|
||||
shutil.copytree(zephyr_lora_files, invalid_rank)
|
||||
|
||||
with open(invalid_rank / "adapter_config.json") as f:
|
||||
adapter_config = json.load(f)
|
||||
|
||||
print(adapter_config)
|
||||
|
||||
# assert False
|
||||
|
||||
# Change rank to invalid value
|
||||
adapter_config["r"] = 1024
|
||||
with open(invalid_rank / "adapter_config.json", "w") as f:
|
||||
json.dump(adapter_config, f)
|
||||
|
||||
with pytest.raises(openai.BadRequestError,
|
||||
match="is greater than max_lora_rank"):
|
||||
await client.post("load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={
|
||||
"lora_name": "invalid-json",
|
||||
"lora_path": str(invalid_rank)
|
||||
})
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path,
|
||||
zephyr_lora_files):
|
||||
"""Validate that many loras can be dynamically registered and inferenced
|
||||
with concurrently"""
|
||||
|
||||
# This test file configures the server with --max-cpu-loras=2 and this test
|
||||
# will concurrently load 10 adapters, so it should flex the LRU cache
|
||||
async def load_and_run_adapter(adapter_name: str):
|
||||
await client.post("load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={
|
||||
"lora_name": adapter_name,
|
||||
"lora_path": str(zephyr_lora_files)
|
||||
})
|
||||
for _ in range(3):
|
||||
await client.completions.create(
|
||||
model=adapter_name,
|
||||
prompt=["Hello there", "Foo bar bazz buzz"],
|
||||
max_tokens=5,
|
||||
)
|
||||
|
||||
lora_tasks = []
|
||||
for i in range(10):
|
||||
lora_tasks.append(
|
||||
asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
|
||||
|
||||
results, _ = await asyncio.wait(lora_tasks)
|
||||
|
||||
for r in results:
|
||||
assert not isinstance(r, Exception), f"Got exception {r}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_loading_invalid_adapters_does_not_break_others(
|
||||
client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files):
|
||||
|
||||
invalid_files = tmp_path / "invalid_files"
|
||||
invalid_files.mkdir()
|
||||
(invalid_files / "adapter_config.json").write_text("this is not json")
|
||||
|
||||
stop_good_requests_event = asyncio.Event()
|
||||
|
||||
async def run_good_requests(client):
|
||||
# Run chat completions requests until event set
|
||||
|
||||
results = []
|
||||
|
||||
while not stop_good_requests_event.is_set():
|
||||
try:
|
||||
batch = await client.completions.create(
|
||||
model="zephyr-lora",
|
||||
prompt=["Hello there", "Foo bar bazz buzz"],
|
||||
max_tokens=5,
|
||||
)
|
||||
results.append(batch)
|
||||
except Exception as e:
|
||||
results.append(e)
|
||||
|
||||
return results
|
||||
|
||||
# Create task to run good requests
|
||||
good_task = asyncio.create_task(run_good_requests(client))
|
||||
|
||||
# Run a bunch of bad adapter loads
|
||||
for _ in range(25):
|
||||
with suppress(openai.NotFoundError):
|
||||
await client.post("load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={
|
||||
"lora_name": "notfound",
|
||||
"lora_path": "/not/an/adapter"
|
||||
})
|
||||
for _ in range(25):
|
||||
with suppress(openai.BadRequestError):
|
||||
await client.post("load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={
|
||||
"lora_name": "invalid",
|
||||
"lora_path": str(invalid_files)
|
||||
})
|
||||
|
||||
# Ensure all the running requests with lora adapters succeeded
|
||||
stop_good_requests_event.set()
|
||||
results = await good_task
|
||||
for r in results:
|
||||
assert not isinstance(r, Exception), f"Got exception {r}"
|
||||
|
||||
# Ensure we can load another adapter and run it
|
||||
await client.post("load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={
|
||||
"lora_name": "valid",
|
||||
"lora_path": zephyr_lora_files
|
||||
})
|
||||
await client.completions.create(
|
||||
model="valid",
|
||||
prompt=["Hello there", "Foo bar bazz buzz"],
|
||||
max_tokens=5,
|
||||
)
|
||||
@@ -1,109 +0,0 @@
|
||||
import json
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
# downloading lora to test lora requests
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
|
||||
# generation quality here
|
||||
LORA_NAME = "typeof/zephyr-7b-beta-lora"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def zephyr_lora_files():
|
||||
return snapshot_download(repo_id=LORA_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server_with_lora_modules_json(zephyr_lora_files):
|
||||
# Define the json format LoRA module configurations
|
||||
lora_module_1 = {
|
||||
"name": "zephyr-lora",
|
||||
"path": zephyr_lora_files,
|
||||
"base_model_name": MODEL_NAME
|
||||
}
|
||||
|
||||
lora_module_2 = {
|
||||
"name": "zephyr-lora2",
|
||||
"path": zephyr_lora_files,
|
||||
"base_model_name": MODEL_NAME
|
||||
}
|
||||
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
# lora config below
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
json.dumps(lora_module_1),
|
||||
json.dumps(lora_module_2),
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
"2",
|
||||
"--max-num-seqs",
|
||||
"64",
|
||||
]
|
||||
|
||||
# Enable the /v1/load_lora_adapter endpoint
|
||||
envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client_for_lora_lineage(server_with_lora_modules_json):
|
||||
async with server_with_lora_modules_json.get_async_client(
|
||||
) as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_static_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI,
|
||||
zephyr_lora_files):
|
||||
models = await client_for_lora_lineage.models.list()
|
||||
models = models.data
|
||||
served_model = models[0]
|
||||
lora_models = models[1:]
|
||||
assert served_model.id == MODEL_NAME
|
||||
assert served_model.root == MODEL_NAME
|
||||
assert served_model.parent is None
|
||||
assert all(lora_model.root == zephyr_lora_files
|
||||
for lora_model in lora_models)
|
||||
assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
|
||||
assert lora_models[0].id == "zephyr-lora"
|
||||
assert lora_models[1].id == "zephyr-lora2"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dynamic_lora_lineage(
|
||||
client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files):
|
||||
|
||||
response = await client_for_lora_lineage.post("load_lora_adapter",
|
||||
cast_to=str,
|
||||
body={
|
||||
"lora_name":
|
||||
"zephyr-lora-3",
|
||||
"lora_path":
|
||||
zephyr_lora_files
|
||||
})
|
||||
# Ensure adapter loads before querying /models
|
||||
assert "success" in response
|
||||
|
||||
models = await client_for_lora_lineage.models.list()
|
||||
models = models.data
|
||||
dynamic_lora_model = models[-1]
|
||||
assert dynamic_lora_model.root == zephyr_lora_files
|
||||
assert dynamic_lora_model.parent == MODEL_NAME
|
||||
assert dynamic_lora_model.id == "zephyr-lora-3"
|
||||
@@ -52,7 +52,7 @@ async def _async_serving_chat_init():
|
||||
engine = MockEngine()
|
||||
model_config = await engine.get_model_config()
|
||||
|
||||
models = OpenAIServingModels(model_config, BASE_MODEL_PATHS)
|
||||
models = OpenAIServingModels(engine, model_config, BASE_MODEL_PATHS)
|
||||
serving_completion = OpenAIServingChat(engine,
|
||||
model_config,
|
||||
models,
|
||||
@@ -73,7 +73,8 @@ def test_serving_chat_should_set_correct_max_tokens():
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
|
||||
models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
|
||||
models = OpenAIServingModels(engine_client=mock_engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
model_config=MockModelConfig())
|
||||
serving_chat = OpenAIServingChat(mock_engine,
|
||||
MockModelConfig(),
|
||||
@@ -116,7 +117,8 @@ def test_serving_chat_could_load_correct_generation_config():
|
||||
mock_engine.errored = False
|
||||
|
||||
# Initialize the serving chat
|
||||
models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
|
||||
models = OpenAIServingModels(engine_client=mock_engine,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
model_config=mock_model_config)
|
||||
serving_chat = OpenAIServingChat(mock_engine,
|
||||
mock_model_config,
|
||||
|
||||
@@ -4,6 +4,7 @@ from unittest.mock import MagicMock
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.openai.protocol import (ErrorResponse,
|
||||
LoadLoraAdapterRequest,
|
||||
UnloadLoraAdapterRequest)
|
||||
@@ -21,13 +22,16 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
|
||||
|
||||
async def _async_serving_models_init() -> OpenAIServingModels:
|
||||
mock_model_config = MagicMock(spec=ModelConfig)
|
||||
mock_engine_client = MagicMock(spec=EngineClient)
|
||||
# Set the max_model_len attribute to avoid missing attribute
|
||||
mock_model_config.max_model_len = 2048
|
||||
|
||||
serving_models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS,
|
||||
serving_models = OpenAIServingModels(engine_client=mock_engine_client,
|
||||
base_model_paths=BASE_MODEL_PATHS,
|
||||
model_config=mock_model_config,
|
||||
lora_modules=None,
|
||||
prompt_adapters=None)
|
||||
await serving_models.init_static_loras()
|
||||
|
||||
return serving_models
|
||||
|
||||
@@ -113,5 +117,5 @@ async def test_unload_lora_adapter_not_found():
|
||||
request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
|
||||
response = await serving_models.unload_lora_adapter(request)
|
||||
assert isinstance(response, ErrorResponse)
|
||||
assert response.type == "InvalidUserInput"
|
||||
assert response.code == HTTPStatus.BAD_REQUEST
|
||||
assert response.type == "NotFoundError"
|
||||
assert response.code == HTTPStatus.NOT_FOUND
|
||||
|
||||
@@ -1,6 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
|
||||
@@ -10,16 +7,7 @@ MODEL_NAME = "meta-llama/Llama-3.2-1B"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_shutdown_on_engine_failure(tmp_path):
|
||||
# Use a bad adapter to crash the engine
|
||||
# (This test will fail when that bug is fixed)
|
||||
adapter_path = tmp_path / "bad_adapter"
|
||||
os.mkdir(adapter_path)
|
||||
with open(adapter_path / "adapter_model_config.json", "w") as f:
|
||||
json.dump({"not": "real"}, f)
|
||||
with open(adapter_path / "adapter_model.safetensors", "wb") as f:
|
||||
f.write(b"this is fake")
|
||||
|
||||
async def test_shutdown_on_engine_failure():
|
||||
# dtype, max-len etc set so that this can run in CI
|
||||
args = [
|
||||
"--dtype",
|
||||
@@ -29,9 +17,6 @@ async def test_shutdown_on_engine_failure(tmp_path):
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"bad-adapter={tmp_path / 'bad_adapter'}",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
@@ -39,9 +24,13 @@ async def test_shutdown_on_engine_failure(tmp_path):
|
||||
|
||||
with pytest.raises(
|
||||
(openai.APIConnectionError, openai.InternalServerError)):
|
||||
# This crashes the engine
|
||||
await client.completions.create(model="bad-adapter",
|
||||
prompt="Hello, my name is")
|
||||
# Asking for lots of prompt logprobs will currently crash the
|
||||
# engine. This may change in the future when that bug is fixed
|
||||
prompt = "Hello " * 4000
|
||||
await client.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=prompt,
|
||||
extra_body={"prompt_logprobs": 10})
|
||||
|
||||
# Now the server should shut down
|
||||
return_code = remote_server.proc.wait(timeout=8)
|
||||
|
||||
Reference in New Issue
Block a user