[Core] Remove tokenizer group in vLLM (#24078)
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
This commit is contained in:
@@ -29,11 +29,7 @@ def monkeypatch_module():
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[False, True])
|
||||
def server(
|
||||
request,
|
||||
monkeypatch_module,
|
||||
zephyr_lora_files, #noqa: F811
|
||||
zephyr_lora_added_tokens_files): # noqa: F811
|
||||
def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811
|
||||
|
||||
use_v1 = request.param
|
||||
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
|
||||
@@ -49,7 +45,6 @@ def server(
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"zephyr-lora={zephyr_lora_files}",
|
||||
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
@@ -79,7 +74,7 @@ async def client(server):
|
||||
@pytest.mark.parametrize(
|
||||
# first test base model, then test loras
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
|
||||
messages = [{
|
||||
|
||||
@@ -27,7 +27,7 @@ GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
|
||||
def default_server_args(zephyr_lora_files):
|
||||
return [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
@@ -41,7 +41,6 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"zephyr-lora={zephyr_lora_files}",
|
||||
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
@@ -87,7 +86,7 @@ async def client(server):
|
||||
@pytest.mark.parametrize(
|
||||
# first test base model, then test loras
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
|
||||
completion = await client.completions.create(model=model_name,
|
||||
@@ -115,20 +114,6 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
|
||||
assert completion.choices[0].prompt_logprobs is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_added_lora_tokens(client: openai.AsyncOpenAI):
|
||||
# test using token IDs
|
||||
completion = await client.completions.create(
|
||||
model="zephyr-lora2",
|
||||
prompt=[0, 0, 32000, 32001, 32002],
|
||||
echo=True,
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
# Added tokens should appear in tokenized prompt
|
||||
assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
|
||||
# test using token IDs
|
||||
@@ -147,7 +132,7 @@ async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
|
||||
@pytest.mark.parametrize(
|
||||
# first test base model, then test loras
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
|
||||
# test using token IDs
|
||||
@@ -713,7 +698,7 @@ async def test_guided_grammar(client: openai.AsyncOpenAI,
|
||||
@pytest.mark.parametrize(
|
||||
# first test base model, then test loras
|
||||
"model_name",
|
||||
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
|
||||
[MODEL_NAME, "zephyr-lora"],
|
||||
)
|
||||
@pytest.mark.parametrize("logprobs_arg", [1, 0])
|
||||
async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
|
||||
|
||||
@@ -21,10 +21,7 @@ CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args(
|
||||
zephyr_lora_files,
|
||||
zephyr_lora_added_tokens_files,
|
||||
) -> list[str]:
|
||||
def default_server_args() -> list[str]:
|
||||
return [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
|
||||
@@ -67,12 +67,6 @@ def server_with_lora_modules_json(request, monkeypatch_module,
|
||||
"base_model_name": MODEL_NAME
|
||||
}
|
||||
|
||||
lora_module_2 = {
|
||||
"name": "zephyr-lora2",
|
||||
"path": zephyr_lora_files,
|
||||
"base_model_name": MODEL_NAME
|
||||
}
|
||||
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
@@ -84,7 +78,6 @@ def server_with_lora_modules_json(request, monkeypatch_module,
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
json.dumps(lora_module_1),
|
||||
json.dumps(lora_module_2),
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
@@ -121,7 +114,6 @@ async def test_static_lora_lineage(client: openai.AsyncOpenAI,
|
||||
for lora_model in lora_models)
|
||||
assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
|
||||
assert lora_models[0].id == "zephyr-lora"
|
||||
assert lora_models[1].id == "zephyr-lora2"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -209,7 +201,7 @@ async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path,
|
||||
@pytest.mark.asyncio
|
||||
async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path,
|
||||
zephyr_lora_files):
|
||||
"""Validate that many loras can be dynamically registered and inferenced
|
||||
"""Validate that many loras can be dynamically registered and inferenced
|
||||
with concurrently"""
|
||||
|
||||
# This test file configures the server with --max-cpu-loras=2 and this test
|
||||
|
||||
@@ -26,7 +26,6 @@ def server(zephyr_lora_files):
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"zephyr-lora={zephyr_lora_files}",
|
||||
f"zephyr-lora2={zephyr_lora_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--max-cpu-loras",
|
||||
@@ -56,4 +55,3 @@ async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
|
||||
assert all(lora_model.root == zephyr_lora_files
|
||||
for lora_model in lora_models)
|
||||
assert lora_models[0].id == "zephyr-lora"
|
||||
assert lora_models[1].id == "zephyr-lora2"
|
||||
|
||||
@@ -14,7 +14,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(zephyr_lora_added_tokens_files: str): # noqa: F811
|
||||
def server():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
@@ -24,12 +24,6 @@ def server(zephyr_lora_added_tokens_files: str): # noqa: F811
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"128",
|
||||
# lora config
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
|
||||
"--max-lora-rank",
|
||||
"64",
|
||||
"--enable-tokenizer-info-endpoint",
|
||||
]
|
||||
|
||||
@@ -38,10 +32,8 @@ def server(zephyr_lora_added_tokens_files: str): # noqa: F811
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def tokenizer_name(model_name: str,
|
||||
zephyr_lora_added_tokens_files: str): # noqa: F811
|
||||
return zephyr_lora_added_tokens_files if (
|
||||
model_name == "zephyr-lora2") else model_name
|
||||
def tokenizer_name(model_name: str):
|
||||
return model_name
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
@@ -53,7 +45,7 @@ async def client(server):
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name,tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
||||
[(MODEL_NAME, MODEL_NAME)],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_tokenize_completions(
|
||||
@@ -86,7 +78,7 @@ async def test_tokenize_completions(
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name,tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
||||
[(MODEL_NAME, MODEL_NAME)],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_tokenize_chat(
|
||||
@@ -148,7 +140,7 @@ async def test_tokenize_chat(
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name,tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
||||
[(MODEL_NAME, MODEL_NAME)],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_tokenize_chat_with_tools(
|
||||
@@ -225,7 +217,7 @@ async def test_tokenize_chat_with_tools(
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name, tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
||||
[(MODEL_NAME, MODEL_NAME)],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_tokenize_with_return_token_strs(
|
||||
@@ -260,7 +252,7 @@ async def test_tokenize_with_return_token_strs(
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name,tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
||||
[(MODEL_NAME, MODEL_NAME)],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_detokenize(
|
||||
@@ -287,7 +279,7 @@ async def test_detokenize(
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name,tokenizer_name",
|
||||
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
|
||||
[(MODEL_NAME, MODEL_NAME)],
|
||||
indirect=["tokenizer_name"],
|
||||
)
|
||||
async def test_tokenizer_info_basic(
|
||||
@@ -384,4 +376,4 @@ async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
|
||||
if chat_template:
|
||||
assert isinstance(chat_template,
|
||||
str), ("Chat template should be a string")
|
||||
assert chat_template.strip(), "Chat template should not be empty"
|
||||
assert chat_template.strip(), "Chat template should not be empty"
|
||||
|
||||
@@ -18,6 +18,8 @@ SERVER_ARGS = [
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"{LORA_MODEL}={LORA_MODEL}",
|
||||
"--tokenizer",
|
||||
f"{LORA_MODEL}",
|
||||
]
|
||||
|
||||
TOOLS = [{
|
||||
|
||||
Reference in New Issue
Block a user