[V1] V1 Enablement Oracle (#13726)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Michael Goin <michael@neuralmagic.com>
This commit is contained in:
141
tests/detokenizer/test_stop_strings.py
Normal file
141
tests/detokenizer/test_stop_strings.py
Normal file
@@ -0,0 +1,141 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, SamplingParams, envs
|
||||
|
||||
MODEL = "meta-llama/llama-2-7b-hf"
|
||||
MAX_TOKENS = 200
|
||||
|
||||
|
||||
def _test_stopping(llm: LLM,
|
||||
expected_output: str,
|
||||
expected_reason: Any,
|
||||
stop: Optional[list[str]] = None,
|
||||
stop_token_ids: Optional[list[int]] = None,
|
||||
include_in_output: bool = False) -> None:
|
||||
output = llm.generate(
|
||||
"A story about vLLM:\n",
|
||||
SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=MAX_TOKENS,
|
||||
stop=stop,
|
||||
stop_token_ids=stop_token_ids,
|
||||
include_stop_str_in_output=include_in_output,
|
||||
))[0].outputs[0]
|
||||
|
||||
assert output is not None
|
||||
assert output.text == expected_output
|
||||
assert output.stop_reason == expected_reason
|
||||
|
||||
|
||||
def _set_async_mode(llm, is_async):
|
||||
llm.llm_engine.scheduler[0].use_async_output_proc = is_async
|
||||
|
||||
|
||||
def _stop_basic(llm):
|
||||
_test_stopping(llm,
|
||||
stop=["."],
|
||||
include_in_output=False,
|
||||
expected_output="VLLM is a 100% volunteer organization",
|
||||
expected_reason=".")
|
||||
|
||||
_test_stopping(llm,
|
||||
stop=["."],
|
||||
include_in_output=True,
|
||||
expected_output="VLLM is a 100% volunteer organization.",
|
||||
expected_reason=".")
|
||||
|
||||
|
||||
def _stop_multi_tokens(llm):
|
||||
_test_stopping(
|
||||
llm,
|
||||
stop=["group of peo", "short"],
|
||||
include_in_output=False,
|
||||
expected_output="VLLM is a 100% volunteer organization. We are a ",
|
||||
expected_reason="group of peo")
|
||||
|
||||
_test_stopping(
|
||||
llm,
|
||||
stop=["group of peo", "short"],
|
||||
include_in_output=True,
|
||||
expected_output=
|
||||
"VLLM is a 100% volunteer organization. We are a group of peo",
|
||||
expected_reason="group of peo")
|
||||
|
||||
|
||||
def _stop_partial_token(llm):
|
||||
_test_stopping(llm,
|
||||
stop=["gani"],
|
||||
include_in_output=False,
|
||||
expected_output="VLLM is a 100% volunteer or",
|
||||
expected_reason="gani")
|
||||
|
||||
_test_stopping(llm,
|
||||
stop=["gani"],
|
||||
include_in_output=True,
|
||||
expected_output="VLLM is a 100% volunteer organi",
|
||||
expected_reason="gani")
|
||||
|
||||
|
||||
def _stop_token_id(llm):
|
||||
# token id 13013 => " organization"
|
||||
|
||||
_test_stopping(llm,
|
||||
stop_token_ids=[13013],
|
||||
include_in_output=False,
|
||||
expected_output="VLLM is a 100% volunteer",
|
||||
expected_reason=13013)
|
||||
|
||||
_test_stopping(llm,
|
||||
stop_token_ids=[13013],
|
||||
include_in_output=True,
|
||||
expected_output="VLLM is a 100% volunteer organization",
|
||||
expected_reason=13013)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_stop_strings():
|
||||
# If V0, must set enforce_eager=False since we use
|
||||
# async output processing below.
|
||||
vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
_stop_basic(vllm_model)
|
||||
else:
|
||||
_set_async_mode(vllm_model, True)
|
||||
_stop_basic(vllm_model)
|
||||
|
||||
_set_async_mode(vllm_model, False)
|
||||
_stop_basic(vllm_model)
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
_stop_multi_tokens(vllm_model)
|
||||
else:
|
||||
_set_async_mode(vllm_model, True)
|
||||
_stop_multi_tokens(vllm_model)
|
||||
|
||||
_set_async_mode(vllm_model, False)
|
||||
_stop_multi_tokens(vllm_model)
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
_stop_partial_token(vllm_model)
|
||||
else:
|
||||
_set_async_mode(vllm_model, True)
|
||||
_stop_partial_token(vllm_model)
|
||||
|
||||
_set_async_mode(vllm_model, False)
|
||||
_stop_partial_token(vllm_model)
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
# FIXME: this does not respect include_in_output=False
|
||||
# _stop_token_id(vllm_model)
|
||||
pass
|
||||
else:
|
||||
_set_async_mode(vllm_model, True)
|
||||
_stop_token_id(vllm_model)
|
||||
|
||||
_set_async_mode(vllm_model, False)
|
||||
_stop_token_id(vllm_model)
|
||||
Reference in New Issue
Block a user