[V1] V1 Enablement Oracle (#13726)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Michael Goin <michael@neuralmagic.com>
2025-03-15 01:02:20 -04:00
parent 8c0d15d5c5
commit d4d93db2c5
96 changed files with 1537 additions and 512 deletions
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import pytest
+
+from vllm import LLM, SamplingParams, envs
+
+MODEL = "meta-llama/llama-2-7b-hf"
+MAX_TOKENS = 200
+
+
+def _test_stopping(llm: LLM,
+                   expected_output: str,
+                   expected_reason: Any,
+                   stop: Optional[list[str]] = None,
+                   stop_token_ids: Optional[list[int]] = None,
+                   include_in_output: bool = False) -> None:
+    output = llm.generate(
+        "A story about vLLM:\n",
+        SamplingParams(
+            temperature=0.0,
+            max_tokens=MAX_TOKENS,
+            stop=stop,
+            stop_token_ids=stop_token_ids,
+            include_stop_str_in_output=include_in_output,
+        ))[0].outputs[0]
+
+    assert output is not None
+    assert output.text == expected_output
+    assert output.stop_reason == expected_reason
+
+
+def _set_async_mode(llm, is_async):
+    llm.llm_engine.scheduler[0].use_async_output_proc = is_async
+
+
+def _stop_basic(llm):
+    _test_stopping(llm,
+                   stop=["."],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer organization",
+                   expected_reason=".")
+
+    _test_stopping(llm,
+                   stop=["."],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organization.",
+                   expected_reason=".")
+
+
+def _stop_multi_tokens(llm):
+    _test_stopping(
+        llm,
+        stop=["group of peo", "short"],
+        include_in_output=False,
+        expected_output="VLLM is a 100% volunteer organization. We are a ",
+        expected_reason="group of peo")
+
+    _test_stopping(
+        llm,
+        stop=["group of peo", "short"],
+        include_in_output=True,
+        expected_output=
+        "VLLM is a 100% volunteer organization. We are a group of peo",
+        expected_reason="group of peo")
+
+
+def _stop_partial_token(llm):
+    _test_stopping(llm,
+                   stop=["gani"],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer or",
+                   expected_reason="gani")
+
+    _test_stopping(llm,
+                   stop=["gani"],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organi",
+                   expected_reason="gani")
+
+
+def _stop_token_id(llm):
+    # token id 13013 => " organization"
+
+    _test_stopping(llm,
+                   stop_token_ids=[13013],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer",
+                   expected_reason=13013)
+
+    _test_stopping(llm,
+                   stop_token_ids=[13013],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organization",
+                   expected_reason=13013)
+
+
+@pytest.mark.skip_global_cleanup
+def test_stop_strings():
+    # If V0, must set enforce_eager=False since we use
+    # async output processing below.
+    vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
+
+    if envs.VLLM_USE_V1:
+        _stop_basic(vllm_model)
+    else:
+        _set_async_mode(vllm_model, True)
+        _stop_basic(vllm_model)
+
+        _set_async_mode(vllm_model, False)
+        _stop_basic(vllm_model)
+
+    if envs.VLLM_USE_V1:
+        _stop_multi_tokens(vllm_model)
+    else:
+        _set_async_mode(vllm_model, True)
+        _stop_multi_tokens(vllm_model)
+
+        _set_async_mode(vllm_model, False)
+        _stop_multi_tokens(vllm_model)
+
+    if envs.VLLM_USE_V1:
+        _stop_partial_token(vllm_model)
+    else:
+        _set_async_mode(vllm_model, True)
+        _stop_partial_token(vllm_model)
+
+        _set_async_mode(vllm_model, False)
+        _stop_partial_token(vllm_model)
+
+    if envs.VLLM_USE_V1:
+        # FIXME: this does not respect include_in_output=False
+        # _stop_token_id(vllm_model)
+        pass
+    else:
+        _set_async_mode(vllm_model, True)
+        _stop_token_id(vllm_model)
+
+        _set_async_mode(vllm_model, False)
+        _stop_token_id(vllm_model)