[Frontend] Use new Renderer for Completions and Tokenize API (#32863)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-31 20:51:15 +08:00
parent 8980001c93
commit f0a1c8453a
64 changed files with 2116 additions and 2003 deletions
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -556,19 +556,6 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
        request_logger=None,
    )

-    async def _fake_process_inputs(
-        request_id,
-        engine_prompt,
-        sampling_params,
-        *,
-        lora_request,
-        trace_headers,
-        priority,
-        data_parallel_rank,
-    ):
-        return dict(engine_prompt), {}
-
-    serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
    return serving_chat


@@ -784,7 +771,7 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated():

    resp = await serving_chat.create_chat_completion(req)
    assert isinstance(resp, ErrorResponse)
-    assert "max_tokens" in resp.error.message
+    assert "context length is only" in resp.error.message


@pytest.mark.asyncio
@@ -824,7 +811,7 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected():

    resp = await serving_chat.create_chat_completion(req)
    assert isinstance(resp, ErrorResponse)
-    assert "maximum context length" in resp.error.message
+    assert "context length is only" in resp.error.message


@pytest.mark.asyncio
@@ -890,6 +877,20 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):

    serving_chat = _build_serving_chat(mock_engine)

+    orig_render_chat_request = serving_chat.render_chat_request
+    captured_prompts = []
+
+    async def render_chat_request(request):
+        result = await orig_render_chat_request(request)
+
+        assert isinstance(result, tuple)
+        conversation, engine_prompts = result
+        captured_prompts.extend(engine_prompts)
+
+        return result
+
+    serving_chat.render_chat_request = render_chat_request
+
    # Test cache_salt
    req = ChatCompletionRequest(
        model=MODEL_NAME,
@@ -899,15 +900,19 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
    # By default, cache_salt in the engine prompt is not set
    with suppress(Exception):
        await serving_chat.create_chat_completion(req)
-    engine_prompt = serving_chat._process_inputs.await_args_list[0].args[1]
-    assert "cache_salt" not in engine_prompt
+
+    assert len(captured_prompts) == 1
+    assert "cache_salt" not in captured_prompts[0]
+
+    captured_prompts.clear()

    # Test with certain cache_salt
    req.cache_salt = "test_salt"
    with suppress(Exception):
        await serving_chat.create_chat_completion(req)
-    engine_prompt = serving_chat._process_inputs.await_args_list[1].args[1]
-    assert engine_prompt.get("cache_salt") == "test_salt"
+
+    assert len(captured_prompts) == 1
+    assert captured_prompts[0]["cache_salt"] == "test_salt"


@pytest.mark.asyncio
@@ -1007,11 +1012,11 @@ class TestServingChatWithHarmony:
    @pytest.fixture()
    def mock_engine(self) -> AsyncLLM:
        mock_engine = MagicMock(spec=AsyncLLM)
-        mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
        mock_engine.errored = False
        mock_engine.model_config = MockModelConfig()
        mock_engine.input_processor = MagicMock()
        mock_engine.io_processor = MagicMock()
+        mock_engine.renderer = _build_renderer(mock_engine.model_config)
        return mock_engine

    @pytest.fixture()
@@ -1618,11 +1623,11 @@ async def test_tool_choice_validation_without_parser():
    """Test that tool_choice='required' or named tool without tool_parser
    returns an appropriate error message."""
    mock_engine = MagicMock(spec=AsyncLLM)
-    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig()
    mock_engine.input_processor = MagicMock()
    mock_engine.io_processor = MagicMock()
+    mock_engine.renderer = _build_renderer(mock_engine.model_config)

    models = OpenAIServingModels(
        engine_client=mock_engine,