[Frontend] Use new Renderer for Completions and Tokenize API (#32863)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -556,19 +556,6 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
|
||||
request_logger=None,
|
||||
)
|
||||
|
||||
async def _fake_process_inputs(
|
||||
request_id,
|
||||
engine_prompt,
|
||||
sampling_params,
|
||||
*,
|
||||
lora_request,
|
||||
trace_headers,
|
||||
priority,
|
||||
data_parallel_rank,
|
||||
):
|
||||
return dict(engine_prompt), {}
|
||||
|
||||
serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
|
||||
return serving_chat
|
||||
|
||||
|
||||
@@ -784,7 +771,7 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated():
|
||||
|
||||
resp = await serving_chat.create_chat_completion(req)
|
||||
assert isinstance(resp, ErrorResponse)
|
||||
assert "max_tokens" in resp.error.message
|
||||
assert "context length is only" in resp.error.message
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -824,7 +811,7 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected():
|
||||
|
||||
resp = await serving_chat.create_chat_completion(req)
|
||||
assert isinstance(resp, ErrorResponse)
|
||||
assert "maximum context length" in resp.error.message
|
||||
assert "context length is only" in resp.error.message
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -890,6 +877,20 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
|
||||
|
||||
serving_chat = _build_serving_chat(mock_engine)
|
||||
|
||||
orig_render_chat_request = serving_chat.render_chat_request
|
||||
captured_prompts = []
|
||||
|
||||
async def render_chat_request(request):
|
||||
result = await orig_render_chat_request(request)
|
||||
|
||||
assert isinstance(result, tuple)
|
||||
conversation, engine_prompts = result
|
||||
captured_prompts.extend(engine_prompts)
|
||||
|
||||
return result
|
||||
|
||||
serving_chat.render_chat_request = render_chat_request
|
||||
|
||||
# Test cache_salt
|
||||
req = ChatCompletionRequest(
|
||||
model=MODEL_NAME,
|
||||
@@ -899,15 +900,19 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
|
||||
# By default, cache_salt in the engine prompt is not set
|
||||
with suppress(Exception):
|
||||
await serving_chat.create_chat_completion(req)
|
||||
engine_prompt = serving_chat._process_inputs.await_args_list[0].args[1]
|
||||
assert "cache_salt" not in engine_prompt
|
||||
|
||||
assert len(captured_prompts) == 1
|
||||
assert "cache_salt" not in captured_prompts[0]
|
||||
|
||||
captured_prompts.clear()
|
||||
|
||||
# Test with certain cache_salt
|
||||
req.cache_salt = "test_salt"
|
||||
with suppress(Exception):
|
||||
await serving_chat.create_chat_completion(req)
|
||||
engine_prompt = serving_chat._process_inputs.await_args_list[1].args[1]
|
||||
assert engine_prompt.get("cache_salt") == "test_salt"
|
||||
|
||||
assert len(captured_prompts) == 1
|
||||
assert captured_prompts[0]["cache_salt"] == "test_salt"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -1007,11 +1012,11 @@ class TestServingChatWithHarmony:
|
||||
@pytest.fixture()
|
||||
def mock_engine(self) -> AsyncLLM:
|
||||
mock_engine = MagicMock(spec=AsyncLLM)
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = MockModelConfig()
|
||||
mock_engine.input_processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
mock_engine.renderer = _build_renderer(mock_engine.model_config)
|
||||
return mock_engine
|
||||
|
||||
@pytest.fixture()
|
||||
@@ -1618,11 +1623,11 @@ async def test_tool_choice_validation_without_parser():
|
||||
"""Test that tool_choice='required' or named tool without tool_parser
|
||||
returns an appropriate error message."""
|
||||
mock_engine = MagicMock(spec=AsyncLLM)
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = MockModelConfig()
|
||||
mock_engine.input_processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
mock_engine.renderer = _build_renderer(mock_engine.model_config)
|
||||
|
||||
models = OpenAIServingModels(
|
||||
engine_client=mock_engine,
|
||||
|
||||
Reference in New Issue
Block a user