Extend renderer with embedding support and integrate completion endpoint (#24405)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
This commit is contained in:
Flora Feng
2025-09-09 10:46:46 -07:00
committed by GitHub
parent 9ad0688e43
commit 15cb047e25
9 changed files with 410 additions and 309 deletions

View File

@@ -10,7 +10,7 @@ import pytest
import regex as re
import torch
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.renderer import BaseRenderer
from ...utils import RemoteOpenAIServer
@@ -27,12 +27,16 @@ async def test_empty_prompt():
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
with pytest.raises(openai.BadRequestError,
match="decoder prompt cannot be empty"):
with pytest.raises(
openai.BadRequestError,
match=
"Either prompt or prompt_embeds must be provided and non-empty."
):
await client.completions.create(model=model_name,
prompt="",
max_tokens=5,
temperature=0.0)
temperature=0.0,
extra_body={"prompt_embeds": []})
@pytest.mark.asyncio
@@ -83,7 +87,7 @@ def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout,
buffer.seek(0)
encoded_tensor = pybase64.b64encode(buffer.getvalue())
loaded_prompt_embeds = OpenAIServing._load_prompt_embeds(encoded_tensor)
loaded_prompt_embeds = BaseRenderer.load_prompt_embeds(encoded_tensor)
assert len(loaded_prompt_embeds) == 1
loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
assert loaded_tensor.device.type == "cpu"