[FEATURE] Enables /score endpoint for embedding models (#12846)

This commit is contained in:
Gabriel Marinho
2025-02-21 03:09:47 -03:00
committed by GitHub
parent 1cdc88614a
commit 1c3c975766
11 changed files with 590 additions and 513 deletions

View File

@@ -26,7 +26,7 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
from vllm.entrypoints.openai.serving_models import (BaseModelPath,
OpenAIServingModels)
from vllm.entrypoints.openai.serving_score import OpenAIServingScores
from vllm.entrypoints.openai.serving_score import ServingScores
from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser, random_uuid
from vllm.version import __version__ as VLLM_VERSION
@@ -342,7 +342,7 @@ async def main(args):
chat_template=None,
chat_template_content_format="auto",
) if model_config.task == "embed" else None
openai_serving_scores = (OpenAIServingScores(
openai_serving_scores = (ServingScores(
engine,
model_config,
openai_serving_models,
@@ -364,9 +364,9 @@ async def main(args):
# Determine the type of request and run it.
if request.url == "/v1/chat/completions":
handler_fn = (None if openai_serving_chat is None else
openai_serving_chat.create_chat_completion)
if handler_fn is None:
chat_handler_fn = (None if openai_serving_chat is None else
openai_serving_chat.create_chat_completion)
if chat_handler_fn is None:
response_futures.append(
make_async_error_request_output(
request,
@@ -375,12 +375,13 @@ async def main(args):
))
continue
response_futures.append(run_request(handler_fn, request, tracker))
response_futures.append(
run_request(chat_handler_fn, request, tracker))
tracker.submitted()
elif request.url == "/v1/embeddings":
handler_fn = (None if openai_serving_embedding is None else
openai_serving_embedding.create_embedding)
if handler_fn is None:
embed_handler_fn = (None if openai_serving_embedding is None else
openai_serving_embedding.create_embedding)
if embed_handler_fn is None:
response_futures.append(
make_async_error_request_output(
request,
@@ -388,12 +389,13 @@ async def main(args):
))
continue
response_futures.append(run_request(handler_fn, request, tracker))
response_futures.append(
run_request(embed_handler_fn, request, tracker))
tracker.submitted()
elif request.url == "/v1/score":
handler_fn = (None if openai_serving_scores is None else
openai_serving_scores.create_score)
if handler_fn is None:
score_handler_fn = (None if openai_serving_scores is None else
openai_serving_scores.create_score)
if score_handler_fn is None:
response_futures.append(
make_async_error_request_output(
request,
@@ -401,7 +403,8 @@ async def main(args):
))
continue
response_futures.append(run_request(handler_fn, request, tracker))
response_futures.append(
run_request(score_handler_fn, request, tracker))
tracker.submitted()
else:
response_futures.append(