[Frontend][4/n] Make pooling entrypoints request schema consensus | ScoreRequest (#33060)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
This commit is contained in:
@@ -89,6 +89,29 @@ def main(args):
|
||||
response = requests.post(rerank_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Query: string & Document: text + image url")
|
||||
prompt = {
|
||||
"model": model,
|
||||
"query": query,
|
||||
"documents": {"content": [documents[0], documents[1]]},
|
||||
}
|
||||
response = requests.post(rerank_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Query: string & Document: list")
|
||||
prompt = {
|
||||
"model": model,
|
||||
"query": query,
|
||||
"documents": [
|
||||
document,
|
||||
{"content": [documents[0]]},
|
||||
{"content": [documents[1]]},
|
||||
{"content": [documents[0], documents[1]]},
|
||||
],
|
||||
}
|
||||
response = requests.post(rerank_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
@@ -92,6 +92,44 @@ def main(args):
|
||||
response = requests.post(score_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Query: string & Document: text + image url")
|
||||
prompt = {
|
||||
"model": model,
|
||||
"queries": query,
|
||||
"documents": {"content": [documents[0], documents[1]]},
|
||||
}
|
||||
response = requests.post(score_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Query: string & Document: list")
|
||||
prompt = {
|
||||
"model": model,
|
||||
"queries": query,
|
||||
"documents": [
|
||||
document,
|
||||
{"content": [documents[0]]},
|
||||
{"content": [documents[1]]},
|
||||
{"content": [documents[0], documents[1]]},
|
||||
],
|
||||
}
|
||||
response = requests.post(score_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Query: list & Document: list")
|
||||
data = [
|
||||
document,
|
||||
{"content": [documents[0]]},
|
||||
{"content": [documents[1]]},
|
||||
{"content": [documents[0], documents[1]]},
|
||||
]
|
||||
prompt = {
|
||||
"model": model,
|
||||
"queries": data,
|
||||
"documents": data,
|
||||
}
|
||||
response = requests.post(score_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
@@ -90,6 +90,35 @@ class TestModel:
|
||||
for i in range(len(vllm_outputs)):
|
||||
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
|
||||
|
||||
def test_queries_str_items_str(
|
||||
self, server: RemoteOpenAIServer, model: dict[str, Any], runner
|
||||
):
|
||||
queries = "What is the capital of France?"
|
||||
items = "The capital of France is Paris."
|
||||
|
||||
score_response = requests.post(
|
||||
server.url_for("score"),
|
||||
json={
|
||||
"model": model["name"],
|
||||
"queries": queries,
|
||||
"items": items,
|
||||
},
|
||||
)
|
||||
score_response.raise_for_status()
|
||||
score = ScoreResponse.model_validate(score_response.json())
|
||||
|
||||
assert score.id is not None
|
||||
assert score.data is not None
|
||||
assert len(score.data) == 1
|
||||
|
||||
vllm_outputs = [d.score for d in score.data]
|
||||
|
||||
text_pairs = [[queries, items]]
|
||||
hf_outputs = run_transformers(runner, model, text_pairs)
|
||||
|
||||
for i in range(len(vllm_outputs)):
|
||||
assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
|
||||
|
||||
def test_text_1_str_text_2_str(
|
||||
self, server: RemoteOpenAIServer, model: dict[str, Any], runner
|
||||
):
|
||||
|
||||
@@ -5,7 +5,7 @@ import pytest
|
||||
import requests
|
||||
|
||||
from tests.utils import VLLM_PATH, RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.score.protocol import ScoreResponse
|
||||
from vllm.entrypoints.pooling.score.protocol import RerankResponse, ScoreResponse
|
||||
from vllm.multimodal.utils import encode_image_url, fetch_image
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen3-VL-Reranker-2B"
|
||||
@@ -16,11 +16,12 @@ HF_OVERRIDES = {
|
||||
}
|
||||
|
||||
query = "A cat standing in the snow."
|
||||
document = "This product was excellent and exceeded my expectations."
|
||||
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
|
||||
documents = [
|
||||
{
|
||||
"type": "text",
|
||||
"text": query,
|
||||
"text": document,
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
@@ -32,6 +33,11 @@ documents = [
|
||||
},
|
||||
]
|
||||
|
||||
TEXT_VS_TEXT = 0.10040374100208282
|
||||
TEXT_VS_IMAGE = 0.7423753142356873
|
||||
TEXT_VS_TEXT_PLUS_IMAGE = 0.5298863053321838
|
||||
TOL = 0.05
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
@@ -50,15 +56,12 @@ def server():
|
||||
|
||||
|
||||
def test_score_api_queries_str_documents_str(server: RemoteOpenAIServer):
|
||||
queries = "What is the capital of France?"
|
||||
documents = "The capital of France is Paris."
|
||||
|
||||
score_response = requests.post(
|
||||
server.url_for("score"),
|
||||
json={
|
||||
"model": MODEL_NAME,
|
||||
"queries": queries,
|
||||
"documents": documents,
|
||||
"queries": query,
|
||||
"documents": document,
|
||||
},
|
||||
)
|
||||
score_response.raise_for_status()
|
||||
@@ -67,6 +70,8 @@ def test_score_api_queries_str_documents_str(server: RemoteOpenAIServer):
|
||||
assert score.id is not None
|
||||
assert score.data is not None
|
||||
assert len(score.data) == 1
|
||||
assert score.usage.prompt_tokens == 81
|
||||
assert score.data[0].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
|
||||
|
||||
|
||||
def test_score_api_queries_str_documents_text_content(server: RemoteOpenAIServer):
|
||||
@@ -84,6 +89,8 @@ def test_score_api_queries_str_documents_text_content(server: RemoteOpenAIServer
|
||||
assert score.id is not None
|
||||
assert score.data is not None
|
||||
assert len(score.data) == 1
|
||||
assert score.usage.prompt_tokens == 81
|
||||
assert score.data[0].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
|
||||
|
||||
|
||||
def test_score_api_queries_str_documents_image_url_content(server: RemoteOpenAIServer):
|
||||
@@ -101,6 +108,8 @@ def test_score_api_queries_str_documents_image_url_content(server: RemoteOpenAIS
|
||||
assert score.id is not None
|
||||
assert score.data is not None
|
||||
assert len(score.data) == 1
|
||||
assert score.usage.prompt_tokens == 98
|
||||
assert score.data[0].score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
|
||||
|
||||
|
||||
def test_score_api_queries_str_documents_image_base64_content(
|
||||
@@ -120,3 +129,111 @@ def test_score_api_queries_str_documents_image_base64_content(
|
||||
assert score.id is not None
|
||||
assert score.data is not None
|
||||
assert len(score.data) == 1
|
||||
assert score.usage.prompt_tokens == 98
|
||||
assert score.data[0].score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
|
||||
|
||||
|
||||
def test_score_api_queries_str_documents_image_url_plus_text_content(
|
||||
server: RemoteOpenAIServer,
|
||||
):
|
||||
score_response = requests.post(
|
||||
server.url_for("score"),
|
||||
json={
|
||||
"model": MODEL_NAME,
|
||||
"queries": query,
|
||||
"documents": {"content": [documents[0], documents[1]]},
|
||||
},
|
||||
)
|
||||
score_response.raise_for_status()
|
||||
score = ScoreResponse.model_validate(score_response.json())
|
||||
|
||||
assert score.id is not None
|
||||
assert score.data is not None
|
||||
assert len(score.data) == 1
|
||||
assert score.usage.prompt_tokens == 108
|
||||
assert score.data[0].score == pytest.approx(TEXT_VS_TEXT_PLUS_IMAGE, rel=TOL)
|
||||
|
||||
|
||||
def test_score_api_queries_str_documents_list(server: RemoteOpenAIServer):
|
||||
score_response = requests.post(
|
||||
server.url_for("score"),
|
||||
json={
|
||||
"model": MODEL_NAME,
|
||||
"queries": query,
|
||||
"documents": [
|
||||
document,
|
||||
{"content": [documents[0]]},
|
||||
{"content": [documents[1]]},
|
||||
{"content": [documents[0], documents[1]]},
|
||||
],
|
||||
},
|
||||
)
|
||||
score_response.raise_for_status()
|
||||
score = ScoreResponse.model_validate(score_response.json())
|
||||
|
||||
assert score.id is not None
|
||||
assert score.data is not None
|
||||
assert len(score.data) == 4
|
||||
assert score.usage.prompt_tokens == 368
|
||||
assert score.data[0].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
|
||||
assert score.data[1].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
|
||||
assert score.data[2].score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
|
||||
assert score.data[3].score == pytest.approx(TEXT_VS_TEXT_PLUS_IMAGE, rel=TOL)
|
||||
|
||||
|
||||
def test_rerank_api_queries_str_documents_list(server: RemoteOpenAIServer):
|
||||
rerank_response = requests.post(
|
||||
server.url_for("rerank"),
|
||||
json={
|
||||
"model": MODEL_NAME,
|
||||
"query": query,
|
||||
"documents": [
|
||||
document,
|
||||
{"content": [documents[0]]},
|
||||
{"content": [documents[1]]},
|
||||
{"content": [documents[0], documents[1]]},
|
||||
],
|
||||
},
|
||||
)
|
||||
rerank_response.raise_for_status()
|
||||
rerank = RerankResponse.model_validate(rerank_response.json())
|
||||
|
||||
assert rerank.id is not None
|
||||
assert rerank.model is not None
|
||||
assert rerank.usage is not None
|
||||
assert len(rerank.results) == 4
|
||||
|
||||
rerank.results.sort(key=lambda x: x.index)
|
||||
assert rerank.results[0].relevance_score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
|
||||
assert rerank.results[1].relevance_score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
|
||||
assert rerank.results[2].relevance_score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
|
||||
assert rerank.results[3].relevance_score == pytest.approx(
|
||||
TEXT_VS_TEXT_PLUS_IMAGE, rel=TOL
|
||||
)
|
||||
|
||||
|
||||
def test_score_api_queries_list_documents_list(server: RemoteOpenAIServer):
|
||||
score_response = requests.post(
|
||||
server.url_for("score"),
|
||||
json={
|
||||
"model": MODEL_NAME,
|
||||
"queries": [query] * 4,
|
||||
"documents": [
|
||||
document,
|
||||
{"content": [documents[0]]},
|
||||
{"content": [documents[1]]},
|
||||
{"content": [documents[0], documents[1]]},
|
||||
],
|
||||
},
|
||||
)
|
||||
score_response.raise_for_status()
|
||||
score = ScoreResponse.model_validate(score_response.json())
|
||||
|
||||
assert score.id is not None
|
||||
assert score.data is not None
|
||||
assert len(score.data) == 4
|
||||
assert score.usage.prompt_tokens == 368
|
||||
assert score.data[0].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
|
||||
assert score.data[1].score == pytest.approx(TEXT_VS_TEXT, rel=TOL)
|
||||
assert score.data[2].score == pytest.approx(TEXT_VS_IMAGE, rel=TOL)
|
||||
assert score.data[3].score == pytest.approx(TEXT_VS_TEXT_PLUS_IMAGE, rel=TOL)
|
||||
|
||||
@@ -40,12 +40,12 @@ from vllm.entrypoints.chat_utils import (
|
||||
ChatTemplateContentFormatOption,
|
||||
)
|
||||
from vllm.entrypoints.pooling.score.utils import (
|
||||
ScoreContentPartParam,
|
||||
ScoreData,
|
||||
ScoreMultiModalParam,
|
||||
_cosine_similarity,
|
||||
_validate_score_input_lens,
|
||||
compress_token_type_ids,
|
||||
get_score_prompt,
|
||||
validate_score_input,
|
||||
)
|
||||
from vllm.entrypoints.utils import log_non_default_args
|
||||
from vllm.inputs import (
|
||||
@@ -1326,8 +1326,8 @@ class LLM:
|
||||
|
||||
def _embedding_score(
|
||||
self,
|
||||
text_1: list[SingletonPrompt],
|
||||
text_2: list[SingletonPrompt],
|
||||
data_1: list[ScoreData],
|
||||
data_2: list[ScoreData],
|
||||
*,
|
||||
use_tqdm: bool | Callable[..., tqdm],
|
||||
pooling_params: PoolingParams | None,
|
||||
@@ -1336,8 +1336,16 @@ class LLM:
|
||||
) -> list[ScoringRequestOutput]:
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
input_texts: list[str] = []
|
||||
for text in data_1 + data_2:
|
||||
if not isinstance(text, str):
|
||||
raise NotImplementedError(
|
||||
"Embedding scores currently do not support multimodal input."
|
||||
)
|
||||
input_texts.append(text)
|
||||
|
||||
encoded_output = self.encode(
|
||||
text_1 + text_2,
|
||||
input_texts,
|
||||
use_tqdm=use_tqdm,
|
||||
lora_request=lora_request,
|
||||
pooling_params=pooling_params,
|
||||
@@ -1345,8 +1353,8 @@ class LLM:
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
encoded_output_1 = encoded_output[0 : len(text_1)]
|
||||
encoded_output_2 = encoded_output[len(text_1) :]
|
||||
encoded_output_1 = encoded_output[0 : len(data_1)]
|
||||
encoded_output_2 = encoded_output[len(data_1) :]
|
||||
|
||||
if len(encoded_output_1) == 1:
|
||||
encoded_output_1 = encoded_output_1 * len(encoded_output_2)
|
||||
@@ -1362,8 +1370,8 @@ class LLM:
|
||||
|
||||
def _cross_encoding_score(
|
||||
self,
|
||||
data_1: list[str] | list[ScoreContentPartParam],
|
||||
data_2: list[str] | list[ScoreContentPartParam],
|
||||
data_1: list[ScoreData],
|
||||
data_2: list[ScoreData],
|
||||
*,
|
||||
use_tqdm: bool | Callable[..., tqdm],
|
||||
pooling_params: PoolingParams | None,
|
||||
@@ -1424,8 +1432,14 @@ class LLM:
|
||||
|
||||
def score(
|
||||
self,
|
||||
data_1: SingletonPrompt | Sequence[SingletonPrompt] | ScoreMultiModalParam,
|
||||
data_2: SingletonPrompt | Sequence[SingletonPrompt] | ScoreMultiModalParam,
|
||||
data_1: SingletonPrompt
|
||||
| Sequence[SingletonPrompt]
|
||||
| ScoreMultiModalParam
|
||||
| list[ScoreMultiModalParam],
|
||||
data_2: SingletonPrompt
|
||||
| Sequence[SingletonPrompt]
|
||||
| ScoreMultiModalParam
|
||||
| list[ScoreMultiModalParam],
|
||||
/,
|
||||
*,
|
||||
use_tqdm: bool | Callable[..., tqdm] = True,
|
||||
@@ -1501,73 +1515,23 @@ class LLM:
|
||||
"chat_template is only supported for cross-encoder models."
|
||||
)
|
||||
|
||||
# the tokenizer for models such as
|
||||
# "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
|
||||
# lists of tokens to the `text` and `text_pair` kwargs
|
||||
tokenizer = self.get_tokenizer()
|
||||
is_multimodal_model = model_config.is_multimodal_model
|
||||
architecture = model_config.architecture
|
||||
|
||||
if not model_config.is_multimodal_model:
|
||||
|
||||
def check_data_type(
|
||||
data: SingletonPrompt
|
||||
| Sequence[SingletonPrompt]
|
||||
| ScoreMultiModalParam,
|
||||
):
|
||||
if isinstance(data, dict) and "content" in data:
|
||||
raise ValueError(
|
||||
"ScoreMultiModalParam is not supported "
|
||||
f"for {model_config.architecture}"
|
||||
)
|
||||
|
||||
check_data_type(data_1)
|
||||
check_data_type(data_2)
|
||||
|
||||
def ensure_str(prompt: SingletonPrompt):
|
||||
if isinstance(prompt, dict):
|
||||
if "multi_modal_data" in prompt:
|
||||
raise ValueError(
|
||||
"Multi-modal prompt is not supported for scoring"
|
||||
)
|
||||
elif "prompt_token_ids" in prompt:
|
||||
prompt = tokenizer.decode(
|
||||
cast(TokensPrompt, prompt)["prompt_token_ids"]
|
||||
)
|
||||
elif "prompt" in prompt:
|
||||
prompt = cast(TextPrompt, prompt)["prompt"]
|
||||
assert type(prompt) is str
|
||||
return prompt
|
||||
|
||||
if isinstance(data_1, (str, dict)):
|
||||
# Convert a single prompt to a list.
|
||||
data_1 = [data_1] # type: ignore[list-item]
|
||||
|
||||
data_1 = [ensure_str(t) for t in data_1]
|
||||
|
||||
if isinstance(data_2, (str, dict)):
|
||||
# Convert a single prompt to a list.
|
||||
data_2 = [data_2] # type: ignore[list-item]
|
||||
|
||||
data_2 = [ensure_str(t) for t in data_2]
|
||||
|
||||
if isinstance(data_1, dict) and "content" in data_1:
|
||||
data_1 = data_1.get("content") # type: ignore[assignment]
|
||||
elif isinstance(data_1, str):
|
||||
data_1 = [data_1]
|
||||
|
||||
if isinstance(data_2, dict) and "content" in data_2:
|
||||
data_2 = data_2.get("content") # type: ignore[assignment]
|
||||
elif isinstance(data_2, str):
|
||||
data_2 = [data_2]
|
||||
|
||||
_validate_score_input_lens(data_1, data_2) # type: ignore[arg-type]
|
||||
score_data_1, score_data_2 = validate_score_input(
|
||||
data_1, # type: ignore[arg-type]
|
||||
data_2, # type: ignore[arg-type]
|
||||
is_multimodal_model=is_multimodal_model,
|
||||
architecture=architecture,
|
||||
)
|
||||
|
||||
tok_params = self._get_cmpl_tok_params(tokenization_kwargs)
|
||||
encode_kwargs = tok_params.get_encode_kwargs()
|
||||
|
||||
if model_config.is_cross_encoder:
|
||||
return self._cross_encoding_score(
|
||||
data_1, # type: ignore[arg-type]
|
||||
data_2, # type: ignore[arg-type]
|
||||
score_data_1,
|
||||
score_data_2,
|
||||
use_tqdm=use_tqdm,
|
||||
pooling_params=pooling_params,
|
||||
lora_request=lora_request,
|
||||
@@ -1576,8 +1540,8 @@ class LLM:
|
||||
)
|
||||
else:
|
||||
return self._embedding_score(
|
||||
data_1, # type: ignore[arg-type]
|
||||
data_2, # type: ignore[arg-type]
|
||||
score_data_1,
|
||||
score_data_2,
|
||||
use_tqdm=use_tqdm,
|
||||
pooling_params=pooling_params,
|
||||
lora_request=lora_request,
|
||||
|
||||
@@ -14,7 +14,8 @@ from vllm.entrypoints.pooling.base.protocol import (
|
||||
)
|
||||
from vllm.entrypoints.pooling.score.utils import (
|
||||
ScoreContentPartParam,
|
||||
ScoreMultiModalParam,
|
||||
ScoreInput,
|
||||
ScoreInputs,
|
||||
)
|
||||
from vllm.renderers import TokenizeParams
|
||||
from vllm.utils import random_uuid
|
||||
@@ -47,13 +48,13 @@ class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin):
|
||||
|
||||
|
||||
class ScoreDataRequest(ScoreRequestMixin):
|
||||
data_1: list[str] | str | ScoreMultiModalParam
|
||||
data_2: list[str] | str | ScoreMultiModalParam
|
||||
data_1: ScoreInputs
|
||||
data_2: ScoreInputs
|
||||
|
||||
|
||||
class ScoreQueriesDocumentsRequest(ScoreRequestMixin):
|
||||
queries: list[str] | str | ScoreMultiModalParam
|
||||
documents: list[str] | str | ScoreMultiModalParam
|
||||
queries: ScoreInputs
|
||||
documents: ScoreInputs
|
||||
|
||||
@property
|
||||
def data_1(self):
|
||||
@@ -64,9 +65,22 @@ class ScoreQueriesDocumentsRequest(ScoreRequestMixin):
|
||||
return self.documents
|
||||
|
||||
|
||||
class ScoreQueriesItemsRequest(ScoreRequestMixin):
|
||||
queries: ScoreInputs
|
||||
items: ScoreInputs
|
||||
|
||||
@property
|
||||
def data_1(self):
|
||||
return self.queries
|
||||
|
||||
@property
|
||||
def data_2(self):
|
||||
return self.items
|
||||
|
||||
|
||||
class ScoreTextRequest(ScoreRequestMixin):
|
||||
text_1: list[str] | str | ScoreMultiModalParam
|
||||
text_2: list[str] | str | ScoreMultiModalParam
|
||||
text_1: ScoreInputs
|
||||
text_2: ScoreInputs
|
||||
|
||||
@property
|
||||
def data_1(self):
|
||||
@@ -78,13 +92,16 @@ class ScoreTextRequest(ScoreRequestMixin):
|
||||
|
||||
|
||||
ScoreRequest: TypeAlias = (
|
||||
ScoreQueriesDocumentsRequest | ScoreDataRequest | ScoreTextRequest
|
||||
ScoreQueriesDocumentsRequest
|
||||
| ScoreQueriesItemsRequest
|
||||
| ScoreDataRequest
|
||||
| ScoreTextRequest
|
||||
)
|
||||
|
||||
|
||||
class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin):
|
||||
query: str | ScoreMultiModalParam
|
||||
documents: list[str] | ScoreMultiModalParam
|
||||
query: ScoreInput
|
||||
documents: ScoreInputs
|
||||
top_n: int = Field(default_factory=lambda: 0)
|
||||
|
||||
# --8<-- [start:rerank-extra-params]
|
||||
@@ -108,7 +125,7 @@ class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin):
|
||||
|
||||
class RerankDocument(BaseModel):
|
||||
text: str | None = None
|
||||
multi_modal: ScoreContentPartParam | None = None
|
||||
multi_modal: list[ScoreContentPartParam] | None = None
|
||||
|
||||
|
||||
class RerankResult(BaseModel):
|
||||
|
||||
@@ -27,12 +27,12 @@ from vllm.entrypoints.pooling.score.protocol import (
|
||||
ScoreResponseData,
|
||||
)
|
||||
from vllm.entrypoints.pooling.score.utils import (
|
||||
ScoreContentPartParam,
|
||||
ScoreMultiModalParam,
|
||||
ScoreData,
|
||||
ScoreInputs,
|
||||
_cosine_similarity,
|
||||
_validate_score_input_lens,
|
||||
compress_token_type_ids,
|
||||
get_score_prompt,
|
||||
validate_score_input,
|
||||
)
|
||||
from vllm.inputs.data import TokensPrompt
|
||||
from vllm.logger import init_logger
|
||||
@@ -65,15 +65,32 @@ class ServingScores(OpenAIServing):
|
||||
|
||||
self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
|
||||
|
||||
self.is_cross_encoder = self.model_config.is_cross_encoder
|
||||
self.is_multimodal_model = self.model_config.is_multimodal_model
|
||||
self.architecture = self.model_config.architecture
|
||||
|
||||
if self.is_cross_encoder:
|
||||
self._score_func = self._cross_encoding_score
|
||||
else:
|
||||
self._score_func = self._embedding_score
|
||||
|
||||
async def _embedding_score(
|
||||
self,
|
||||
data_1: list[str],
|
||||
data_2: list[str],
|
||||
data_1: list[ScoreData],
|
||||
data_2: list[ScoreData],
|
||||
request: RerankRequest | ScoreRequest,
|
||||
request_id: str,
|
||||
lora_request: LoRARequest | None | None = None,
|
||||
trace_headers: Mapping[str, str] | None = None,
|
||||
) -> list[PoolingRequestOutput] | ErrorResponse:
|
||||
input_texts: list[str] = []
|
||||
for text in data_1 + data_2:
|
||||
if not isinstance(text, str):
|
||||
raise NotImplementedError(
|
||||
"Embedding scores currently do not support multimodal input."
|
||||
)
|
||||
input_texts.append(text)
|
||||
|
||||
model_config = self.model_config
|
||||
tokenizer = self.renderer.get_tokenizer()
|
||||
|
||||
@@ -82,8 +99,6 @@ class ServingScores(OpenAIServing):
|
||||
executor=self._tokenizer_executor,
|
||||
)
|
||||
|
||||
input_texts = data_1 + data_2
|
||||
|
||||
tokenization_kwargs = request.build_tok_params(model_config).get_encode_kwargs()
|
||||
tokenized_prompts = await asyncio.gather(
|
||||
*(encode_async(t, **tokenization_kwargs) for t in input_texts)
|
||||
@@ -157,60 +172,30 @@ class ServingScores(OpenAIServing):
|
||||
|
||||
return final_res_batch
|
||||
|
||||
def _preprocess_score(
|
||||
self,
|
||||
request: RerankRequest | ScoreRequest,
|
||||
tokenizer: TokenizerLike,
|
||||
tokenization_kwargs: dict[str, Any],
|
||||
data_1: str | ScoreContentPartParam,
|
||||
data_2: str | ScoreContentPartParam,
|
||||
) -> tuple[str, TokensPrompt]:
|
||||
model_config = self.model_config
|
||||
|
||||
full_prompt, engine_prompt = get_score_prompt(
|
||||
model_config=model_config,
|
||||
data_1=data_1,
|
||||
data_2=data_2,
|
||||
tokenizer=tokenizer,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
score_template=self.score_template,
|
||||
)
|
||||
self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt)
|
||||
if request.mm_processor_kwargs is not None:
|
||||
engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
|
||||
|
||||
return full_prompt, engine_prompt
|
||||
|
||||
async def _cross_encoding_score(
|
||||
self,
|
||||
data_1: list[str] | list[ScoreContentPartParam],
|
||||
data_2: list[str] | list[ScoreContentPartParam],
|
||||
data_1: list[ScoreData],
|
||||
data_2: list[ScoreData],
|
||||
request: RerankRequest | ScoreRequest,
|
||||
request_id: str,
|
||||
lora_request: LoRARequest | None | None = None,
|
||||
trace_headers: Mapping[str, str] | None = None,
|
||||
) -> list[PoolingRequestOutput] | ErrorResponse:
|
||||
model_config = self.model_config
|
||||
tokenizer = self.renderer.get_tokenizer()
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
raise ValueError("MistralTokenizer not supported for cross-encoding")
|
||||
|
||||
request_prompts: list[str] = []
|
||||
engine_prompts: list[TokensPrompt] = []
|
||||
model_config = self.model_config
|
||||
|
||||
if len(data_1) == 1:
|
||||
data_1 = data_1 * len(data_2)
|
||||
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
raise ValueError("MistralTokenizer not supported for cross-encoding")
|
||||
|
||||
tok_kwargs = request.build_tok_params(model_config).get_encode_kwargs()
|
||||
|
||||
input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
|
||||
|
||||
preprocess_async = make_async(
|
||||
self._preprocess_score,
|
||||
executor=self._tokenizer_executor,
|
||||
)
|
||||
|
||||
preprocessed_prompts = await asyncio.gather(
|
||||
*(
|
||||
preprocess_async(
|
||||
@@ -224,6 +209,8 @@ class ServingScores(OpenAIServing):
|
||||
)
|
||||
)
|
||||
|
||||
request_prompts: list[str] = []
|
||||
engine_prompts: list[TokensPrompt] = []
|
||||
for full_prompt, engine_prompt in preprocessed_prompts:
|
||||
request_prompts.append(full_prompt)
|
||||
engine_prompts.append(engine_prompt)
|
||||
@@ -278,10 +265,33 @@ class ServingScores(OpenAIServing):
|
||||
|
||||
return [out for out in final_res_batch if out is not None]
|
||||
|
||||
def _preprocess_score(
|
||||
self,
|
||||
request: RerankRequest | ScoreRequest,
|
||||
tokenizer: TokenizerLike,
|
||||
tokenization_kwargs: dict[str, Any],
|
||||
data_1: ScoreData,
|
||||
data_2: ScoreData,
|
||||
) -> tuple[str, TokensPrompt]:
|
||||
model_config = self.model_config
|
||||
full_prompt, engine_prompt = get_score_prompt(
|
||||
model_config=model_config,
|
||||
data_1=data_1,
|
||||
data_2=data_2,
|
||||
tokenizer=tokenizer,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
score_template=self.score_template,
|
||||
)
|
||||
self._validate_input(request, engine_prompt["prompt_token_ids"], full_prompt)
|
||||
if request.mm_processor_kwargs is not None:
|
||||
engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
|
||||
|
||||
return full_prompt, engine_prompt
|
||||
|
||||
async def _run_scoring(
|
||||
self,
|
||||
data_1: list[str] | str | ScoreMultiModalParam,
|
||||
data_2: list[str] | str | ScoreMultiModalParam,
|
||||
data_1: ScoreInputs,
|
||||
data_2: ScoreInputs,
|
||||
request: ScoreRequest | RerankRequest,
|
||||
request_id: str,
|
||||
raw_request: Request | None = None,
|
||||
@@ -294,44 +304,21 @@ class ServingScores(OpenAIServing):
|
||||
else await self._get_trace_headers(raw_request.headers)
|
||||
)
|
||||
|
||||
if not self.model_config.is_multimodal_model and (
|
||||
isinstance(data_1, dict) or isinstance(data_2, dict)
|
||||
):
|
||||
raise ValueError(
|
||||
f"MultiModalParam is not supported for {self.model_config.architecture}" # noqa: E501
|
||||
)
|
||||
score_data_1, score_data_2 = validate_score_input(
|
||||
data_1,
|
||||
data_2,
|
||||
is_multimodal_model=self.is_multimodal_model,
|
||||
architecture=self.architecture,
|
||||
)
|
||||
|
||||
if isinstance(data_1, str):
|
||||
data_1 = [data_1]
|
||||
elif isinstance(data_1, dict):
|
||||
data_1 = data_1.get("content") # type: ignore[assignment]
|
||||
|
||||
if isinstance(data_2, str):
|
||||
data_2 = [data_2]
|
||||
elif isinstance(data_2, dict):
|
||||
data_2 = data_2.get("content") # type: ignore[assignment]
|
||||
|
||||
_validate_score_input_lens(data_1, data_2) # type: ignore[arg-type]
|
||||
|
||||
if self.model_config.is_cross_encoder:
|
||||
return await self._cross_encoding_score(
|
||||
data_1=data_1, # type: ignore[arg-type]
|
||||
data_2=data_2, # type: ignore[arg-type]
|
||||
request=request,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
|
||||
else:
|
||||
return await self._embedding_score(
|
||||
data_1=data_1, # type: ignore[arg-type]
|
||||
data_2=data_2, # type: ignore[arg-type]
|
||||
request=request,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
return await self._score_func(
|
||||
data_1=score_data_1,
|
||||
data_2=score_data_2,
|
||||
request=request,
|
||||
request_id=request_id,
|
||||
lora_request=lora_request,
|
||||
trace_headers=trace_headers,
|
||||
)
|
||||
|
||||
async def create_score(
|
||||
self,
|
||||
@@ -391,15 +378,6 @@ class ServingScores(OpenAIServing):
|
||||
|
||||
request_id = f"rerank-{self._base_request_id(raw_request)}"
|
||||
documents = request.documents
|
||||
top_n = (
|
||||
request.top_n
|
||||
if request.top_n > 0
|
||||
else (
|
||||
len(documents)
|
||||
if isinstance(documents, list)
|
||||
else len(documents["content"])
|
||||
)
|
||||
)
|
||||
|
||||
try:
|
||||
final_res_batch = await self._run_scoring(
|
||||
@@ -412,6 +390,8 @@ class ServingScores(OpenAIServing):
|
||||
if isinstance(final_res_batch, ErrorResponse):
|
||||
return final_res_batch
|
||||
|
||||
top_n = request.top_n if request.top_n > 0 else len(final_res_batch)
|
||||
|
||||
return self.request_output_to_rerank_response(
|
||||
final_res_batch,
|
||||
request_id,
|
||||
@@ -465,22 +445,32 @@ class ServingScores(OpenAIServing):
|
||||
final_res_batch: list[PoolingRequestOutput],
|
||||
request_id: str,
|
||||
model_name: str,
|
||||
documents: list[str] | ScoreMultiModalParam,
|
||||
documents: ScoreInputs,
|
||||
top_n: int,
|
||||
) -> RerankResponse:
|
||||
"""
|
||||
Convert the output of do_rank to a RerankResponse
|
||||
"""
|
||||
|
||||
if not isinstance(documents, list):
|
||||
documents = [documents]
|
||||
|
||||
results: list[RerankResult] = []
|
||||
num_prompt_tokens = 0
|
||||
for idx, final_res in enumerate(final_res_batch):
|
||||
classify_res = ScoringRequestOutput.from_base(final_res)
|
||||
|
||||
document = documents[idx]
|
||||
if isinstance(document, str):
|
||||
rerank_document = RerankDocument(text=document)
|
||||
else:
|
||||
rerank_document = RerankDocument(
|
||||
multi_modal=document.get("content", [])
|
||||
)
|
||||
|
||||
result = RerankResult(
|
||||
index=idx,
|
||||
document=RerankDocument(text=documents[idx])
|
||||
if isinstance(documents, list)
|
||||
else RerankDocument(multi_modal=documents["content"][idx]),
|
||||
document=rerank_document,
|
||||
relevance_score=classify_res.outputs.score,
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, TypeAlias, cast
|
||||
|
||||
from torch.nn import CosineSimilarity
|
||||
@@ -10,12 +11,13 @@ from vllm.entrypoints.chat_utils import (
|
||||
BaseMultiModalItemTracker,
|
||||
ChatCompletionContentPartImageEmbedsParam,
|
||||
ChatCompletionContentPartImageParam,
|
||||
ChatCompletionContentPartParam,
|
||||
ChatCompletionContentPartTextParam,
|
||||
ChatCompletionContentPartVideoParam,
|
||||
ChatTemplateResolutionError,
|
||||
ConversationMessage,
|
||||
MultiModalItemTracker,
|
||||
_ContentPart,
|
||||
_parse_chat_message_content_part,
|
||||
_parse_chat_message_content_parts,
|
||||
)
|
||||
from vllm.inputs import TokensPrompt
|
||||
from vllm.model_executor.models.interfaces import supports_score_template
|
||||
@@ -46,6 +48,13 @@ class ScoreMultiModalParam(TypedDict, total=False):
|
||||
"""The multimodal contents"""
|
||||
|
||||
|
||||
# Raw input data with content key in ScoreMultiModalParam.
|
||||
ScoreInput = str | ScoreMultiModalParam
|
||||
ScoreInputs = ScoreInput | list[ScoreInput]
|
||||
# Score data without content key.
|
||||
ScoreData = str | list[ScoreContentPartParam]
|
||||
|
||||
|
||||
def _cosine_similarity(
|
||||
tokenizer: TokenizerLike,
|
||||
embed_1: list[PoolingRequestOutput],
|
||||
@@ -77,8 +86,8 @@ def _cosine_similarity(
|
||||
|
||||
|
||||
def _validate_score_input_lens(
|
||||
data_1: list[str] | list[ScoreContentPartParam],
|
||||
data_2: list[str] | list[ScoreContentPartParam],
|
||||
data_1: list[ScoreData],
|
||||
data_2: list[ScoreData],
|
||||
):
|
||||
len_1 = len(data_1)
|
||||
len_2 = len(data_2)
|
||||
@@ -91,19 +100,56 @@ def _validate_score_input_lens(
|
||||
raise ValueError("At least one text_pair element must be given")
|
||||
|
||||
|
||||
def _validate_mm_score_input(
|
||||
data: list[ScoreInput],
|
||||
is_multimodal_model: bool,
|
||||
architecture: str,
|
||||
) -> list[ScoreData]:
|
||||
out: list[ScoreData] = []
|
||||
for d in data:
|
||||
if isinstance(d, str):
|
||||
out.append(d)
|
||||
else:
|
||||
if not is_multimodal_model:
|
||||
raise ValueError(f"MultiModalParam is not supported for {architecture}")
|
||||
content = cast(list[ScoreContentPartParam], d.get("content", []))
|
||||
out.append(content)
|
||||
return out
|
||||
|
||||
|
||||
def validate_score_input(
|
||||
data_1: ScoreInputs,
|
||||
data_2: ScoreInputs,
|
||||
is_multimodal_model: bool,
|
||||
architecture: str,
|
||||
) -> tuple[list[ScoreData], list[ScoreData]]:
|
||||
if not isinstance(data_1, list):
|
||||
data_1 = [data_1]
|
||||
|
||||
if not isinstance(data_2, list):
|
||||
data_2 = [data_2]
|
||||
|
||||
score_input_1 = _validate_mm_score_input(data_1, is_multimodal_model, architecture)
|
||||
score_input_2 = _validate_mm_score_input(data_2, is_multimodal_model, architecture)
|
||||
_validate_score_input_lens(score_input_1, score_input_2)
|
||||
return score_input_1, score_input_2
|
||||
|
||||
|
||||
def parse_score_data(
|
||||
data_1: str | ScoreContentPartParam,
|
||||
data_2: str | ScoreContentPartParam,
|
||||
data_1: ScoreData,
|
||||
data_2: ScoreData,
|
||||
model_config: ModelConfig,
|
||||
) -> tuple[str, str, MultiModalDataDict | None, MultiModalUUIDDict | None]:
|
||||
mm_tracker = MultiModalItemTracker(model_config)
|
||||
|
||||
content_1 = _parse_score_content(data_1, mm_tracker)
|
||||
content_2 = _parse_score_content(data_2, mm_tracker)
|
||||
content_1 = _parse_score_content("query", data_1, mm_tracker)
|
||||
content_2 = _parse_score_content("document", data_2, mm_tracker)
|
||||
|
||||
def ensure_str(content: _ContentPart | None) -> str:
|
||||
if content is not None and isinstance(content, str):
|
||||
return cast(str, content)
|
||||
def ensure_str(content: list[ConversationMessage]) -> str:
|
||||
assert len(content) == 1
|
||||
prompt = content[0]["content"]
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
return cast(str, prompt)
|
||||
else:
|
||||
raise ValueError(f"Only string content is supported, but got {content}.")
|
||||
|
||||
@@ -115,19 +161,22 @@ def parse_score_data(
|
||||
|
||||
|
||||
def _parse_score_content(
|
||||
data: str | ScoreContentPartParam,
|
||||
role: str,
|
||||
data: ScoreData,
|
||||
mm_tracker: BaseMultiModalItemTracker,
|
||||
) -> _ContentPart | None:
|
||||
) -> list[ConversationMessage]:
|
||||
parts: Iterable[ChatCompletionContentPartParam]
|
||||
if isinstance(data, str):
|
||||
part = ChatCompletionContentPartTextParam(type="text", text=data)
|
||||
parts = [ChatCompletionContentPartTextParam(type="text", text=data)]
|
||||
else:
|
||||
part = data
|
||||
parts = cast(Iterable[ChatCompletionContentPartParam], data)
|
||||
|
||||
mm_parser = mm_tracker.create_parser()
|
||||
|
||||
parse_res = _parse_chat_message_content_part(
|
||||
part,
|
||||
mm_parser,
|
||||
parse_res = _parse_chat_message_content_parts(
|
||||
role=role,
|
||||
parts=parts,
|
||||
mm_tracker=mm_tracker,
|
||||
wrap_dicts=False,
|
||||
interleave_strings=False,
|
||||
)
|
||||
@@ -184,8 +233,8 @@ def get_score_prompt(
|
||||
model_config: ModelConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
tokenization_kwargs: dict[str, Any],
|
||||
data_1: str | ScoreContentPartParam,
|
||||
data_2: str | ScoreContentPartParam,
|
||||
data_1: ScoreData,
|
||||
data_2: ScoreData,
|
||||
score_template: str | None = None,
|
||||
) -> tuple[str, TokensPrompt]:
|
||||
prompt_1, prompt_2, mm_data, mm_uuids = parse_score_data(
|
||||
|
||||
Reference in New Issue
Block a user