[Frontend] Using matryoshka_dimensions control the allowed output dimensions. (#16970)
This commit is contained in:
@@ -11,11 +11,12 @@ import requests
|
||||
from vllm.entrypoints.openai.protocol import EmbeddingResponse
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...models.embedding.utils import check_embeddings_close
|
||||
from ...models.embedding.utils import correctness_test
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
|
||||
DTYPE = "bfloat16"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@@ -25,7 +26,7 @@ def server():
|
||||
"embed",
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
DTYPE,
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"512",
|
||||
@@ -43,9 +44,17 @@ async def client(server):
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def hf_model(hf_runner):
|
||||
with hf_runner(MODEL_NAME, dtype=DTYPE,
|
||||
is_sentence_transformer=True) as hf_model:
|
||||
yield hf_model
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
|
||||
async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
input_texts = [
|
||||
"The chef prepared a delicious meal.",
|
||||
]
|
||||
@@ -66,6 +75,9 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
|
||||
assert embeddings.usage.prompt_tokens == 11
|
||||
assert embeddings.usage.total_tokens == 11
|
||||
|
||||
vllm_outputs = [d.embedding for d in embeddings.data]
|
||||
correctness_test(hf_model, input_texts, vllm_outputs)
|
||||
|
||||
# test using token IDs
|
||||
input_tokens = [1, 1, 1, 1, 1]
|
||||
embedding_response = await client.embeddings.create(
|
||||
@@ -86,7 +98,8 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
|
||||
async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
# test list[str]
|
||||
input_texts = [
|
||||
"The cat sat on the mat.", "A feline was resting on a rug.",
|
||||
@@ -107,6 +120,9 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
|
||||
assert embeddings.usage.prompt_tokens == 33
|
||||
assert embeddings.usage.total_tokens == 33
|
||||
|
||||
vllm_outputs = [d.embedding for d in embeddings.data]
|
||||
correctness_test(hf_model, input_texts, vllm_outputs)
|
||||
|
||||
# test list[list[int]]
|
||||
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
|
||||
[25, 32, 64, 77]]
|
||||
@@ -181,7 +197,7 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
|
||||
async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
|
||||
model_name: str):
|
||||
input_texts = [
|
||||
"Hello my name is",
|
||||
@@ -192,6 +208,7 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
|
||||
model=model_name,
|
||||
encoding_format="float")
|
||||
float_data = [d.embedding for d in responses_float.data]
|
||||
correctness_test(hf_model, input_texts, float_data)
|
||||
|
||||
responses_base64 = await client.embeddings.create(input=input_texts,
|
||||
model=model_name,
|
||||
@@ -202,24 +219,13 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
|
||||
np.frombuffer(base64.b64decode(data.embedding),
|
||||
dtype="float32").tolist())
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=float_data,
|
||||
embeddings_1_lst=base64_data,
|
||||
name_0="float",
|
||||
name_1="base64",
|
||||
)
|
||||
correctness_test(hf_model, input_texts, base64_data)
|
||||
|
||||
# Default response is float32 decoded from base64 by OpenAI Client
|
||||
responses_default = await client.embeddings.create(input=input_texts,
|
||||
model=model_name)
|
||||
default_data = [d.embedding for d in responses_default.data]
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=float_data,
|
||||
embeddings_1_lst=default_data,
|
||||
name_0="float",
|
||||
name_1="default",
|
||||
)
|
||||
correctness_test(hf_model, input_texts, default_data)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
Reference in New Issue
Block a user