[Frontend] Using matryoshka_dimensions control the allowed output dimensions. (#16970)

This commit is contained in:
wang.yuqi
2025-04-24 22:06:28 +08:00
committed by GitHub
parent b724afe343
commit 67309a1cb5
8 changed files with 172 additions and 76 deletions

View File

@@ -3,73 +3,121 @@
Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
"""
from typing import Optional
import openai
import pytest
from vllm.entrypoints.openai.protocol import EmbeddingResponse
from ...models.embedding.utils import EmbedModelInfo
from ...conftest import HfRunner
from ...models.embedding.utils import EmbedModelInfo, correctness_test
from ...utils import RemoteOpenAIServer
MODELS = [
EmbedModelInfo(name="BAAI/bge-m3", is_matryoshka=False),
EmbedModelInfo(name="jinaai/jina-embeddings-v3", is_matryoshka=True),
EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
is_matryoshka=True,
matryoshka_dimensions=[256]),
]
input_texts = [
"The chef prepared a delicious meal.",
] * 3
]
@pytest.mark.asyncio
@pytest.mark.parametrize("model", MODELS)
async def test_validating_dimensions(model: EmbedModelInfo):
@pytest.fixture(scope="module", params=MODELS)
def model_info(request):
return request.param
@pytest.fixture(scope="module", params=["bfloat16"])
def dtype(request):
return request.param
@pytest.fixture(scope="module")
def server(model_info, dtype: str):
args = [
"--task",
"embed",
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
dtype,
"--enforce-eager",
"--max-model-len",
"512",
"--trust_remote_code"
"512"
]
with RemoteOpenAIServer(model.name, args) as remote_server:
client = remote_server.get_async_client()
async def make_request(dimensions):
embedding_response = await client.embeddings.create(
model=model.name,
input=input_texts,
dimensions=dimensions,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
if model_info.name == "Snowflake/snowflake-arctic-embed-m-v1.5":
# Manually enable Matryoshka Embeddings
args.extend([
"--trust_remote_code", "--hf_overrides",
'{"matryoshka_dimensions":[256]}'
])
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) > 0
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens > 0
assert embeddings.usage.total_tokens > 0
with RemoteOpenAIServer(model_info.name, args) as remote_server:
yield remote_server
if dimensions is not None:
assert len(embeddings.data[0].embedding) == dimensions
if model.is_matryoshka:
for dimensions in [None, 16]:
await make_request(dimensions)
@pytest.fixture(scope="module")
def hf_model(hf_runner, model_info, dtype: str):
with hf_runner(model_info.name, dtype=dtype,
is_sentence_transformer=True) as hf_model:
yield hf_model
@pytest.mark.asyncio
async def test_matryoshka(model_info: EmbedModelInfo,
server: RemoteOpenAIServer, hf_model: HfRunner):
client = server.get_async_client()
async def make_request_and_correctness_test(dimensions):
prompts = input_texts * 3
embedding_response = await client.embeddings.create(
model=model_info.name,
input=prompts,
dimensions=dimensions,
encoding_format="float",
)
embeddings = EmbeddingResponse.model_validate(
embedding_response.model_dump(mode="json"))
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) > 0
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens > 0
assert embeddings.usage.total_tokens > 0
if dimensions is not None:
assert len(embeddings.data[0].embedding) == dimensions
vllm_outputs = [d.embedding for d in embeddings.data]
correctness_test(hf_model, prompts, vllm_outputs, dimensions)
if model_info.is_matryoshka:
valid_dimensions: list[Optional[int]] = [None]
if model_info.matryoshka_dimensions is not None:
valid_dimensions += model_info.matryoshka_dimensions[:2]
for dimensions in valid_dimensions:
await make_request_and_correctness_test(dimensions)
invalid_dimensions: list[Optional[int]] = [-1]
if model_info.matryoshka_dimensions is not None:
assert 5 not in model_info.matryoshka_dimensions
invalid_dimensions.append(5)
for dimensions in invalid_dimensions:
with pytest.raises(openai.BadRequestError):
for dimensions in [-1]:
await make_request(dimensions)
await make_request_and_correctness_test(dimensions)
else:
for dimensions in [None]:
await make_request(dimensions)
else:
for dimensions in [None]:
await make_request_and_correctness_test(dimensions)
for dimensions in [-1, 16]:
with pytest.raises(openai.BadRequestError):
for dimensions in [-1, 16]:
await make_request(dimensions)
await make_request_and_correctness_test(dimensions)