# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Parity test between Cohere /v2/embed and OpenAI /v1/embeddings. Verifies that both endpoints produce identical float embeddings when no prompt prefix is applied (input_type omitted for Cohere /v2/embed). """ import numpy as np import pytest import requests from tests.utils import ROCM_EXTRA_ARGS, RemoteOpenAIServer MODEL_NAME = "BAAI/bge-base-en-v1.5" DTYPE = "bfloat16" @pytest.fixture(scope="module") def server(): args = [ "--runner", "pooling", "--dtype", DTYPE, "--enforce-eager", "--max-model-len", "512", "--gpu-memory-utilization", "0.02", ] + ROCM_EXTRA_ARGS with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server def _cohere_embed( server: RemoteOpenAIServer, texts: list[str], ) -> list[list[float]]: body = { "model": MODEL_NAME, "texts": texts, "embedding_types": ["float"], } resp = requests.post(server.url_for("/v2/embed"), json=body) resp.raise_for_status() return resp.json()["embeddings"]["float"] def _openai_embed( server: RemoteOpenAIServer, texts: list[str], ) -> list[list[float]]: body = {"model": MODEL_NAME, "input": texts, "encoding_format": "float"} resp = requests.post(server.url_for("/v1/embeddings"), json=body) resp.raise_for_status() return [item["embedding"] for item in resp.json()["data"]] def _cosine_sim(a: list[float], b: list[float]) -> float: va, vb = np.array(a), np.array(b) return float(np.dot(va, vb) / (np.linalg.norm(va) * np.linalg.norm(vb))) def test_single_text_parity(server: RemoteOpenAIServer): """A single text should produce equivalent embeddings via both APIs.""" texts = ["the quick brown fox jumps over the lazy dog"] v2 = _cohere_embed(server, texts) v1 = _openai_embed(server, texts) # Full-suite BF16 runs can introduce tiny numerical drift even when both # endpoints are functionally equivalent, so compare semantic equivalence # instead of exact elementwise equality. cos = _cosine_sim(v2[0], v1[0]) assert cos > 0.9999, f"single-text parity failed, cosine={cos}" def test_batch_parity(server: RemoteOpenAIServer): """A batch of texts should produce equivalent embeddings via both APIs, in the same order.""" texts = [ "machine learning", "deep learning", "natural language processing", ] v2 = _cohere_embed(server, texts) v1 = _openai_embed(server, texts) assert len(v2) == len(v1) == 3 similarities = np.array( [[_cosine_sim(v2_emb, v1_emb) for v1_emb in v1] for v2_emb in v2] ) for i in range(3): assert int(np.argmax(similarities[i])) == i, ( f"batch parity order mismatch at index {i}: " f"similarities={similarities[i].tolist()}" ) assert similarities[i, i] > 0.9999, ( f"batch parity failed at index {i}, cosine={similarities[i, i]}" ) def test_token_count_parity(server: RemoteOpenAIServer): """Both APIs should report the same prompt token count.""" texts = ["hello world"] v2_resp = requests.post( server.url_for("/v2/embed"), json={ "model": MODEL_NAME, "texts": texts, "embedding_types": ["float"], }, ) v1_resp = requests.post( server.url_for("/v1/embeddings"), json={"model": MODEL_NAME, "input": texts, "encoding_format": "float"}, ) v2_resp.raise_for_status() v1_resp.raise_for_status() v2_tokens = v2_resp.json()["meta"]["billed_units"]["input_tokens"] v1_tokens = v1_resp.json()["usage"]["prompt_tokens"] assert v2_tokens == v1_tokens