136 lines
3.8 KiB
Python
136 lines
3.8 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""Tests for the Cohere /v2/embed API with a multimodal model (SigLIP).
|
|
|
|
Validates image embedding, batching, normalisation, and embedding type
|
|
conversions through the /v2/embed endpoint.
|
|
"""
|
|
|
|
import struct
|
|
import zlib
|
|
|
|
import numpy as np
|
|
import pybase64 as base64
|
|
import pytest
|
|
import requests
|
|
|
|
from tests.utils import RemoteOpenAIServer
|
|
|
|
MODEL_NAME = "google/siglip-so400m-patch14-384"
|
|
DTYPE = "bfloat16"
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def server():
|
|
args = [
|
|
"--runner",
|
|
"pooling",
|
|
"--dtype",
|
|
DTYPE,
|
|
"--enforce-eager",
|
|
"--max-model-len",
|
|
"64",
|
|
"--gpu-memory-utilization",
|
|
"0.3",
|
|
]
|
|
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
|
yield remote_server
|
|
|
|
|
|
def _make_tiny_png(r: int, g: int, b: int, w: int = 2, h: int = 2) -> str:
|
|
raw = b""
|
|
for _ in range(h):
|
|
raw += b"\x00" + bytes([r, g, b]) * w
|
|
compressed = zlib.compress(raw)
|
|
|
|
def chunk(ctype: bytes, cdata: bytes) -> bytes:
|
|
c = ctype + cdata
|
|
return (
|
|
struct.pack(">I", len(cdata))
|
|
+ c
|
|
+ struct.pack(">I", zlib.crc32(c) & 0xFFFFFFFF)
|
|
)
|
|
|
|
ihdr = struct.pack(">IIBBBBB", w, h, 8, 2, 0, 0, 0)
|
|
png = (
|
|
b"\x89PNG\r\n\x1a\n"
|
|
+ chunk(b"IHDR", ihdr)
|
|
+ chunk(b"IDAT", compressed)
|
|
+ chunk(b"IEND", b"")
|
|
)
|
|
return "data:image/png;base64," + base64.b64encode(png).decode()
|
|
|
|
|
|
def _cohere_embed(
|
|
server: RemoteOpenAIServer,
|
|
texts: list[str] | None = None,
|
|
images: list[str] | None = None,
|
|
embedding_types: list[str] | None = None,
|
|
) -> dict:
|
|
body: dict = {"model": MODEL_NAME}
|
|
if texts is not None:
|
|
body["texts"] = texts
|
|
if images is not None:
|
|
body["images"] = images
|
|
if embedding_types is not None:
|
|
body["embedding_types"] = embedding_types
|
|
resp = requests.post(server.url_for("/v2/embed"), json=body)
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
|
|
|
|
def test_image_embed(server: RemoteOpenAIServer):
|
|
img_uri = _make_tiny_png(255, 0, 0)
|
|
r = _cohere_embed(
|
|
server,
|
|
images=[img_uri],
|
|
embedding_types=["float"],
|
|
)
|
|
assert "embeddings" in r
|
|
assert len(r["embeddings"]["float"]) == 1
|
|
assert len(r["embeddings"]["float"][0]) > 0
|
|
assert r["meta"]["billed_units"]["image_tokens"] > 0
|
|
assert r["meta"]["billed_units"]["input_tokens"] == 0
|
|
|
|
|
|
def test_image_batch(server: RemoteOpenAIServer):
|
|
red = _make_tiny_png(255, 0, 0)
|
|
blue = _make_tiny_png(0, 0, 255)
|
|
r = _cohere_embed(
|
|
server,
|
|
images=[red, blue],
|
|
embedding_types=["float"],
|
|
)
|
|
assert len(r["embeddings"]["float"]) == 2
|
|
|
|
|
|
def test_image_l2_normalized(server: RemoteOpenAIServer):
|
|
img_uri = _make_tiny_png(0, 255, 0)
|
|
r = _cohere_embed(
|
|
server,
|
|
images=[img_uri],
|
|
embedding_types=["float"],
|
|
)
|
|
emb = np.array(r["embeddings"]["float"][0])
|
|
assert abs(float(np.linalg.norm(emb)) - 1.0) < 0.01
|
|
|
|
|
|
def test_image_embedding_types(server: RemoteOpenAIServer):
|
|
img_uri = _make_tiny_png(128, 128, 128)
|
|
r = _cohere_embed(
|
|
server,
|
|
images=[img_uri],
|
|
embedding_types=["float", "binary", "ubinary"],
|
|
)
|
|
dim = len(r["embeddings"]["float"][0])
|
|
assert len(r["embeddings"]["binary"][0]) == dim // 8
|
|
assert len(r["embeddings"]["ubinary"][0]) == dim // 8
|
|
|
|
|
|
def test_text_embed_on_multimodal(server: RemoteOpenAIServer):
|
|
"""SigLIP also supports text-only embedding via /v2/embed."""
|
|
r = _cohere_embed(server, texts=["hello world"], embedding_types=["float"])
|
|
assert "embeddings" in r
|
|
assert len(r["embeddings"]["float"]) == 1
|
|
assert len(r["embeddings"]["float"][0]) > 0
|