Files
vllm/tests/entrypoints/pooling/embed/test_cohere_online_vision.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

136 lines
3.8 KiB
Python
Raw Normal View History

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the Cohere /v2/embed API with a multimodal model (SigLIP).
Validates image embedding, batching, normalisation, and embedding type
conversions through the /v2/embed endpoint.
"""
import struct
import zlib
import numpy as np
import pybase64 as base64
import pytest
import requests
from tests.utils import RemoteOpenAIServer
MODEL_NAME = "google/siglip-so400m-patch14-384"
DTYPE = "bfloat16"
@pytest.fixture(scope="module")
def server():
args = [
"--runner",
"pooling",
"--dtype",
DTYPE,
"--enforce-eager",
"--max-model-len",
"64",
"--gpu-memory-utilization",
"0.3",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
def _make_tiny_png(r: int, g: int, b: int, w: int = 2, h: int = 2) -> str:
raw = b""
for _ in range(h):
raw += b"\x00" + bytes([r, g, b]) * w
compressed = zlib.compress(raw)
def chunk(ctype: bytes, cdata: bytes) -> bytes:
c = ctype + cdata
return (
struct.pack(">I", len(cdata))
+ c
+ struct.pack(">I", zlib.crc32(c) & 0xFFFFFFFF)
)
ihdr = struct.pack(">IIBBBBB", w, h, 8, 2, 0, 0, 0)
png = (
b"\x89PNG\r\n\x1a\n"
+ chunk(b"IHDR", ihdr)
+ chunk(b"IDAT", compressed)
+ chunk(b"IEND", b"")
)
return "data:image/png;base64," + base64.b64encode(png).decode()
def _cohere_embed(
server: RemoteOpenAIServer,
texts: list[str] | None = None,
images: list[str] | None = None,
embedding_types: list[str] | None = None,
) -> dict:
body: dict = {"model": MODEL_NAME}
if texts is not None:
body["texts"] = texts
if images is not None:
body["images"] = images
if embedding_types is not None:
body["embedding_types"] = embedding_types
resp = requests.post(server.url_for("/v2/embed"), json=body)
resp.raise_for_status()
return resp.json()
def test_image_embed(server: RemoteOpenAIServer):
img_uri = _make_tiny_png(255, 0, 0)
r = _cohere_embed(
server,
images=[img_uri],
embedding_types=["float"],
)
assert "embeddings" in r
assert len(r["embeddings"]["float"]) == 1
assert len(r["embeddings"]["float"][0]) > 0
assert r["meta"]["billed_units"]["image_tokens"] > 0
assert r["meta"]["billed_units"]["input_tokens"] == 0
def test_image_batch(server: RemoteOpenAIServer):
red = _make_tiny_png(255, 0, 0)
blue = _make_tiny_png(0, 0, 255)
r = _cohere_embed(
server,
images=[red, blue],
embedding_types=["float"],
)
assert len(r["embeddings"]["float"]) == 2
def test_image_l2_normalized(server: RemoteOpenAIServer):
img_uri = _make_tiny_png(0, 255, 0)
r = _cohere_embed(
server,
images=[img_uri],
embedding_types=["float"],
)
emb = np.array(r["embeddings"]["float"][0])
assert abs(float(np.linalg.norm(emb)) - 1.0) < 0.01
def test_image_embedding_types(server: RemoteOpenAIServer):
img_uri = _make_tiny_png(128, 128, 128)
r = _cohere_embed(
server,
images=[img_uri],
embedding_types=["float", "binary", "ubinary"],
)
dim = len(r["embeddings"]["float"][0])
assert len(r["embeddings"]["binary"][0]) == dim // 8
assert len(r["embeddings"]["ubinary"][0]) == dim // 8
def test_text_embed_on_multimodal(server: RemoteOpenAIServer):
"""SigLIP also supports text-only embedding via /v2/embed."""
r = _cohere_embed(server, texts=["hello world"], embedding_types=["float"])
assert "embeddings" in r
assert len(r["embeddings"]["float"]) == 1
assert len(r["embeddings"]["float"][0]) > 0