Add support for Prithvi in Online serving mode (#21518)

Signed-off-by: Michele Gazzetti <michele.gazzetti1@ibm.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-07-25 15:01:27 +01:00
parent 136d750f5f
commit e189b50f53
5 changed files with 128 additions and 10 deletions
--- a/tests/entrypoints/openai/test_skip_tokenizer.py
+++ b/tests/entrypoints/openai/test_skip_tokenizer.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import io
+
+import numpy as np
+import pytest
+import requests
+import torch
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
+DTYPE = "float16"
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "embed",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        DTYPE,
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--skip-tokenizer-init",
+        "--max-num-seqs",
+        "32"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_request(server: RemoteOpenAIServer, model_name: str):
+
+    pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
+    location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
+
+    buffer_tiff = io.BytesIO()
+    torch.save(pixel_values, buffer_tiff)
+    buffer_tiff.seek(0)
+    binary_data = buffer_tiff.read()
+    base64_tensor_embedding = base64.b64encode(binary_data).decode('utf-8')
+
+    buffer_coord = io.BytesIO()
+    torch.save(location_coords, buffer_coord)
+    buffer_coord.seek(0)
+    binary_data = buffer_coord.read()
+    base64_coord_embedding = base64.b64encode(binary_data).decode('utf-8')
+
+    prompt = {
+        "model":
+        model_name,
+        "additional_data": {
+            "prompt_token_ids": [1]
+        },
+        "encoding_format":
+        "base64",
+        "messages": [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_embeds",
+                "image_embeds": {
+                    "pixel_values": base64_tensor_embedding,
+                    "location_coords": base64_coord_embedding,
+                },
+            }],
+        }]
+    }
+
+    # test single pooling
+    response = requests.post(server.url_for("pooling"), json=prompt)
+    response.raise_for_status()
+
+    output = response.json()["data"][0]['data']
+
+    np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
+
+    assert len(np_response) == 524288