diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 4656ee43e..2b25dc766 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -795,14 +795,12 @@ The following example demonstrates how to pass image embeddings to the OpenAI se ??? code ```python + from vllm.utils.serial_utils import tensor2base64 + image_embedding = torch.load(...) grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct - buffer = io.BytesIO() - torch.save(image_embedding, buffer) - buffer.seek(0) - binary_data = buffer.read() - base64_image_embedding = base64.b64encode(binary_data).decode('utf-8') + base64_image_embedding = tensor2base64(image_embedding) client = OpenAI( # defaults to os.environ.get("OPENAI_API_KEY") diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py index 0bbe4b8f5..889be6820 100644 --- a/examples/online_serving/prompt_embed_inference_with_openai_client.py +++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py @@ -28,13 +28,11 @@ Dependencies: - openai """ -import base64 -import io - -import torch import transformers from openai import OpenAI +from vllm.utils.serial_utils import tensor2base64 + def main(): client = OpenAI( @@ -58,11 +56,7 @@ def main(): prompt_embeds = embedding_layer(token_ids).squeeze(0) # Prompt embeddings - buffer = io.BytesIO() - torch.save(prompt_embeds, buffer) - buffer.seek(0) - binary_data = buffer.read() - encoded_embeds = base64.b64encode(binary_data).decode("utf-8") + encoded_embeds = tensor2base64(prompt_embeds) completion = client.completions.create( model=model_name, diff --git a/tests/entrypoints/openai/test_vision_embeds.py b/tests/entrypoints/openai/test_vision_embeds.py index a6593c5b0..42d9fe484 100644 --- a/tests/entrypoints/openai/test_vision_embeds.py +++ b/tests/entrypoints/openai/test_vision_embeds.py @@ -2,64 +2,47 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 -import io import numpy as np import pytest import requests import torch +from vllm.utils.serial_utils import tensor2base64 + from ...utils import RemoteOpenAIServer -MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11" -DTYPE = "float16" - -def _terratorch_dummy_inputs(model_name: str): +def _terratorch_dummy_messages(): pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16) location_coords = torch.full((1, 2), 1.0, dtype=torch.float16) - buffer_tiff = io.BytesIO() - torch.save(pixel_values, buffer_tiff) - buffer_tiff.seek(0) - binary_data = buffer_tiff.read() - base64_tensor_embedding = base64.b64encode(binary_data).decode("utf-8") - - buffer_coord = io.BytesIO() - torch.save(location_coords, buffer_coord) - buffer_coord.seek(0) - binary_data = buffer_coord.read() - base64_coord_embedding = base64.b64encode(binary_data).decode("utf-8") - - return { - "model": model_name, - "additional_data": {"prompt_token_ids": [1]}, - "encoding_format": "base64", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "image_embeds", - "image_embeds": { - "pixel_values": base64_tensor_embedding, - "location_coords": base64_coord_embedding, - }, - } - ], - } - ], - } + return [ + { + "role": "user", + "content": [ + { + "type": "image_embeds", + "image_embeds": { + "pixel_values": tensor2base64(pixel_values), + "location_coords": tensor2base64(location_coords), + }, + } + ], + } + ] -@pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_single_request(model_name: str): +@pytest.mark.parametrize( + "model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"] +) +def test_single_request(model_name: str): args = [ "--runner", "pooling", # use half precision for speed and memory savings in CI environment "--dtype", - DTYPE, + "float16", "--enforce-eager", "--trust-remote-code", "--max-num-seqs", @@ -70,11 +53,15 @@ async def test_single_request(model_name: str): "--enable-mm-embeds", ] - with RemoteOpenAIServer(MODEL_NAME, args) as server: - prompt = _terratorch_dummy_inputs(model_name) - - # test single pooling - response = requests.post(server.url_for("pooling"), json=prompt) + with RemoteOpenAIServer(model_name, args) as server: + response = requests.post( + server.url_for("pooling"), + json={ + "model": model_name, + "messages": _terratorch_dummy_messages(), + "encoding_format": "base64", + }, + ) response.raise_for_status() output = response.json()["data"][0]["data"] diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 03a0c058e..75be34820 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -29,6 +29,7 @@ from vllm.multimodal.utils import ( encode_video_base64, ) from vllm.tokenizers import MistralTokenizer, get_tokenizer +from vllm.utils.serial_utils import tensor2base64 from ..models.registry import HF_EXAMPLE_MODELS from ..utils import VLLM_PATH @@ -85,11 +86,6 @@ def phi3v_model_config_image_embeds(): ) -@pytest.fixture(scope="module") -def phi3v_tokenizer(): - return get_tokenizer(PHI3V_MODEL_ID) - - @pytest.fixture(scope="function") def qwen2_audio_model_config(): return ModelConfig( @@ -115,11 +111,6 @@ def audio_embeds_model_config(): ) -@pytest.fixture(scope="module") -def qwen2_audio_tokenizer(): - return get_tokenizer(QWEN2AUDIO_MODEL_ID) - - @pytest.fixture(scope="function") def qwen25omni_model_config_mm_interleaved(): return ModelConfig( @@ -134,11 +125,6 @@ def qwen25omni_model_config_mm_interleaved(): ) -@pytest.fixture(scope="module") -def qwen25omni_tokenizer(): - return get_tokenizer(QWEN25OMNI_MODEL_ID) - - @pytest.fixture(scope="function") def mistral_model_config(): return ModelConfig( @@ -150,11 +136,6 @@ def mistral_model_config(): ) -@pytest.fixture(scope="module") -def mistral_tokenizer(): - return get_tokenizer(MISTRAL_MODEL_ID) - - @pytest.fixture(scope="module") def image_url(): image = ImageAsset("cherry_blossom") @@ -239,7 +220,6 @@ def _assert_mm_data_inputs( def test_parse_chat_messages_single_image( phi3v_model_config, - phi3v_tokenizer, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( @@ -253,7 +233,6 @@ def test_parse_chat_messages_single_image( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -266,7 +245,6 @@ def test_parse_chat_messages_single_image( def test_parse_chat_messages_single_image_with_uuid( phi3v_model_config, - phi3v_tokenizer, image_url, ): image_uuid = str(hash(image_url)) @@ -287,7 +265,6 @@ def test_parse_chat_messages_single_image_with_uuid( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -300,7 +277,6 @@ def test_parse_chat_messages_single_image_with_uuid( def test_parse_chat_messages_single_empty_image_with_uuid( phi3v_model_config, - phi3v_tokenizer, image_url, ): image_uuid = str(hash(image_url)) @@ -319,7 +295,6 @@ def test_parse_chat_messages_single_empty_image_with_uuid( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -332,7 +307,6 @@ def test_parse_chat_messages_single_empty_image_with_uuid( def test_parse_chat_messages_single_image_with_bad_uuid_format( phi3v_model_config, - phi3v_tokenizer, image_url, ): image_uuid = str(hash(image_url)) @@ -354,7 +328,6 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -367,7 +340,6 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format( def test_parse_chat_messages_multiple_images_with_uuids( phi3v_model_config, - phi3v_tokenizer, image_url, ): image_uuid1 = "my_uuid_1" @@ -397,7 +369,6 @@ def test_parse_chat_messages_multiple_images_with_uuids( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -413,7 +384,6 @@ def test_parse_chat_messages_multiple_images_with_uuids( def test_parse_chat_messages_multiple_empty_images_with_uuids( phi3v_model_config, - phi3v_tokenizer, image_url, ): image_uuid1 = "my_uuid_1" @@ -439,7 +409,6 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -455,7 +424,6 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids( def test_parse_chat_messages_mixed_empty_images_with_uuids( phi3v_model_config, - phi3v_tokenizer, image_url, ): image_uuid1 = "my_uuid_1" @@ -483,7 +451,6 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -500,7 +467,6 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids( @pytest.mark.asyncio async def test_parse_chat_messages_single_image_with_uuid_async( phi3v_model_config, - phi3v_tokenizer, image_url, ): image_uuid = str(hash(image_url)) @@ -519,7 +485,6 @@ async def test_parse_chat_messages_single_image_with_uuid_async( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -533,7 +498,6 @@ async def test_parse_chat_messages_single_image_with_uuid_async( @pytest.mark.asyncio async def test_parse_chat_messages_empty_image_with_uuid_async( phi3v_model_config, - phi3v_tokenizer, image_url, ): image_uuid = str(hash(image_url)) @@ -552,7 +516,6 @@ async def test_parse_chat_messages_empty_image_with_uuid_async( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -566,7 +529,6 @@ async def test_parse_chat_messages_empty_image_with_uuid_async( @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_with_uuids_async( phi3v_model_config, - phi3v_tokenizer, image_url, ): image_uuid1 = "my_uuid_1" @@ -592,7 +554,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -609,7 +570,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async( @pytest.mark.asyncio async def test_parse_chat_messages_multiple_empty_images_with_uuids_async( phi3v_model_config, - phi3v_tokenizer, image_url, ): image_uuid1 = "my_uuid_1" @@ -635,7 +595,6 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -652,7 +611,6 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async( @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_with_partial_uuids_async( phi3v_model_config, - phi3v_tokenizer, image_url, ): image_uuid2 = "my_uuid_2" @@ -676,7 +634,6 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -692,7 +649,6 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async( def test_parse_chat_messages_empty_system( mistral_model_config, - mistral_tokenizer, ): # Test string format conversation, _, _ = parse_chat_messages( @@ -704,7 +660,6 @@ def test_parse_chat_messages_empty_system( }, ], mistral_model_config, - mistral_tokenizer, content_format="string", ) assert conversation == [ @@ -722,7 +677,6 @@ def test_parse_chat_messages_empty_system( }, ], mistral_model_config, - mistral_tokenizer, content_format="openai", ) assert conversation == [ @@ -734,7 +688,6 @@ def test_parse_chat_messages_empty_system( @pytest.mark.asyncio async def test_parse_chat_messages_single_image_async( phi3v_model_config, - phi3v_tokenizer, image_url, ): conversation, mm_future, mm_uuids = parse_chat_messages_futures( @@ -748,7 +701,6 @@ async def test_parse_chat_messages_single_image_async( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -761,7 +713,6 @@ async def test_parse_chat_messages_single_image_async( def test_parse_chat_messages_multiple_images( phi3v_model_config, - phi3v_tokenizer, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( @@ -779,7 +730,6 @@ def test_parse_chat_messages_multiple_images( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -795,7 +745,6 @@ def test_parse_chat_messages_multiple_images( def test_parse_chat_messages_empty_pil_image_with_uuid( phi3v_model_config, - phi3v_tokenizer, ): uuid = "abcd" conversation, mm_data, mm_uuids = parse_chat_messages( @@ -809,7 +758,6 @@ def test_parse_chat_messages_empty_pil_image_with_uuid( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -825,7 +773,6 @@ def test_parse_chat_messages_empty_pil_image_with_uuid( def test_parse_chat_messages_empty_image_embeds_with_uuid( phi3v_model_config_image_embeds, - phi3v_tokenizer, ): uuid = "abcd" conversation, mm_data, mm_uuids = parse_chat_messages( @@ -839,7 +786,6 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid( } ], phi3v_model_config_image_embeds, - phi3v_tokenizer, content_format="string", ) @@ -857,7 +803,6 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid( def test_parse_chat_messages_empty_audio_embeds_with_uuid( audio_embeds_model_config, - qwen2_audio_tokenizer, ): """Test audio_embeds with UUID (no actual embeds data).""" uuid = "test-audio-uuid-123" @@ -873,7 +818,6 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid( } ], audio_embeds_model_config, - qwen2_audio_tokenizer, content_format="string", ) @@ -889,11 +833,8 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid( def test_parse_chat_messages_audio_embeds_with_string( audio_embeds_model_config, - qwen2_audio_tokenizer, ): """Test audio_embeds with base64 string embedding data.""" - import base64 - import io import torch @@ -901,11 +842,7 @@ def test_parse_chat_messages_audio_embeds_with_string( audio_embedding = torch.randn(1, 128, 768) # Encode it as base64 - buffer = io.BytesIO() - torch.save(audio_embedding, buffer) - buffer.seek(0) - binary_data = buffer.read() - base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8") + base64_audio_embedding = tensor2base64(audio_embedding) conversation, mm_data, mm_uuids = parse_chat_messages( [ @@ -921,7 +858,6 @@ def test_parse_chat_messages_audio_embeds_with_string( } ], audio_embeds_model_config, - qwen2_audio_tokenizer, content_format="string", ) @@ -939,11 +875,8 @@ def test_parse_chat_messages_audio_embeds_with_string( @pytest.mark.asyncio async def test_parse_chat_messages_audio_embeds_async( audio_embeds_model_config, - qwen2_audio_tokenizer, ): """Test audio_embeds with async futures.""" - import base64 - import io import torch @@ -951,11 +884,7 @@ async def test_parse_chat_messages_audio_embeds_async( audio_embedding = torch.randn(1, 128, 768) # Encode it as base64 - buffer = io.BytesIO() - torch.save(audio_embedding, buffer) - buffer.seek(0) - binary_data = buffer.read() - base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8") + base64_audio_embedding = tensor2base64(audio_embedding) conversation, mm_future, mm_uuids = parse_chat_messages_futures( [ @@ -971,7 +900,6 @@ async def test_parse_chat_messages_audio_embeds_async( } ], audio_embeds_model_config, - qwen2_audio_tokenizer, content_format="string", ) @@ -990,7 +918,6 @@ async def test_parse_chat_messages_audio_embeds_async( @pytest.mark.asyncio async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( phi3v_model_config_image_embeds, - phi3v_tokenizer, ): uuid = "abcd" conversation, mm_future, mm_uuids = parse_chat_messages_futures( @@ -1004,7 +931,6 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( } ], phi3v_model_config_image_embeds, - phi3v_tokenizer, content_format="string", ) @@ -1024,7 +950,6 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_async( phi3v_model_config, - phi3v_tokenizer, image_url, ): conversation, mm_future, mm_uuids = parse_chat_messages_futures( @@ -1042,7 +967,6 @@ async def test_parse_chat_messages_multiple_images_async( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -1058,7 +982,6 @@ async def test_parse_chat_messages_multiple_images_async( def test_parse_chat_messages_placeholder_already_in_prompt( phi3v_model_config, - phi3v_tokenizer, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( @@ -1076,7 +999,6 @@ def test_parse_chat_messages_placeholder_already_in_prompt( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) assert conversation == [ @@ -1091,7 +1013,6 @@ def test_parse_chat_messages_placeholder_already_in_prompt( def test_parse_chat_messages_placeholder_one_already_in_prompt( phi3v_model_config, - phi3v_tokenizer, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( @@ -1110,7 +1031,6 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -1127,7 +1047,6 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( def test_parse_chat_messages_multiple_images_across_messages( phi3v_model_config, - phi3v_tokenizer, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( @@ -1149,7 +1068,6 @@ def test_parse_chat_messages_multiple_images_across_messages( }, ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -1164,7 +1082,6 @@ def test_parse_chat_messages_multiple_images_across_messages( def test_parse_chat_messages_multiple_images_with_uuids_across_messages( phi3v_model_config, - phi3v_tokenizer, image_url, ): image_uuid = str(hash(image_url)) @@ -1195,7 +1112,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages( }, ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -1210,7 +1126,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages( def test_parse_chat_messages_context_text_format( phi3v_model_config, - phi3v_tokenizer, ): conversation, mm_data, mm_uuids = parse_chat_messages( [ @@ -1222,7 +1137,6 @@ def test_parse_chat_messages_context_text_format( {"role": "user", "content": "What about this one?"}, ], phi3v_model_config, - phi3v_tokenizer, content_format="openai", ) @@ -1246,7 +1160,6 @@ def test_parse_chat_messages_context_text_format( def test_parse_chat_messages_rejects_too_many_images_in_one_message( phi3v_model_config, - phi3v_tokenizer, image_url, ): with warnings.catch_warnings(): @@ -1277,14 +1190,12 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) def test_parse_chat_messages_rejects_too_many_images_across_messages( phi3v_model_config, - phi3v_tokenizer, image_url, ): with warnings.catch_warnings(): @@ -1322,14 +1233,12 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages( }, ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) def test_parse_chat_messages_multiple_images_uncommon_input( phi3v_model_config, - phi3v_tokenizer, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( @@ -1344,7 +1253,6 @@ def test_parse_chat_messages_multiple_images_uncommon_input( } ], phi3v_model_config, - phi3v_tokenizer, content_format="string", ) @@ -1360,7 +1268,6 @@ def test_parse_chat_messages_multiple_images_uncommon_input( def test_parse_chat_messages_multiple_images_interleave( phi3v_model_config_mm_interleaved, - phi3v_tokenizer, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( @@ -1380,7 +1287,6 @@ def test_parse_chat_messages_multiple_images_interleave( } ], phi3v_model_config_mm_interleaved, - phi3v_tokenizer, content_format="string", ) @@ -1398,7 +1304,6 @@ def test_parse_chat_messages_multiple_images_interleave( @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_interleave_async( phi3v_model_config_mm_interleaved, - phi3v_tokenizer, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages_futures( @@ -1418,7 +1323,6 @@ async def test_parse_chat_messages_multiple_images_interleave_async( } ], phi3v_model_config_mm_interleaved, - phi3v_tokenizer, content_format="string", ) @@ -1436,7 +1340,6 @@ async def test_parse_chat_messages_multiple_images_interleave_async( @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async( phi3v_model_config_mm_interleaved, - phi3v_tokenizer, image_url, ): image_uuid = str(hash(image_url)) @@ -1465,7 +1368,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async( } ], phi3v_model_config_mm_interleaved, - phi3v_tokenizer, content_format="string", ) @@ -1482,7 +1384,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async( def test_parse_chat_messages_multiple_images_multiple_messages_interleave( phi3v_model_config_mm_interleaved, - phi3v_tokenizer, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( @@ -1505,7 +1406,6 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave( }, ], phi3v_model_config_mm_interleaved, - phi3v_tokenizer, content_format="string", ) @@ -1523,7 +1423,6 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave( def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave( phi3v_model_config_mm_interleaved, - phi3v_tokenizer, image_url, ): image_uuid = str(hash(image_url)) @@ -1555,7 +1454,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl }, ], phi3v_model_config_mm_interleaved, - phi3v_tokenizer, content_format="string", ) @@ -1573,7 +1471,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( qwen25omni_model_config_mm_interleaved, - qwen25omni_tokenizer, image_url, video_url, audio_url, @@ -1601,7 +1498,6 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( }, ], qwen25omni_model_config_mm_interleaved, - qwen25omni_tokenizer, content_format="string", ) @@ -1627,7 +1523,6 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave( qwen25omni_model_config_mm_interleaved, - qwen25omni_tokenizer, image_url, video_url, audio_url, @@ -1671,7 +1566,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl }, ], qwen25omni_model_config_mm_interleaved, - qwen25omni_tokenizer, content_format="string", ) @@ -1699,7 +1593,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave( # noqa: E501 qwen25omni_model_config_mm_interleaved, - qwen25omni_tokenizer, image_url, video_url, audio_url, @@ -1743,7 +1636,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes }, ], qwen25omni_model_config_mm_interleaved, - qwen25omni_tokenizer, content_format="string", ) @@ -1775,7 +1667,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave( # noqa: E501 qwen25omni_model_config_mm_interleaved, - qwen25omni_tokenizer, image_url, video_url, audio_url, @@ -1811,7 +1702,6 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message }, ], qwen25omni_model_config_mm_interleaved, - qwen25omni_tokenizer, content_format="string", ) @@ -1837,7 +1727,6 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message def test_parse_chat_messages_multiple_images_interleave_with_placeholders( phi3v_model_config_mm_interleaved, - phi3v_tokenizer, image_url, ): with pytest.raises( @@ -1861,7 +1750,6 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders( } ], phi3v_model_config_mm_interleaved, - phi3v_tokenizer, content_format="string", ) @@ -2237,9 +2125,7 @@ def test_resolve_content_format_examples(template_path, expected_format): assert resolved_format == expected_format -def test_parse_chat_messages_include_thinking_chunk( - mistral_model_config, mistral_tokenizer -): +def test_parse_chat_messages_include_thinking_chunk(mistral_model_config): messages = [ { "role": "system", @@ -2269,7 +2155,6 @@ def test_parse_chat_messages_include_thinking_chunk( conversation_with_thinking, _, _ = parse_chat_messages( messages, mistral_model_config, - mistral_tokenizer, content_format="openai", ) @@ -2353,7 +2238,6 @@ def test_apply_mistral_chat_template_thinking_chunk(): def test_parse_chat_messages_single_empty_audio_with_uuid( qwen2_audio_model_config, - qwen2_audio_tokenizer, ): audio_uuid = "abcd" conversation, mm_data, mm_uuids = parse_chat_messages( @@ -2371,7 +2255,6 @@ def test_parse_chat_messages_single_empty_audio_with_uuid( } ], qwen2_audio_model_config, - qwen2_audio_tokenizer, content_format="string", ) @@ -2389,7 +2272,6 @@ def test_parse_chat_messages_single_empty_audio_with_uuid( @pytest.mark.asyncio async def test_parse_chat_messages_single_empty_audio_with_uuid_async( qwen2_audio_model_config, - qwen2_audio_tokenizer, ): audio_uuid = "abcd" conversation, mm_future, mm_uuids = parse_chat_messages_futures( @@ -2407,7 +2289,6 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async( } ], qwen2_audio_model_config, - qwen2_audio_tokenizer, content_format="string", ) diff --git a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py index 276de2ff8..b30556fbc 100644 --- a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py +++ b/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import base64 -import io import json import openai # use the official client for correctness check @@ -13,6 +11,7 @@ from transformers import AutoConfig from tests.conftest import ImageTestAssets from tests.utils import RemoteOpenAIServer +from vllm.utils.serial_utils import tensor2base64 # any model with a chat template should work here MODEL_NAME = "llava-hf/llava-1.5-7b-hf" @@ -50,18 +49,6 @@ async def client_with_image_embeds(server_with_image_embeds): yield async_client -def encode_image_embedding_to_base64(image_embedding) -> str: - """ - Encode image embedding to base64 string - """ - buffer = io.BytesIO() - torch.save(image_embedding, buffer) - buffer.seek(0) - binary_data = buffer.read() - base64_image_embedding = base64.b64encode(binary_data).decode("utf-8") - return base64_image_embedding - - @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("dtype", [torch.half, torch.float16, torch.float32]) @@ -73,7 +60,7 @@ async def test_completions_with_image_embeds( ): # Test case: Single image embeds input image_embeds = image_assets[0].image_embeds.to(dtype=dtype) - base64_image_embedding = encode_image_embedding_to_base64(image_embeds) + base64_image_embedding = tensor2base64(image_embeds) chat_completion = await client_with_image_embeds.chat.completions.create( messages=[ {"role": "system", "content": "You are a helpful assistant."}, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 1b3a7d266..077fe681b 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -536,7 +536,7 @@ def resolve_hf_chat_template( def _resolve_chat_template_content_format( chat_template: str | None, tools: list[dict[str, Any]] | None, - tokenizer: TokenizerLike, + tokenizer: TokenizerLike | None, *, model_config: ModelConfig, ) -> _ChatTemplateContentFormat: @@ -593,7 +593,7 @@ def resolve_chat_template_content_format( chat_template: str | None, tools: list[dict[str, Any]] | None, given_format: ChatTemplateContentFormatOption, - tokenizer: TokenizerLike, + tokenizer: TokenizerLike | None, *, model_config: ModelConfig, ) -> _ChatTemplateContentFormat: @@ -627,11 +627,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): maximum per prompt. """ - def __init__(self, model_config: ModelConfig, tokenizer: TokenizerLike): + def __init__(self, model_config: ModelConfig): super().__init__() self._model_config = model_config - self._tokenizer = tokenizer self._items_by_modality = defaultdict[str, list[_T | None]](list) self._uuids_by_modality = defaultdict[str, list[str | None]](list) @@ -1612,7 +1611,6 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None: def parse_chat_messages( messages: list[ChatCompletionMessageParam], model_config: ModelConfig, - tokenizer: TokenizerLike, content_format: _ChatTemplateContentFormat, ) -> tuple[ list[ConversationMessage], @@ -1620,7 +1618,7 @@ def parse_chat_messages( MultiModalUUIDDict | None, ]: conversation: list[ConversationMessage] = [] - mm_tracker = MultiModalItemTracker(model_config, tokenizer) + mm_tracker = MultiModalItemTracker(model_config) for msg in messages: sub_messages = _parse_chat_message_content( @@ -1644,7 +1642,6 @@ def parse_chat_messages( def parse_chat_messages_futures( messages: list[ChatCompletionMessageParam], model_config: ModelConfig, - tokenizer: TokenizerLike, content_format: _ChatTemplateContentFormat, ) -> tuple[ list[ConversationMessage], @@ -1652,7 +1649,7 @@ def parse_chat_messages_futures( MultiModalUUIDDict | None, ]: conversation: list[ConversationMessage] = [] - mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer) + mm_tracker = AsyncMultiModalItemTracker(model_config) for msg in messages: sub_messages = _parse_chat_message_content( diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index c121fa71f..481a47a97 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -834,7 +834,6 @@ class LLM: conversation, mm_data, mm_uuids = parse_chat_messages( msgs, model_config, - tokenizer, content_format=resolved_content_format, ) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 9642024dd..bfa98f29a 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1088,11 +1088,6 @@ class OpenAIServing: Sequence[RequestPrompt], list[EngineTokensPrompt], ]: - if tokenizer is None: - raise ValueError( - "Unable to get tokenizer because `skip_tokenizer_init=True`" - ) - model_config = self.model_config resolved_content_format = resolve_chat_template_content_format( @@ -1105,7 +1100,6 @@ class OpenAIServing: conversation, mm_data_future, mm_uuids = parse_chat_messages_futures( messages, model_config, - tokenizer, content_format=resolved_content_format, ) diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 8819c85af..072ddd4c9 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -89,12 +89,10 @@ def parse_score_data( data_1: str | ScoreContentPartParam, data_2: str | ScoreContentPartParam, model_config: ModelConfig, - tokenizer: TokenizerLike, ) -> tuple[str, str, MultiModalDataDict | None]: - mm_tracker = MultiModalItemTracker(model_config, tokenizer) + mm_tracker = MultiModalItemTracker(model_config) content_1 = _parse_score_content(data_1, mm_tracker) - content_2 = _parse_score_content(data_2, mm_tracker) def ensure_str(content: _ContentPart | None) -> str: @@ -188,7 +186,6 @@ def get_score_prompt( data_1, data_2, model_config, - tokenizer, ) from vllm.model_executor.model_loader import get_model_cls diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index 6537b6df8..5aef09ca9 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -62,6 +62,7 @@ from vllm.multimodal.inputs import ( from vllm.multimodal.parse import ( DictEmbeddingItems, ImageSize, + ModalityDataItems, MultiModalDataItems, MultiModalDataParser, ) @@ -570,7 +571,7 @@ class HunYuanVLMultiModalDataParser(MultiModalDataParser): def _parse_image_data( self, data: dict[str, torch.Tensor] | ModalityData[ImageItem], - ): + ) -> ModalityDataItems[Any, Any] | None: if isinstance(data, dict): return DictEmbeddingItems( data, diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 881760155..09acf8372 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -1000,7 +1000,7 @@ class KeyeMultiModalDataParser(MultiModalDataParser): def _parse_image_data( self, data: dict[str, torch.Tensor] | ModalityData[ImageItem], - ) -> ModalityDataItems[Any, Any]: + ) -> ModalityDataItems[Any, Any] | None: if isinstance(data, dict): return DictEmbeddingItems( data, @@ -1017,7 +1017,7 @@ class KeyeMultiModalDataParser(MultiModalDataParser): def _parse_video_data( self, data: dict[str, torch.Tensor] | ModalityData[VideoItem], - ) -> ModalityDataItems[Any, Any]: + ) -> ModalityDataItems[Any, Any] | None: if isinstance(data, dict): return DictEmbeddingItems( data, diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py index 124e9c2af..2b04e3bd4 100644 --- a/vllm/model_executor/models/keye_vl1_5.py +++ b/vllm/model_executor/models/keye_vl1_5.py @@ -333,7 +333,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser): def _parse_image_data( self, data: dict[str, torch.Tensor] | ModalityData[ImageItem], - ) -> ModalityDataItems[Any, Any]: + ) -> ModalityDataItems[Any, Any] | None: if isinstance(data, dict): return DictEmbeddingItems( data, @@ -350,7 +350,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser): def _parse_video_data( self, data: dict[str, torch.Tensor] | ModalityData[VideoItem], - ) -> ModalityDataItems[Any, Any]: + ) -> ModalityDataItems[Any, Any] | None: if isinstance(data, dict): return DictEmbeddingItems( data, diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index b93a42ffd..062547401 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -11,6 +11,7 @@ import pybase64 import torch from vllm.utils.import_utils import PlaceholderModule +from vllm.utils.serial_utils import tensor2base64 from .base import MediaIO @@ -135,8 +136,4 @@ class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]): return torch.load(filepath, weights_only=True) def encode_base64(self, media: torch.Tensor) -> str: - buffer = BytesIO() - torch.save(media, buffer) - buffer.seek(0) - binary_data = buffer.read() - return pybase64.b64encode(binary_data).decode("utf-8") + return tensor2base64(media) diff --git a/vllm/utils/serial_utils.py b/vllm/utils/serial_utils.py index b89fa6ce4..a6d717e03 100644 --- a/vllm/utils/serial_utils.py +++ b/vllm/utils/serial_utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 +import io import sys from dataclasses import dataclass from typing import Literal @@ -52,6 +53,15 @@ Endianness = Literal["native", "big", "little"] EncodingFormat = Literal["float", "base64", "bytes"] +def tensor2base64(x: torch.Tensor) -> str: + with io.BytesIO() as buf: + torch.save(x, buf) + buf.seek(0) + binary_data = buf.read() + + return base64.b64encode(binary_data).decode("utf-8") + + def tensor2binary( tensor: torch.Tensor, embed_dtype: EmbedDType, endianness: Endianness ) -> bytes: