[Refactor] Relocate entrypoint tests to match serving code structure (#37593)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
This commit is contained in:
223
tests/multimodal/test_embedding_shape_validation.py
Normal file
223
tests/multimodal/test_embedding_shape_validation.py
Normal file
@@ -0,0 +1,223 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Embedding shape validation in multimodal APIs.
|
||||
|
||||
Tests verify that embeddings with correct ndim but incorrect hidden_size
|
||||
are rejected before they can cause crashes during model inference.
|
||||
|
||||
Validation is performed by the parser (MultiModalDataParser) and EmbeddingItems
|
||||
classes, not by MediaIO classes.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.parse import (
|
||||
AudioEmbeddingItems,
|
||||
ImageEmbeddingItems,
|
||||
MultiModalDataParser,
|
||||
VideoEmbeddingItems,
|
||||
)
|
||||
|
||||
|
||||
class TestMultiModalParserShapeValidation:
|
||||
"""Test hidden_size validation in MultiModalDataParser."""
|
||||
|
||||
def test_image_embeddings_correct_hidden_size_accepted(self):
|
||||
"""Baseline: Image embeddings with correct hidden_size should work."""
|
||||
expected_hidden_size = 768
|
||||
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
|
||||
|
||||
valid_embeds = torch.randn(2, 100, expected_hidden_size)
|
||||
|
||||
result = parser.parse_mm_data({"image": valid_embeds})
|
||||
|
||||
assert "image" in result
|
||||
assert isinstance(result["image"], ImageEmbeddingItems)
|
||||
assert result["image"].get_count() == 2
|
||||
|
||||
def test_image_embeddings_wrong_hidden_size_rejected(self):
|
||||
"""Security: Image embeddings with wrong hidden_size should be rejected."""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 4096
|
||||
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
|
||||
|
||||
invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
parser.parse_mm_data({"image": invalid_embeds})
|
||||
|
||||
error_msg = str(exc_info.value).lower()
|
||||
assert "image" in error_msg
|
||||
assert "hidden dimension mismatch" in error_msg
|
||||
|
||||
def test_audio_embeddings_wrong_hidden_size_rejected(self):
|
||||
"""Security: Audio embeddings with wrong hidden_size should be rejected."""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 2048
|
||||
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
|
||||
|
||||
invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
parser.parse_mm_data({"audio": invalid_embeds})
|
||||
|
||||
error_msg = str(exc_info.value).lower()
|
||||
assert "audio" in error_msg
|
||||
assert "hidden dimension mismatch" in error_msg
|
||||
|
||||
def test_video_embeddings_wrong_hidden_size_rejected(self):
|
||||
"""Security: Video embeddings with wrong hidden_size should be rejected."""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 512
|
||||
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
|
||||
|
||||
invalid_embeds = torch.randn(2, 100, wrong_hidden_size)
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
parser.parse_mm_data({"video": invalid_embeds})
|
||||
|
||||
error_msg = str(exc_info.value).lower()
|
||||
assert "video" in error_msg
|
||||
assert "hidden dimension mismatch" in error_msg
|
||||
|
||||
def test_list_of_embeddings_validates_each(self):
|
||||
"""Security: Each embedding in list should be validated."""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 1024
|
||||
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
|
||||
|
||||
# List with second tensor having wrong hidden_size
|
||||
invalid_embeds = [
|
||||
torch.randn(100, expected_hidden_size),
|
||||
torch.randn(100, wrong_hidden_size),
|
||||
]
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
parser.parse_mm_data({"image": invalid_embeds})
|
||||
|
||||
# Should identify which embedding failed
|
||||
assert "[1]" in str(exc_info.value)
|
||||
|
||||
def test_validation_disabled_allows_any_size(self):
|
||||
"""When validation disabled (legacy), any hidden_size allowed."""
|
||||
parser = MultiModalDataParser(expected_hidden_size=None)
|
||||
|
||||
any_hidden_size = 12345
|
||||
embeds = torch.randn(2, 100, any_hidden_size)
|
||||
|
||||
# Should not raise
|
||||
result = parser.parse_mm_data({"image": embeds})
|
||||
assert "image" in result
|
||||
assert isinstance(result["image"], ImageEmbeddingItems)
|
||||
|
||||
|
||||
class TestEmbeddingItemsDirectValidation:
|
||||
"""Direct tests for EmbeddingItems hidden_size validation."""
|
||||
|
||||
def test_image_embedding_items_validates_batched_tensor(self):
|
||||
"""Test validation for batched (3D) image embeddings."""
|
||||
expected = 768
|
||||
wrong = 1024
|
||||
|
||||
# Valid
|
||||
valid = torch.randn(2, 100, expected)
|
||||
items = ImageEmbeddingItems(valid, expected_hidden_size=expected)
|
||||
assert items.get_count() == 2
|
||||
|
||||
# Invalid
|
||||
invalid = torch.randn(2, 100, wrong)
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
ImageEmbeddingItems(invalid, expected_hidden_size=expected)
|
||||
|
||||
assert str(wrong) in str(exc_info.value)
|
||||
assert str(expected) in str(exc_info.value)
|
||||
|
||||
def test_image_embedding_items_validates_list_of_tensors(self):
|
||||
"""Test validation for list of 2D image embeddings."""
|
||||
expected = 768
|
||||
wrong = 512
|
||||
|
||||
# Valid list
|
||||
valid_list = [torch.randn(100, expected), torch.randn(50, expected)]
|
||||
items = ImageEmbeddingItems(valid_list, expected_hidden_size=expected)
|
||||
assert items.get_count() == 2
|
||||
|
||||
# Invalid list
|
||||
invalid_list = [torch.randn(100, expected), torch.randn(50, wrong)]
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
ImageEmbeddingItems(invalid_list, expected_hidden_size=expected)
|
||||
|
||||
assert "[1]" in str(exc_info.value)
|
||||
|
||||
def test_audio_embedding_items_validates(self):
|
||||
"""Test validation for audio embeddings."""
|
||||
expected = 768
|
||||
wrong = 256
|
||||
|
||||
invalid = torch.randn(2, 100, wrong)
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
AudioEmbeddingItems(invalid, expected_hidden_size=expected)
|
||||
|
||||
assert "audio" in str(exc_info.value).lower()
|
||||
|
||||
def test_video_embedding_items_validates(self):
|
||||
"""Test validation for video embeddings."""
|
||||
expected = 768
|
||||
wrong = 384
|
||||
|
||||
invalid = torch.randn(2, 100, wrong)
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
VideoEmbeddingItems(invalid, expected_hidden_size=expected)
|
||||
|
||||
assert "video" in str(exc_info.value).lower()
|
||||
|
||||
|
||||
class TestShapeValidationIntegration:
|
||||
"""Integration tests verifying attack scenarios are blocked."""
|
||||
|
||||
def test_attack_scenario_multimodal_image(self):
|
||||
"""
|
||||
Simulate attack through Chat API with image embeddings.
|
||||
|
||||
Verifies validation occurs in multimodal parser path.
|
||||
"""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 4096
|
||||
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
|
||||
|
||||
attack_tensor = torch.randn(1, 100, wrong_hidden_size)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
parser.parse_mm_data({"image": attack_tensor})
|
||||
|
||||
def test_attack_scenario_multimodal_audio(self):
|
||||
"""
|
||||
Simulate attack through Chat API with audio embeddings.
|
||||
|
||||
Verifies validation occurs in multimodal parser path.
|
||||
"""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 2048
|
||||
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
|
||||
|
||||
attack_tensor = torch.randn(1, 100, wrong_hidden_size)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
parser.parse_mm_data({"audio": attack_tensor})
|
||||
|
||||
def test_attack_scenario_multimodal_video(self):
|
||||
"""
|
||||
Simulate attack through Chat API with video embeddings.
|
||||
|
||||
Verifies validation occurs in multimodal parser path.
|
||||
"""
|
||||
expected_hidden_size = 768
|
||||
wrong_hidden_size = 1024
|
||||
parser = MultiModalDataParser(expected_hidden_size=expected_hidden_size)
|
||||
|
||||
attack_tensor = torch.randn(1, 100, wrong_hidden_size)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
parser.parse_mm_data({"video": attack_tensor})
|
||||
Reference in New Issue
Block a user