[Model] Support nested structures for TensorSchema (#26212)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -6,37 +6,39 @@ import torch
|
||||
|
||||
from vllm.model_executor.models.glm4_1v import Glm4vImageEmbeddingInputs
|
||||
from vllm.model_executor.models.granite_speech import GraniteSpeechAudioInputs
|
||||
from vllm.model_executor.models.hyperclovax_vision import (
|
||||
HCXVisionVideoPixelInputs)
|
||||
from vllm.model_executor.models.phi3v import Phi3VImagePixelInputs
|
||||
|
||||
|
||||
def test_tensor_schema_valid_tensor():
|
||||
Phi3VImagePixelInputs(
|
||||
data=torch.randn(16, 64, 3, 32, 32),
|
||||
pixel_values=torch.randn(16, 64, 3, 32, 32),
|
||||
image_sizes=torch.randint(0, 256, (16, 2)),
|
||||
)
|
||||
|
||||
|
||||
def test_tensor_schema_optional_fields():
|
||||
Phi3VImagePixelInputs(
|
||||
data=torch.randn(16, 64, 3, 32, 32),
|
||||
pixel_values=torch.randn(16, 64, 3, 32, 32),
|
||||
image_sizes=None,
|
||||
)
|
||||
|
||||
Phi3VImagePixelInputs(data=torch.randn(16, 64, 3, 32, 32), )
|
||||
Phi3VImagePixelInputs(pixel_values=torch.randn(16, 64, 3, 32, 32))
|
||||
|
||||
|
||||
def test_tensor_schema_constant_dim_failure():
|
||||
with pytest.raises(ValueError, match="dim\\[2\\] expected 3, got 4"):
|
||||
Phi3VImagePixelInputs(
|
||||
data=torch.randn(16, 64, 4, 32, 32), # dim[2] = 4
|
||||
pixel_values=torch.randn(16, 64, 4, 32, 32), # dim[2] = 4
|
||||
image_sizes=torch.randint(0, 256, (16, 2)),
|
||||
)
|
||||
|
||||
|
||||
def test_tensor_schema_invalid_types_in_list():
|
||||
with pytest.raises(ValueError, match="is not a torch.Tensor"):
|
||||
with pytest.raises(TypeError, match="is not one of the expected types"):
|
||||
Phi3VImagePixelInputs(
|
||||
data=[
|
||||
pixel_values=[
|
||||
torch.randn(64, 3, 32, 32),
|
||||
"not_a_tensor",
|
||||
torch.randn(64, 3, 32, 32),
|
||||
@@ -48,27 +50,28 @@ def test_tensor_schema_invalid_types_in_list():
|
||||
def test_tensor_schema_rank_mismatch():
|
||||
with pytest.raises(ValueError, match="has rank 3 but expected 5"):
|
||||
Phi3VImagePixelInputs(
|
||||
data=torch.randn(16, 64, 3),
|
||||
pixel_values=torch.randn(16, 64, 3),
|
||||
image_sizes=torch.randint(0, 256, (16, 2)),
|
||||
)
|
||||
|
||||
|
||||
def test_tensor_schema_missing_required_field():
|
||||
with pytest.raises(ValueError, match="Required field 'data' is missing"):
|
||||
with pytest.raises(ValueError,
|
||||
match="Required field 'pixel_values' is missing"):
|
||||
Phi3VImagePixelInputs(image_sizes=torch.randint(0, 256, (16, 2)), )
|
||||
|
||||
|
||||
def test_tensor_schema_symbolic_dim_mismatch():
|
||||
with pytest.raises(ValueError, match="expected 'bn'=12, got 16"):
|
||||
Phi3VImagePixelInputs(
|
||||
data=torch.randn(12, 64, 3, 32, 32),
|
||||
pixel_values=torch.randn(12, 64, 3, 32, 32),
|
||||
image_sizes=torch.randint(0, 256, (16, 2)),
|
||||
)
|
||||
|
||||
|
||||
def test_tensor_schema_list_tensor_valid():
|
||||
Phi3VImagePixelInputs(
|
||||
data=[torch.randn(64, 3, 32, 32) for _ in range(16)],
|
||||
pixel_values=[torch.randn(64, 3, 32, 32) for _ in range(16)],
|
||||
image_sizes=torch.randint(0, 256, (16, 2)),
|
||||
)
|
||||
|
||||
@@ -76,39 +79,46 @@ def test_tensor_schema_list_tensor_valid():
|
||||
def test_tensor_schema_variable_patch_counts_valid():
|
||||
# Each image has a different number of patches (p)
|
||||
# Each tensor has shape (p, 3, 32, 32)
|
||||
data = [
|
||||
torch.randn(16, 3, 32, 32), # p = 16
|
||||
torch.randn(32, 3, 32, 32), # p = 32
|
||||
torch.randn(64, 3, 32, 32), # p = 64
|
||||
]
|
||||
image_sizes = torch.randint(0, 256, (3, 2)) # bn = 3
|
||||
Phi3VImagePixelInputs(
|
||||
data=data,
|
||||
image_sizes=image_sizes,
|
||||
pixel_values=[
|
||||
torch.randn(16, 3, 32, 32), # p = 16
|
||||
torch.randn(32, 3, 32, 32), # p = 32
|
||||
torch.randn(64, 3, 32, 32), # p = 64
|
||||
],
|
||||
image_sizes=torch.randint(0, 256, (3, 2)), # bn = 3
|
||||
)
|
||||
|
||||
|
||||
def test_tensor_schema_tuple_tensor_valid():
|
||||
Phi3VImagePixelInputs(
|
||||
data=tuple(torch.randn(64, 3, 32, 32) for _ in range(16)),
|
||||
pixel_values=tuple(torch.randn(64, 3, 32, 32) for _ in range(16)),
|
||||
image_sizes=torch.randint(0, 256, (16, 2)),
|
||||
)
|
||||
|
||||
|
||||
def test_tensor_schema_double_nested_tensors():
|
||||
x = torch.rand(4, 3, 32, 32)
|
||||
y = torch.rand(2, 3, 32, 32)
|
||||
|
||||
HCXVisionVideoPixelInputs(pixel_values_videos=([x, y, x], [y], [x, y]))
|
||||
|
||||
|
||||
def test_tensor_schema_inconsistent_shapes_in_list():
|
||||
with pytest.raises(ValueError, match="contains inconsistent shapes"):
|
||||
Phi3VImagePixelInputs(
|
||||
data=[torch.randn(64, 3, 32, 32),
|
||||
torch.randn(64, 3, 16, 16)] +
|
||||
[torch.randn(64, 3, 32, 32) for _ in range(14)],
|
||||
pixel_values=[
|
||||
torch.randn(64, 3, 32, 32),
|
||||
torch.randn(64, 3, 16, 16),
|
||||
*(torch.randn(64, 3, 32, 32) for _ in range(14)),
|
||||
],
|
||||
image_sizes=torch.randint(0, 256, (16, 2)),
|
||||
)
|
||||
|
||||
|
||||
def test_tensor_schema_empty_list():
|
||||
with pytest.raises(ValueError, match="is an empty list"):
|
||||
with pytest.raises(ValueError, match="is an empty sequence"):
|
||||
Phi3VImagePixelInputs(
|
||||
data=[],
|
||||
pixel_values=[],
|
||||
image_sizes=torch.randint(0, 256, (0, 2)),
|
||||
)
|
||||
|
||||
@@ -117,18 +127,18 @@ def test_tensor_schema_validation_disabled_skips_shape_check():
|
||||
# This should NOT raise, because validation is turned off
|
||||
# This would normally fail (dim[2] should be 3, not 4)
|
||||
Phi3VImagePixelInputs(
|
||||
data=torch.randn(16, 64, 4, 32, 32),
|
||||
pixel_values=torch.randn(16, 64, 4, 32, 32),
|
||||
image_sizes=torch.randint(0, 256, (16, 2)),
|
||||
validate=False,
|
||||
)
|
||||
|
||||
|
||||
def test_tensor_schema_with_valid_resolve_binding_dims():
|
||||
data = torch.randn(16, 64, 3, 336, 336) # h=336, w=336
|
||||
pixel_values = torch.randn(16, 64, 3, 336, 336) # h=336, w=336
|
||||
image_sizes = torch.randint(0, 256, (16, 2))
|
||||
|
||||
Phi3VImagePixelInputs(
|
||||
data=data,
|
||||
pixel_values=pixel_values,
|
||||
image_sizes=image_sizes,
|
||||
resolve_bindings={
|
||||
"h": 336,
|
||||
@@ -138,13 +148,13 @@ def test_tensor_schema_with_valid_resolve_binding_dims():
|
||||
|
||||
|
||||
def test_tensor_schema_with_invalid_resolve_binding_dims():
|
||||
data = torch.randn(16, 64, 3, 36, 36) # h=36, w=36
|
||||
pixel_values = torch.randn(16, 64, 3, 36, 36) # h=36, w=36
|
||||
image_sizes = torch.randint(0, 256, (16, 2))
|
||||
|
||||
# Should raise because 'h' and 'w' don't match resolve bindings
|
||||
with pytest.raises(ValueError, match="dim\\[3\\] expected 336, got 36"):
|
||||
Phi3VImagePixelInputs(
|
||||
data=data,
|
||||
pixel_values=pixel_values,
|
||||
image_sizes=image_sizes,
|
||||
resolve_bindings={
|
||||
"h": 336,
|
||||
|
||||
Reference in New Issue
Block a user