135 lines
4.8 KiB
Python
135 lines
4.8 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""
|
|
Regression test for DeepSeek-OCR TensorSchema validation with empty images_crop.
|
|
|
|
When using the Gundam preset (BASE_SIZE=1024, IMAGE_SIZE=640, CROP_MODE=True),
|
|
images that are small enough to not require cropping produce an empty
|
|
images_crop tensor with shape (0, 3, 640, 640). The _parse_and_validate_image_input
|
|
method must correctly read image_size from this tensor's shape rather than
|
|
falling back to base_size, which would cause a TensorSchema mismatch.
|
|
|
|
Run with:
|
|
pytest tests/models/multimodal/processing/test_deepseek_ocr.py -v
|
|
"""
|
|
|
|
import pytest
|
|
from PIL import Image
|
|
from transformers import AutoTokenizer
|
|
|
|
from vllm.model_executor.models.deepseek_ocr import DeepseekOCRImagePixelInputs
|
|
from vllm.transformers_utils.processors.deepseek_ocr import DeepseekOCRProcessor
|
|
|
|
MODEL_ID = "deepseek-ai/DeepSeek-OCR"
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def processor():
|
|
"""Load the DeepseekOCRProcessor with tokenizer from HuggingFace."""
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
|
return DeepseekOCRProcessor(tokenizer=tokenizer)
|
|
|
|
|
|
class TestDeepseekOCREmptyImagesCrop:
|
|
"""Verify TensorSchema validation handles empty images_crop correctly."""
|
|
|
|
def test_empty_images_crop_small_image(self, processor):
|
|
"""A small image (<=640px) produces empty images_crop and should
|
|
not crash the TensorSchema validation.
|
|
|
|
Previously, the code used ``numel() > 0`` to decide whether to read
|
|
image_size from the tensor shape. When numel()==0, it fell back to
|
|
base_size=1024, mismatching the actual tensor dim of 640.
|
|
"""
|
|
# Small image: both dims <= IMAGE_SIZE (640) → no crops
|
|
small_image = Image.new("RGB", (100, 100), color="red")
|
|
|
|
result = processor(
|
|
prompt="<image>\nDescribe this image.",
|
|
images=[small_image],
|
|
)
|
|
|
|
pixel_values = result["pixel_values"]
|
|
images_crop = result["images_crop"]
|
|
images_spatial_crop = result["images_spatial_crop"]
|
|
|
|
# Processor must produce an empty crop tensor for a small image
|
|
assert images_crop.shape[0] == 0
|
|
|
|
base_size = pixel_values.shape[-1]
|
|
image_size = images_crop.shape[-1] if images_crop is not None else base_size
|
|
|
|
# This should NOT raise ValueError
|
|
schema = DeepseekOCRImagePixelInputs(
|
|
type="pixel_values",
|
|
data=pixel_values,
|
|
images_crop=images_crop,
|
|
images_spatial_crop=images_spatial_crop,
|
|
resolve_bindings={
|
|
"base_size": base_size,
|
|
"image_size": image_size,
|
|
},
|
|
)
|
|
|
|
assert schema.data.shape == (1, 3, 1024, 1024)
|
|
assert schema.images_crop.shape == (0, 3, 640, 640)
|
|
|
|
def test_populated_images_crop_large_image(self, processor):
|
|
"""A large image (>640px) produces populated images_crop."""
|
|
# Large image: exceeds IMAGE_SIZE (640) → dynamic crop tiles
|
|
large_image = Image.new("RGB", (1200, 800), color="blue")
|
|
|
|
result = processor(
|
|
prompt="<image>\nDescribe this image.",
|
|
images=[large_image],
|
|
)
|
|
|
|
pixel_values = result["pixel_values"]
|
|
images_crop = result["images_crop"]
|
|
images_spatial_crop = result["images_spatial_crop"]
|
|
|
|
assert images_crop.shape[0] > 0
|
|
|
|
base_size = pixel_values.shape[-1]
|
|
image_size = images_crop.shape[-1]
|
|
|
|
schema = DeepseekOCRImagePixelInputs(
|
|
type="pixel_values",
|
|
data=pixel_values,
|
|
images_crop=images_crop,
|
|
images_spatial_crop=images_spatial_crop,
|
|
resolve_bindings={
|
|
"base_size": base_size,
|
|
"image_size": image_size,
|
|
},
|
|
)
|
|
|
|
assert schema.data.shape == (1, 3, 1024, 1024)
|
|
assert schema.images_crop.shape[-1] == 640
|
|
|
|
def test_mismatched_image_size_raises(self, processor):
|
|
"""Deliberately wrong image_size binding should still be caught
|
|
by TensorSchema validation."""
|
|
small_image = Image.new("RGB", (100, 100), color="green")
|
|
|
|
result = processor(
|
|
prompt="<image>\nDescribe this image.",
|
|
images=[small_image],
|
|
)
|
|
|
|
pixel_values = result["pixel_values"]
|
|
images_crop = result["images_crop"]
|
|
images_spatial_crop = result["images_spatial_crop"]
|
|
|
|
with pytest.raises(ValueError, match="images_crop"):
|
|
DeepseekOCRImagePixelInputs(
|
|
type="pixel_values",
|
|
data=pixel_values,
|
|
images_crop=images_crop,
|
|
images_spatial_crop=images_spatial_crop,
|
|
resolve_bindings={
|
|
"base_size": 1024,
|
|
"image_size": 1024, # Wrong! Tensor has 640
|
|
},
|
|
)
|