diff --git a/tests/models/multimodal/processing/test_deepseek_ocr.py b/tests/models/multimodal/processing/test_deepseek_ocr.py new file mode 100644 index 000000000..7bdfbc083 --- /dev/null +++ b/tests/models/multimodal/processing/test_deepseek_ocr.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Regression test for DeepSeek-OCR TensorSchema validation with empty images_crop. + +When using the Gundam preset (BASE_SIZE=1024, IMAGE_SIZE=640, CROP_MODE=True), +images that are small enough to not require cropping produce an empty +images_crop tensor with shape (0, 3, 640, 640). The _parse_and_validate_image_input +method must correctly read image_size from this tensor's shape rather than +falling back to base_size, which would cause a TensorSchema mismatch. + +Run with: + pytest tests/models/multimodal/processing/test_deepseek_ocr.py -v +""" + +import pytest +from PIL import Image +from transformers import AutoTokenizer + +from vllm.model_executor.models.deepseek_ocr import DeepseekOCRImagePixelInputs +from vllm.transformers_utils.processors.deepseek_ocr import DeepseekOCRProcessor + +MODEL_ID = "deepseek-ai/DeepSeek-OCR" + + +@pytest.fixture(scope="module") +def processor(): + """Load the DeepseekOCRProcessor with tokenizer from HuggingFace.""" + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + return DeepseekOCRProcessor(tokenizer=tokenizer) + + +class TestDeepseekOCREmptyImagesCrop: + """Verify TensorSchema validation handles empty images_crop correctly.""" + + def test_empty_images_crop_small_image(self, processor): + """A small image (<=640px) produces empty images_crop and should + not crash the TensorSchema validation. + + Previously, the code used ``numel() > 0`` to decide whether to read + image_size from the tensor shape. When numel()==0, it fell back to + base_size=1024, mismatching the actual tensor dim of 640. + """ + # Small image: both dims <= IMAGE_SIZE (640) → no crops + small_image = Image.new("RGB", (100, 100), color="red") + + result = processor( + prompt="\nDescribe this image.", + images=[small_image], + ) + + pixel_values = result["pixel_values"] + images_crop = result["images_crop"] + images_spatial_crop = result["images_spatial_crop"] + + # Processor must produce an empty crop tensor for a small image + assert images_crop.shape[0] == 0 + + base_size = pixel_values.shape[-1] + image_size = images_crop.shape[-1] if images_crop is not None else base_size + + # This should NOT raise ValueError + schema = DeepseekOCRImagePixelInputs( + type="pixel_values", + data=pixel_values, + images_crop=images_crop, + images_spatial_crop=images_spatial_crop, + resolve_bindings={ + "base_size": base_size, + "image_size": image_size, + }, + ) + + assert schema.data.shape == (1, 3, 1024, 1024) + assert schema.images_crop.shape == (0, 3, 640, 640) + + def test_populated_images_crop_large_image(self, processor): + """A large image (>640px) produces populated images_crop.""" + # Large image: exceeds IMAGE_SIZE (640) → dynamic crop tiles + large_image = Image.new("RGB", (1200, 800), color="blue") + + result = processor( + prompt="\nDescribe this image.", + images=[large_image], + ) + + pixel_values = result["pixel_values"] + images_crop = result["images_crop"] + images_spatial_crop = result["images_spatial_crop"] + + assert images_crop.shape[0] > 0 + + base_size = pixel_values.shape[-1] + image_size = images_crop.shape[-1] + + schema = DeepseekOCRImagePixelInputs( + type="pixel_values", + data=pixel_values, + images_crop=images_crop, + images_spatial_crop=images_spatial_crop, + resolve_bindings={ + "base_size": base_size, + "image_size": image_size, + }, + ) + + assert schema.data.shape == (1, 3, 1024, 1024) + assert schema.images_crop.shape[-1] == 640 + + def test_mismatched_image_size_raises(self, processor): + """Deliberately wrong image_size binding should still be caught + by TensorSchema validation.""" + small_image = Image.new("RGB", (100, 100), color="green") + + result = processor( + prompt="\nDescribe this image.", + images=[small_image], + ) + + pixel_values = result["pixel_values"] + images_crop = result["images_crop"] + images_spatial_crop = result["images_spatial_crop"] + + with pytest.raises(ValueError, match="images_crop"): + DeepseekOCRImagePixelInputs( + type="pixel_values", + data=pixel_values, + images_crop=images_crop, + images_spatial_crop=images_spatial_crop, + resolve_bindings={ + "base_size": 1024, + "image_size": 1024, # Wrong! Tensor has 640 + }, + ) diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index b0fba01a4..caf4dbee7 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -452,10 +452,7 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, Supports # support arbitrary resolutions via pos-encoding interpolation, # so Tiny/Small/Base/Large variants all work with the same weights. base_size = pixel_values.shape[-1] - if images_crop is not None and images_crop.numel() > 0: - image_size = images_crop.shape[-1] - else: - image_size = base_size + image_size = images_crop.shape[-1] if images_crop is not None else base_size return DeepseekOCRImagePixelInputs( type="pixel_values",