[Bugfix][Model] Fix DeepSeek-OCR TensorSchema crash on empty images_crop (#36670)

Signed-off-by: István Ketykó <istvan.ketyko@gmail.com>
This commit is contained in:
István Ketykó
2026-03-12 08:35:54 +01:00
committed by GitHub
parent 9fe404ed04
commit 00726c74c9
2 changed files with 135 additions and 4 deletions

View File

@@ -0,0 +1,134 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Regression test for DeepSeek-OCR TensorSchema validation with empty images_crop.
When using the Gundam preset (BASE_SIZE=1024, IMAGE_SIZE=640, CROP_MODE=True),
images that are small enough to not require cropping produce an empty
images_crop tensor with shape (0, 3, 640, 640). The _parse_and_validate_image_input
method must correctly read image_size from this tensor's shape rather than
falling back to base_size, which would cause a TensorSchema mismatch.
Run with:
pytest tests/models/multimodal/processing/test_deepseek_ocr.py -v
"""
import pytest
from PIL import Image
from transformers import AutoTokenizer
from vllm.model_executor.models.deepseek_ocr import DeepseekOCRImagePixelInputs
from vllm.transformers_utils.processors.deepseek_ocr import DeepseekOCRProcessor
MODEL_ID = "deepseek-ai/DeepSeek-OCR"
@pytest.fixture(scope="module")
def processor():
"""Load the DeepseekOCRProcessor with tokenizer from HuggingFace."""
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
return DeepseekOCRProcessor(tokenizer=tokenizer)
class TestDeepseekOCREmptyImagesCrop:
"""Verify TensorSchema validation handles empty images_crop correctly."""
def test_empty_images_crop_small_image(self, processor):
"""A small image (<=640px) produces empty images_crop and should
not crash the TensorSchema validation.
Previously, the code used ``numel() > 0`` to decide whether to read
image_size from the tensor shape. When numel()==0, it fell back to
base_size=1024, mismatching the actual tensor dim of 640.
"""
# Small image: both dims <= IMAGE_SIZE (640) → no crops
small_image = Image.new("RGB", (100, 100), color="red")
result = processor(
prompt="<image>\nDescribe this image.",
images=[small_image],
)
pixel_values = result["pixel_values"]
images_crop = result["images_crop"]
images_spatial_crop = result["images_spatial_crop"]
# Processor must produce an empty crop tensor for a small image
assert images_crop.shape[0] == 0
base_size = pixel_values.shape[-1]
image_size = images_crop.shape[-1] if images_crop is not None else base_size
# This should NOT raise ValueError
schema = DeepseekOCRImagePixelInputs(
type="pixel_values",
data=pixel_values,
images_crop=images_crop,
images_spatial_crop=images_spatial_crop,
resolve_bindings={
"base_size": base_size,
"image_size": image_size,
},
)
assert schema.data.shape == (1, 3, 1024, 1024)
assert schema.images_crop.shape == (0, 3, 640, 640)
def test_populated_images_crop_large_image(self, processor):
"""A large image (>640px) produces populated images_crop."""
# Large image: exceeds IMAGE_SIZE (640) → dynamic crop tiles
large_image = Image.new("RGB", (1200, 800), color="blue")
result = processor(
prompt="<image>\nDescribe this image.",
images=[large_image],
)
pixel_values = result["pixel_values"]
images_crop = result["images_crop"]
images_spatial_crop = result["images_spatial_crop"]
assert images_crop.shape[0] > 0
base_size = pixel_values.shape[-1]
image_size = images_crop.shape[-1]
schema = DeepseekOCRImagePixelInputs(
type="pixel_values",
data=pixel_values,
images_crop=images_crop,
images_spatial_crop=images_spatial_crop,
resolve_bindings={
"base_size": base_size,
"image_size": image_size,
},
)
assert schema.data.shape == (1, 3, 1024, 1024)
assert schema.images_crop.shape[-1] == 640
def test_mismatched_image_size_raises(self, processor):
"""Deliberately wrong image_size binding should still be caught
by TensorSchema validation."""
small_image = Image.new("RGB", (100, 100), color="green")
result = processor(
prompt="<image>\nDescribe this image.",
images=[small_image],
)
pixel_values = result["pixel_values"]
images_crop = result["images_crop"]
images_spatial_crop = result["images_spatial_crop"]
with pytest.raises(ValueError, match="images_crop"):
DeepseekOCRImagePixelInputs(
type="pixel_values",
data=pixel_values,
images_crop=images_crop,
images_spatial_crop=images_spatial_crop,
resolve_bindings={
"base_size": 1024,
"image_size": 1024, # Wrong! Tensor has 640
},
)