Fix DeepSeek-OCR tensor validation for all size variants (#34085)

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-11 22:50:23 -08:00
parent 136b0bfa59
commit 80f2ba6ea6
1 changed files with 11 additions and 1 deletions
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -448,7 +448,16 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, Supports
        if pixel_values is None or torch.sum(pixel_values).item() == 0:
            return None

-        base_size = self.vision_config.image_size
+        # Use actual tensor spatial dim instead of hardcoded
+        # vision_config.image_size (1024). The vision encoders (SAM & CLIP)
+        # support arbitrary resolutions via pos-encoding interpolation,
+        # so Tiny/Small/Base/Large variants all work with the same weights.
+        base_size = pixel_values.shape[-1]
+        if images_crop is not None and images_crop.numel() > 0:
+            image_size = images_crop.shape[-1]
+        else:
+            image_size = base_size
+
        return DeepseekOCRImagePixelInputs(
            type="pixel_values",
            data=pixel_values,
@@ -456,6 +465,7 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, Supports
            images_spatial_crop=images_spatial_crop,
            resolve_bindings={
                "base_size": base_size,
+                "image_size": image_size,
            },
        )