[Fix] handle PaddleOCR-VL image processor max_pixels across Transformers v4/v5 (#38629)

Signed-off-by: zhangyue66 <zhangyue66@baidu.com>
2026-03-31 23:50:41 +08:00
parent f1ff50c86c
commit b6e636c12c
1 changed files with 7 additions and 1 deletions
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -200,7 +200,13 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo):
        merge_size = hf_config.vision_config.spatial_merge_size
        patch_size = hf_config.vision_config.patch_size
        factor = merge_size * patch_size
-        max_num_tokens = image_processor.max_pixels // (factor**2)
+        if self.ctx.model_config.trust_remote_code:
+            # Defined in HF Hub repo
+            max_pixels = image_processor.max_pixels
+        else:
+            # Defined in Transformers library (requires v5.0 or above)
+            max_pixels = image_processor.size.longest_edge
+        max_num_tokens = max_pixels // (factor**2)
        # Find factors of max_num_tokens close to its square root
        # to create a dummy image with a reasonable aspect ratio.
        h_patches = int(math.sqrt(max_num_tokens))