diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 8440c3946..23f27db3c 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -869,9 +869,28 @@ class Glm4vProcessingInfo(BaseProcessingInfo): return preprocessed_size, num_vision_tokens + def _get_image_max_pixels(self) -> int: + """Read max_pixels from the HF image processor config. + + Despite the name, ``longest_edge`` is a pixel **area** (total pixel + count), not an edge length. The HF processor passes it directly to + ``smart_resize`` as the ``max_pixels`` argument, which constrains + ``t_bar * h_bar * w_bar <= max_pixels``. + """ + return self.get_image_processor().size["longest_edge"] + def get_image_size_with_most_features(self) -> ImageSize: + # Use num_frames=1 for single-image budget estimation. + # _get_vision_info defaults to num_frames=16 (video), which + # makes smart_resize constrain 16*H*W <= max_pixels, vastly + # underestimating the spatial budget for a single image and + # causing encoder cache overflow for large images + # (see https://github.com/vllm-project/vllm/issues/34040). max_image_size, _ = self._get_vision_info( - image_width=9999999, image_height=9999999 + image_width=9999999, + image_height=9999999, + num_frames=1, + max_image_pixels=self._get_image_max_pixels(), ) return max_image_size @@ -884,7 +903,8 @@ class Glm4vProcessingInfo(BaseProcessingInfo): _, num_image_tokens = self._get_vision_info( image_width=image_width, image_height=image_height, - max_image_pixels=28 * 28 * 2 * 6144, + num_frames=1, + max_image_pixels=self._get_image_max_pixels(), ) return num_image_tokens