[Bugfix] Schedule failure due to wrong get_image_size_with_most_features (#29692)
This commit is contained in:
@@ -25,6 +25,7 @@
|
||||
# limitations under the License.
|
||||
"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
|
||||
|
||||
import math
|
||||
from collections.abc import Callable, Iterable, Mapping, Sequence
|
||||
from functools import partial
|
||||
from typing import Annotated, Any, Literal, TypeAlias
|
||||
@@ -959,13 +960,42 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
|
||||
return num_video_tokens
|
||||
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
max_image_size, _ = self._get_vision_info(
|
||||
image_width=9999999,
|
||||
image_height=9999999,
|
||||
num_frames=1,
|
||||
image_processor=None,
|
||||
)
|
||||
return max_image_size
|
||||
# NOTE: Simply processing a huge size with _get_vision_info might not give a
|
||||
# size that maximizes the number of featrues, i.e., the number of (merged)
|
||||
# patches. This is because the number of patches limits the allowed aspect
|
||||
# ratios. For example, suppose the maximum number of patches is 1280. A square
|
||||
# image cannot be broken down into 1280 patches, so feeding a giant square image
|
||||
# into _get_vision_info will not yield a size that maximizes the number of
|
||||
# patches. Therefore, we directly factorize the maximum number of patches into
|
||||
# height and width. The tricky part is to avoid extreme aspect ratios (>200 for
|
||||
# qwen2-vl). If we can't find a suitable aspect ratio, we decrease the number of
|
||||
# patches and retry. This is safe because the processor does not accept extreme
|
||||
# aspect ratios, so there is no valid post-resize image with the number of
|
||||
# patches that yields extreme aspect ratios.
|
||||
|
||||
hf_config = self.get_hf_config()
|
||||
vision_config = hf_config.vision_config
|
||||
patch_size = vision_config.patch_size
|
||||
merge_size = vision_config.spatial_merge_size
|
||||
image_processor = self.get_image_processor()
|
||||
max_pixels = image_processor.max_pixels or image_processor.size["longest_edge"]
|
||||
unit = patch_size * merge_size
|
||||
max_seq_len = max_pixels // (unit * unit)
|
||||
|
||||
def closest_factor_pair(n: int) -> tuple[int, int]:
|
||||
# left <= right
|
||||
for d in range(math.isqrt(n), 0, -1):
|
||||
if n % d == 0:
|
||||
return d, n // d
|
||||
return 1, n
|
||||
|
||||
height_factor, width_factor = 1, max_seq_len
|
||||
for seq_len in range(max_seq_len, 0, -1):
|
||||
height_factor, width_factor = closest_factor_pair(seq_len)
|
||||
if width_factor / height_factor <= 200:
|
||||
break
|
||||
|
||||
return ImageSize(width=unit * width_factor, height=unit * height_factor)
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
|
||||
Reference in New Issue
Block a user