fix: align lfm2 thumbnail token counting with HF (#36707)
This commit is contained in:
committed by
GitHub
parent
a5d06dc557
commit
741f4e046b
@@ -324,7 +324,25 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo):
|
||||
)
|
||||
tile_size = mm_kwargs.get("tile_size", image_processor.tile_size)
|
||||
|
||||
num_thumbnail_tokens = spatial_shapes[-1].prod() // (downsample_factor**2)
|
||||
thumbnail_height_patches = int(spatial_shapes[-1][0].item())
|
||||
thumbnail_width_patches = int(spatial_shapes[-1][1].item())
|
||||
# HF computes thumbnail tokens as
|
||||
# ceil(h_patches / downsample_factor) * ceil(w_patches / downsample_factor).
|
||||
# We assert divisibility here so any processor/model drift is surfaced
|
||||
# immediately instead of being hidden by floor division.
|
||||
assert thumbnail_height_patches % downsample_factor == 0, (
|
||||
"LFM2-VL thumbnail height patch grid must be divisible by "
|
||||
f"downsample_factor, got height_patches={thumbnail_height_patches}, "
|
||||
f"downsample_factor={downsample_factor}"
|
||||
)
|
||||
assert thumbnail_width_patches % downsample_factor == 0, (
|
||||
"LFM2-VL thumbnail width patch grid must be divisible by "
|
||||
f"downsample_factor, got width_patches={thumbnail_width_patches}, "
|
||||
f"downsample_factor={downsample_factor}"
|
||||
)
|
||||
num_thumbnail_tokens = math.ceil(
|
||||
thumbnail_height_patches / downsample_factor
|
||||
) * math.ceil(thumbnail_width_patches / downsample_factor)
|
||||
num_patches_tile = tile_size // encoder_patch_size
|
||||
dwn_num_patches_tile = math.ceil(num_patches_tile / downsample_factor)
|
||||
num_tiles_tokens = dwn_num_patches_tile * dwn_num_patches_tile
|
||||
|
||||
Reference in New Issue
Block a user