fix: align lfm2 thumbnail token counting with HF (#36707)

This commit is contained in:
tianshu-Michael-yu
2026-03-11 10:28:38 -07:00
committed by GitHub
parent a5d06dc557
commit 741f4e046b

View File

@@ -324,7 +324,25 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo):
)
tile_size = mm_kwargs.get("tile_size", image_processor.tile_size)
num_thumbnail_tokens = spatial_shapes[-1].prod() // (downsample_factor**2)
thumbnail_height_patches = int(spatial_shapes[-1][0].item())
thumbnail_width_patches = int(spatial_shapes[-1][1].item())
# HF computes thumbnail tokens as
# ceil(h_patches / downsample_factor) * ceil(w_patches / downsample_factor).
# We assert divisibility here so any processor/model drift is surfaced
# immediately instead of being hidden by floor division.
assert thumbnail_height_patches % downsample_factor == 0, (
"LFM2-VL thumbnail height patch grid must be divisible by "
f"downsample_factor, got height_patches={thumbnail_height_patches}, "
f"downsample_factor={downsample_factor}"
)
assert thumbnail_width_patches % downsample_factor == 0, (
"LFM2-VL thumbnail width patch grid must be divisible by "
f"downsample_factor, got width_patches={thumbnail_width_patches}, "
f"downsample_factor={downsample_factor}"
)
num_thumbnail_tokens = math.ceil(
thumbnail_height_patches / downsample_factor
) * math.ceil(thumbnail_width_patches / downsample_factor)
num_patches_tile = tile_size // encoder_patch_size
dwn_num_patches_tile = math.ceil(num_patches_tile / downsample_factor)
num_tiles_tokens = dwn_num_patches_tile * dwn_num_patches_tile