From 741f4e046bb7e5c5a6093d9fc294865ad7a8e721 Mon Sep 17 00:00:00 2001 From: tianshu-Michael-yu <101950379+tianshu-Michael-yu@users.noreply.github.com> Date: Wed, 11 Mar 2026 10:28:38 -0700 Subject: [PATCH] fix: align lfm2 thumbnail token counting with HF (#36707) --- vllm/model_executor/models/lfm2_vl.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/lfm2_vl.py b/vllm/model_executor/models/lfm2_vl.py index 86cd5546b..63f546c5a 100644 --- a/vllm/model_executor/models/lfm2_vl.py +++ b/vllm/model_executor/models/lfm2_vl.py @@ -324,7 +324,25 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo): ) tile_size = mm_kwargs.get("tile_size", image_processor.tile_size) - num_thumbnail_tokens = spatial_shapes[-1].prod() // (downsample_factor**2) + thumbnail_height_patches = int(spatial_shapes[-1][0].item()) + thumbnail_width_patches = int(spatial_shapes[-1][1].item()) + # HF computes thumbnail tokens as + # ceil(h_patches / downsample_factor) * ceil(w_patches / downsample_factor). + # We assert divisibility here so any processor/model drift is surfaced + # immediately instead of being hidden by floor division. + assert thumbnail_height_patches % downsample_factor == 0, ( + "LFM2-VL thumbnail height patch grid must be divisible by " + f"downsample_factor, got height_patches={thumbnail_height_patches}, " + f"downsample_factor={downsample_factor}" + ) + assert thumbnail_width_patches % downsample_factor == 0, ( + "LFM2-VL thumbnail width patch grid must be divisible by " + f"downsample_factor, got width_patches={thumbnail_width_patches}, " + f"downsample_factor={downsample_factor}" + ) + num_thumbnail_tokens = math.ceil( + thumbnail_height_patches / downsample_factor + ) * math.ceil(thumbnail_width_patches / downsample_factor) num_patches_tile = tile_size // encoder_patch_size dwn_num_patches_tile = math.ceil(num_patches_tile / downsample_factor) num_tiles_tokens = dwn_num_patches_tile * dwn_num_patches_tile