fix: align lfm2 thumbnail token counting with HF (#36707)

2026-03-11 10:28:38 -07:00
parent a5d06dc557
commit 741f4e046b
1 changed files with 19 additions and 1 deletions
--- a/vllm/model_executor/models/lfm2_vl.py
+++ b/vllm/model_executor/models/lfm2_vl.py
@@ -324,7 +324,25 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo):
        )
        tile_size = mm_kwargs.get("tile_size", image_processor.tile_size)

-        num_thumbnail_tokens = spatial_shapes[-1].prod() // (downsample_factor**2)
+        thumbnail_height_patches = int(spatial_shapes[-1][0].item())
+        thumbnail_width_patches = int(spatial_shapes[-1][1].item())
+        # HF computes thumbnail tokens as
+        # ceil(h_patches / downsample_factor) * ceil(w_patches / downsample_factor).
+        # We assert divisibility here so any processor/model drift is surfaced
+        # immediately instead of being hidden by floor division.
+        assert thumbnail_height_patches % downsample_factor == 0, (
+            "LFM2-VL thumbnail height patch grid must be divisible by "
+            f"downsample_factor, got height_patches={thumbnail_height_patches}, "
+            f"downsample_factor={downsample_factor}"
+        )
+        assert thumbnail_width_patches % downsample_factor == 0, (
+            "LFM2-VL thumbnail width patch grid must be divisible by "
+            f"downsample_factor, got width_patches={thumbnail_width_patches}, "
+            f"downsample_factor={downsample_factor}"
+        )
+        num_thumbnail_tokens = math.ceil(
+            thumbnail_height_patches / downsample_factor
+        ) * math.ceil(thumbnail_width_patches / downsample_factor)
        num_patches_tile = tile_size // encoder_patch_size
        dwn_num_patches_tile = math.ceil(num_patches_tile / downsample_factor)
        num_tiles_tokens = dwn_num_patches_tile * dwn_num_patches_tile