[Misc] Remove redundant num_embeds (#15443)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-25 18:27:57 +08:00
parent a9e879b316
commit 5994430b84
5 changed files with 25 additions and 64 deletions
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -155,7 +155,6 @@ def resolve_visual_encoder_outputs(

 def scatter_patch_features(
    features: torch.Tensor,
-    num_embeds: torch.Tensor,
    embed_is_patch: torch.Tensor,
 ) -> tuple[torch.Tensor, ...]:
    """
@@ -168,13 +167,35 @@ def scatter_patch_features(
    Args:
        features: The patch features, concatenated across each image.
          Shape: `(num_patch, feature_depth)`
-        num_embeds: The number of image embeddings for each image.
-          Shape: `(num_images,)`
        embed_is_patch: A boolean mask indicating which image embeddings
          correspond to patch tokens for each image.
          Shape: `(num_images, num_embeds)`
+
+    Note:
+        The original code only considers patch tokens as feature
+        tokens, but our processor considers all image-related tokens
+        as feature tokens because the feature tokens need to be
+        consecutive in `input_ids`.
+
+    Example:
+        A simplified example for one image:
+
+        .. code-block::
+
+            Embedding tokens (from HF processor):
+            [<start> <patch> <patch>  <col>  <patch> <patch>  <col>  <end> ]
+
+            embed_is_patch (from HF processor):
+            [ False   True    True    False    True    True   False  False ]
+
+            Encoder outputs (from model):
+            [  p1      p2      p3      p4   ]
+
+            The resulting embedding tensor is:
+            [  nan     p1      p2      nan      p3      p4     nan    nan  ]
    """
-    num_embeds_per_image: list[int] = num_embeds.tolist()
+    num_images, num_embeds = embed_is_patch.shape
+    num_embeds_per_image = [num_embeds] * num_images

    embeds_flat = features.new_full(
        (sum(num_embeds_per_image), features.shape[-1]),