[Misc] Remove redundant num_embeds (#15443)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -155,7 +155,6 @@ def resolve_visual_encoder_outputs(
|
||||
|
||||
def scatter_patch_features(
|
||||
features: torch.Tensor,
|
||||
num_embeds: torch.Tensor,
|
||||
embed_is_patch: torch.Tensor,
|
||||
) -> tuple[torch.Tensor, ...]:
|
||||
"""
|
||||
@@ -168,13 +167,35 @@ def scatter_patch_features(
|
||||
Args:
|
||||
features: The patch features, concatenated across each image.
|
||||
Shape: `(num_patch, feature_depth)`
|
||||
num_embeds: The number of image embeddings for each image.
|
||||
Shape: `(num_images,)`
|
||||
embed_is_patch: A boolean mask indicating which image embeddings
|
||||
correspond to patch tokens for each image.
|
||||
Shape: `(num_images, num_embeds)`
|
||||
|
||||
Note:
|
||||
The original code only considers patch tokens as feature
|
||||
tokens, but our processor considers all image-related tokens
|
||||
as feature tokens because the feature tokens need to be
|
||||
consecutive in `input_ids`.
|
||||
|
||||
Example:
|
||||
A simplified example for one image:
|
||||
|
||||
.. code-block::
|
||||
|
||||
Embedding tokens (from HF processor):
|
||||
[<start> <patch> <patch> <col> <patch> <patch> <col> <end> ]
|
||||
|
||||
embed_is_patch (from HF processor):
|
||||
[ False True True False True True False False ]
|
||||
|
||||
Encoder outputs (from model):
|
||||
[ p1 p2 p3 p4 ]
|
||||
|
||||
The resulting embedding tensor is:
|
||||
[ nan p1 p2 nan p3 p4 nan nan ]
|
||||
"""
|
||||
num_embeds_per_image: list[int] = num_embeds.tolist()
|
||||
num_images, num_embeds = embed_is_patch.shape
|
||||
num_embeds_per_image = [num_embeds] * num_images
|
||||
|
||||
embeds_flat = features.new_full(
|
||||
(sum(num_embeds_per_image), features.shape[-1]),
|
||||
|
||||
Reference in New Issue
Block a user