Revert "[V1] Scatter and gather placeholders in the model runner" (#16075)

2025-04-04 14:50:57 -07:00
parent f5722a5052
commit af51d80fa1
42 changed files with 942 additions and 496 deletions
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -40,6 +40,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                    maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features

 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -60,6 +61,14 @@ class SkyworkR1VImagePixelInputs(TypedDict):
    num_patches: torch.Tensor
    """Shape: `(batch_size * num_images)`"""

+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+

 class SkyworkR1VImageEmbeddingInputs(TypedDict):
    type: Literal["image_embeds"]
@@ -410,13 +419,24 @@ class BaseSkyworkR1VProcessor(ABC):
                torch.tensor([len(item) for item in pixel_values_lst]),
            }

+            tokenizer = self.tokenizer
+            image_token_id = self.image_token_id
+
+            embed_is_patch = list[torch.Tensor]()
+
            for pixel_values in pixel_values_lst:
                num_patches = pixel_values.shape[0]
                feature_size = num_patches * self.num_image_token

                image_repl = self.get_image_repl(feature_size, num_patches)
+                feature_tokens = tokenizer.encode(image_repl.features,
+                                                  add_special_tokens=False)

                text = [t.replace('<image>', image_repl.full, 1) for t in text]
+                embed_is_patch.append(
+                    torch.tensor(feature_tokens) == image_token_id)
+
+            image_inputs["embed_is_patch"] = embed_is_patch

        text_inputs = self.tokenizer(text)

@@ -440,7 +460,7 @@ class SkyworkR1VProcessor(BaseSkyworkR1VProcessor):
        repl_features = IMG_CONTEXT * feature_size
        repl_full = IMG_START + repl_features + IMG_END

-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+        return PromptUpdateDetails(full=repl_full, features=repl_features)


 class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
@@ -579,6 +599,7 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]):
            pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
                "image", image_num_patches),
            image_num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
            image_embeds=MultiModalFieldConfig.batched("image"),
            image_token_id=MultiModalFieldConfig.shared("image", num_images),
        )
@@ -814,6 +835,7 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
            self, **kwargs: object) -> Optional[SkyworkR1VImageInputs]:
        pixel_values_flat = kwargs.pop("pixel_values_flat", None)
        image_num_patches = kwargs.pop("image_num_patches", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
        image_embeds = kwargs.pop("image_embeds", None)

        if pixel_values_flat is None and image_embeds is None:
@@ -842,14 +864,20 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
                raise ValueError("Incorrect type of image_num_patches. "
                                 f"Got type: {type(image_num_patches)}")

+            if not isinstance(embed_is_patch, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of embed_is_patch. "
+                                 f"Got type: {type(embed_is_patch)}")
+
            pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
            image_num_patches = flatten_bn(image_num_patches, concat=True)
+            embed_is_patch = flatten_bn(embed_is_patch)

            return SkyworkR1VImagePixelInputs(
                type="pixel_values",
                pixel_values_flat=self._validate_pixel_values(
                    pixel_values_flat),
                num_patches=image_num_patches,
+                embed_is_patch=embed_is_patch,
            )

        raise AssertionError("This line should be unreachable.")
@@ -895,7 +923,15 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
        if image_input is None:
            return None

-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        if image_input["type"] != "pixel_values":
+            return image_features
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )

    def get_input_embeddings(
        self,
@@ -909,7 +945,7 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
            inputs_embeds = merge_multimodal_embeddings(
                input_ids,
                inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                self.img_context_token_id,
            )
        return inputs_embeds