[V1][VLM] V1 support for selected single-image models. (#11632)

Signed-off-by: Roger Wang <ywang@roblox.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Isotr0py <2037008807@qq.com>
2024-12-31 13:17:22 -08:00
parent 8c3230d8c1
commit e7c7c5e822
19 changed files with 575 additions and 621 deletions
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -528,10 +528,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
        stacked_image_features = self._image_pixels_to_features(
            self.vision_tower, stacked_pixel_values)

-        return [
-            self.multi_modal_projector(image_features) for image_features in
-            torch.split(stacked_image_features, num_patches_per_batch)
-        ]
+        return torch.split(self.multi_modal_projector(stacked_image_features),
+                           num_patches_per_batch)

    def _process_image_input(
        self,