diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 39e965c1d..e1287bdb4 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -716,7 +716,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `Ovis` | Ovis2, Ovis1.6 | T + I+ | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | | `Ovis2_5` | Ovis2.5 | T + I+ + V | `AIDC-AI/Ovis2.5-9B`, etc. | | | | `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I+ | `PaddlePaddle/PaddleOCR-VL`, etc. | | | -| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + IE | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | +| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + IE | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | ✅︎ | ✅︎ | | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | | `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I+ | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | | ✅︎ | diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 67240c6e7..8671bbd5c 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -35,7 +35,13 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema, TensorShape -from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .interfaces import ( + MultiModalEmbeddings, + SupportsLoRA, + SupportsMultiModal, + SupportsPP, +) +from .module_mapping import MultiModelKeys from .siglip import SiglipVisionModel from .utils import ( AutoWeightsLoader, @@ -250,7 +256,9 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn info=PaliGemmaProcessingInfo, dummy_inputs=PaliGemmaDummyInputsBuilder, ) -class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): +class PaliGemmaForConditionalGeneration( + nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP +): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -406,3 +414,16 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + def get_mm_mapping(self) -> MultiModelKeys: + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="multi_modal_projector", + tower_model="vision_tower", + ) + + def get_num_mm_encoder_tokens(self, num_image_tokens: int) -> int: + return num_image_tokens + + def get_num_mm_connector_tokens(self, num_vision_tokens: int) -> int: + return num_vision_tokens