[VLM] Move supported limits and max tokens to merged multi-modal processor (#11669)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com>
2025-01-01 23:44:42 +08:00
parent 73001445fb
commit a115ac46b5
16 changed files with 351 additions and 361 deletions
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -119,6 +119,12 @@ def get_max_llava_image_tokens(ctx: InputContext):

 class LlavaMultiModalProcessor(BaseMultiModalProcessor):

+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+        return {"image": get_max_llava_image_tokens(self.ctx)}
+
    def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]:
        return self.ctx.get_hf_processor((LlavaProcessor, PixtralProcessor))

@@ -324,7 +330,6 @@ def init_vision_tower_for_llava(
    raise NotImplementedError(msg)


-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
    # BitandBytes specific attributes
@@ -649,7 +654,6 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):

 # To use this model, please use
 # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'`
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
@MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor)
 class MantisForConditionalGeneration(LlavaForConditionalGeneration):
    pass