[Core] Dynamic image size support for VLMs (#5276)

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: ywang96 <ywang@roblox.com> Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-07-03 11:34:00 +08:00
parent 482045ee77
commit 9831aec49f
38 changed files with 1453 additions and 664 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -5,7 +5,7 @@ from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Tuple,
                    Union)

 import torch
-from transformers import PretrainedConfig, PreTrainedTokenizerBase
+from transformers import PretrainedConfig

 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -1303,16 +1303,6 @@ class VisionLanguageConfig:
    image_input_shape: tuple
    image_feature_size: int

-    #TODO(ywang96): make this a cached property once we refactor the
-    # VisionLanguageConfig class.
-    def get_image_token_text(
-            self, tokenizer: PreTrainedTokenizerBase) -> Tuple[str, str]:
-        """Get the image token placeholder text to be inserted into the 
-        text prompt and the string representation of the image token id.
-        """
-        image_token_str = tokenizer.decode(self.image_token_id)
-        return image_token_str * self.image_feature_size, image_token_str
-
    def as_cli_args_dict(self) -> Dict[str, Any]:
        """Flatten vision language config to pure args.