[Core] Dynamic image size support for VLMs (#5276)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: ywang96 <ywang@roblox.com> Co-authored-by: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
This commit is contained in:
@@ -5,7 +5,7 @@ from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Tuple,
|
||||
Union)
|
||||
|
||||
import torch
|
||||
from transformers import PretrainedConfig, PreTrainedTokenizerBase
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
@@ -1303,16 +1303,6 @@ class VisionLanguageConfig:
|
||||
image_input_shape: tuple
|
||||
image_feature_size: int
|
||||
|
||||
#TODO(ywang96): make this a cached property once we refactor the
|
||||
# VisionLanguageConfig class.
|
||||
def get_image_token_text(
|
||||
self, tokenizer: PreTrainedTokenizerBase) -> Tuple[str, str]:
|
||||
"""Get the image token placeholder text to be inserted into the
|
||||
text prompt and the string representation of the image token id.
|
||||
"""
|
||||
image_token_str = tokenizer.decode(self.image_token_id)
|
||||
return image_token_str * self.image_feature_size, image_token_str
|
||||
|
||||
def as_cli_args_dict(self) -> Dict[str, Any]:
|
||||
"""Flatten vision language config to pure args.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user