[VLM] Merged multi-modal processor for InternVL-based models (#12553)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
@@ -334,12 +334,12 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
def __init__(self, hf_runner: HfRunner):
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
self.dtype = hf_runner.model.dtype
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.use_msac = self.config.use_msac
|
||||
self.min_num = self.config.min_dynamic_patch
|
||||
self.max_num = self.config.max_dynamic_patch
|
||||
self.image_size = self.vision_config.image_size
|
||||
@@ -348,18 +348,19 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
**kwargs):
|
||||
# yapf: disable
|
||||
from vllm.model_executor.models.h2ovl import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
|
||||
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
|
||||
|
||||
# yapf: enable
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
pixel_values = [
|
||||
image_to_pixel_values(image,
|
||||
self.image_size,
|
||||
self.min_num,
|
||||
self.max_num,
|
||||
self.use_thumbnail,
|
||||
use_MSAC=self.config.use_msac).to(
|
||||
self.dtype) for image in images
|
||||
image_to_pixel_values_h2ovl(
|
||||
image,
|
||||
input_size=self.image_size,
|
||||
min_num=self.min_num,
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
use_msac=self.use_msac,
|
||||
) for image in images
|
||||
]
|
||||
num_patches_list = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values
|
||||
@@ -394,7 +395,6 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
def __init__(self, hf_runner: HfRunner):
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
self.dtype = hf_runner.model.dtype
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
@@ -407,13 +407,17 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
def __call__(self, text: str, images: Union[Image, List[Image]],
|
||||
**kwargs):
|
||||
from vllm.model_executor.models.internvl import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
|
||||
IMG_CONTEXT, IMG_END, IMG_START,
|
||||
image_to_pixel_values_internvl)
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
pixel_values = [
|
||||
image_to_pixel_values(image, self.image_size, self.min_num,
|
||||
self.max_num,
|
||||
self.use_thumbnail).to(self.dtype)
|
||||
for image in images
|
||||
image_to_pixel_values_internvl(
|
||||
image,
|
||||
input_size=self.image_size,
|
||||
min_num=self.min_num,
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
) for image in images
|
||||
]
|
||||
num_patches_list = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values
|
||||
@@ -448,7 +452,8 @@ def _internvl_generate(
|
||||
) -> torch.LongTensor:
|
||||
"""Generate method for InternVL2 model without fixed use_cache."""
|
||||
assert self.img_context_token_id is not None
|
||||
vit_embeds = self.extract_feature(pixel_values)
|
||||
target_dtype = next(self.parameters()).dtype
|
||||
vit_embeds = self.extract_feature(pixel_values.to(target_dtype))
|
||||
input_embeds = self.language_model.get_input_embeddings()(input_ids)
|
||||
B, N, C = input_embeds.shape
|
||||
input_embeds = input_embeds.reshape(B * N, C)
|
||||
|
||||
Reference in New Issue
Block a user