Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -4,6 +4,7 @@
|
||||
for manipulating the input / output of HF & vLLM test runners, which are
|
||||
typically specific to a small subset of models.
|
||||
"""
|
||||
|
||||
import types
|
||||
from pathlib import PosixPath
|
||||
from typing import Optional, Union
|
||||
@@ -15,8 +16,13 @@ import pytest
|
||||
import regex as re
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
|
||||
GenerationConfig, GenerationMixin)
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
BatchFeature,
|
||||
GenerationConfig,
|
||||
GenerationMixin,
|
||||
)
|
||||
from transformers.video_utils import VideoMetadata
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
@@ -27,8 +33,7 @@ from .types import RunnerOutput
|
||||
|
||||
|
||||
####### vLLM output processors functions
|
||||
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [blip2 models] to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -42,8 +47,7 @@ def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [fuyu models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -53,8 +57,8 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
|
||||
|
||||
def qwen_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [qwen models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -64,8 +68,8 @@ def qwen_vllm_to_hf_output(
|
||||
|
||||
|
||||
def qwen2_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [qwen2 models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -75,8 +79,8 @@ def qwen2_vllm_to_hf_output(
|
||||
|
||||
|
||||
def kimiv_vl_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -85,23 +89,25 @@ def kimiv_vl_vllm_to_hf_output(
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def llava_image_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> RunnerOutput:
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
mm_token_id = config.image_token_index
|
||||
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
|
||||
|
||||
|
||||
def llava_video_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
mm_token_id = config.video_token_index
|
||||
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
|
||||
|
||||
|
||||
def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
|
||||
mm_token_id: int) -> RunnerOutput:
|
||||
def _llava_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput, model: str, mm_token_id: int
|
||||
) -> RunnerOutput:
|
||||
"""Sanitize vllm output [Llava models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -109,7 +115,8 @@ def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
token_id
|
||||
for idx, token_id in enumerate(output_ids)
|
||||
if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
|
||||
]
|
||||
|
||||
@@ -128,8 +135,9 @@ def llava_onevision_hf_model_kwargs(model: str) -> dict:
|
||||
return config.to_dict()
|
||||
|
||||
|
||||
def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def llava_onevision_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> RunnerOutput:
|
||||
"""Sanitize vllm output [llava-onevision] to compare with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -140,7 +148,8 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
token_id
|
||||
for idx, token_id in enumerate(output_ids)
|
||||
if token_id != video_token_id or output_ids[idx - 1] != video_token_id
|
||||
]
|
||||
|
||||
@@ -151,8 +160,7 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def mantis_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [mantis] to compare with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -161,8 +169,7 @@ def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [phi3v] to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -180,8 +187,7 @@ def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -192,7 +198,8 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
token_id
|
||||
for idx, token_id in enumerate(output_ids)
|
||||
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
|
||||
]
|
||||
|
||||
@@ -205,46 +212,40 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
|
||||
|
||||
####### Post-processors for HF outputs
|
||||
def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<|end▁of▁sentence|>"):
|
||||
output_str = output_str.split("<|end▁of▁sentence|>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def idefics3_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def idefics3_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<end_of_utterance>"):
|
||||
output_str = output_str.split("<end_of_utterance>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def smolvlm_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
# Based on Idefics3
|
||||
return idefics3_trunc_hf_output(hf_output, model)
|
||||
|
||||
|
||||
def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<|eot_id|>"):
|
||||
output_str = output_str.split("<|eot_id|>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def minimax_vl_01_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def minimax_vl_01_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<end_of_sentence>"):
|
||||
output_str = output_str.split("<end_of_sentence>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def ultravox_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def ultravox_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
@@ -262,8 +263,8 @@ def get_llava_embeddings(image_assets: ImageTestAssets):
|
||||
|
||||
####### Prompt path encoders for models that need models on disk
|
||||
def qwen_prompt_path_encoder(
|
||||
tmp_path: PosixPath, prompt: str,
|
||||
assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
|
||||
tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], ImageTestAssets]
|
||||
) -> str:
|
||||
"""Given a temporary dir path, export one or more image assets into the
|
||||
tempdir & replace its contents with the local path to the string so that
|
||||
the HF version of Qwen-VL can resolve the path and load the image in its
|
||||
@@ -313,8 +314,9 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
return BatchFeature(data=inputs, tensor_type="pt")
|
||||
|
||||
hf_model.processor = processor
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language.model.embed_tokens
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.language.model.embed_tokens
|
||||
)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -357,11 +359,10 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
assert len(contents) == len(images)
|
||||
|
||||
return hf_processor.apply_chat_template(
|
||||
[{
|
||||
"role": "user",
|
||||
"image": image,
|
||||
"content": content
|
||||
} for image, content in zip(images, contents)],
|
||||
[
|
||||
{"role": "user", "image": image, "content": content}
|
||||
for image, content in zip(images, contents)
|
||||
],
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
@@ -369,8 +370,9 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
)
|
||||
|
||||
hf_model.processor = processor
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.transformer.output_layer
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.transformer.output_layer
|
||||
)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -387,10 +389,9 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
else:
|
||||
video_metadata = None
|
||||
|
||||
return hf_processor(*args,
|
||||
videos=videos,
|
||||
video_metadata=video_metadata,
|
||||
**kwargs)
|
||||
return hf_processor(
|
||||
*args, videos=videos, video_metadata=video_metadata, **kwargs
|
||||
)
|
||||
|
||||
hf_model.processor = processor
|
||||
return hf_model
|
||||
@@ -406,8 +407,9 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.config = AutoConfig.from_pretrained(
|
||||
hf_runner.model_name, trust_remote_code=True
|
||||
)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.use_msac = self.config.use_msac
|
||||
@@ -415,11 +417,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.max_num = self.config.max_dynamic_patch
|
||||
self.image_size = self.vision_config.image_size
|
||||
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]],
|
||||
**kwargs):
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
|
||||
# yapf: disable
|
||||
from vllm.model_executor.models.h2ovl import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
image_to_pixel_values_h2ovl,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
@@ -431,29 +436,26 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
use_msac=self.use_msac,
|
||||
) for image in images
|
||||
]
|
||||
num_patches_list = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
for num_patches in num_patches_list:
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
text = text.replace("<image>", image_tokens, 1)
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = H2OVLProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.language_model.get_output_embeddings()
|
||||
)
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -467,19 +469,23 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.config = AutoConfig.from_pretrained(
|
||||
hf_runner.model_name, trust_remote_code=True
|
||||
)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.min_num = self.config.min_dynamic_patch
|
||||
self.max_num = self.config.max_dynamic_patch
|
||||
self.image_size = self.vision_config.image_size
|
||||
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]],
|
||||
**kwargs):
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
|
||||
from vllm.model_executor.models.skyworkr1v import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START,
|
||||
image_to_pixel_values_skyworkr1v)
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
image_to_pixel_values_skyworkr1v,
|
||||
)
|
||||
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
pixel_values = [
|
||||
image_to_pixel_values_skyworkr1v(
|
||||
@@ -488,29 +494,26 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
min_num=self.min_num,
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
) for image in images
|
||||
]
|
||||
num_patches_list = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
for num_patches in num_patches_list:
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
text = text.replace("<image>", image_tokens, 1)
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = SkyworkR1VProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.language_model.get_output_embeddings()
|
||||
)
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -524,8 +527,9 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.config = AutoConfig.from_pretrained(
|
||||
hf_runner.model_name, trust_remote_code=True
|
||||
)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.min_num = self.config.min_dynamic_patch
|
||||
@@ -540,8 +544,13 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
**kwargs,
|
||||
):
|
||||
from vllm.model_executor.models.internvl import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START,
|
||||
image_to_pixel_values_internvl, video_to_pixel_values_internvl)
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
image_to_pixel_values_internvl,
|
||||
video_to_pixel_values_internvl,
|
||||
)
|
||||
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
videos = [videos] if isinstance(videos, np.ndarray) else videos
|
||||
if images is not None:
|
||||
@@ -552,7 +561,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
min_num=self.min_num,
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
) for image in images
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
num_patches_images = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values_images
|
||||
@@ -568,7 +578,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
min_num=1,
|
||||
max_num=1,
|
||||
use_thumbnail=False,
|
||||
) for video in videos
|
||||
)
|
||||
for video in videos
|
||||
]
|
||||
num_patches_videos = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values_videos
|
||||
@@ -580,38 +591,37 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
while ("<image>" in text) or ("<video>" in text):
|
||||
image_index = text.find("<image>")
|
||||
video_index = text.find("<video>")
|
||||
if image_index == -1 or (video_index > -1
|
||||
and video_index < image_index):
|
||||
if image_index == -1 or (
|
||||
video_index > -1 and video_index < image_index
|
||||
):
|
||||
num_patches = num_patches_videos.pop(0)
|
||||
pixel_values.append(pixel_values_videos.pop(0))
|
||||
context_tokens = IMG_START + \
|
||||
IMG_CONTEXT * self.num_image_token + IMG_END
|
||||
video_tokens = ''.join([
|
||||
f'Frame{i+1}: {context_tokens}'
|
||||
for i in range(num_patches)
|
||||
])
|
||||
text = text.replace('<video>', video_tokens, 1)
|
||||
context_tokens = (
|
||||
IMG_START + IMG_CONTEXT * self.num_image_token + IMG_END
|
||||
)
|
||||
video_tokens = "".join(
|
||||
[f"Frame{i + 1}: {context_tokens}" for i in range(num_patches)]
|
||||
)
|
||||
text = text.replace("<video>", video_tokens, 1)
|
||||
else:
|
||||
num_patches = num_patches_images.pop(0)
|
||||
pixel_values.append(pixel_values_images.pop(0))
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
text = text.replace("<image>", image_tokens, 1)
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = InternVLProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.language_model.get_output_embeddings()
|
||||
)
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -631,7 +641,7 @@ def _internvl_generate(
|
||||
input_embeds = input_embeds.reshape(B * N, C)
|
||||
|
||||
input_ids = input_ids.reshape(B * N)
|
||||
selected = (input_ids == self.img_context_token_id)
|
||||
selected = input_ids == self.img_context_token_id
|
||||
assert selected.sum() != 0
|
||||
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
|
||||
|
||||
@@ -778,8 +788,9 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
|
||||
def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.llm.get_output_embeddings()
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.llm.get_output_embeddings()
|
||||
)
|
||||
|
||||
def processor(*args, text="", images=None, **kwargs):
|
||||
text_tokenizer = hf_model.model.get_text_tokenizer()
|
||||
@@ -787,8 +798,7 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
|
||||
prompt_start_and_end = {
|
||||
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
|
||||
"llama":
|
||||
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
|
||||
"llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
|
||||
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
|
||||
}
|
||||
for start, end in prompt_start_and_end.values():
|
||||
@@ -797,7 +807,8 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
break
|
||||
|
||||
prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
|
||||
text_or_conversations=text, images=images)
|
||||
text_or_conversations=text, images=images
|
||||
)
|
||||
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
|
||||
|
||||
inputs = {
|
||||
@@ -813,8 +824,9 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
|
||||
def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.llm.get_output_embeddings()
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.llm.get_output_embeddings()
|
||||
)
|
||||
|
||||
def processor(*args, text="", images=None, videos=None, **kwargs):
|
||||
if images is None:
|
||||
@@ -825,13 +837,11 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
videos = []
|
||||
else:
|
||||
videos = [videos] if isinstance(videos, np.ndarray) else videos
|
||||
videos = [[PIL.Image.fromarray(frame) for frame in vid]
|
||||
for vid in videos]
|
||||
videos = [[PIL.Image.fromarray(frame) for frame in vid] for vid in videos]
|
||||
|
||||
prompt_start_and_end = {
|
||||
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
|
||||
"llama":
|
||||
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
|
||||
"llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
|
||||
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
|
||||
}
|
||||
for start, end in prompt_start_and_end.values():
|
||||
@@ -842,21 +852,20 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
images_message = [{"type": "image", "image": img} for img in images]
|
||||
videos_message = [{"type": "video", "video": vid} for vid in videos]
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*images_message,
|
||||
*videos_message,
|
||||
{
|
||||
"type": "text",
|
||||
"text": text
|
||||
},
|
||||
],
|
||||
}]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*images_message,
|
||||
*videos_message,
|
||||
{"type": "text", "text": text},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
|
||||
messages=messages, enable_thinking=True)
|
||||
messages=messages, enable_thinking=True
|
||||
)
|
||||
inputs = {
|
||||
"inputs": input_ids,
|
||||
"pixel_values": pixel_values,
|
||||
|
||||
Reference in New Issue
Block a user