[VLM] Remove input processor from clip and siglip (#13165)
This commit is contained in:
@@ -3,18 +3,15 @@
|
||||
within a vision language model."""
|
||||
|
||||
import math
|
||||
from typing import Iterable, List, Optional, Set, Tuple, Union
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from torch import nn
|
||||
from transformers import SiglipVisionConfig
|
||||
|
||||
from vllm.attention.layer import MultiHeadAttention
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.distributed import divide, get_tensor_model_parallel_world_size
|
||||
from vllm.inputs import DecoderOnlyInputs, token_inputs
|
||||
from vllm.model_executor.layers.activation import get_act_fn
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
@@ -23,9 +20,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
VocabParallelEmbedding)
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.multimodal.utils import (cached_get_tokenizer,
|
||||
consecutive_placeholder_ranges,
|
||||
repeat_and_pad_placeholder_tokens)
|
||||
from vllm.multimodal.utils import consecutive_placeholder_ranges
|
||||
from vllm.sequence import SequenceData
|
||||
|
||||
from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
|
||||
@@ -93,71 +88,6 @@ def dummy_image_for_siglip(
|
||||
return {"image": image if num_images == 1 else [image] * num_images}
|
||||
|
||||
|
||||
def dummy_video_for_siglip(
|
||||
hf_config: SiglipVisionConfig,
|
||||
num_frames: int,
|
||||
num_videos: int = 1,
|
||||
*,
|
||||
image_width_override: Optional[int] = None,
|
||||
image_height_override: Optional[int] = None,
|
||||
):
|
||||
pil_frame = dummy_image_for_siglip(
|
||||
hf_config,
|
||||
num_images=1,
|
||||
image_width_override=image_width_override,
|
||||
image_height_override=image_height_override)
|
||||
np_frame = np.array(pil_frame["image"])
|
||||
mm_data_per_video = np.repeat([np_frame], num_frames, axis=0)
|
||||
video_data = [mm_data_per_video] * num_videos
|
||||
mm_data = {"video": video_data}
|
||||
return mm_data
|
||||
|
||||
|
||||
def input_processor_for_siglip(
|
||||
model_config: ModelConfig,
|
||||
hf_config: SiglipVisionConfig,
|
||||
inputs: DecoderOnlyInputs,
|
||||
*,
|
||||
image_token_id: int,
|
||||
image_feature_size_override: Optional[Union[int, List[int]]] = None,
|
||||
):
|
||||
multi_modal_data = inputs.get("multi_modal_data")
|
||||
if multi_modal_data is None or "image" not in multi_modal_data:
|
||||
return inputs
|
||||
|
||||
if "multi_modal_placeholders" in inputs and "image" in inputs[
|
||||
"multi_modal_placeholders"]:
|
||||
# The inputs already have placeholders.
|
||||
return inputs
|
||||
|
||||
tokenizer = cached_get_tokenizer(model_config.tokenizer)
|
||||
|
||||
if image_feature_size_override is None:
|
||||
image_data = multi_modal_data["image"]
|
||||
if isinstance(image_data, Image.Image):
|
||||
image_feature_size = get_siglip_image_feature_size(hf_config)
|
||||
elif isinstance(image_data, torch.Tensor):
|
||||
num_images, image_feature_size, hidden_size = image_data.shape
|
||||
else:
|
||||
raise TypeError(f"Invalid image type: {type(image_data)}")
|
||||
else:
|
||||
image_feature_size = image_feature_size_override
|
||||
|
||||
new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
|
||||
tokenizer,
|
||||
inputs.get("prompt"),
|
||||
inputs["prompt_token_ids"],
|
||||
placeholder_token_id=image_token_id,
|
||||
repeat_count=image_feature_size,
|
||||
)
|
||||
|
||||
# NOTE: Create a defensive copy of the original inputs
|
||||
return token_inputs(prompt_token_ids=new_token_ids,
|
||||
prompt=new_prompt,
|
||||
multi_modal_data=multi_modal_data,
|
||||
multi_modal_placeholders={"image": ranges})
|
||||
|
||||
|
||||
class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]):
|
||||
|
||||
def get_num_image_tokens(
|
||||
|
||||
Reference in New Issue
Block a user