[VLM][Core] Support profiling with multiple multi-modal inputs per prompt (#7126)
This commit is contained in:
@@ -16,7 +16,7 @@
|
||||
# limitations under the License.
|
||||
""" PyTorch Fuyu model."""
|
||||
import math
|
||||
from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
|
||||
from typing import Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@@ -29,8 +29,7 @@ from vllm.config import CacheConfig, MultiModalConfig
|
||||
from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.linear import ColumnParallelLinear
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig)
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models.persimmon import PersimmonForCausalLM
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
@@ -94,27 +93,33 @@ def get_max_fuyu_image_tokens(ctx: InputContext):
|
||||
return (ncol + 1) * nrow
|
||||
|
||||
|
||||
def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int):
|
||||
def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int):
|
||||
ncol, nrow = get_max_fuyu_image_feature_size()
|
||||
image_feature_size = get_max_fuyu_image_tokens(ctx)
|
||||
|
||||
token_ids = ([_IMAGE_TOKEN_ID] * ncol + [_NEWLINE_TOKEN_ID]) * nrow
|
||||
token_ids += [0] * (seq_len - image_feature_size)
|
||||
image_token_ids = ([_IMAGE_TOKEN_ID] * ncol + [_NEWLINE_TOKEN_ID]) * nrow
|
||||
token_ids = image_token_ids * num_images
|
||||
token_ids += [0] * (seq_len - image_feature_size * num_images)
|
||||
return SequenceData(token_ids)
|
||||
|
||||
|
||||
def dummy_image_for_fuyu(
|
||||
num_images: int,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
):
|
||||
image = Image.new("RGB", (image_width, image_height), color=0)
|
||||
return {"image": image}
|
||||
return {"image": image if num_images == 1 else [image] * num_images}
|
||||
|
||||
|
||||
def dummy_data_for_fuyu(ctx: InputContext, seq_len: int):
|
||||
seq_data = dummy_seq_data_for_fuyu(ctx, seq_len)
|
||||
mm_data = dummy_image_for_fuyu(MAX_IMAGE_FEATURE_SIZE_WIDTH,
|
||||
MAX_IMAGE_FEATURE_SIZE_HEIGHT)
|
||||
def dummy_data_for_fuyu(ctx: InputContext, seq_len: int,
|
||||
mm_counts: Mapping[str, int]):
|
||||
num_images = mm_counts["image"]
|
||||
seq_data = dummy_seq_data_for_fuyu(ctx, seq_len, num_images)
|
||||
mm_data = dummy_image_for_fuyu(num_images,
|
||||
image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
|
||||
image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT)
|
||||
return seq_data, mm_data
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user