[Core][VLM] Add precise multi-modal placeholder tracking (#8346)
Signed-off-by: Peter Salas <peter@fixie.ai>
This commit is contained in:
@@ -28,8 +28,8 @@ from transformers import CLIPVisionConfig, PretrainedConfig
|
||||
from vllm.attention import AttentionMetadata
|
||||
from vllm.config import (CacheConfig, ModelConfig, MultiModalConfig,
|
||||
PoolerConfig)
|
||||
from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
|
||||
token_inputs)
|
||||
from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
|
||||
InputContext, token_inputs)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.pooler import Pooler, PoolingType
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
@@ -380,7 +380,7 @@ def dummy_data_for_phi3v(ctx: InputContext,
|
||||
|
||||
image_feature_size = get_max_phi3v_image_tokens(ctx, num_crops=num_crops)
|
||||
|
||||
seq_data = dummy_seq_data_for_clip(
|
||||
seq_data, ranges = dummy_seq_data_for_clip(
|
||||
CLIP_VIT_LARGE_PATCH14_336_CONFIG,
|
||||
seq_len,
|
||||
num_images,
|
||||
@@ -394,7 +394,7 @@ def dummy_data_for_phi3v(ctx: InputContext,
|
||||
image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
|
||||
)
|
||||
|
||||
return seq_data, mm_data
|
||||
return DummyData(seq_data, mm_data, ranges)
|
||||
|
||||
|
||||
@lru_cache
|
||||
|
||||
Reference in New Issue
Block a user