[Multi Modal] Configurable MM Profiling (#25631)
Signed-off-by: wwl2755 <wangwenlong2755@gmail.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -12,6 +12,8 @@ from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
|
||||
ImageDummyOptions, VideoDummyOptions)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
|
||||
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
|
||||
from vllm.multimodal.inputs import MultiModalInputs
|
||||
@@ -112,12 +114,26 @@ def _test_processing_correctness(
|
||||
|
||||
processing_info = factories.info(ctx)
|
||||
supported_mm_limits = processing_info.get_supported_mm_limits()
|
||||
limit_mm_per_prompt = {
|
||||
# Keep integer limits for local data generation
|
||||
limit_mm_per_prompt_ints = {
|
||||
modality: 3 if limit is None else limit
|
||||
for modality, limit in supported_mm_limits.items()
|
||||
}
|
||||
|
||||
model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
|
||||
def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
|
||||
if modality == "video":
|
||||
return VideoDummyOptions(count=count)
|
||||
if modality == "image":
|
||||
return ImageDummyOptions(count=count)
|
||||
if modality == "audio":
|
||||
return AudioDummyOptions(count=count)
|
||||
return BaseDummyOptions(count=count)
|
||||
|
||||
# Assign normalized DummyOptions to the model config
|
||||
model_config.get_multimodal_config().limit_per_prompt = {
|
||||
modality: _to_dummy_options(modality, count)
|
||||
for modality, count in limit_mm_per_prompt_ints.items()
|
||||
}
|
||||
|
||||
baseline_processor = factories.build_processor(ctx, cache=None)
|
||||
cached_processor = factories.build_processor(ctx, cache=cache)
|
||||
@@ -150,7 +166,7 @@ def _test_processing_correctness(
|
||||
k:
|
||||
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
|
||||
for _ in range(rng.randint(limit + 1))]
|
||||
for k, limit in limit_mm_per_prompt.items()
|
||||
for k, limit in limit_mm_per_prompt_ints.items()
|
||||
}
|
||||
|
||||
mm_counts = {k: len(vs) for k, vs in mm_data.items()}
|
||||
|
||||
@@ -17,23 +17,23 @@ def test_profiling(model_id: str, max_model_len: int):
|
||||
model_config_kwargs = {
|
||||
"max_model_len": max_model_len,
|
||||
}
|
||||
mm_counts = {"image": 1}
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
model_config_kwargs=model_config_kwargs,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
limit_mm_per_prompt=mm_counts,
|
||||
)
|
||||
|
||||
mm_config = ctx.get_mm_config()
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
profiler = MultiModalProfiler(processor)
|
||||
|
||||
decoder_dummy_data = profiler.get_decoder_dummy_data(
|
||||
max_model_len,
|
||||
mm_counts=mm_config.limit_per_prompt,
|
||||
mm_counts=mm_counts,
|
||||
)
|
||||
dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
|
||||
max_model_len,
|
||||
mm_counts=mm_config.limit_per_prompt,
|
||||
mm_counts=mm_counts,
|
||||
)
|
||||
|
||||
hf_config = ctx.get_hf_config(Llama4Config)
|
||||
@@ -58,7 +58,7 @@ def test_profiling(model_id: str, max_model_len: int):
|
||||
|
||||
profiled_tokens = profiler.get_mm_max_contiguous_tokens(
|
||||
max_model_len,
|
||||
mm_counts=mm_config.limit_per_prompt,
|
||||
mm_counts=mm_counts,
|
||||
)
|
||||
|
||||
assert total_tokens == profiled_tokens["image"]
|
||||
|
||||
@@ -15,6 +15,8 @@ from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
|
||||
ImageDummyOptions, VideoDummyOptions)
|
||||
from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel)
|
||||
@@ -236,7 +238,20 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
|
||||
modality: 3 if limit is None else limit
|
||||
for modality, limit in supported_mm_limits.items()
|
||||
}
|
||||
model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt
|
||||
|
||||
def _to_dummy_options(modality: str, count: int) -> BaseDummyOptions:
|
||||
if modality == "video":
|
||||
return VideoDummyOptions(count=count)
|
||||
if modality == "image":
|
||||
return ImageDummyOptions(count=count)
|
||||
if modality == "audio":
|
||||
return AudioDummyOptions(count=count)
|
||||
return BaseDummyOptions(count=count)
|
||||
|
||||
model_config.get_multimodal_config().limit_per_prompt = {
|
||||
modality: _to_dummy_options(modality, count)
|
||||
for modality, count in limit_mm_per_prompt.items()
|
||||
}
|
||||
processor = factories.build_processor(ctx, cache=None)
|
||||
|
||||
with initialize_dummy_model(model_cls, model_config) as model:
|
||||
|
||||
Reference in New Issue
Block a user