[V1] Initial support of multimodal models for V1 re-arch (#10699)

Signed-off-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
Roger Wang
2024-12-08 04:50:51 -08:00
committed by GitHub
parent fd57d2b534
commit a11f326528
11 changed files with 283 additions and 68 deletions

View File

@@ -1050,9 +1050,12 @@ class EngineArgs:
# long context (> 32K) models. This is to avoid OOM errors in the
# initial memory profiling phase.
# Chunked prefill is currently disabled for multimodal models by
# default.
if use_long_context and not model_config.is_multimodal_model:
# For multimodal models, chunked prefill is disabled by default in
# V0, but enabled by design in V1
if model_config.is_multimodal_model:
self.enable_chunked_prefill = bool(envs.VLLM_USE_V1)
elif use_long_context:
is_gpu = device_config.device_type == "cuda"
use_sliding_window = (model_config.get_sliding_window()
is not None)
@@ -1241,12 +1244,9 @@ class EngineArgs:
Override the EngineConfig's configs based on the usage context for V1.
"""
assert envs.VLLM_USE_V1, "V1 is not enabled"
# TODO (ywang96): Enable APC by default when VLM supports it.
if engine_config.model_config.is_multimodal_model:
logger.warning(
"Prefix caching is currently not supported for multimodal "
"models and has been disabled.")
engine_config.cache_config.enable_prefix_caching = False
# TODO (ywang96): Enable APC by default when VLM supports it.
assert not engine_config.cache_config.enable_prefix_caching
@dataclass