[V1] Initial support of multimodal models for V1 re-arch (#10699)
Signed-off-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
@@ -1050,9 +1050,12 @@ class EngineArgs:
|
||||
# long context (> 32K) models. This is to avoid OOM errors in the
|
||||
# initial memory profiling phase.
|
||||
|
||||
# Chunked prefill is currently disabled for multimodal models by
|
||||
# default.
|
||||
if use_long_context and not model_config.is_multimodal_model:
|
||||
# For multimodal models, chunked prefill is disabled by default in
|
||||
# V0, but enabled by design in V1
|
||||
if model_config.is_multimodal_model:
|
||||
self.enable_chunked_prefill = bool(envs.VLLM_USE_V1)
|
||||
|
||||
elif use_long_context:
|
||||
is_gpu = device_config.device_type == "cuda"
|
||||
use_sliding_window = (model_config.get_sliding_window()
|
||||
is not None)
|
||||
@@ -1241,12 +1244,9 @@ class EngineArgs:
|
||||
Override the EngineConfig's configs based on the usage context for V1.
|
||||
"""
|
||||
assert envs.VLLM_USE_V1, "V1 is not enabled"
|
||||
# TODO (ywang96): Enable APC by default when VLM supports it.
|
||||
if engine_config.model_config.is_multimodal_model:
|
||||
logger.warning(
|
||||
"Prefix caching is currently not supported for multimodal "
|
||||
"models and has been disabled.")
|
||||
engine_config.cache_config.enable_prefix_caching = False
|
||||
# TODO (ywang96): Enable APC by default when VLM supports it.
|
||||
assert not engine_config.cache_config.enable_prefix_caching
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
Reference in New Issue
Block a user