[V1] Prefix caching for vision language models (#11187)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
2024-12-17 16:37:59 -08:00
parent c77eb8a33c
commit bf8717ebae
14 changed files with 341 additions and 97 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -205,6 +205,7 @@ class EngineArgs:
        # by user.
        if self.enable_prefix_caching is None:
            self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
+
        # Override max_num_seqs if it's not set by user.
        if self.max_num_seqs is None:
            self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024
@@ -1026,11 +1027,11 @@ class EngineArgs:
        device_config = DeviceConfig(device=self.device)
        model_config = self.create_model_config()

-        if model_config.is_multimodal_model:
-            if self.enable_prefix_caching:
-                logger.warning(
-                    "--enable-prefix-caching is currently not "
-                    "supported for multimodal models and has been disabled.")
+        if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
+                and self.enable_prefix_caching):
+            logger.warning("--enable-prefix-caching is currently not "
+                           "supported for multimodal models in v0 and "
+                           "has been disabled.")
            self.enable_prefix_caching = False

        cache_config = CacheConfig(
@@ -1249,11 +1250,14 @@ class EngineArgs:
        # When no user override, set the default values based on the usage
        # context.
        # TODO(woosuk): Tune the default values for different hardware.
-        if self.max_num_batched_tokens is None:
-            if usage_context == UsageContext.LLM_CLASS:
-                self.max_num_batched_tokens = 8192
-            elif usage_context == UsageContext.OPENAI_API_SERVER:
-                self.max_num_batched_tokens = 2048
+        default_max_num_batched_tokens = {
+            UsageContext.LLM_CLASS: 8192,
+            UsageContext.OPENAI_API_SERVER: 2048,
+        }
+        if (self.max_num_batched_tokens is None
+                and usage_context in default_max_num_batched_tokens):
+            self.max_num_batched_tokens = default_max_num_batched_tokens[
+                usage_context]
            logger.warning(
                "Setting max_num_batched_tokens to %d for %s usage context.",
                self.max_num_batched_tokens, usage_context.value)
@@ -1263,9 +1267,6 @@ class EngineArgs:
        Override the EngineConfig's configs based on the usage context for V1.
        """
        assert envs.VLLM_USE_V1, "V1 is not enabled"
-        if engine_config.model_config.is_multimodal_model:
-            # TODO (ywang96): Enable APC by default when VLM supports it.
-            assert not engine_config.cache_config.enable_prefix_caching


@dataclass