[CORE] Prompt Embeddings Support for v1 Engine (#24278)

Signed-off-by: Andrew Sansom <andrew@protopia.ai> Signed-off-by: Andrew Sansom <qthequartermasterman@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-09-18 19:03:09 -05:00
parent 9fac6aa30b
commit 9a4600e4dc
20 changed files with 305 additions and 76 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1513,12 +1513,6 @@ class EngineArgs:
                                   recommend_to_remove=False)
                return False

-        # No text embedding inputs so far.
-        if self.enable_prompt_embeds:
-            _raise_or_fallback(feature_name="--enable-prompt-embeds",
-                               recommend_to_remove=False)
-            return False
-
        # No Mamba or Encoder-Decoder so far.
        if not model_config.is_v1_compatible:
            _raise_or_fallback(feature_name=model_config.architectures,
@@ -1651,6 +1645,13 @@ class EngineArgs:
                "models in V0 and has been disabled.")
            self.enable_prefix_caching = False

+            if self.enable_prompt_embeds:
+                logger.warning(
+                    "--enable-prompt-embeds and --enable-prefix-caching "
+                    "are not supported together in V0. Prefix caching has "
+                    "been disabled.")
+                self.enable_prefix_caching = False
+
        # Set max_num_seqs to 256 for VLLM_V0.
        if self.max_num_seqs is None:
            self.max_num_seqs = 256
@@ -1664,6 +1665,17 @@ class EngineArgs:
        # For pooling tasks the default is False
        if model_config.runner_type != "pooling":
            self.enable_chunked_prefill = True
+
+            # TODO: When prefix caching supports prompt embeds inputs, this
+            # check can be removed.
+            if (self.enable_prompt_embeds
+                    and self.enable_prefix_caching is not False):
+                logger.warning(
+                    "--enable-prompt-embeds and --enable-prefix-caching "
+                    "are not supported together in V1. Prefix caching has "
+                    "been disabled.")
+                self.enable_prefix_caching = False
+
            if self.enable_prefix_caching is None:
                self.enable_prefix_caching = True
        else: