Add renderer-based prompt processing for embedding and classification endpoints (#24356)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
2025-09-07 01:26:48 -07:00
parent 105d3d62ef
commit 0661cb9df3
6 changed files with 60 additions and 57 deletions
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -108,10 +108,15 @@ class CompletionRenderer(BaseRenderer):
        for detailed parameter documentation.
        """
        if truncate_prompt_tokens is not None:
-            if max_length is not None:
-                assert 0 <= truncate_prompt_tokens <= max_length
            if truncate_prompt_tokens == 0:
                return []
+            if truncate_prompt_tokens < 0:
+                truncate_prompt_tokens = self.model_config.max_model_len
+            if max_length is not None and truncate_prompt_tokens > max_length:
+                raise ValueError(
+                    f"truncate_prompt_tokens ({truncate_prompt_tokens}) "
+                    f"cannot be greater than max_length ({max_length}). "
+                    f"Please select a smaller truncation size.")

        # Parse and batch the input prompts
        batch_inputs = parse_and_batch_prompt(prompt_or_prompts)