[Frontend] Add chunked processing to handle long inputs in embedding models (#22280)

Signed-off-by: x22x22 <wadeking@qq.com> Signed-off-by: Kdump <rootshellexp@gmail.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Maximilien de Bayser <maxdebayser@gmail.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-13 19:14:24 +08:00
parent 0b1bdac6af
commit 653124bd46
6 changed files with 1603 additions and 3 deletions
--- a/vllm/config/init.py
+++ b/vllm/config/init.py
@@ -2598,6 +2598,25 @@ class PoolerConfig:
    ``math-shepherd-mistral-7b-prm`` model.
    """

+    enable_chunked_processing: Optional[bool] = None
+    """
+    Whether to enable chunked processing for long inputs that exceed the model's
+    maximum position embeddings. When enabled, long inputs will be split into
+    chunks, processed separately, and then aggregated using weighted averaging.
+    This allows embedding models to handle arbitrarily long text without CUDA
+    errors. Defaults to False.
+    """
+
+    max_embed_len: Optional[int] = None
+    """
+    Maximum input length allowed for embedding generation. When set, allows 
+    inputs longer than max_embed_len to be accepted for embedding models.
+    This parameter enables accepting long inputs without requiring 
+    VLLM_ALLOW_LONG_MAX_MODEL_LEN environment variable. When an input exceeds
+    max_embed_len, it will be handled according to the original max_model_len
+    validation logic. Defaults to None (i.e. set to max_model_len).
+    """
+
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,