[Perf] Compute maxsim in worker side, reducing redundant copies, 2.7% E2E throughput improvement (#36159)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-03-09 23:55:58 -04:00
parent 006aea17d7
commit 7279374f91
11 changed files with 518 additions and 58 deletions
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -11,6 +11,26 @@ from vllm.sampling_params import RequestOutputKind
 from vllm.tasks import PoolingTask


+class LateInteractionParams(
+    msgspec.Struct,
+    omit_defaults=True,  # type: ignore[call-arg]
+    array_like=True,
+):  # type: ignore[call-arg]
+    """Metadata for worker-side late-interaction scoring.
+
+    Attributes:
+        mode:
+            - "cache_query": cache query token embeddings
+            - "score_doc": score a document against a cached query.
+        query_key: stable key used for both DP routing and worker cache lookup.
+        query_uses: expected number of document requests
+    """
+
+    mode: str
+    query_key: str
+    query_uses: int | None = None
+
+
 class PoolingParams(
    msgspec.Struct,
    omit_defaults=True,  # type: ignore[call-arg]
@@ -46,6 +66,7 @@ class PoolingParams(
    task: PoolingTask | None = None
    requires_token_ids: bool = False
    skip_reading_prefix_cache: bool | None = None
+    late_interaction_params: LateInteractionParams | None = None
    extra_kwargs: dict[str, Any] | None = None
    output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY

@@ -193,6 +214,7 @@ class PoolingParams(
            f"returned_token_ids={self.returned_token_ids}, "
            f"requires_token_ids={self.requires_token_ids}, "
            f"skip_reading_prefix_cache={self.skip_reading_prefix_cache}, "
+            f"late_interaction_params={self.late_interaction_params}, "
            f"extra_kwargs={self.extra_kwargs})"
        )