[Perf] Compute maxsim in worker side, reducing redundant copies, 2.7% E2E throughput improvement (#36159)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
Wentao Ye
2026-03-09 23:55:58 -04:00
committed by GitHub
parent 006aea17d7
commit 7279374f91
11 changed files with 518 additions and 58 deletions

View File

@@ -11,6 +11,26 @@ from vllm.sampling_params import RequestOutputKind
from vllm.tasks import PoolingTask
class LateInteractionParams(
msgspec.Struct,
omit_defaults=True, # type: ignore[call-arg]
array_like=True,
): # type: ignore[call-arg]
"""Metadata for worker-side late-interaction scoring.
Attributes:
mode:
- "cache_query": cache query token embeddings
- "score_doc": score a document against a cached query.
query_key: stable key used for both DP routing and worker cache lookup.
query_uses: expected number of document requests
"""
mode: str
query_key: str
query_uses: int | None = None
class PoolingParams(
msgspec.Struct,
omit_defaults=True, # type: ignore[call-arg]
@@ -46,6 +66,7 @@ class PoolingParams(
task: PoolingTask | None = None
requires_token_ids: bool = False
skip_reading_prefix_cache: bool | None = None
late_interaction_params: LateInteractionParams | None = None
extra_kwargs: dict[str, Any] | None = None
output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
@@ -193,6 +214,7 @@ class PoolingParams(
f"returned_token_ids={self.returned_token_ids}, "
f"requires_token_ids={self.requires_token_ids}, "
f"skip_reading_prefix_cache={self.skip_reading_prefix_cache}, "
f"late_interaction_params={self.late_interaction_params}, "
f"extra_kwargs={self.extra_kwargs})"
)