[Core] [Bugfix]: tensor parallel with prompt embeds (#18171)

Signed-off-by: Nan2018 <nan@protopia.ai>
Co-authored-by: Andrew Sansom <andrew@protopia.ai>
This commit is contained in:
Nan Qin
2025-05-19 22:21:27 -05:00
committed by GitHub
parent f07a673eb2
commit 9609327fa4
4 changed files with 138 additions and 64 deletions

View File

@@ -112,12 +112,12 @@ class RequestMetrics:
will include model forward, block/sync across
workers, cpu-gpu sync time and sampling time.
spec_token_acceptance_counts: number of accepted speculative tokens at
each position; the first token is from
each position; the first token is from
the target model and is always accepted;
e.g., when it's [10, 8, 4, 2] for a req,
e.g., when it's [10, 8, 4, 2] for a req,
it means there were 10 forward passes in
total, and there were 8, 4, 2 accepted
tokens at 1st, 2nd, 3rd speculation step.
total, and there were 8, 4, 2 accepted
tokens at 1st, 2nd, 3rd speculation step.
"""
arrival_time: float
last_token_time: float
@@ -714,9 +714,9 @@ class SequenceGroup:
trace_headers: OpenTelemetry trace headers.
prompt_adapter_request: Prompt Adapter request.
priority: User-defined priority of the request.
draft_size: The number of speculative tokens plus one from the target
draft_size: The number of speculative tokens plus one from the target
model; equal to max number of tokens a step can generate
for single-draft speculative decoding but larger than
for single-draft speculative decoding but larger than
that for multi-draft SD (currently not supported).
"""
@@ -1123,7 +1123,7 @@ class SequenceOutput(
self.output_embed.shape if self.output_embed is not None else None
return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, "
f"output_token={self.output_token}, "
f"output_embed.shape={output_embed_shape}"
f"output_embed.shape={output_embed_shape}, "
f"logprobs={self.logprobs})")
def __eq__(self, other: object) -> bool: