[Perf] API-server scaleout with many-to-many server-engine comms (#17546)
This commit is contained in:
@@ -138,7 +138,6 @@ def create_requests(num_requests: int,
|
||||
multi_modal_placeholders=mm_position,
|
||||
multi_modal_hashes=None,
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
arrival_time=0,
|
||||
)
|
||||
requests.append(request)
|
||||
return requests
|
||||
@@ -744,7 +743,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
|
||||
assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i])
|
||||
|
||||
# No draft or accepted tokens counted yet
|
||||
assert engine_core_outputs.scheduler_stats.spec_decoding_stats is None
|
||||
assert not engine_core_outputs or (
|
||||
engine_core_outputs[0].scheduler_stats.spec_decoding_stats is None)
|
||||
|
||||
# Schedule the speculated tokens for validation
|
||||
output = scheduler.schedule()
|
||||
@@ -772,7 +772,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
|
||||
engine_core_outputs = scheduler.update_from_output(output,
|
||||
model_runner_output)
|
||||
|
||||
scheduler_stats = engine_core_outputs.scheduler_stats
|
||||
scheduler_stats = engine_core_outputs[0].scheduler_stats \
|
||||
if engine_core_outputs else None
|
||||
if expected[0] == 0:
|
||||
assert scheduler_stats.spec_decoding_stats is None
|
||||
else:
|
||||
@@ -843,7 +844,7 @@ def _step_until_done(
|
||||
# We should be in the decode phase now.
|
||||
assert num_scheduled_tokens == 1
|
||||
assert len(output.kv_connector_metadata.requests) == 0
|
||||
ecos = scheduler.update_from_output(output, model_runner_output)
|
||||
ecos = scheduler.update_from_output(output, model_runner_output)[0]
|
||||
all_done = True
|
||||
for eco in ecos.outputs:
|
||||
if eco.finish_reason is None:
|
||||
|
||||
Reference in New Issue
Block a user