[Perf] API-server scaleout with many-to-many server-engine comms (#17546)

This commit is contained in:
Nick Hill
2025-05-30 08:17:00 -07:00
committed by GitHub
parent 84ec470fca
commit 2dbe8c0774
26 changed files with 1828 additions and 436 deletions

View File

@@ -88,7 +88,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
assert len(engine_core.scheduler.running) == 4
# Loop through until they are all done.
while len(engine_core.step()[0].outputs) > 0:
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass
assert len(engine_core.scheduler.waiting) == 0
@@ -163,11 +163,11 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
req0.request_id = req1.request_id = "test"
engine_core.add_request(req0)
while len(engine_core.step()[0].outputs) > 0:
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass
engine_core.add_request(req1)
while len(engine_core.step()[0].outputs) > 0:
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass
assert len(engine_core.scheduler.waiting) == 0
@@ -207,7 +207,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0
# Loop through until they are all done.
while len(engine_core.step()[0].outputs) > 0:
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
@@ -327,7 +327,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
assert scheduler_output.num_scheduled_tokens[1] == 4
# Batch queue is full. Finish Batch 2. Get first token of req0.
output = engine_core.step_with_batch_queue()[0]
output = engine_core.step_with_batch_queue()[0].get(0)
assert output is not None
assert len(output.outputs) == 1
assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
@@ -339,7 +339,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
assert scheduler_output.num_scheduled_tokens[0] == 1
# Batch queue is full. Finish Batch 3. Get first token of req1.
output = engine_core.step_with_batch_queue()[0]
output = engine_core.step_with_batch_queue()[0].get(0)
assert output is not None
assert len(output.outputs) == 1
assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
@@ -362,7 +362,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
if step % 2 == 0:
# Even steps consumes an output.
assert output is not None
assert len(output.outputs) == 1
assert len(output[0].outputs) == 1
if req_id in engine_core.scheduler.requests:
assert engine_core.scheduler.requests[
req_id].num_tokens == expected_num_tokens[req_id]