[V1] Refactor num_computed_tokens logic (#15307)
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
@@ -231,8 +231,10 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||
Test that the engine can handle multiple concurrent batches.
|
||||
"""
|
||||
|
||||
def make_request_with_max_tokens(max_tokens: int) -> EngineCoreRequest:
|
||||
def make_request_with_max_tokens(req_id: int,
|
||||
max_tokens: int) -> EngineCoreRequest:
|
||||
request = make_request()
|
||||
request.request_id = req_id
|
||||
request.sampling_params.max_tokens = max_tokens
|
||||
return request
|
||||
|
||||
@@ -279,6 +281,8 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||
# Avoid all requests being scheduled once.
|
||||
enable_prefix_caching=False,
|
||||
max_num_batched_tokens=10,
|
||||
# Reduce startup time.
|
||||
enforce_eager=True,
|
||||
)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
engine_core = EngineCore(vllm_config=vllm_config,
|
||||
@@ -286,13 +290,13 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
|
||||
executor_class=DummyExecutor)
|
||||
assert engine_core.batch_queue is not None
|
||||
|
||||
# Add two requests in a row.
|
||||
req = make_request_with_max_tokens(5)
|
||||
engine_core.add_request(req)
|
||||
req = make_request_with_max_tokens(5)
|
||||
engine_core.add_request(req)
|
||||
# Add two requests in a row. Each request have 12 prompt tokens.
|
||||
req0 = make_request_with_max_tokens(0, 5)
|
||||
engine_core.add_request(req0)
|
||||
req1 = make_request_with_max_tokens(1, 5)
|
||||
engine_core.add_request(req1)
|
||||
|
||||
# First saturate the batch queue.
|
||||
# Schedule Batch 1: (10, req0)
|
||||
assert engine_core.step_with_batch_queue() is None
|
||||
assert engine_core.batch_queue.qsize() == 1
|
||||
assert engine_core.step_with_batch_queue() is None
|
||||
|
||||
Reference in New Issue
Block a user