[V1] Support long_prefill_token_threshold in v1 scheduler (#15419)
Signed-off-by: Lu Fang <lufang@fb.com>
This commit is contained in:
29
tests/v1/core/test_scheduler_e2e.py
Normal file
29
tests/v1/core/test_scheduler_e2e.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
if os.getenv("VLLM_USE_V1", "0") != "1":
|
||||
pytest.skip("Test package requires V1", allow_module_level=True)
|
||||
|
||||
MODEL = "meta-llama/Llama-3.2-1B"
|
||||
PROMPT = "Hello my name is Robert and I"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def model() -> LLM:
|
||||
return LLM(MODEL,
|
||||
enforce_eager=True,
|
||||
enable_prefix_caching=True,
|
||||
long_prefill_token_threshold=2,
|
||||
max_num_batched_tokens=6,
|
||||
max_num_seqs=3)
|
||||
|
||||
|
||||
def test_concurrent_partial_prefill(model):
|
||||
outputs = model.generate([PROMPT] * 3)
|
||||
assert len(outputs) == 3
|
||||
for output in outputs:
|
||||
assert len(output.outputs) == 1
|
||||
Reference in New Issue
Block a user