[core] add sleep and wake up endpoint and v1 support (#12987)
Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: cennn <2523403608@qq.com> Co-authored-by: cennn <2523403608@qq.com>
This commit is contained in:
@@ -118,14 +118,16 @@ def test_cumem_with_cudagraph():
|
||||
|
||||
@fork_new_process_for_each_test
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
"model, use_v1",
|
||||
[
|
||||
# sleep mode with safetensors
|
||||
f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
|
||||
(f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B", True),
|
||||
# sleep mode with pytorch checkpoint
|
||||
"facebook/opt-125m"
|
||||
("facebook/opt-125m", False),
|
||||
])
|
||||
def test_end_to_end(model):
|
||||
def test_end_to_end(model: str, use_v1: bool):
|
||||
import os
|
||||
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
|
||||
free, total = torch.cuda.mem_get_info()
|
||||
used_bytes_baseline = total - free # in case other process is running
|
||||
load_format = LoadFormat.AUTO
|
||||
@@ -152,3 +154,5 @@ def test_end_to_end(model):
|
||||
|
||||
# cmp output
|
||||
assert output[0].outputs[0].text == output2[0].outputs[0].text
|
||||
|
||||
del os.environ["VLLM_USE_V1"]
|
||||
|
||||
Reference in New Issue
Block a user