[core] add sleep and wake up endpoint and v1 support (#12987)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: cennn <2523403608@qq.com>
Co-authored-by: cennn <2523403608@qq.com>
This commit is contained in:
youkaichao
2025-02-20 12:41:17 +08:00
committed by GitHub
parent 0d243f2a54
commit ba81163997
13 changed files with 160 additions and 9 deletions

View File

@@ -118,14 +118,16 @@ def test_cumem_with_cudagraph():
@fork_new_process_for_each_test
@pytest.mark.parametrize(
"model",
"model, use_v1",
[
# sleep mode with safetensors
f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
(f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B", True),
# sleep mode with pytorch checkpoint
"facebook/opt-125m"
("facebook/opt-125m", False),
])
def test_end_to_end(model):
def test_end_to_end(model: str, use_v1: bool):
import os
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
load_format = LoadFormat.AUTO
@@ -152,3 +154,5 @@ def test_end_to_end(model):
# cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text
del os.environ["VLLM_USE_V1"]