[core] add sleep and wake up endpoint and v1 support (#12987)

Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: cennn <2523403608@qq.com> Co-authored-by: cennn <2523403608@qq.com>
2025-02-20 12:41:17 +08:00
parent 0d243f2a54
commit ba81163997
13 changed files with 160 additions and 9 deletions
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -118,14 +118,16 @@ def test_cumem_with_cudagraph():

@fork_new_process_for_each_test
@pytest.mark.parametrize(
-    "model",
+    "model, use_v1",
    [
        # sleep mode with safetensors
-        f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B", True),
        # sleep mode with pytorch checkpoint
-        "facebook/opt-125m"
+        ("facebook/opt-125m", False),
    ])
-def test_end_to_end(model):
+def test_end_to_end(model: str, use_v1: bool):
+    import os
+    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
    free, total = torch.cuda.mem_get_info()
    used_bytes_baseline = total - free  # in case other process is running
    load_format = LoadFormat.AUTO
@@ -152,3 +154,5 @@ def test_end_to_end(model):

    # cmp output
    assert output[0].outputs[0].text == output2[0].outputs[0].text
+
+    del os.environ["VLLM_USE_V1"]