[ci] Use env var to control whether to use S3 bucket in CI (#13634)

2025-02-22 19:19:45 -08:00
parent 322d2a27d6
commit 2c5e637b57
30 changed files with 222 additions and 231 deletions
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -4,11 +4,9 @@ import pytest
 import torch

 from vllm import LLM, SamplingParams
-from vllm.config import LoadFormat
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.utils import GiB_bytes

-from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 from ..utils import fork_new_process_for_each_test


@@ -121,7 +119,7 @@ def test_cumem_with_cudagraph():
    "model, use_v1",
    [
        # sleep mode with safetensors
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/meta-llama/Llama-3.2-1B", True),
+        ("meta-llama/Llama-3.2-1B", True),
        # sleep mode with pytorch checkpoint
        ("facebook/opt-125m", False),
    ])
@@ -130,10 +128,7 @@ def test_end_to_end(model: str, use_v1: bool):
    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
    free, total = torch.cuda.mem_get_info()
    used_bytes_baseline = total - free  # in case other process is running
-    load_format = LoadFormat.AUTO
-    if "Llama" in model:
-        load_format = LoadFormat.RUNAI_STREAMER
-    llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
+    llm = LLM(model, enable_sleep_mode=True)
    prompt = "How are you?"
    sampling_params = SamplingParams(temperature=0, max_tokens=10)
    output = llm.generate(prompt, sampling_params)