[1/n][CI] Load models in CI from S3 instead of HF (#13205)

Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
This commit is contained in:
Kevin H. Luu
2025-02-18 23:34:59 -08:00
committed by GitHub
parent fd84857f64
commit d5d214ac7f
43 changed files with 225 additions and 76 deletions

View File

@@ -4,9 +4,11 @@ import pytest
import torch
from vllm import LLM, SamplingParams
from vllm.config import LoadFormat
from vllm.device_allocator.cumem import CuMemAllocator
from vllm.utils import GiB_bytes
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
from ..utils import fork_new_process_for_each_test
@@ -118,13 +120,18 @@ def test_cumem_with_cudagraph():
@pytest.mark.parametrize(
"model",
[
"meta-llama/Llama-3.2-1B-Instruct", # sleep mode with safetensors
"facebook/opt-125m" # sleep mode with pytorch checkpoint
# sleep mode with safetensors
f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
# sleep mode with pytorch checkpoint
"facebook/opt-125m"
])
def test_end_to_end(model):
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)
load_format = LoadFormat.AUTO
if "Llama" in model:
load_format = LoadFormat.RUNAI_STREAMER
llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)