[1/n][CI] Load models in CI from S3 instead of HF (#13205)
Signed-off-by: <> Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
This commit is contained in:
@@ -4,9 +4,11 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.device_allocator.cumem import CuMemAllocator
|
||||
from vllm.utils import GiB_bytes
|
||||
|
||||
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
|
||||
@@ -118,13 +120,18 @@ def test_cumem_with_cudagraph():
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"meta-llama/Llama-3.2-1B-Instruct", # sleep mode with safetensors
|
||||
"facebook/opt-125m" # sleep mode with pytorch checkpoint
|
||||
# sleep mode with safetensors
|
||||
f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
|
||||
# sleep mode with pytorch checkpoint
|
||||
"facebook/opt-125m"
|
||||
])
|
||||
def test_end_to_end(model):
|
||||
free, total = torch.cuda.mem_get_info()
|
||||
used_bytes_baseline = total - free # in case other process is running
|
||||
llm = LLM(model, enable_sleep_mode=True)
|
||||
load_format = LoadFormat.AUTO
|
||||
if "Llama" in model:
|
||||
load_format = LoadFormat.RUNAI_STREAMER
|
||||
llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
|
||||
prompt = "How are you?"
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||
output = llm.generate(prompt, sampling_params)
|
||||
|
||||
Reference in New Issue
Block a user