[1/n][CI] Load models in CI from S3 instead of HF (#13205)

Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
This commit is contained in:
Kevin H. Luu
2025-02-18 23:34:59 -08:00
committed by GitHub
parent fd84857f64
commit d5d214ac7f
43 changed files with 225 additions and 76 deletions

View File

@@ -9,6 +9,7 @@ import weakref
import pytest
from vllm import LLM
from vllm.config import LoadFormat
from vllm.platforms import current_platform
from ..conftest import VllmRunner
@@ -33,7 +34,7 @@ def v1(run_with_both_engines):
def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted"""
llm = LLM("facebook/opt-125m")
llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER)
weak_llm = weakref.ref(llm)
del llm
# If there's any circular reference to vllm, this fails
@@ -94,14 +95,14 @@ def test_models(
@pytest.mark.parametrize(
"model, distributed_executor_backend, attention_backend, "
"test_suite", [
("facebook/opt-125m", "ray", "", "L4"),
("facebook/opt-125m", "mp", "", "L4"),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
("facebook/opt-125m", "ray", "", "A100"),
("facebook/opt-125m", "mp", "", "A100"),
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
("distilbert/distilgpt2", "ray", "", "L4"),
("distilbert/distilgpt2", "mp", "", "L4"),
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
("distilbert/distilgpt2", "ray", "", "A100"),
("distilbert/distilgpt2", "mp", "", "A100"),
("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
])
def test_models_distributed(
hf_runner,

View File

@@ -4,9 +4,11 @@ import pytest
import torch
from vllm import LLM, SamplingParams
from vllm.config import LoadFormat
from vllm.device_allocator.cumem import CuMemAllocator
from vllm.utils import GiB_bytes
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
from ..utils import fork_new_process_for_each_test
@@ -118,13 +120,18 @@ def test_cumem_with_cudagraph():
@pytest.mark.parametrize(
"model",
[
"meta-llama/Llama-3.2-1B-Instruct", # sleep mode with safetensors
"facebook/opt-125m" # sleep mode with pytorch checkpoint
# sleep mode with safetensors
f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
# sleep mode with pytorch checkpoint
"facebook/opt-125m"
])
def test_end_to_end(model):
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)
load_format = LoadFormat.AUTO
if "Llama" in model:
load_format = LoadFormat.RUNAI_STREAMER
llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)

View File

@@ -17,7 +17,7 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
from ..models.utils import check_outputs_equal
MODELS = [
"facebook/opt-125m",
"distilbert/distilgpt2",
]