[1/n][CI] Load models in CI from S3 instead of HF (#13205)
Signed-off-by: <> Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
This commit is contained in:
@@ -9,6 +9,7 @@ import weakref
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..conftest import VllmRunner
|
||||
@@ -33,7 +34,7 @@ def v1(run_with_both_engines):
|
||||
|
||||
def test_vllm_gc_ed():
|
||||
"""Verify vllm instance is GC'ed when it is deleted"""
|
||||
llm = LLM("facebook/opt-125m")
|
||||
llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER)
|
||||
weak_llm = weakref.ref(llm)
|
||||
del llm
|
||||
# If there's any circular reference to vllm, this fails
|
||||
@@ -94,14 +95,14 @@ def test_models(
|
||||
@pytest.mark.parametrize(
|
||||
"model, distributed_executor_backend, attention_backend, "
|
||||
"test_suite", [
|
||||
("facebook/opt-125m", "ray", "", "L4"),
|
||||
("facebook/opt-125m", "mp", "", "L4"),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
|
||||
("facebook/opt-125m", "ray", "", "A100"),
|
||||
("facebook/opt-125m", "mp", "", "A100"),
|
||||
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
|
||||
("distilbert/distilgpt2", "ray", "", "L4"),
|
||||
("distilbert/distilgpt2", "mp", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
|
||||
("distilbert/distilgpt2", "ray", "", "A100"),
|
||||
("distilbert/distilgpt2", "mp", "", "A100"),
|
||||
("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
|
||||
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
|
||||
])
|
||||
def test_models_distributed(
|
||||
hf_runner,
|
||||
|
||||
@@ -4,9 +4,11 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.device_allocator.cumem import CuMemAllocator
|
||||
from vllm.utils import GiB_bytes
|
||||
|
||||
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
|
||||
@@ -118,13 +120,18 @@ def test_cumem_with_cudagraph():
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"meta-llama/Llama-3.2-1B-Instruct", # sleep mode with safetensors
|
||||
"facebook/opt-125m" # sleep mode with pytorch checkpoint
|
||||
# sleep mode with safetensors
|
||||
f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
|
||||
# sleep mode with pytorch checkpoint
|
||||
"facebook/opt-125m"
|
||||
])
|
||||
def test_end_to_end(model):
|
||||
free, total = torch.cuda.mem_get_info()
|
||||
used_bytes_baseline = total - free # in case other process is running
|
||||
llm = LLM(model, enable_sleep_mode=True)
|
||||
load_format = LoadFormat.AUTO
|
||||
if "Llama" in model:
|
||||
load_format = LoadFormat.RUNAI_STREAMER
|
||||
llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
|
||||
prompt = "How are you?"
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||
output = llm.generate(prompt, sampling_params)
|
||||
|
||||
@@ -17,7 +17,7 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
|
||||
from ..models.utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
"distilbert/distilgpt2",
|
||||
]
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user