[ci] Use env var to control whether to use S3 bucket in CI (#13634)
This commit is contained in:
@@ -9,7 +9,6 @@ import weakref
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..conftest import VllmRunner
|
||||
@@ -34,7 +33,7 @@ def v1(run_with_both_engines):
|
||||
|
||||
def test_vllm_gc_ed():
|
||||
"""Verify vllm instance is GC'ed when it is deleted"""
|
||||
llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER)
|
||||
llm = LLM("distilbert/distilgpt2")
|
||||
weak_llm = weakref.ref(llm)
|
||||
del llm
|
||||
# If there's any circular reference to vllm, this fails
|
||||
@@ -43,10 +42,10 @@ def test_vllm_gc_ed():
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
@pytest.mark.parametrize("enforce_eager", [False, True])
|
||||
@pytest.mark.parametrize("enforce_eager", [False])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
model: str,
|
||||
@@ -97,8 +96,8 @@ def test_models(
|
||||
"test_suite", [
|
||||
("distilbert/distilgpt2", "ray", "", "L4"),
|
||||
("distilbert/distilgpt2", "mp", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
|
||||
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
|
||||
("distilbert/distilgpt2", "ray", "", "A100"),
|
||||
("distilbert/distilgpt2", "mp", "", "A100"),
|
||||
("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
|
||||
|
||||
@@ -4,11 +4,9 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.device_allocator.cumem import CuMemAllocator
|
||||
from vllm.utils import GiB_bytes
|
||||
|
||||
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
|
||||
@@ -121,7 +119,7 @@ def test_cumem_with_cudagraph():
|
||||
"model, use_v1",
|
||||
[
|
||||
# sleep mode with safetensors
|
||||
(f"{MODEL_WEIGHTS_S3_BUCKET}/meta-llama/Llama-3.2-1B", True),
|
||||
("meta-llama/Llama-3.2-1B", True),
|
||||
# sleep mode with pytorch checkpoint
|
||||
("facebook/opt-125m", False),
|
||||
])
|
||||
@@ -130,10 +128,7 @@ def test_end_to_end(model: str, use_v1: bool):
|
||||
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
|
||||
free, total = torch.cuda.mem_get_info()
|
||||
used_bytes_baseline = total - free # in case other process is running
|
||||
load_format = LoadFormat.AUTO
|
||||
if "Llama" in model:
|
||||
load_format = LoadFormat.RUNAI_STREAMER
|
||||
llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
|
||||
llm = LLM(model, enable_sleep_mode=True)
|
||||
prompt = "How are you?"
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||
output = llm.generate(prompt, sampling_params)
|
||||
|
||||
Reference in New Issue
Block a user