[ci] Use env var to control whether to use S3 bucket in CI (#13634)

This commit is contained in:
Kevin H. Luu
2025-02-22 19:19:45 -08:00
committed by GitHub
parent 322d2a27d6
commit 2c5e637b57
30 changed files with 222 additions and 231 deletions

View File

@@ -5,17 +5,12 @@ from typing import List
import pytest
from vllm import LLM
from vllm.config import LoadFormat
from ...conftest import MODEL_WEIGHTS_S3_BUCKET
from ..openai.test_vision import TEST_IMAGE_URLS
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
def test_chat():
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
load_format=RUNAI_STREAMER_LOAD_FORMAT)
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
prompt1 = "Explain the concept of entropy."
messages = [
@@ -33,8 +28,7 @@ def test_chat():
def test_multi_chat():
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
load_format=RUNAI_STREAMER_LOAD_FORMAT)
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
prompt1 = "Explain the concept of entropy."
prompt2 = "Explain what among us is."
@@ -71,8 +65,7 @@ def test_multi_chat():
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
def test_chat_multi_image(image_urls: List[str]):
llm = LLM(
model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
load_format=RUNAI_STREAMER_LOAD_FORMAT,
model="microsoft/Phi-3.5-vision-instruct",
dtype="bfloat16",
max_model_len=4096,
max_num_seqs=5,

View File

@@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend):
def echo_rank(self):
return self.rank
llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True,
load_format="dummy",
tensor_parallel_size=tp_size,

View File

@@ -6,10 +6,9 @@ from typing import List
import pytest
from vllm import LLM, PoolingParams, PoolingRequestOutput
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory
MODEL_NAME = "s3://vllm-ci-model-weights/e5-mistral-7b-instruct"
MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
PROMPTS = [
"Hello, my name is",
@@ -33,7 +32,6 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(model=MODEL_NAME,
load_format=LoadFormat.RUNAI_STREAMER,
max_num_batched_tokens=32768,
tensor_parallel_size=1,
gpu_memory_utilization=0.75,

View File

@@ -6,10 +6,9 @@ from typing import List
import pytest
from vllm import LLM, RequestOutput, SamplingParams
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory
MODEL_NAME = "s3://vllm-ci-model-weights/distilgpt2"
MODEL_NAME = "distilbert/distilgpt2"
PROMPTS = [
"Hello, my name is",
@@ -31,7 +30,6 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(model=MODEL_NAME,
load_format=LoadFormat.RUNAI_STREAMER,
max_num_batched_tokens=4096,
tensor_parallel_size=1,
gpu_memory_utilization=0.10,

View File

@@ -7,11 +7,10 @@ import pytest
from huggingface_hub import snapshot_download
from vllm import LLM
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.lora.request import LoRARequest
MODEL_NAME = "s3://vllm-ci-model-weights/zephyr-7b-beta"
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
PROMPTS = [
"Hello, my name is",
@@ -28,7 +27,6 @@ def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(model=MODEL_NAME,
load_format=LoadFormat.RUNAI_STREAMER,
tensor_parallel_size=1,
max_model_len=8192,
enable_lora=True,

View File

@@ -7,13 +7,12 @@ import weakref
import jsonschema
import pytest
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.entrypoints.llm import LLM
from vllm.outputs import RequestOutput
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
MODEL_NAME = "s3://vllm-ci-model-weights/Qwen2.5-1.5B-Instruct"
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
@@ -21,9 +20,7 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(model=MODEL_NAME,
load_format=LoadFormat.RUNAI_STREAMER,
max_model_len=1024)
llm = LLM(model=MODEL_NAME, max_model_len=1024)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)

View File

@@ -6,7 +6,6 @@ from contextlib import nullcontext
from vllm_test_utils import BlameResult, blame
from vllm import LLM, SamplingParams
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory
@@ -44,8 +43,7 @@ def run_normal():
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM without guided decoding as a baseline.
llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
llm = LLM(model="distilbert/distilgpt2",
enforce_eager=True,
gpu_memory_utilization=0.3)
outputs = llm.generate(prompts, sampling_params)
@@ -61,8 +59,7 @@ def run_normal():
def run_lmfe(sample_regex):
# Create an LLM with guided decoding enabled.
llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
llm = LLM(model="distilbert/distilgpt2",
enforce_eager=True,
guided_decoding_backend="lm-format-enforcer",
gpu_memory_utilization=0.3)

View File

@@ -3,7 +3,6 @@
import pytest
from vllm import LLM
from vllm.config import LoadFormat
@pytest.fixture(autouse=True)
@@ -15,17 +14,13 @@ def v1(run_with_both_engines):
def test_empty_prompt():
llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True)
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
with pytest.raises(ValueError, match='Prompt cannot be empty'):
llm.generate([""])
@pytest.mark.skip_v1
def test_out_of_vocab_token():
llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True)
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
with pytest.raises(ValueError, match='out of vocabulary'):
llm.generate({"prompt_token_ids": [999999]})