[ci] Use env var to control whether to use S3 bucket in CI (#13634)

2025-02-22 19:19:45 -08:00
parent 322d2a27d6
commit 2c5e637b57
30 changed files with 222 additions and 231 deletions
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -5,17 +5,12 @@ from typing import List
 import pytest

 from vllm import LLM
-from vllm.config import LoadFormat

-from ...conftest import MODEL_WEIGHTS_S3_BUCKET
 from ..openai.test_vision import TEST_IMAGE_URLS

-RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
-

 def test_chat():
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
-              load_format=RUNAI_STREAMER_LOAD_FORMAT)
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")

    prompt1 = "Explain the concept of entropy."
    messages = [
@@ -33,8 +28,7 @@ def test_chat():


 def test_multi_chat():
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
-              load_format=RUNAI_STREAMER_LOAD_FORMAT)
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")

    prompt1 = "Explain the concept of entropy."
    prompt2 = "Explain what among us is."
@@ -71,8 +65,7 @@ def test_multi_chat():
                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
 def test_chat_multi_image(image_urls: List[str]):
    llm = LLM(
-        model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
-        load_format=RUNAI_STREAMER_LOAD_FORMAT,
+        model="microsoft/Phi-3.5-vision-instruct",
        dtype="bfloat16",
        max_model_len=4096,
        max_num_seqs=5,
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend):
        def echo_rank(self):
            return self.rank

-    llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
              enforce_eager=True,
              load_format="dummy",
              tensor_parallel_size=tp_size,
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -6,10 +6,9 @@ from typing import List
 import pytest

 from vllm import LLM, PoolingParams, PoolingRequestOutput
-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory

-MODEL_NAME = "s3://vllm-ci-model-weights/e5-mistral-7b-instruct"
+MODEL_NAME = "intfloat/e5-mistral-7b-instruct"

 PROMPTS = [
    "Hello, my name is",
@@ -33,7 +32,6 @@ def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model=MODEL_NAME,
-              load_format=LoadFormat.RUNAI_STREAMER,
              max_num_batched_tokens=32768,
              tensor_parallel_size=1,
              gpu_memory_utilization=0.75,
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -6,10 +6,9 @@ from typing import List
 import pytest

 from vllm import LLM, RequestOutput, SamplingParams
-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory

-MODEL_NAME = "s3://vllm-ci-model-weights/distilgpt2"
+MODEL_NAME = "distilbert/distilgpt2"

 PROMPTS = [
    "Hello, my name is",
@@ -31,7 +30,6 @@ def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model=MODEL_NAME,
-              load_format=LoadFormat.RUNAI_STREAMER,
              max_num_batched_tokens=4096,
              tensor_parallel_size=1,
              gpu_memory_utilization=0.10,
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -7,11 +7,10 @@ import pytest
 from huggingface_hub import snapshot_download

 from vllm import LLM
-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.lora.request import LoRARequest

-MODEL_NAME = "s3://vllm-ci-model-weights/zephyr-7b-beta"
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"

 PROMPTS = [
    "Hello, my name is",
@@ -28,7 +27,6 @@ def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(model=MODEL_NAME,
-              load_format=LoadFormat.RUNAI_STREAMER,
              tensor_parallel_size=1,
              max_model_len=8192,
              enable_lora=True,
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -7,13 +7,12 @@ import weakref
 import jsonschema
 import pytest

-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams

-MODEL_NAME = "s3://vllm-ci-model-weights/Qwen2.5-1.5B-Instruct"
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
 GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]


@@ -21,9 +20,7 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              load_format=LoadFormat.RUNAI_STREAMER,
-              max_model_len=1024)
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)

    with llm.deprecate_legacy_api():
        yield weakref.proxy(llm)
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -6,7 +6,6 @@ from contextlib import nullcontext
 from vllm_test_utils import BlameResult, blame

 from vllm import LLM, SamplingParams
-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory


@@ -44,8 +43,7 @@ def run_normal():
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

    # Create an LLM without guided decoding as a baseline.
-    llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
-              load_format=LoadFormat.RUNAI_STREAMER,
+    llm = LLM(model="distilbert/distilgpt2",
              enforce_eager=True,
              gpu_memory_utilization=0.3)
    outputs = llm.generate(prompts, sampling_params)
@@ -61,8 +59,7 @@ def run_normal():

 def run_lmfe(sample_regex):
    # Create an LLM with guided decoding enabled.
-    llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
-              load_format=LoadFormat.RUNAI_STREAMER,
+    llm = LLM(model="distilbert/distilgpt2",
              enforce_eager=True,
              guided_decoding_backend="lm-format-enforcer",
              gpu_memory_utilization=0.3)
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -3,7 +3,6 @@
 import pytest

 from vllm import LLM
-from vllm.config import LoadFormat


@pytest.fixture(autouse=True)
@@ -15,17 +14,13 @@ def v1(run_with_both_engines):


 def test_empty_prompt():
-    llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
-              load_format=LoadFormat.RUNAI_STREAMER,
-              enforce_eager=True)
+    llm = LLM(model="openai-community/gpt2", enforce_eager=True)
    with pytest.raises(ValueError, match='Prompt cannot be empty'):
        llm.generate([""])


@pytest.mark.skip_v1
 def test_out_of_vocab_token():
-    llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
-              load_format=LoadFormat.RUNAI_STREAMER,
-              enforce_eager=True)
+    llm = LLM(model="openai-community/gpt2", enforce_eager=True)
    with pytest.raises(ValueError, match='out of vocabulary'):
        llm.generate({"prompt_token_ids": [999999]})