[1/n][CI] Load models in CI from S3 instead of HF (#13205)

Signed-off-by: <> Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
2025-02-18 23:34:59 -08:00
parent fd84857f64
commit d5d214ac7f
43 changed files with 225 additions and 76 deletions
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -10,6 +10,9 @@ import gc
 import torch

 from vllm import LLM, SamplingParams
+from vllm.config import LoadFormat
+
+from .conftest import MODEL_WEIGHTS_S3_BUCKET


 def test_duplicated_ignored_sequence_group():
@@ -18,7 +21,8 @@ def test_duplicated_ignored_sequence_group():
    sampling_params = SamplingParams(temperature=0.01,
                                     top_p=0.1,
                                     max_tokens=256)
-    llm = LLM(model="facebook/opt-125m",
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
              max_num_batched_tokens=4096,
              tensor_parallel_size=1)
    prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
@@ -31,7 +35,8 @@ def test_max_tokens_none():
    sampling_params = SamplingParams(temperature=0.01,
                                     top_p=0.1,
                                     max_tokens=None)
-    llm = LLM(model="facebook/opt-125m",
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
              max_num_batched_tokens=4096,
              tensor_parallel_size=1)
    prompts = ["Just say hello!"]
@@ -41,7 +46,9 @@ def test_max_tokens_none():


 def test_gc():
-    llm = LLM("facebook/opt-125m", enforce_eager=True)
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
+              enforce_eager=True)
    del llm

    gc.collect()