[1/n][CI] Load models in CI from S3 instead of HF (#13205)

Signed-off-by: <> Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
2025-02-18 23:34:59 -08:00
parent fd84857f64
commit d5d214ac7f
43 changed files with 225 additions and 76 deletions
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -7,12 +7,13 @@ import weakref
 import jsonschema
 import pytest

+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams

-MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
+MODEL_NAME = "s3://vllm-ci-model-weights/Qwen2.5-1.5B-Instruct"
 GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]


@@ -20,7 +21,9 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
-    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
+              max_model_len=1024)

    with llm.deprecate_legacy_api():
        yield weakref.proxy(llm)