[ci] Use env var to control whether to use S3 bucket in CI (#13634)
This commit is contained in:
@@ -5,17 +5,12 @@ from typing import List
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import LoadFormat
|
||||
|
||||
from ...conftest import MODEL_WEIGHTS_S3_BUCKET
|
||||
from ..openai.test_vision import TEST_IMAGE_URLS
|
||||
|
||||
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
|
||||
|
||||
|
||||
def test_chat():
|
||||
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
|
||||
load_format=RUNAI_STREAMER_LOAD_FORMAT)
|
||||
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
|
||||
|
||||
prompt1 = "Explain the concept of entropy."
|
||||
messages = [
|
||||
@@ -33,8 +28,7 @@ def test_chat():
|
||||
|
||||
|
||||
def test_multi_chat():
|
||||
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
|
||||
load_format=RUNAI_STREAMER_LOAD_FORMAT)
|
||||
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
|
||||
|
||||
prompt1 = "Explain the concept of entropy."
|
||||
prompt2 = "Explain what among us is."
|
||||
@@ -71,8 +65,7 @@ def test_multi_chat():
|
||||
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
|
||||
def test_chat_multi_image(image_urls: List[str]):
|
||||
llm = LLM(
|
||||
model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
|
||||
load_format=RUNAI_STREAMER_LOAD_FORMAT,
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
dtype="bfloat16",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
|
||||
@@ -28,7 +28,7 @@ def test_collective_rpc(tp_size, backend):
|
||||
def echo_rank(self):
|
||||
return self.rank
|
||||
|
||||
llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
|
||||
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
enforce_eager=True,
|
||||
load_format="dummy",
|
||||
tensor_parallel_size=tp_size,
|
||||
|
||||
@@ -6,10 +6,9 @@ from typing import List
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, PoolingParams, PoolingRequestOutput
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_NAME = "s3://vllm-ci-model-weights/e5-mistral-7b-instruct"
|
||||
MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
|
||||
|
||||
PROMPTS = [
|
||||
"Hello, my name is",
|
||||
@@ -33,7 +32,6 @@ def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
max_num_batched_tokens=32768,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.75,
|
||||
|
||||
@@ -6,10 +6,9 @@ from typing import List
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, RequestOutput, SamplingParams
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
MODEL_NAME = "s3://vllm-ci-model-weights/distilgpt2"
|
||||
MODEL_NAME = "distilbert/distilgpt2"
|
||||
|
||||
PROMPTS = [
|
||||
"Hello, my name is",
|
||||
@@ -31,7 +30,6 @@ def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
max_num_batched_tokens=4096,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.10,
|
||||
|
||||
@@ -7,11 +7,10 @@ import pytest
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
MODEL_NAME = "s3://vllm-ci-model-weights/zephyr-7b-beta"
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
PROMPTS = [
|
||||
"Hello, my name is",
|
||||
@@ -28,7 +27,6 @@ def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
tensor_parallel_size=1,
|
||||
max_model_len=8192,
|
||||
enable_lora=True,
|
||||
|
||||
@@ -7,13 +7,12 @@ import weakref
|
||||
import jsonschema
|
||||
import pytest
|
||||
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.entrypoints.llm import LLM
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
|
||||
MODEL_NAME = "s3://vllm-ci-model-weights/Qwen2.5-1.5B-Instruct"
|
||||
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
|
||||
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
|
||||
|
||||
|
||||
@@ -21,9 +20,7 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
|
||||
def llm():
|
||||
# pytest caches the fixture so we use weakref.proxy to
|
||||
# enable garbage collection
|
||||
llm = LLM(model=MODEL_NAME,
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
max_model_len=1024)
|
||||
llm = LLM(model=MODEL_NAME, max_model_len=1024)
|
||||
|
||||
with llm.deprecate_legacy_api():
|
||||
yield weakref.proxy(llm)
|
||||
|
||||
@@ -6,7 +6,6 @@ from contextlib import nullcontext
|
||||
from vllm_test_utils import BlameResult, blame
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import LoadFormat
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
|
||||
@@ -44,8 +43,7 @@ def run_normal():
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM without guided decoding as a baseline.
|
||||
llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
llm = LLM(model="distilbert/distilgpt2",
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=0.3)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
@@ -61,8 +59,7 @@ def run_normal():
|
||||
|
||||
def run_lmfe(sample_regex):
|
||||
# Create an LLM with guided decoding enabled.
|
||||
llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
llm = LLM(model="distilbert/distilgpt2",
|
||||
enforce_eager=True,
|
||||
guided_decoding_backend="lm-format-enforcer",
|
||||
gpu_memory_utilization=0.3)
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import LoadFormat
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
@@ -15,17 +14,13 @@ def v1(run_with_both_engines):
|
||||
|
||||
|
||||
def test_empty_prompt():
|
||||
llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
enforce_eager=True)
|
||||
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
|
||||
with pytest.raises(ValueError, match='Prompt cannot be empty'):
|
||||
llm.generate([""])
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
def test_out_of_vocab_token():
|
||||
llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
|
||||
load_format=LoadFormat.RUNAI_STREAMER,
|
||||
enforce_eager=True)
|
||||
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
|
||||
with pytest.raises(ValueError, match='out of vocabulary'):
|
||||
llm.generate({"prompt_token_ids": [999999]})
|
||||
|
||||
Reference in New Issue
Block a user