[Model] Add user-configurable task for models that support both generation and embedding (#9424)
This commit is contained in:
@@ -25,7 +25,7 @@ from tests.models.utils import (TokensTextLogprobs,
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.config import TokenizerPoolConfig
|
||||
from vllm.config import TaskOption, TokenizerPoolConfig
|
||||
from vllm.connections import global_http_connection
|
||||
from vllm.distributed import (destroy_distributed_environment,
|
||||
destroy_model_parallel,
|
||||
@@ -619,6 +619,7 @@ class VllmRunner:
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
task: TaskOption = "auto",
|
||||
tokenizer_name: Optional[str] = None,
|
||||
# Use smaller max model length, otherwise bigger model cannot run due
|
||||
# to kv cache size limit.
|
||||
@@ -634,6 +635,7 @@ class VllmRunner:
|
||||
) -> None:
|
||||
self.model = LLM(
|
||||
model=model_name,
|
||||
task=task,
|
||||
tokenizer=tokenizer_name,
|
||||
trust_remote_code=True,
|
||||
dtype=dtype,
|
||||
|
||||
@@ -33,7 +33,8 @@ def test_simple():
|
||||
num_seq_group = 4
|
||||
max_model_len = 16
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig(max_num_batched_tokens,
|
||||
scheduler_config = SchedulerConfig("generate",
|
||||
max_num_batched_tokens,
|
||||
num_seq_group,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True)
|
||||
@@ -78,6 +79,7 @@ def test_chunk():
|
||||
max_model_len = 80
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
@@ -126,6 +128,7 @@ def test_complex():
|
||||
max_model_len = 80
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
@@ -196,6 +199,7 @@ def test_maximal_decoding():
|
||||
max_model_len = 8
|
||||
max_num_batched_tokens = 2
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
@@ -289,6 +293,7 @@ def test_prompt_limit():
|
||||
max_model_len = 64
|
||||
max_num_batched_tokens = 32
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
@@ -321,7 +326,8 @@ def test_prompt_limit_exceed():
|
||||
max_seqs = 64
|
||||
max_model_len = 32
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig(max_num_batched_tokens,
|
||||
scheduler_config = SchedulerConfig("generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
enable_chunked_prefill=True)
|
||||
@@ -348,6 +354,7 @@ def test_swap():
|
||||
max_model_len = 200
|
||||
max_num_batched_tokens = 30
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
@@ -404,6 +411,7 @@ def test_running_prefill_prioritized_over_swap():
|
||||
max_model_len = 200
|
||||
max_num_batched_tokens = 30
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
@@ -498,6 +506,7 @@ def test_chunked_prefill_preempt():
|
||||
max_model_len = 200
|
||||
max_num_batched_tokens = 30
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
@@ -563,6 +572,7 @@ def test_chunked_prefill_max_seqs():
|
||||
max_model_len = 80
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
@@ -617,6 +627,7 @@ def test_perfix_caching():
|
||||
max_model_len = 80
|
||||
max_num_batched_tokens = 64
|
||||
scheduler_config = SchedulerConfig(
|
||||
"generate",
|
||||
max_num_batched_tokens,
|
||||
max_seqs,
|
||||
max_model_len,
|
||||
|
||||
@@ -20,9 +20,10 @@ from .utils import (append_new_token, append_new_token_seq_group,
|
||||
def test_scheduler_add_seq_group():
|
||||
block_size = 4
|
||||
scheduler_config = SchedulerConfig(
|
||||
100,
|
||||
64,
|
||||
1,
|
||||
"generate",
|
||||
max_num_batched_tokens=100,
|
||||
max_num_seqs=64,
|
||||
max_model_len=1,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
|
||||
cache_config.num_cpu_blocks = 4
|
||||
@@ -42,9 +43,10 @@ def test_scheduler_add_seq_group():
|
||||
def test_scheduler_abort_seq_group():
|
||||
block_size = 4
|
||||
scheduler_config = SchedulerConfig(
|
||||
100,
|
||||
64,
|
||||
1,
|
||||
"generate",
|
||||
max_num_batched_tokens=100,
|
||||
max_num_seqs=64,
|
||||
max_model_len=1,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 4
|
||||
@@ -70,9 +72,10 @@ def test_scheduler_schedule_simple():
|
||||
num_seq_group = 4
|
||||
max_model_len = 16
|
||||
scheduler_config = SchedulerConfig(
|
||||
64,
|
||||
num_seq_group,
|
||||
max_model_len,
|
||||
"generate",
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=num_seq_group,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 8
|
||||
@@ -114,9 +117,10 @@ def test_scheduler_prefill_prioritized():
|
||||
max_model_len = 30
|
||||
max_batched_num_tokens = 30
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_batched_num_tokens,
|
||||
2,
|
||||
max_model_len,
|
||||
"generate",
|
||||
max_num_batched_tokens=max_batched_num_tokens,
|
||||
max_num_seqs=2,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16
|
||||
@@ -145,9 +149,10 @@ def test_scheduler_schedule_preempt_abort():
|
||||
block_size = 4
|
||||
max_model_len = 16
|
||||
scheduler_config = SchedulerConfig(
|
||||
64,
|
||||
2,
|
||||
max_model_len,
|
||||
"generate",
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=2,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 2
|
||||
@@ -204,9 +209,10 @@ def test_scheduler_max_seqs():
|
||||
max_seq_group = 2
|
||||
max_model_len = 16
|
||||
scheduler_config = SchedulerConfig(
|
||||
64,
|
||||
max_seq_group,
|
||||
max_model_len,
|
||||
"generate",
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=max_seq_group,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 8
|
||||
@@ -248,9 +254,10 @@ def test_scheduler_max_seqs():
|
||||
def test_scheduler_delay_factor():
|
||||
block_size = 4
|
||||
scheduler_config = SchedulerConfig(
|
||||
100,
|
||||
64,
|
||||
16,
|
||||
"generate",
|
||||
max_num_batched_tokens=100,
|
||||
max_num_seqs=64,
|
||||
max_model_len=16,
|
||||
delay_factor=0.5,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
@@ -350,9 +357,10 @@ def initialize_scheduler(
|
||||
):
|
||||
block_size = block_size
|
||||
scheduler_config = SchedulerConfig(
|
||||
max_token_budget,
|
||||
max_num_seqs,
|
||||
max_model_len,
|
||||
"generate",
|
||||
max_num_batched_tokens=max_token_budget,
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = num_cpu_blocks
|
||||
|
||||
@@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder():
|
||||
block_size = 4
|
||||
num_seq_group = 4
|
||||
max_model_len = 16
|
||||
scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
|
||||
scheduler_config = SchedulerConfig(
|
||||
task="generate",
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=num_seq_group,
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
|
||||
cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group
|
||||
cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group
|
||||
|
||||
@@ -11,6 +11,7 @@ from typing import List, Literal, NamedTuple, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import TaskOption
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from ..utils import compare_two_settings, fork_new_process_for_each_test
|
||||
@@ -31,6 +32,7 @@ class ParallelSetup(NamedTuple):
|
||||
class PPTestSettings:
|
||||
parallel_setups: List[ParallelSetup]
|
||||
distributed_backends: List[str]
|
||||
task: TaskOption
|
||||
trust_remote_code: bool
|
||||
tokenizer_mode: Optional[str]
|
||||
|
||||
@@ -39,6 +41,7 @@ class PPTestSettings:
|
||||
*,
|
||||
tp_base: int = 1,
|
||||
pp_base: int = 2,
|
||||
task: TaskOption = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
tokenizer_mode: Optional[str] = None,
|
||||
):
|
||||
@@ -66,6 +69,7 @@ class PPTestSettings:
|
||||
chunked_prefill=False),
|
||||
],
|
||||
distributed_backends=["mp", "ray"],
|
||||
task=task,
|
||||
trust_remote_code=trust_remote_code,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
)
|
||||
@@ -75,6 +79,7 @@ class PPTestSettings:
|
||||
*,
|
||||
tp_base: int = 1,
|
||||
pp_base: int = 2,
|
||||
task: TaskOption = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
tokenizer_mode: Optional[str] = None,
|
||||
):
|
||||
@@ -86,6 +91,7 @@ class PPTestSettings:
|
||||
chunked_prefill=False),
|
||||
],
|
||||
distributed_backends=["mp"],
|
||||
task=task,
|
||||
trust_remote_code=trust_remote_code,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
)
|
||||
@@ -94,7 +100,7 @@ class PPTestSettings:
|
||||
for parallel_setup in self.parallel_setups:
|
||||
for distributed_backend in self.distributed_backends:
|
||||
yield (model_name, parallel_setup, distributed_backend,
|
||||
self.trust_remote_code, self.tokenizer_mode)
|
||||
self.task, self.trust_remote_code, self.tokenizer_mode)
|
||||
|
||||
|
||||
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
|
||||
@@ -213,6 +219,7 @@ def _compare_tp(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
task: TaskOption,
|
||||
trust_remote_code: bool,
|
||||
tokenizer_mode: Optional[str],
|
||||
num_gpus_available: int,
|
||||
@@ -240,6 +247,8 @@ def _compare_tp(
|
||||
common_args.append("--enable-chunked-prefill")
|
||||
if eager_mode:
|
||||
common_args.append("--enforce-eager")
|
||||
if task != "auto":
|
||||
common_args.extend(["--task", task])
|
||||
if trust_remote_code:
|
||||
common_args.append("--trust-remote-code")
|
||||
if tokenizer_mode:
|
||||
@@ -297,7 +306,7 @@ def _compare_tp(
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend",
|
||||
("model_name", "parallel_setup", "distributed_backend", "task",
|
||||
"trust_remote_code", "tokenizer_mode"),
|
||||
[
|
||||
params for model_name, settings in GENERATION_MODEL_SETTINGS.items()
|
||||
@@ -310,6 +319,7 @@ def test_tp_language_generation(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
task: TaskOption,
|
||||
trust_remote_code: bool,
|
||||
tokenizer_mode: Optional[str],
|
||||
num_gpus_available,
|
||||
@@ -317,6 +327,7 @@ def test_tp_language_generation(
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
task,
|
||||
trust_remote_code,
|
||||
tokenizer_mode,
|
||||
num_gpus_available,
|
||||
@@ -324,7 +335,7 @@ def test_tp_language_generation(
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend",
|
||||
("model_name", "parallel_setup", "distributed_backend", "task",
|
||||
"trust_remote_code", "tokenizer_mode"),
|
||||
[
|
||||
params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items()
|
||||
@@ -337,6 +348,7 @@ def test_tp_language_embedding(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
task: TaskOption,
|
||||
trust_remote_code: bool,
|
||||
tokenizer_mode: Optional[str],
|
||||
num_gpus_available,
|
||||
@@ -344,6 +356,7 @@ def test_tp_language_embedding(
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
task,
|
||||
trust_remote_code,
|
||||
tokenizer_mode,
|
||||
num_gpus_available,
|
||||
@@ -351,7 +364,7 @@ def test_tp_language_embedding(
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_name", "parallel_setup", "distributed_backend",
|
||||
("model_name", "parallel_setup", "distributed_backend", "task",
|
||||
"trust_remote_code", "tokenizer_mode"),
|
||||
[
|
||||
params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items()
|
||||
@@ -364,6 +377,7 @@ def test_tp_multimodal_generation(
|
||||
model_name: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
task: TaskOption,
|
||||
trust_remote_code: bool,
|
||||
tokenizer_mode: Optional[str],
|
||||
num_gpus_available,
|
||||
@@ -371,6 +385,7 @@ def test_tp_multimodal_generation(
|
||||
_compare_tp(model_name,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
task,
|
||||
trust_remote_code,
|
||||
tokenizer_mode,
|
||||
num_gpus_available,
|
||||
|
||||
92
tests/entrypoints/llm/test_chat.py
Normal file
92
tests/entrypoints/llm/test_chat.py
Normal file
@@ -0,0 +1,92 @@
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
from ..openai.test_vision import TEST_IMAGE_URLS
|
||||
|
||||
|
||||
def test_chat():
|
||||
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
|
||||
|
||||
prompt1 = "Explain the concept of entropy."
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt1
|
||||
},
|
||||
]
|
||||
outputs = llm.chat(messages)
|
||||
assert len(outputs) == 1
|
||||
|
||||
|
||||
def test_multi_chat():
|
||||
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
|
||||
|
||||
prompt1 = "Explain the concept of entropy."
|
||||
prompt2 = "Explain what among us is."
|
||||
|
||||
conversation1 = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt1
|
||||
},
|
||||
]
|
||||
|
||||
conversation2 = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt2
|
||||
},
|
||||
]
|
||||
|
||||
messages = [conversation1, conversation2]
|
||||
|
||||
outputs = llm.chat(messages)
|
||||
assert len(outputs) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("image_urls",
|
||||
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
|
||||
def test_chat_multi_image(image_urls: List[str]):
|
||||
llm = LLM(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
dtype="bfloat16",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
)
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*({
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
}
|
||||
} for image_url in image_urls),
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this image?"
|
||||
},
|
||||
],
|
||||
}]
|
||||
outputs = llm.chat(messages)
|
||||
assert len(outputs) >= 0
|
||||
@@ -6,7 +6,6 @@ import pytest
|
||||
from vllm import LLM, RequestOutput, SamplingParams
|
||||
|
||||
from ...conftest import cleanup
|
||||
from ..openai.test_vision import TEST_IMAGE_URLS
|
||||
|
||||
MODEL_NAME = "facebook/opt-125m"
|
||||
|
||||
@@ -104,90 +103,3 @@ def test_multiple_sampling_params(llm: LLM):
|
||||
# sampling_params is None, default params should be applied
|
||||
outputs = llm.generate(PROMPTS, sampling_params=None)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
|
||||
def test_chat():
|
||||
|
||||
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
|
||||
|
||||
prompt1 = "Explain the concept of entropy."
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt1
|
||||
},
|
||||
]
|
||||
outputs = llm.chat(messages)
|
||||
assert len(outputs) == 1
|
||||
|
||||
|
||||
def test_multi_chat():
|
||||
|
||||
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
|
||||
|
||||
prompt1 = "Explain the concept of entropy."
|
||||
prompt2 = "Explain what among us is."
|
||||
|
||||
conversation1 = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt1
|
||||
},
|
||||
]
|
||||
|
||||
conversation2 = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt2
|
||||
},
|
||||
]
|
||||
|
||||
messages = [conversation1, conversation2]
|
||||
|
||||
outputs = llm.chat(messages)
|
||||
assert len(outputs) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("image_urls",
|
||||
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
|
||||
def test_chat_multi_image(image_urls: List[str]):
|
||||
llm = LLM(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
dtype="bfloat16",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
enforce_eager=True,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
)
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*({
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
}
|
||||
} for image_url in image_urls),
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this image?"
|
||||
},
|
||||
],
|
||||
}]
|
||||
outputs = llm.chat(messages)
|
||||
assert len(outputs) >= 0
|
||||
|
||||
22
tests/entrypoints/llm/test_init.py
Normal file
22
tests/entrypoints/llm/test_init.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import pytest
|
||||
|
||||
from vllm import LLM
|
||||
|
||||
from ...utils import error_on_warning
|
||||
|
||||
MODEL_NAME = "facebook/opt-125m"
|
||||
|
||||
|
||||
def test_pos_args_deprecated():
|
||||
with error_on_warning(DeprecationWarning):
|
||||
LLM(model=MODEL_NAME, tokenizer=MODEL_NAME)
|
||||
|
||||
with error_on_warning(DeprecationWarning):
|
||||
LLM(MODEL_NAME, tokenizer=MODEL_NAME)
|
||||
|
||||
with pytest.warns(DeprecationWarning, match="'tokenizer'"):
|
||||
LLM(MODEL_NAME, MODEL_NAME)
|
||||
|
||||
with pytest.warns(DeprecationWarning,
|
||||
match="'tokenizer', 'tokenizer_mode'"):
|
||||
LLM(MODEL_NAME, MODEL_NAME, "auto")
|
||||
@@ -22,12 +22,12 @@ class MockHFConfig:
|
||||
|
||||
@dataclass
|
||||
class MockModelConfig:
|
||||
task = "generate"
|
||||
tokenizer = MODEL_NAME
|
||||
trust_remote_code = False
|
||||
tokenizer_mode = "auto"
|
||||
max_model_len = 100
|
||||
tokenizer_revision = None
|
||||
embedding_mode = False
|
||||
multimodal_config = MultiModalConfig()
|
||||
hf_config = MockHFConfig()
|
||||
|
||||
|
||||
@@ -23,6 +23,8 @@ TEST_IMAGE_URLS = [
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--task",
|
||||
"generate",
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
|
||||
@@ -18,7 +18,8 @@ PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
|
||||
@pytest.fixture(scope="module")
|
||||
def phi3v_model_config():
|
||||
return ModelConfig(PHI3V_MODEL_ID,
|
||||
PHI3V_MODEL_ID,
|
||||
task="generate",
|
||||
tokenizer=PHI3V_MODEL_ID,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
|
||||
@@ -15,7 +15,8 @@ def test_worker_apply_lora(sql_lora_files):
|
||||
worker = Worker(
|
||||
model_config=ModelConfig(
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
task="auto",
|
||||
tokenizer="meta-llama/Llama-2-7b-hf",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
@@ -27,7 +28,7 @@ def test_worker_apply_lora(sql_lora_files):
|
||||
load_format="dummy",
|
||||
),
|
||||
parallel_config=ParallelConfig(1, 1, False),
|
||||
scheduler_config=SchedulerConfig(32, 32, 32),
|
||||
scheduler_config=SchedulerConfig("generate", 32, 32, 32),
|
||||
device_config=DeviceConfig("cuda"),
|
||||
cache_config=CacheConfig(block_size=16,
|
||||
gpu_memory_utilization=1.,
|
||||
|
||||
@@ -89,6 +89,7 @@ def run_test(
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(model,
|
||||
task="generate",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
|
||||
@@ -28,6 +28,7 @@ def test_models(
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(model,
|
||||
task="embedding",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.config import ModelConfig, TaskOption
|
||||
from vllm.inputs import InputContext
|
||||
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
|
||||
from vllm.utils import is_cpu
|
||||
@@ -248,6 +248,7 @@ def check_logprobs_close(
|
||||
|
||||
|
||||
def build_model_context(model_name: str,
|
||||
task: TaskOption = "auto",
|
||||
tokenizer_name: Optional[str] = None,
|
||||
trust_remote_code: bool = False,
|
||||
dtype: Optional[Union[str, torch.dtype]] = None,
|
||||
@@ -273,7 +274,8 @@ def build_model_context(model_name: str,
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_name,
|
||||
tokenizer_name,
|
||||
task=task,
|
||||
tokenizer=tokenizer_name,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=trust_remote_code,
|
||||
dtype=dtype,
|
||||
|
||||
@@ -24,6 +24,7 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
|
||||
|
||||
model_config = ModelConfig(
|
||||
model=MODEL_NAME,
|
||||
task="auto",
|
||||
tokenizer=MODEL_NAME,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
@@ -67,6 +68,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype,
|
||||
|
||||
model_config = ModelConfig(
|
||||
model=MODEL_NAME,
|
||||
task="auto",
|
||||
tokenizer=MODEL_NAME,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
@@ -109,6 +111,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
|
||||
|
||||
model_config = ModelConfig(
|
||||
model=MODEL_NAME,
|
||||
task="auto",
|
||||
tokenizer=MODEL_NAME,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
@@ -139,6 +142,7 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images):
|
||||
|
||||
model_config = ModelConfig(
|
||||
model=MODEL_NAME,
|
||||
task="auto",
|
||||
tokenizer=MODEL_NAME,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
|
||||
@@ -221,6 +221,7 @@ def test_max_tokens_kwarg_overrides(num_crops):
|
||||
expected_seq_count = DEFAULT_NUM_CROPS if num_crops is None else num_crops
|
||||
|
||||
ctx = build_model_context(MULTIMODAL_MODEL_ID,
|
||||
task="generate",
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
limit_mm_per_prompt={"image": 1})
|
||||
@@ -256,6 +257,7 @@ def test_max_tokens_kwarg_overrides(num_crops):
|
||||
def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs):
|
||||
"""Ensure that max token calcs filters out invalid mm_processor_kwargs"""
|
||||
ctx = build_model_context(MULTIMODAL_MODEL_ID,
|
||||
task="generate",
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
limit_mm_per_prompt={"image": 1})
|
||||
@@ -278,12 +280,13 @@ def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs):
|
||||
|
||||
### Test overrides for the mapper
|
||||
@pytest.mark.parametrize("num_crops", [DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE])
|
||||
def test_default_mapper_with_processer_kwargs(image_assets, num_crops):
|
||||
def test_default_mapper_with_processor_kwargs(image_assets, num_crops):
|
||||
"""Ensure that the mapper processor kwargs can fall back to HF models."""
|
||||
# NOTE - we don't validate bad inputs for the default mapper, because it's
|
||||
# through the automodel interface in transformers, so we can't easily
|
||||
# inspect what kwargs are or are not allowed.
|
||||
ctx = build_model_context(MULTIMODAL_MODEL_ID,
|
||||
task="generate",
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={"num_crops": num_crops},
|
||||
limit_mm_per_prompt={"image": 1})
|
||||
@@ -311,6 +314,7 @@ def test_custom_mapper_kwarg_overrides(image_assets, init_num_crops,
|
||||
init_num_crops, inference_num_crops)
|
||||
|
||||
ctx = build_model_context(MULTIMODAL_MODEL_ID,
|
||||
task="generate",
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=init_kwargs,
|
||||
limit_mm_per_prompt={"image": 1})
|
||||
@@ -348,6 +352,7 @@ def test_custom_mapper_with_sad_kwarg_overrides(image_assets,
|
||||
"""Ensure that custom mappers filters out invalid mm_processor_kwargs"""
|
||||
# Should filter out the init time kwargs
|
||||
ctx = build_model_context(MULTIMODAL_MODEL_ID,
|
||||
task="generate",
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
limit_mm_per_prompt={"image": 1})
|
||||
|
||||
@@ -57,7 +57,8 @@ def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None:
|
||||
|
||||
try:
|
||||
model_config = ModelConfig(model_path,
|
||||
model_path,
|
||||
task="auto",
|
||||
tokenizer=model_path,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
|
||||
@@ -2,6 +2,42 @@ import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("model_id", "expected_task"), [
|
||||
("facebook/opt-125m", "generate"),
|
||||
("intfloat/e5-mistral-7b-instruct", "embedding"),
|
||||
])
|
||||
def test_auto_task(model_id, expected_task):
|
||||
config = ModelConfig(
|
||||
model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
)
|
||||
|
||||
assert config.task == expected_task
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("model_id", "bad_task"), [
|
||||
("facebook/opt-125m", "embedding"),
|
||||
("intfloat/e5-mistral-7b-instruct", "generate"),
|
||||
])
|
||||
def test_incorrect_task(model_id, bad_task):
|
||||
with pytest.raises(ValueError, match=r"does not support the .* task"):
|
||||
ModelConfig(
|
||||
model_id,
|
||||
task=bad_task,
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
)
|
||||
|
||||
|
||||
MODEL_IDS_EXPECTED = [
|
||||
("Qwen/Qwen1.5-7B", 32768),
|
||||
("mistralai/Mistral-7B-v0.1", 4096),
|
||||
@@ -14,7 +50,8 @@ def test_disable_sliding_window(model_id_expected):
|
||||
model_id, expected = model_id_expected
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
@@ -32,7 +69,8 @@ def test_get_sliding_window():
|
||||
# when use_sliding_window is False.
|
||||
qwen2_model_config = ModelConfig(
|
||||
"Qwen/Qwen1.5-7B",
|
||||
"Qwen/Qwen1.5-7B",
|
||||
task="auto",
|
||||
tokenizer="Qwen/Qwen1.5-7B",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
@@ -49,7 +87,8 @@ def test_get_sliding_window():
|
||||
|
||||
mistral_model_config = ModelConfig(
|
||||
"mistralai/Mistral-7B-v0.1",
|
||||
"mistralai/Mistral-7B-v0.1",
|
||||
task="auto",
|
||||
tokenizer="mistralai/Mistral-7B-v0.1",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
@@ -70,7 +109,8 @@ def test_rope_customization():
|
||||
|
||||
llama_model_config = ModelConfig(
|
||||
"meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
"meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
task="auto",
|
||||
tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
dtype="float16",
|
||||
@@ -82,7 +122,8 @@ def test_rope_customization():
|
||||
|
||||
llama_model_config = ModelConfig(
|
||||
"meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
"meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
task="auto",
|
||||
tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
dtype="float16",
|
||||
@@ -98,7 +139,8 @@ def test_rope_customization():
|
||||
|
||||
longchat_model_config = ModelConfig(
|
||||
"lmsys/longchat-13b-16k",
|
||||
"lmsys/longchat-13b-16k",
|
||||
task="auto",
|
||||
tokenizer="lmsys/longchat-13b-16k",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
dtype="float16",
|
||||
@@ -112,7 +154,8 @@ def test_rope_customization():
|
||||
|
||||
longchat_model_config = ModelConfig(
|
||||
"lmsys/longchat-13b-16k",
|
||||
"lmsys/longchat-13b-16k",
|
||||
task="auto",
|
||||
tokenizer="lmsys/longchat-13b-16k",
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
dtype="float16",
|
||||
|
||||
@@ -59,7 +59,7 @@ def test_deprecate_kwargs_always():
|
||||
with pytest.warns(DeprecationWarning, match="'old_arg'"):
|
||||
dummy(old_arg=1)
|
||||
|
||||
with error_on_warning():
|
||||
with error_on_warning(DeprecationWarning):
|
||||
dummy(new_arg=1)
|
||||
|
||||
|
||||
@@ -69,10 +69,10 @@ def test_deprecate_kwargs_never():
|
||||
def dummy(*, old_arg: object = None, new_arg: object = None):
|
||||
pass
|
||||
|
||||
with error_on_warning():
|
||||
with error_on_warning(DeprecationWarning):
|
||||
dummy(old_arg=1)
|
||||
|
||||
with error_on_warning():
|
||||
with error_on_warning(DeprecationWarning):
|
||||
dummy(new_arg=1)
|
||||
|
||||
|
||||
@@ -86,15 +86,15 @@ def test_deprecate_kwargs_dynamic():
|
||||
with pytest.warns(DeprecationWarning, match="'old_arg'"):
|
||||
dummy(old_arg=1)
|
||||
|
||||
with error_on_warning():
|
||||
with error_on_warning(DeprecationWarning):
|
||||
dummy(new_arg=1)
|
||||
|
||||
is_deprecated = False
|
||||
|
||||
with error_on_warning():
|
||||
with error_on_warning(DeprecationWarning):
|
||||
dummy(old_arg=1)
|
||||
|
||||
with error_on_warning():
|
||||
with error_on_warning(DeprecationWarning):
|
||||
dummy(new_arg=1)
|
||||
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ import time
|
||||
import warnings
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Literal, Optional, Union
|
||||
from typing import Any, Callable, Dict, List, Literal, Optional, Type, Union
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
@@ -454,13 +454,13 @@ def multi_process_parallel(
|
||||
|
||||
|
||||
@contextmanager
|
||||
def error_on_warning():
|
||||
def error_on_warning(category: Type[Warning] = Warning):
|
||||
"""
|
||||
Within the scope of this context manager, tests will fail if any warning
|
||||
is emitted.
|
||||
of the given category is emitted.
|
||||
"""
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error")
|
||||
warnings.filterwarnings("error", category=category)
|
||||
|
||||
yield
|
||||
|
||||
|
||||
Reference in New Issue
Block a user