Improve-mm-and-pooler-and-decoding-configs (#16789)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -20,11 +20,12 @@ from vllm.config import (CacheConfig, CompilationConfig, Config, ConfigFormat,
|
||||
DecodingConfig, Device, DeviceConfig,
|
||||
DistributedExecutorBackend, HfOverrides,
|
||||
KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
|
||||
ModelConfig, ModelImpl, ObservabilityConfig,
|
||||
ParallelConfig, PoolerConfig, PoolType,
|
||||
PromptAdapterConfig, SchedulerConfig, SchedulerPolicy,
|
||||
SpeculativeConfig, TaskOption, TokenizerPoolConfig,
|
||||
VllmConfig, get_attr_docs, get_field)
|
||||
ModelConfig, ModelImpl, MultiModalConfig,
|
||||
ObservabilityConfig, ParallelConfig, PoolerConfig,
|
||||
PoolType, PromptAdapterConfig, SchedulerConfig,
|
||||
SchedulerPolicy, SpeculativeConfig, TaskOption,
|
||||
TokenizerPoolConfig, VllmConfig, get_attr_docs,
|
||||
get_field)
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
@@ -190,7 +191,8 @@ class EngineArgs:
|
||||
TokenizerPoolConfig.pool_type
|
||||
tokenizer_pool_extra_config: dict[str, Any] = \
|
||||
get_field(TokenizerPoolConfig, "extra_config")
|
||||
limit_mm_per_prompt: Optional[Mapping[str, int]] = None
|
||||
limit_mm_per_prompt: Mapping[str, int] = \
|
||||
get_field(MultiModalConfig, "limit_per_prompt")
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None
|
||||
disable_mm_preprocessor_cache: bool = False
|
||||
enable_lora: bool = False
|
||||
@@ -252,7 +254,7 @@ class EngineArgs:
|
||||
|
||||
additional_config: Optional[Dict[str, Any]] = None
|
||||
enable_reasoning: Optional[bool] = None
|
||||
reasoning_parser: Optional[str] = None
|
||||
reasoning_parser: Optional[str] = DecodingConfig.reasoning_backend
|
||||
use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
|
||||
|
||||
def __post_init__(self):
|
||||
@@ -478,18 +480,22 @@ class EngineArgs:
|
||||
'Examples:\n'
|
||||
'- 1k → 1000\n'
|
||||
'- 1K → 1024\n')
|
||||
parser.add_argument(
|
||||
|
||||
# Guided decoding arguments
|
||||
guided_decoding_kwargs = get_kwargs(DecodingConfig)
|
||||
guided_decoding_group = parser.add_argument_group(
|
||||
title="DecodingConfig",
|
||||
description=DecodingConfig.__doc__,
|
||||
)
|
||||
guided_decoding_group.add_argument(
|
||||
'--guided-decoding-backend',
|
||||
type=str,
|
||||
default=DecodingConfig.guided_decoding_backend,
|
||||
help='Which engine will be used for guided decoding'
|
||||
' (JSON schema / regex etc) by default. Currently support '
|
||||
'https://github.com/mlc-ai/xgrammar and '
|
||||
'https://github.com/guidance-ai/llguidance.'
|
||||
'Valid backend values are "xgrammar", "guidance", and "auto". '
|
||||
'With "auto", we will make opinionated choices based on request '
|
||||
'contents and what the backend libraries currently support, so '
|
||||
'the behavior is subject to change in each release.')
|
||||
**guided_decoding_kwargs["guided_decoding_backend"])
|
||||
guided_decoding_group.add_argument(
|
||||
"--reasoning-parser",
|
||||
# This choices is a special case because it's not static
|
||||
choices=list(ReasoningParserManager.reasoning_parsers),
|
||||
**guided_decoding_kwargs["reasoning_backend"])
|
||||
|
||||
parser.add_argument(
|
||||
'--logits-processor-pattern',
|
||||
type=optional_str,
|
||||
@@ -697,18 +703,14 @@ class EngineArgs:
|
||||
**tokenizer_kwargs["extra_config"])
|
||||
|
||||
# Multimodal related configs
|
||||
parser.add_argument(
|
||||
'--limit-mm-per-prompt',
|
||||
type=nullable_kvs,
|
||||
default=EngineArgs.limit_mm_per_prompt,
|
||||
# The default value is given in
|
||||
# MultiModalConfig.get_default_limit_per_prompt
|
||||
help=('For each multimodal plugin, limit how many '
|
||||
'input instances to allow for each prompt. '
|
||||
'Expects a comma-separated list of items, '
|
||||
'e.g.: `image=16,video=2` allows a maximum of 16 '
|
||||
'images and 2 videos per prompt. Defaults to '
|
||||
'1 (V0) or 999 (V1) for each modality.'))
|
||||
multimodal_kwargs = get_kwargs(MultiModalConfig)
|
||||
multimodal_group = parser.add_argument_group(
|
||||
title="MultiModalConfig",
|
||||
description=MultiModalConfig.__doc__,
|
||||
)
|
||||
multimodal_group.add_argument('--limit-mm-per-prompt',
|
||||
**multimodal_kwargs["limit_per_prompt"])
|
||||
|
||||
parser.add_argument(
|
||||
'--mm-processor-kwargs',
|
||||
default=None,
|
||||
@@ -1018,16 +1020,6 @@ class EngineArgs:
|
||||
"If enabled, the model will be able to generate reasoning content."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--reasoning-parser",
|
||||
type=str,
|
||||
choices=list(ReasoningParserManager.reasoning_parsers),
|
||||
default=None,
|
||||
help=
|
||||
"Select the reasoning parser depending on the model that you're "
|
||||
"using. This is used to parse the reasoning content into OpenAI "
|
||||
"API format. Required for ``--enable-reasoning``.")
|
||||
|
||||
parser.add_argument(
|
||||
"--disable-cascade-attn",
|
||||
action="store_true",
|
||||
|
||||
Reference in New Issue
Block a user