Improve configs - ModelConfig (#17130)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -20,15 +20,16 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
|
||||
DeviceConfig, DistributedExecutorBackend,
|
||||
GuidedDecodingBackend, GuidedDecodingBackendV1,
|
||||
HfOverrides, KVTransferConfig, LoadConfig, LoadFormat,
|
||||
LoRAConfig, ModelConfig, ModelImpl, MultiModalConfig,
|
||||
ObservabilityConfig, ParallelConfig, PoolerConfig,
|
||||
PrefixCachingHashAlgo, PromptAdapterConfig,
|
||||
SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
|
||||
TaskOption, TokenizerPoolConfig, VllmConfig,
|
||||
get_attr_docs, get_field)
|
||||
LoRAConfig, ModelConfig, ModelDType, ModelImpl,
|
||||
MultiModalConfig, ObservabilityConfig, ParallelConfig,
|
||||
PoolerConfig, PrefixCachingHashAlgo,
|
||||
PromptAdapterConfig, SchedulerConfig, SchedulerPolicy,
|
||||
SpeculativeConfig, TaskOption, TokenizerMode,
|
||||
TokenizerPoolConfig, VllmConfig, get_attr_docs,
|
||||
get_field)
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||
from vllm.plugins import load_general_plugins
|
||||
from vllm.reasoning import ReasoningParserManager
|
||||
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
|
||||
@@ -183,6 +184,9 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
|
||||
kwargs[name]["nargs"] = "+"
|
||||
elif contains_type(type_hints, int):
|
||||
kwargs[name]["type"] = int
|
||||
# Special case for large integers
|
||||
if name in {"max_model_len"}:
|
||||
kwargs[name]["type"] = human_readable_int
|
||||
elif contains_type(type_hints, float):
|
||||
kwargs[name]["type"] = float
|
||||
elif contains_type(type_hints, dict):
|
||||
@@ -212,22 +216,23 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
|
||||
@dataclass
|
||||
class EngineArgs:
|
||||
"""Arguments for vLLM engine."""
|
||||
model: str = 'facebook/opt-125m'
|
||||
served_model_name: Optional[Union[str, List[str]]] = None
|
||||
tokenizer: Optional[str] = None
|
||||
hf_config_path: Optional[str] = None
|
||||
task: TaskOption = "auto"
|
||||
skip_tokenizer_init: bool = False
|
||||
tokenizer_mode: str = 'auto'
|
||||
trust_remote_code: bool = False
|
||||
allowed_local_media_path: str = ""
|
||||
model: str = ModelConfig.model
|
||||
served_model_name: Optional[Union[
|
||||
str, List[str]]] = ModelConfig.served_model_name
|
||||
tokenizer: Optional[str] = ModelConfig.tokenizer
|
||||
hf_config_path: Optional[str] = ModelConfig.hf_config_path
|
||||
task: TaskOption = ModelConfig.task
|
||||
skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
|
||||
tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
|
||||
trust_remote_code: bool = ModelConfig.trust_remote_code
|
||||
allowed_local_media_path: str = ModelConfig.allowed_local_media_path
|
||||
download_dir: Optional[str] = LoadConfig.download_dir
|
||||
load_format: str = LoadConfig.load_format
|
||||
config_format: ConfigFormat = ConfigFormat.AUTO
|
||||
dtype: str = 'auto'
|
||||
config_format: str = ModelConfig.config_format
|
||||
dtype: ModelDType = ModelConfig.dtype
|
||||
kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
|
||||
seed: Optional[int] = None
|
||||
max_model_len: Optional[int] = None
|
||||
seed: Optional[int] = ModelConfig.seed
|
||||
max_model_len: Optional[int] = ModelConfig.max_model_len
|
||||
# Note: Specifying a custom executor backend by passing a class
|
||||
# is intended for expert use only. The API may change without
|
||||
# notice.
|
||||
@@ -245,8 +250,8 @@ class EngineArgs:
|
||||
enable_prefix_caching: Optional[bool] = CacheConfig.enable_prefix_caching
|
||||
prefix_caching_hash_algo: PrefixCachingHashAlgo = \
|
||||
CacheConfig.prefix_caching_hash_algo
|
||||
disable_sliding_window: bool = False
|
||||
disable_cascade_attn: bool = False
|
||||
disable_sliding_window: bool = ModelConfig.disable_sliding_window
|
||||
disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
|
||||
use_v2_block_manager: bool = True
|
||||
swap_space: float = CacheConfig.swap_space
|
||||
cpu_offload_gb: float = CacheConfig.cpu_offload_gb
|
||||
@@ -258,18 +263,19 @@ class EngineArgs:
|
||||
long_prefill_token_threshold: int = \
|
||||
SchedulerConfig.long_prefill_token_threshold
|
||||
max_num_seqs: Optional[int] = SchedulerConfig.max_num_seqs
|
||||
max_logprobs: int = 20 # Default value for OpenAI Chat Completions API
|
||||
max_logprobs: int = ModelConfig.max_logprobs
|
||||
disable_log_stats: bool = False
|
||||
revision: Optional[str] = None
|
||||
code_revision: Optional[str] = None
|
||||
rope_scaling: Optional[Dict[str, Any]] = None
|
||||
rope_theta: Optional[float] = None
|
||||
hf_token: Optional[Union[bool, str]] = None
|
||||
hf_overrides: Optional[HfOverrides] = None
|
||||
tokenizer_revision: Optional[str] = None
|
||||
quantization: Optional[str] = None
|
||||
enforce_eager: Optional[bool] = None
|
||||
max_seq_len_to_capture: int = 8192
|
||||
revision: Optional[str] = ModelConfig.revision
|
||||
code_revision: Optional[str] = ModelConfig.code_revision
|
||||
rope_scaling: dict[str, Any] = get_field(ModelConfig, "rope_scaling")
|
||||
rope_theta: Optional[float] = ModelConfig.rope_theta
|
||||
hf_token: Optional[Union[bool, str]] = ModelConfig.hf_token
|
||||
hf_overrides: Optional[HfOverrides] = \
|
||||
get_field(ModelConfig, "hf_overrides")
|
||||
tokenizer_revision: Optional[str] = ModelConfig.tokenizer_revision
|
||||
quantization: Optional[QuantizationMethods] = ModelConfig.quantization
|
||||
enforce_eager: bool = ModelConfig.enforce_eager
|
||||
max_seq_len_to_capture: int = ModelConfig.max_seq_len_to_capture
|
||||
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
|
||||
# The following three fields are deprecated and will be removed in a future
|
||||
# release. Setting them will have no effect. Please remove them from your
|
||||
@@ -280,8 +286,10 @@ class EngineArgs:
|
||||
get_field(TokenizerPoolConfig, "extra_config")
|
||||
limit_mm_per_prompt: dict[str, int] = \
|
||||
get_field(MultiModalConfig, "limit_per_prompt")
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None
|
||||
disable_mm_preprocessor_cache: bool = False
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = \
|
||||
MultiModalConfig.mm_processor_kwargs
|
||||
disable_mm_preprocessor_cache: bool = \
|
||||
MultiModalConfig.disable_mm_preprocessor_cache
|
||||
# LoRA fields
|
||||
enable_lora: bool = False
|
||||
enable_lora_bias: bool = LoRAConfig.bias_enabled
|
||||
@@ -323,7 +331,8 @@ class EngineArgs:
|
||||
DecodingConfig.disable_any_whitespace
|
||||
guided_decoding_disable_additional_properties: bool = \
|
||||
DecodingConfig.disable_additional_properties
|
||||
logits_processor_pattern: Optional[str] = None
|
||||
logits_processor_pattern: Optional[
|
||||
str] = ModelConfig.logits_processor_pattern
|
||||
|
||||
speculative_config: Optional[Dict[str, Any]] = None
|
||||
|
||||
@@ -331,22 +340,25 @@ class EngineArgs:
|
||||
show_hidden_metrics_for_version: Optional[str] = None
|
||||
otlp_traces_endpoint: Optional[str] = None
|
||||
collect_detailed_traces: Optional[str] = None
|
||||
disable_async_output_proc: bool = False
|
||||
disable_async_output_proc: bool = not ModelConfig.use_async_output_proc
|
||||
scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
|
||||
scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls
|
||||
|
||||
override_neuron_config: Optional[Dict[str, Any]] = None
|
||||
override_pooler_config: Optional[PoolerConfig] = None
|
||||
override_neuron_config: dict[str, Any] = \
|
||||
get_field(ModelConfig, "override_neuron_config")
|
||||
override_pooler_config: Optional[Union[dict, PoolerConfig]] = \
|
||||
ModelConfig.override_pooler_config
|
||||
compilation_config: Optional[CompilationConfig] = None
|
||||
worker_cls: str = ParallelConfig.worker_cls
|
||||
worker_extension_cls: str = ParallelConfig.worker_extension_cls
|
||||
|
||||
kv_transfer_config: Optional[KVTransferConfig] = None
|
||||
|
||||
generation_config: Optional[str] = "auto"
|
||||
override_generation_config: Optional[Dict[str, Any]] = None
|
||||
enable_sleep_mode: bool = False
|
||||
model_impl: str = "auto"
|
||||
generation_config: str = ModelConfig.generation_config
|
||||
enable_sleep_mode: bool = ModelConfig.enable_sleep_mode
|
||||
override_generation_config: dict[str, Any] = \
|
||||
get_field(ModelConfig, "override_generation_config")
|
||||
model_impl: str = ModelConfig.model_impl
|
||||
|
||||
calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
|
||||
|
||||
@@ -356,9 +368,6 @@ class EngineArgs:
|
||||
use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
|
||||
|
||||
def __post_init__(self):
|
||||
if not self.tokenizer:
|
||||
self.tokenizer = self.model
|
||||
|
||||
# support `EngineArgs(compilation_config={...})`
|
||||
# without having to manually construct a
|
||||
# CompilationConfig object
|
||||
@@ -375,80 +384,87 @@ class EngineArgs:
|
||||
"""Shared CLI arguments for vLLM engine."""
|
||||
|
||||
# Model arguments
|
||||
parser.add_argument(
|
||||
'--model',
|
||||
type=str,
|
||||
default=EngineArgs.model,
|
||||
help='Name or path of the huggingface model to use.')
|
||||
parser.add_argument(
|
||||
'--task',
|
||||
default=EngineArgs.task,
|
||||
choices=get_args(TaskOption),
|
||||
help='The task to use the model for. Each vLLM instance only '
|
||||
'supports one task, even if the same model can be used for '
|
||||
'multiple tasks. When the model only supports one task, ``"auto"`` '
|
||||
'can be used to select it; otherwise, you must specify explicitly '
|
||||
'which task to use.')
|
||||
parser.add_argument(
|
||||
'--tokenizer',
|
||||
type=optional_type(str),
|
||||
default=EngineArgs.tokenizer,
|
||||
help='Name or path of the huggingface tokenizer to use. '
|
||||
'If unspecified, model name or path will be used.')
|
||||
parser.add_argument(
|
||||
"--hf-config-path",
|
||||
type=optional_type(str),
|
||||
default=EngineArgs.hf_config_path,
|
||||
help='Name or path of the huggingface config to use. '
|
||||
'If unspecified, model name or path will be used.')
|
||||
parser.add_argument(
|
||||
'--skip-tokenizer-init',
|
||||
action='store_true',
|
||||
help='Skip initialization of tokenizer and detokenizer. '
|
||||
'Expects valid prompt_token_ids and None for prompt from '
|
||||
'the input. The generated output will contain token ids.')
|
||||
parser.add_argument(
|
||||
'--revision',
|
||||
type=optional_type(str),
|
||||
default=None,
|
||||
help='The specific model version to use. It can be a branch '
|
||||
'name, a tag name, or a commit id. If unspecified, will use '
|
||||
'the default version.')
|
||||
parser.add_argument(
|
||||
'--code-revision',
|
||||
type=optional_type(str),
|
||||
default=None,
|
||||
help='The specific revision to use for the model code on '
|
||||
'Hugging Face Hub. It can be a branch name, a tag name, or a '
|
||||
'commit id. If unspecified, will use the default version.')
|
||||
parser.add_argument(
|
||||
'--tokenizer-revision',
|
||||
type=optional_type(str),
|
||||
default=None,
|
||||
help='Revision of the huggingface tokenizer to use. '
|
||||
'It can be a branch name, a tag name, or a commit id. '
|
||||
'If unspecified, will use the default version.')
|
||||
parser.add_argument(
|
||||
'--tokenizer-mode',
|
||||
type=str,
|
||||
default=EngineArgs.tokenizer_mode,
|
||||
choices=['auto', 'slow', 'mistral', 'custom'],
|
||||
help='The tokenizer mode.\n\n* "auto" will use the '
|
||||
'fast tokenizer if available.\n* "slow" will '
|
||||
'always use the slow tokenizer. \n* '
|
||||
'"mistral" will always use the `mistral_common` tokenizer. \n* '
|
||||
'"custom" will use --tokenizer to select the '
|
||||
'preregistered tokenizer.')
|
||||
parser.add_argument('--trust-remote-code',
|
||||
action='store_true',
|
||||
help='Trust remote code from huggingface.')
|
||||
parser.add_argument(
|
||||
'--allowed-local-media-path',
|
||||
type=str,
|
||||
help="Allowing API requests to read local images or videos "
|
||||
"from directories specified by the server file system. "
|
||||
"This is a security risk. "
|
||||
"Should only be enabled in trusted environments.")
|
||||
model_kwargs = get_kwargs(ModelConfig)
|
||||
model_group = parser.add_argument_group(
|
||||
title="ModelConfig",
|
||||
description=ModelConfig.__doc__,
|
||||
)
|
||||
model_group.add_argument("--model", **model_kwargs["model"])
|
||||
model_group.add_argument("--task", **model_kwargs["task"])
|
||||
model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
|
||||
model_group.add_argument("--tokenizer-mode",
|
||||
**model_kwargs["tokenizer_mode"])
|
||||
model_group.add_argument("--trust-remote-code",
|
||||
**model_kwargs["trust_remote_code"])
|
||||
model_group.add_argument("--dtype", **model_kwargs["dtype"])
|
||||
model_group.add_argument("--seed", **model_kwargs["seed"])
|
||||
model_group.add_argument("--hf-config-path",
|
||||
**model_kwargs["hf_config_path"])
|
||||
model_group.add_argument("--allowed-local-media-path",
|
||||
**model_kwargs["allowed_local_media_path"])
|
||||
model_group.add_argument("--revision", **model_kwargs["revision"])
|
||||
model_group.add_argument("--code-revision",
|
||||
**model_kwargs["code_revision"])
|
||||
model_group.add_argument("--rope-scaling",
|
||||
**model_kwargs["rope_scaling"])
|
||||
model_group.add_argument("--rope-theta", **model_kwargs["rope_theta"])
|
||||
model_group.add_argument("--tokenizer-revision",
|
||||
**model_kwargs["tokenizer_revision"])
|
||||
model_group.add_argument("--max-model-len",
|
||||
**model_kwargs["max_model_len"])
|
||||
model_group.add_argument("--quantization", "-q",
|
||||
**model_kwargs["quantization"])
|
||||
model_group.add_argument("--enforce-eager",
|
||||
**model_kwargs["enforce_eager"])
|
||||
model_group.add_argument("--max-seq-len-to-capture",
|
||||
**model_kwargs["max_seq_len_to_capture"])
|
||||
model_group.add_argument("--max-logprobs",
|
||||
**model_kwargs["max_logprobs"])
|
||||
model_group.add_argument("--disable-sliding-window",
|
||||
**model_kwargs["disable_sliding_window"])
|
||||
model_group.add_argument("--disable-cascade-attn",
|
||||
**model_kwargs["disable_cascade_attn"])
|
||||
model_group.add_argument("--skip-tokenizer-init",
|
||||
**model_kwargs["skip_tokenizer_init"])
|
||||
model_group.add_argument("--served-model-name",
|
||||
**model_kwargs["served_model_name"])
|
||||
# This one is a special case because it is the
|
||||
# opposite of ModelConfig.use_async_output_proc
|
||||
model_group.add_argument(
|
||||
"--disable-async-output-proc",
|
||||
action="store_true",
|
||||
default=EngineArgs.disable_async_output_proc,
|
||||
help="Disable async output processing. This may result in "
|
||||
"lower performance.")
|
||||
model_group.add_argument("--config-format",
|
||||
choices=[f.value for f in ConfigFormat],
|
||||
**model_kwargs["config_format"])
|
||||
# This one is a special case because it can bool
|
||||
# or str. TODO: Handle this in get_kwargs
|
||||
model_group.add_argument("--hf-token",
|
||||
type=str,
|
||||
nargs="?",
|
||||
const=True,
|
||||
default=model_kwargs["hf_token"]["default"],
|
||||
help=model_kwargs["hf_token"]["help"])
|
||||
model_group.add_argument("--hf-overrides",
|
||||
**model_kwargs["hf_overrides"])
|
||||
model_group.add_argument("--override-neuron-config",
|
||||
**model_kwargs["override_neuron_config"])
|
||||
model_group.add_argument("--override-pooler-config",
|
||||
**model_kwargs["override_pooler_config"])
|
||||
model_group.add_argument("--logits-processor-pattern",
|
||||
**model_kwargs["logits_processor_pattern"])
|
||||
model_group.add_argument("--generation-config",
|
||||
**model_kwargs["generation_config"])
|
||||
model_group.add_argument("--override-generation-config",
|
||||
**model_kwargs["override_generation_config"])
|
||||
model_group.add_argument("--enable-sleep-mode",
|
||||
**model_kwargs["enable_sleep_mode"])
|
||||
model_group.add_argument("--model-impl",
|
||||
choices=[f.value for f in ModelImpl],
|
||||
**model_kwargs["model_impl"])
|
||||
|
||||
# Model loading arguments
|
||||
load_kwargs = get_kwargs(LoadConfig)
|
||||
load_group = parser.add_argument_group(
|
||||
@@ -465,38 +481,6 @@ class EngineArgs:
|
||||
load_group.add_argument('--use-tqdm-on-load',
|
||||
**load_kwargs["use_tqdm_on_load"])
|
||||
|
||||
parser.add_argument(
|
||||
'--config-format',
|
||||
default=EngineArgs.config_format,
|
||||
choices=[f.value for f in ConfigFormat],
|
||||
help='The format of the model config to load.\n\n'
|
||||
'* "auto" will try to load the config in hf format '
|
||||
'if available else it will try to load in mistral format ')
|
||||
parser.add_argument(
|
||||
'--dtype',
|
||||
type=str,
|
||||
default=EngineArgs.dtype,
|
||||
choices=[
|
||||
'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
|
||||
],
|
||||
help='Data type for model weights and activations.\n\n'
|
||||
'* "auto" will use FP16 precision for FP32 and FP16 models, and '
|
||||
'BF16 precision for BF16 models.\n'
|
||||
'* "half" for FP16. Recommended for AWQ quantization.\n'
|
||||
'* "float16" is the same as "half".\n'
|
||||
'* "bfloat16" for a balance between precision and range.\n'
|
||||
'* "float" is shorthand for FP32 precision.\n'
|
||||
'* "float32" for FP32 precision.')
|
||||
parser.add_argument('--max-model-len',
|
||||
type=human_readable_int,
|
||||
default=EngineArgs.max_model_len,
|
||||
help='Model context length. If unspecified, will '
|
||||
'be automatically derived from the model config. '
|
||||
'Supports k/m/g/K/M/G in human-readable format.\n'
|
||||
'Examples:\n'
|
||||
'- 1k → 1000\n'
|
||||
'- 1K → 1024\n')
|
||||
|
||||
# Guided decoding arguments
|
||||
guided_decoding_kwargs = get_kwargs(DecodingConfig)
|
||||
guided_decoding_group = parser.add_argument_group(
|
||||
@@ -520,26 +504,6 @@ class EngineArgs:
|
||||
choices=list(ReasoningParserManager.reasoning_parsers),
|
||||
**guided_decoding_kwargs["reasoning_backend"])
|
||||
|
||||
parser.add_argument(
|
||||
'--logits-processor-pattern',
|
||||
type=optional_type(str),
|
||||
default=None,
|
||||
help='Optional regex pattern specifying valid logits processor '
|
||||
'qualified names that can be passed with the `logits_processors` '
|
||||
'extra completion argument. Defaults to None, which allows no '
|
||||
'processors.')
|
||||
parser.add_argument(
|
||||
'--model-impl',
|
||||
type=str,
|
||||
default=EngineArgs.model_impl,
|
||||
choices=[f.value for f in ModelImpl],
|
||||
help='Which implementation of the model to use.\n\n'
|
||||
'* "auto" will try to use the vLLM implementation if it exists '
|
||||
'and fall back to the Transformers implementation if no vLLM '
|
||||
'implementation is available.\n'
|
||||
'* "vllm" will use the vLLM model implementation.\n'
|
||||
'* "transformers" will use the Transformers model '
|
||||
'implementation.\n')
|
||||
# Parallel arguments
|
||||
parallel_kwargs = get_kwargs(ParallelConfig)
|
||||
parallel_group = parser.add_argument_group(
|
||||
@@ -592,10 +556,6 @@ class EngineArgs:
|
||||
cache_group.add_argument('--calculate-kv-scales',
|
||||
**cache_kwargs["calculate_kv_scales"])
|
||||
|
||||
parser.add_argument('--disable-sliding-window',
|
||||
action='store_true',
|
||||
help='Disables sliding window, '
|
||||
'capping to sliding window size.')
|
||||
parser.add_argument('--use-v2-block-manager',
|
||||
action='store_true',
|
||||
default=True,
|
||||
@@ -605,73 +565,9 @@ class EngineArgs:
|
||||
'Setting this flag to True or False'
|
||||
' has no effect on vLLM behavior.')
|
||||
|
||||
parser.add_argument('--seed',
|
||||
type=int,
|
||||
default=EngineArgs.seed,
|
||||
help='Random seed for operations.')
|
||||
parser.add_argument(
|
||||
'--max-logprobs',
|
||||
type=int,
|
||||
default=EngineArgs.max_logprobs,
|
||||
help=('Max number of log probs to return logprobs is specified in'
|
||||
' SamplingParams.'))
|
||||
parser.add_argument('--disable-log-stats',
|
||||
action='store_true',
|
||||
help='Disable logging statistics.')
|
||||
# Quantization settings.
|
||||
parser.add_argument('--quantization',
|
||||
'-q',
|
||||
type=optional_type(str),
|
||||
choices=[*QUANTIZATION_METHODS, None],
|
||||
default=EngineArgs.quantization,
|
||||
help='Method used to quantize the weights. If '
|
||||
'None, we first check the `quantization_config` '
|
||||
'attribute in the model config file. If that is '
|
||||
'None, we assume the model weights are not '
|
||||
'quantized and use `dtype` to determine the data '
|
||||
'type of the weights.')
|
||||
parser.add_argument(
|
||||
'--rope-scaling',
|
||||
default=None,
|
||||
type=json.loads,
|
||||
help='RoPE scaling configuration in JSON format. '
|
||||
'For example, ``{"rope_type":"dynamic","factor":2.0}``')
|
||||
parser.add_argument('--rope-theta',
|
||||
default=None,
|
||||
type=float,
|
||||
help='RoPE theta. Use with `rope_scaling`. In '
|
||||
'some cases, changing the RoPE theta improves the '
|
||||
'performance of the scaled model.')
|
||||
parser.add_argument(
|
||||
'--hf-token',
|
||||
type=str,
|
||||
nargs='?',
|
||||
const=True,
|
||||
default=None,
|
||||
help='The token to use as HTTP bearer authorization'
|
||||
' for remote files. If `True`, will use the token '
|
||||
'generated when running `huggingface-cli login` '
|
||||
'(stored in `~/.huggingface`).')
|
||||
parser.add_argument('--hf-overrides',
|
||||
type=json.loads,
|
||||
default=EngineArgs.hf_overrides,
|
||||
help='Extra arguments for the HuggingFace config. '
|
||||
'This should be a JSON string that will be '
|
||||
'parsed into a dictionary.')
|
||||
parser.add_argument('--enforce-eager',
|
||||
action='store_true',
|
||||
help='Always use eager-mode PyTorch. If False, '
|
||||
'will use eager mode and CUDA graph in hybrid '
|
||||
'for maximal performance and flexibility.')
|
||||
parser.add_argument('--max-seq-len-to-capture',
|
||||
type=int,
|
||||
default=EngineArgs.max_seq_len_to_capture,
|
||||
help='Maximum sequence length covered by CUDA '
|
||||
'graphs. When a sequence has context length '
|
||||
'larger than this, we fall back to eager mode. '
|
||||
'Additionally for encoder-decoder models, if the '
|
||||
'sequence length of the encoder input is larger '
|
||||
'than this, we fall back to the eager mode.')
|
||||
|
||||
# Tokenizer arguments
|
||||
tokenizer_kwargs = get_kwargs(TokenizerPoolConfig)
|
||||
@@ -775,20 +671,6 @@ class EngineArgs:
|
||||
"Default to `original/**/*` to avoid repeated loading of llama's "
|
||||
"checkpoints.")
|
||||
|
||||
parser.add_argument(
|
||||
"--served-model-name",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The model name(s) used in the API. If multiple "
|
||||
"names are provided, the server will respond to any "
|
||||
"of the provided names. The model name in the model "
|
||||
"field of a response will be the first name in this "
|
||||
"list. If not specified, the model name will be the "
|
||||
"same as the ``--model`` argument. Noted that this name(s) "
|
||||
"will also be used in `model_name` tag content of "
|
||||
"prometheus metrics, if multiple names provided, metrics "
|
||||
"tag will take the first one.")
|
||||
parser.add_argument('--qlora-adapter-name-or-path',
|
||||
type=str,
|
||||
default=None,
|
||||
@@ -822,13 +704,6 @@ class EngineArgs:
|
||||
"modules. This involves use of possibly costly and or blocking "
|
||||
"operations and hence might have a performance impact.")
|
||||
|
||||
parser.add_argument(
|
||||
'--disable-async-output-proc',
|
||||
action='store_true',
|
||||
default=EngineArgs.disable_async_output_proc,
|
||||
help="Disable async output processing. This may result in "
|
||||
"lower performance.")
|
||||
|
||||
# Scheduler arguments
|
||||
scheduler_kwargs = get_kwargs(SchedulerConfig)
|
||||
scheduler_group = parser.add_argument_group(
|
||||
@@ -871,19 +746,6 @@ class EngineArgs:
|
||||
parser.add_argument('--scheduler-cls',
|
||||
**scheduler_kwargs["scheduler_cls"])
|
||||
|
||||
parser.add_argument(
|
||||
'--override-neuron-config',
|
||||
type=json.loads,
|
||||
default=None,
|
||||
help="Override or set neuron device configuration. "
|
||||
"e.g. ``{\"cast_logits_dtype\": \"bloat16\"}``.")
|
||||
parser.add_argument(
|
||||
'--override-pooler-config',
|
||||
type=PoolerConfig.from_json,
|
||||
default=None,
|
||||
help="Override or set the pooling method for pooling models. "
|
||||
"e.g. ``{\"pooling_type\": \"mean\", \"normalize\": false}``.")
|
||||
|
||||
parser.add_argument('--compilation-config',
|
||||
'-O',
|
||||
type=CompilationConfig.from_cli,
|
||||
@@ -920,34 +782,6 @@ class EngineArgs:
|
||||
help='The worker extension class on top of the worker cls, '
|
||||
'it is useful if you just want to add new functions to the worker '
|
||||
'class without changing the existing functions.')
|
||||
parser.add_argument(
|
||||
"--generation-config",
|
||||
type=optional_type(str),
|
||||
default="auto",
|
||||
help="The folder path to the generation config. "
|
||||
"Defaults to 'auto', the generation config will be loaded from "
|
||||
"model path. If set to 'vllm', no generation config is loaded, "
|
||||
"vLLM defaults will be used. If set to a folder path, the "
|
||||
"generation config will be loaded from the specified folder path. "
|
||||
"If `max_new_tokens` is specified in generation config, then "
|
||||
"it sets a server-wide limit on the number of output tokens "
|
||||
"for all requests.")
|
||||
|
||||
parser.add_argument(
|
||||
"--override-generation-config",
|
||||
type=json.loads,
|
||||
default=None,
|
||||
help="Overrides or sets generation config in JSON format. "
|
||||
"e.g. ``{\"temperature\": 0.5}``. If used with "
|
||||
"--generation-config=auto, the override parameters will be merged "
|
||||
"with the default config from the model. If generation-config is "
|
||||
"None, only the override parameters are used.")
|
||||
|
||||
parser.add_argument("--enable-sleep-mode",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Enable sleep mode for the engine. "
|
||||
"(only cuda platform is supported)")
|
||||
|
||||
parser.add_argument(
|
||||
"--additional-config",
|
||||
@@ -966,16 +800,6 @@ class EngineArgs:
|
||||
"If enabled, the model will be able to generate reasoning content."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--disable-cascade-attn",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Disable cascade attention for V1. While cascade attention "
|
||||
"does not change the mathematical correctness, disabling it "
|
||||
"could be useful for preventing potential numerical issues. "
|
||||
"Note that even if this is set to False, cascade attention will be "
|
||||
"only used when the heuristic tells that it's beneficial.")
|
||||
|
||||
return parser
|
||||
|
||||
@classmethod
|
||||
@@ -1002,8 +826,7 @@ class EngineArgs:
|
||||
model=self.model,
|
||||
hf_config_path=self.hf_config_path,
|
||||
task=self.task,
|
||||
# We know this is not None because we set it in __post_init__
|
||||
tokenizer=cast(str, self.tokenizer),
|
||||
tokenizer=self.tokenizer,
|
||||
tokenizer_mode=self.tokenizer_mode,
|
||||
trust_remote_code=self.trust_remote_code,
|
||||
allowed_local_media_path=self.allowed_local_media_path,
|
||||
|
||||
Reference in New Issue
Block a user