Move the last arguments in arg_utils.py to be in their final groups (#17531)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-05-01 18:31:44 +01:00
committed by GitHub
parent f2e7af9b86
commit 6768ff4a22
2 changed files with 132 additions and 135 deletions

View File

@@ -1637,7 +1637,7 @@ class ParallelConfig:
"""Use expert parallelism instead of tensor parallelism for MoE layers.""" """Use expert parallelism instead of tensor parallelism for MoE layers."""
max_parallel_loading_workers: Optional[int] = None max_parallel_loading_workers: Optional[int] = None
"""Maximum number of parallal loading workers when loading model """Maximum number of parallel loading workers when loading model
sequentially in multiple batches. To avoid RAM OOM when using tensor sequentially in multiple batches. To avoid RAM OOM when using tensor
parallel and large models.""" parallel and large models."""

View File

@@ -474,15 +474,21 @@ class EngineArgs:
title="LoadConfig", title="LoadConfig",
description=LoadConfig.__doc__, description=LoadConfig.__doc__,
) )
load_group.add_argument('--load-format', load_group.add_argument("--load-format",
choices=[f.value for f in LoadFormat], choices=[f.value for f in LoadFormat],
**load_kwargs["load_format"]) **load_kwargs["load_format"])
load_group.add_argument('--download-dir', load_group.add_argument("--download-dir",
**load_kwargs["download_dir"]) **load_kwargs["download_dir"])
load_group.add_argument('--model-loader-extra-config', load_group.add_argument("--model-loader-extra-config",
**load_kwargs["model_loader_extra_config"]) **load_kwargs["model_loader_extra_config"])
load_group.add_argument('--use-tqdm-on-load', load_group.add_argument("--ignore-patterns",
**load_kwargs["ignore_patterns"])
load_group.add_argument("--use-tqdm-on-load",
**load_kwargs["use_tqdm_on_load"]) **load_kwargs["use_tqdm_on_load"])
load_group.add_argument('--qlora-adapter-name-or-path',
type=str,
default=None,
help='Name or path of the QLoRA adapter.')
# Guided decoding arguments # Guided decoding arguments
guided_decoding_kwargs = get_kwargs(DecodingConfig) guided_decoding_kwargs = get_kwargs(DecodingConfig)
@@ -501,6 +507,14 @@ class EngineArgs:
guided_decoding_group.add_argument( guided_decoding_group.add_argument(
"--guided-decoding-disable-additional-properties", "--guided-decoding-disable-additional-properties",
**guided_decoding_kwargs["disable_additional_properties"]) **guided_decoding_kwargs["disable_additional_properties"])
guided_decoding_group.add_argument(
"--enable-reasoning",
action=argparse.BooleanOptionalAction,
help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as "
"of v0.8.6. Use `--reasoning-parser` to specify the reasoning "
"parser backend insteadThis flag (`--enable-reasoning`) will be "
"removed in v0.10.0. When `--reasoning-parser` is specified, "
"reasoning mode is automatically enabled.")
guided_decoding_group.add_argument( guided_decoding_group.add_argument(
"--reasoning-parser", "--reasoning-parser",
# This choices is a special case because it's not static # This choices is a special case because it's not static
@@ -514,27 +528,31 @@ class EngineArgs:
description=ParallelConfig.__doc__, description=ParallelConfig.__doc__,
) )
parallel_group.add_argument( parallel_group.add_argument(
'--distributed-executor-backend', "--distributed-executor-backend",
**parallel_kwargs["distributed_executor_backend"]) **parallel_kwargs["distributed_executor_backend"])
parallel_group.add_argument( parallel_group.add_argument(
'--pipeline-parallel-size', '-pp', "--pipeline-parallel-size", "-pp",
**parallel_kwargs["pipeline_parallel_size"]) **parallel_kwargs["pipeline_parallel_size"])
parallel_group.add_argument('--tensor-parallel-size', '-tp', parallel_group.add_argument("--tensor-parallel-size", "-tp",
**parallel_kwargs["tensor_parallel_size"]) **parallel_kwargs["tensor_parallel_size"])
parallel_group.add_argument('--data-parallel-size', '-dp', parallel_group.add_argument("--data-parallel-size", "-dp",
**parallel_kwargs["data_parallel_size"]) **parallel_kwargs["data_parallel_size"])
parallel_group.add_argument( parallel_group.add_argument(
'--enable-expert-parallel', "--enable-expert-parallel",
**parallel_kwargs["enable_expert_parallel"]) **parallel_kwargs["enable_expert_parallel"])
parallel_group.add_argument( parallel_group.add_argument(
'--max-parallel-loading-workers', "--max-parallel-loading-workers",
**parallel_kwargs["max_parallel_loading_workers"]) **parallel_kwargs["max_parallel_loading_workers"])
parallel_group.add_argument( parallel_group.add_argument(
'--ray-workers-use-nsight', "--ray-workers-use-nsight",
**parallel_kwargs["ray_workers_use_nsight"]) **parallel_kwargs["ray_workers_use_nsight"])
parallel_group.add_argument( parallel_group.add_argument(
'--disable-custom-all-reduce', "--disable-custom-all-reduce",
**parallel_kwargs["disable_custom_all_reduce"]) **parallel_kwargs["disable_custom_all_reduce"])
parallel_group.add_argument("--worker-cls",
**parallel_kwargs["worker_cls"])
parallel_group.add_argument("--worker-extension-cls",
**parallel_kwargs["worker_extension_cls"])
# KV cache arguments # KV cache arguments
cache_kwargs = get_kwargs(CacheConfig) cache_kwargs = get_kwargs(CacheConfig)
@@ -542,47 +560,34 @@ class EngineArgs:
title="CacheConfig", title="CacheConfig",
description=CacheConfig.__doc__, description=CacheConfig.__doc__,
) )
cache_group.add_argument('--block-size', **cache_kwargs["block_size"]) cache_group.add_argument("--block-size", **cache_kwargs["block_size"])
cache_group.add_argument('--gpu-memory-utilization', cache_group.add_argument("--gpu-memory-utilization",
**cache_kwargs["gpu_memory_utilization"]) **cache_kwargs["gpu_memory_utilization"])
cache_group.add_argument('--swap-space', **cache_kwargs["swap_space"]) cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
cache_group.add_argument('--kv-cache-dtype', cache_group.add_argument("--kv-cache-dtype",
**cache_kwargs["cache_dtype"]) **cache_kwargs["cache_dtype"])
cache_group.add_argument('--num-gpu-blocks-override', cache_group.add_argument("--num-gpu-blocks-override",
**cache_kwargs["num_gpu_blocks_override"]) **cache_kwargs["num_gpu_blocks_override"])
cache_group.add_argument("--enable-prefix-caching", cache_group.add_argument("--enable-prefix-caching",
**cache_kwargs["enable_prefix_caching"]) **cache_kwargs["enable_prefix_caching"])
cache_group.add_argument("--prefix-caching-hash-algo", cache_group.add_argument("--prefix-caching-hash-algo",
**cache_kwargs["prefix_caching_hash_algo"]) **cache_kwargs["prefix_caching_hash_algo"])
cache_group.add_argument('--cpu-offload-gb', cache_group.add_argument("--cpu-offload-gb",
**cache_kwargs["cpu_offload_gb"]) **cache_kwargs["cpu_offload_gb"])
cache_group.add_argument('--calculate-kv-scales', cache_group.add_argument("--calculate-kv-scales",
**cache_kwargs["calculate_kv_scales"]) **cache_kwargs["calculate_kv_scales"])
parser.add_argument('--use-v2-block-manager',
action='store_true',
default=True,
help='[DEPRECATED] block manager v1 has been '
'removed and SelfAttnBlockSpaceManager (i.e. '
'block manager v2) is now the default. '
'Setting this flag to True or False'
' has no effect on vLLM behavior.')
parser.add_argument('--disable-log-stats',
action='store_true',
help='Disable logging statistics.')
# Tokenizer arguments # Tokenizer arguments
tokenizer_kwargs = get_kwargs(TokenizerPoolConfig) tokenizer_kwargs = get_kwargs(TokenizerPoolConfig)
tokenizer_group = parser.add_argument_group( tokenizer_group = parser.add_argument_group(
title="TokenizerPoolConfig", title="TokenizerPoolConfig",
description=TokenizerPoolConfig.__doc__, description=TokenizerPoolConfig.__doc__,
) )
tokenizer_group.add_argument('--tokenizer-pool-size', tokenizer_group.add_argument("--tokenizer-pool-size",
**tokenizer_kwargs["pool_size"]) **tokenizer_kwargs["pool_size"])
tokenizer_group.add_argument('--tokenizer-pool-type', tokenizer_group.add_argument("--tokenizer-pool-type",
**tokenizer_kwargs["pool_type"]) **tokenizer_kwargs["pool_type"])
tokenizer_group.add_argument('--tokenizer-pool-extra-config', tokenizer_group.add_argument("--tokenizer-pool-extra-config",
**tokenizer_kwargs["extra_config"]) **tokenizer_kwargs["extra_config"])
# Multimodal related configs # Multimodal related configs
@@ -591,13 +596,13 @@ class EngineArgs:
title="MultiModalConfig", title="MultiModalConfig",
description=MultiModalConfig.__doc__, description=MultiModalConfig.__doc__,
) )
multimodal_group.add_argument('--limit-mm-per-prompt', multimodal_group.add_argument("--limit-mm-per-prompt",
**multimodal_kwargs["limit_per_prompt"]) **multimodal_kwargs["limit_per_prompt"])
multimodal_group.add_argument( multimodal_group.add_argument(
'--mm-processor-kwargs', "--mm-processor-kwargs",
**multimodal_kwargs["mm_processor_kwargs"]) **multimodal_kwargs["mm_processor_kwargs"])
multimodal_group.add_argument( multimodal_group.add_argument(
'--disable-mm-preprocessor-cache', "--disable-mm-preprocessor-cache",
**multimodal_kwargs["disable_mm_preprocessor_cache"]) **multimodal_kwargs["disable_mm_preprocessor_cache"])
# LoRA related configs # LoRA related configs
@@ -607,25 +612,25 @@ class EngineArgs:
description=LoRAConfig.__doc__, description=LoRAConfig.__doc__,
) )
lora_group.add_argument( lora_group.add_argument(
'--enable-lora', "--enable-lora",
action=argparse.BooleanOptionalAction, action=argparse.BooleanOptionalAction,
help='If True, enable handling of LoRA adapters.') help="If True, enable handling of LoRA adapters.")
lora_group.add_argument('--enable-lora-bias', lora_group.add_argument("--enable-lora-bias",
**lora_kwargs["bias_enabled"]) **lora_kwargs["bias_enabled"])
lora_group.add_argument('--max-loras', **lora_kwargs["max_loras"]) lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
lora_group.add_argument('--max-lora-rank', lora_group.add_argument("--max-lora-rank",
**lora_kwargs["max_lora_rank"]) **lora_kwargs["max_lora_rank"])
lora_group.add_argument('--lora-extra-vocab-size', lora_group.add_argument("--lora-extra-vocab-size",
**lora_kwargs["lora_extra_vocab_size"]) **lora_kwargs["lora_extra_vocab_size"])
lora_group.add_argument( lora_group.add_argument(
'--lora-dtype', "--lora-dtype",
**lora_kwargs["lora_dtype"], **lora_kwargs["lora_dtype"],
) )
lora_group.add_argument('--long-lora-scaling-factors', lora_group.add_argument("--long-lora-scaling-factors",
**lora_kwargs["long_lora_scaling_factors"]) **lora_kwargs["long_lora_scaling_factors"])
lora_group.add_argument('--max-cpu-loras', lora_group.add_argument("--max-cpu-loras",
**lora_kwargs["max_cpu_loras"]) **lora_kwargs["max_cpu_loras"])
lora_group.add_argument('--fully-sharded-loras', lora_group.add_argument("--fully-sharded-loras",
**lora_kwargs["fully_sharded_loras"]) **lora_kwargs["fully_sharded_loras"])
# PromptAdapter related configs # PromptAdapter related configs
@@ -635,14 +640,14 @@ class EngineArgs:
description=PromptAdapterConfig.__doc__, description=PromptAdapterConfig.__doc__,
) )
prompt_adapter_group.add_argument( prompt_adapter_group.add_argument(
'--enable-prompt-adapter', "--enable-prompt-adapter",
action=argparse.BooleanOptionalAction, action=argparse.BooleanOptionalAction,
help='If True, enable handling of PromptAdapters.') help="If True, enable handling of PromptAdapters.")
prompt_adapter_group.add_argument( prompt_adapter_group.add_argument(
'--max-prompt-adapters', "--max-prompt-adapters",
**prompt_adapter_kwargs["max_prompt_adapters"]) **prompt_adapter_kwargs["max_prompt_adapters"])
prompt_adapter_group.add_argument( prompt_adapter_group.add_argument(
'--max-prompt-adapter-token', "--max-prompt-adapter-token",
**prompt_adapter_kwargs["max_prompt_adapter_token"]) **prompt_adapter_kwargs["max_prompt_adapter_token"])
# Device arguments # Device arguments
@@ -659,25 +664,11 @@ class EngineArgs:
description=SpeculativeConfig.__doc__, description=SpeculativeConfig.__doc__,
) )
speculative_group.add_argument( speculative_group.add_argument(
'--speculative-config', "--speculative-config",
type=json.loads, type=json.loads,
default=None, default=None,
help='The configurations for speculative decoding.' help="The configurations for speculative decoding. Should be a "
' Should be a JSON string.') "JSON string.")
parser.add_argument(
'--ignore-patterns',
action="append",
type=str,
default=[],
help="The pattern(s) to ignore when loading the model."
"Default to `original/**/*` to avoid repeated loading of llama's "
"checkpoints.")
parser.add_argument('--qlora-adapter-name-or-path',
type=str,
default=None,
help='Name or path of the QLoRA adapter.')
# Observability arguments # Observability arguments
observability_kwargs = get_kwargs(ObservabilityConfig) observability_kwargs = get_kwargs(ObservabilityConfig)
@@ -710,9 +701,9 @@ class EngineArgs:
description=SchedulerConfig.__doc__, description=SchedulerConfig.__doc__,
) )
scheduler_group.add_argument( scheduler_group.add_argument(
'--max-num-batched-tokens', "--max-num-batched-tokens",
**scheduler_kwargs["max_num_batched_tokens"]) **scheduler_kwargs["max_num_batched_tokens"])
scheduler_group.add_argument('--max-num-seqs', scheduler_group.add_argument("--max-num-seqs",
**scheduler_kwargs["max_num_seqs"]) **scheduler_kwargs["max_num_seqs"])
scheduler_group.add_argument( scheduler_group.add_argument(
"--max-num-partial-prefills", "--max-num-partial-prefills",
@@ -723,70 +714,78 @@ class EngineArgs:
scheduler_group.add_argument( scheduler_group.add_argument(
"--long-prefill-token-threshold", "--long-prefill-token-threshold",
**scheduler_kwargs["long_prefill_token_threshold"]) **scheduler_kwargs["long_prefill_token_threshold"])
scheduler_group.add_argument('--num-lookahead-slots', scheduler_group.add_argument("--num-lookahead-slots",
**scheduler_kwargs["num_lookahead_slots"]) **scheduler_kwargs["num_lookahead_slots"])
scheduler_group.add_argument('--scheduler-delay-factor', scheduler_group.add_argument("--scheduler-delay-factor",
**scheduler_kwargs["delay_factor"]) **scheduler_kwargs["delay_factor"])
scheduler_group.add_argument('--preemption-mode', scheduler_group.add_argument("--preemption-mode",
**scheduler_kwargs["preemption_mode"]) **scheduler_kwargs["preemption_mode"])
scheduler_group.add_argument('--num-scheduler-steps', scheduler_group.add_argument("--num-scheduler-steps",
**scheduler_kwargs["num_scheduler_steps"]) **scheduler_kwargs["num_scheduler_steps"])
scheduler_group.add_argument( scheduler_group.add_argument(
'--multi-step-stream-outputs', "--multi-step-stream-outputs",
**scheduler_kwargs["multi_step_stream_outputs"]) **scheduler_kwargs["multi_step_stream_outputs"])
scheduler_group.add_argument('--scheduling-policy', scheduler_group.add_argument("--scheduling-policy",
**scheduler_kwargs["policy"]) **scheduler_kwargs["policy"])
scheduler_group.add_argument( scheduler_group.add_argument(
'--enable-chunked-prefill', "--enable-chunked-prefill",
**scheduler_kwargs["enable_chunked_prefill"]) **scheduler_kwargs["enable_chunked_prefill"])
scheduler_group.add_argument( scheduler_group.add_argument(
"--disable-chunked-mm-input", "--disable-chunked-mm-input",
**scheduler_kwargs["disable_chunked_mm_input"]) **scheduler_kwargs["disable_chunked_mm_input"])
parser.add_argument('--scheduler-cls', scheduler_group.add_argument("--scheduler-cls",
**scheduler_kwargs["scheduler_cls"]) **scheduler_kwargs["scheduler_cls"])
parser.add_argument('--compilation-config', # Compilation arguments
'-O', # compilation_kwargs = get_kwargs(CompilationConfig)
compilation_group = parser.add_argument_group(
title="CompilationConfig",
description=CompilationConfig.__doc__,
)
compilation_group.add_argument(
"--compilation-config",
"-O",
type=CompilationConfig.from_cli, type=CompilationConfig.from_cli,
default=None, default=None,
help='torch.compile configuration for the model. ' help="torch.compile configuration for the model. "
'When it is a number (0, 1, 2, 3), it will be ' "When it is a number (0, 1, 2, 3), it will be "
'interpreted as the optimization level.\n' "interpreted as the optimization level.\n"
'NOTE: level 0 is the default level without ' "NOTE: level 0 is the default level without "
'any optimization. level 1 and 2 are for internal ' "any optimization. level 1 and 2 are for internal "
'testing only. level 3 is the recommended level ' "testing only. level 3 is the recommended level "
'for production.\n' "for production.\n"
'To specify the full compilation config, ' "To specify the full compilation config, "
'use a JSON string, e.g. ``{"level": 3, ' "use a JSON string, e.g. ``{\"level\": 3, "
'"cudagraph_capture_sizes": [1, 2, 4, 8]}``\n' "\"cudagraph_capture_sizes\": [1, 2, 4, 8]}``\n"
'Following the convention of traditional ' "Following the convention of traditional "
'compilers, using ``-O`` without space is also ' "compilers, using ``-O`` without space is also "
'supported. ``-O3`` is equivalent to ``-O 3``.') "supported. ``-O3`` is equivalent to ``-O 3``.")
parser.add_argument('--kv-transfer-config', # KVTransfer arguments
# kv_transfer_kwargs = get_kwargs(KVTransferConfig)
kv_transfer_group = parser.add_argument_group(
title="KVTransferConfig",
description=KVTransferConfig.__doc__,
)
kv_transfer_group.add_argument(
"--kv-transfer-config",
type=KVTransferConfig.from_cli, type=KVTransferConfig.from_cli,
default=None, default=None,
help='The configurations for distributed KV cache ' help="The configurations for distributed KV cache "
'transfer. Should be a JSON string.') "transfer. Should be a JSON string.")
parser.add_argument('--kv-events-config', kv_transfer_group.add_argument(
'--kv-events-config',
type=KVEventsConfig.from_cli, type=KVEventsConfig.from_cli,
default=None, default=None,
help='The configurations for event publishing.') help='The configurations for event publishing.')
parser.add_argument( # vLLM arguments
'--worker-cls', # vllm_kwargs = get_kwargs(VllmConfig)
type=str, vllm_group = parser.add_argument_group(
default="auto", title="VllmConfig",
help='The worker class to use for distributed execution.') description=VllmConfig.__doc__,
parser.add_argument( )
'--worker-extension-cls', vllm_group.add_argument(
type=str,
default="",
help='The worker extension class on top of the worker cls, '
'it is useful if you just want to add new functions to the worker '
'class without changing the existing functions.')
parser.add_argument(
"--additional-config", "--additional-config",
type=json.loads, type=json.loads,
default=None, default=None,
@@ -795,20 +794,18 @@ class EngineArgs:
"configs are valid for the platform you are using. The input format" "configs are valid for the platform you are using. The input format"
" is like '{\"config_key\":\"config_value\"}'") " is like '{\"config_key\":\"config_value\"}'")
parser.add_argument( # Other arguments
"--enable-reasoning", parser.add_argument('--use-v2-block-manager',
action="store_true", action='store_true',
default=False, default=True,
help= help='[DEPRECATED] block manager v1 has been '
"[DEPRECATED] " \ 'removed and SelfAttnBlockSpaceManager (i.e. '
"The --enable-reasoning flag is deprecated as of v0.8.6. " 'block manager v2) is now the default. '
"Use --reasoning-parser to specify " \ 'Setting this flag to True or False'
"the reasoning parser backend instead. " ' has no effect on vLLM behavior.')
"This flag (--enable-reasoning) will be " \ parser.add_argument('--disable-log-stats',
"removed in v0.10.0. " action='store_true',
"When --reasoning-parser is specified, " \ help='Disable logging statistics.')
"reasoning mode is automatically enabled."
)
return parser return parser