diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index b50b310fd..6106b9014 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -42,7 +42,6 @@ details. import random import time -from dataclasses import fields from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs @@ -124,7 +123,7 @@ def main(args): # Create the LLM engine engine_args = EngineArgs.from_cli_args(args) - llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)}) + llm = LLM.from_engine_args(engine_args) sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) print("------warm up------") diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index e7759616e..0145f6b7c 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -32,7 +32,6 @@ import dataclasses import json import random import time -from dataclasses import fields from transformers import PreTrainedTokenizerBase @@ -197,7 +196,7 @@ def main(args): engine_args = EngineArgs.from_cli_args(args) - llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)}) + llm = LLM.from_engine_args(engine_args) sampling_params = SamplingParams( temperature=0, diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index d83bb7e17..1de833978 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -6,7 +6,6 @@ import argparse import json import random import time -from dataclasses import fields from transformers import AutoTokenizer, PreTrainedTokenizerBase @@ -79,7 +78,7 @@ def run_vllm( ) -> float: from vllm import LLM, SamplingParams - llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)}) + llm = LLM.from_engine_args(engine_args) assert all( llm.llm_engine.model_config.max_model_len >= (request[1] + request[2]) diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index b7e49d2c9..f384dc2bb 100755 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -9,7 +9,6 @@ on HuggingFace model repository. """ import os -from dataclasses import asdict from typing import Any, NamedTuple from huggingface_hub import snapshot_download @@ -633,7 +632,7 @@ def main(args): req_data.engine_args.limit_mm_per_prompt or {} ) - engine_args = asdict(req_data.engine_args) | {"seed": args.seed} + engine_args = vars(req_data.engine_args) | {"seed": args.seed} if args.tensor_parallel_size is not None: engine_args["tensor_parallel_size"] = args.tensor_parallel_size llm = LLM(**engine_args) diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index 857767ac3..2f72b7d06 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -8,7 +8,6 @@ the explicit/implicit prompt format on enc-dec LMMs for text generation. import os import time from collections.abc import Sequence -from dataclasses import asdict from typing import NamedTuple from vllm import LLM, EngineArgs, PromptType, SamplingParams @@ -91,13 +90,12 @@ def main(args): req_data = model_example_map[model]() # Disable other modalities to save memory + engine_args = req_data.engine_args default_limits = {"image": 0, "video": 0, "audio": 0} - req_data.engine_args.limit_mm_per_prompt = default_limits | dict( - req_data.engine_args.limit_mm_per_prompt or {} - ) - - engine_args = asdict(req_data.engine_args) | {"seed": args.seed} - llm = LLM(**engine_args) + limit_mm_per_prompt = default_limits | (engine_args.limit_mm_per_prompt or {}) + engine_args.limit_mm_per_prompt = limit_mm_per_prompt + engine_args.seed = args.seed + llm = LLM.from_engine_args(engine_args) prompts = req_data.prompts diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py index 7743733f8..0085e8e8e 100644 --- a/examples/offline_inference/load_sharded_state.py +++ b/examples/offline_inference/load_sharded_state.py @@ -20,8 +20,6 @@ python load_sharded_state.py \ --max-tokens 50 """ -import dataclasses - from vllm import LLM, EngineArgs, SamplingParams from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -64,7 +62,7 @@ def main(): print(f"Tensor parallel size: {engine_args.tensor_parallel_size}") # Load the model using engine args - llm = LLM(**dataclasses.asdict(engine_args)) + llm = LLM.from_engine_args(engine_args) # Prepare sampling parameters sampling_params = SamplingParams( diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py index 43d890465..14d472ee3 100644 --- a/examples/offline_inference/save_sharded_state.py +++ b/examples/offline_inference/save_sharded_state.py @@ -21,7 +21,6 @@ llm = LLM( ) """ -import dataclasses import os import shutil from pathlib import Path @@ -60,7 +59,7 @@ def main(args): if not Path(model_path).is_dir(): raise ValueError("model path must be a local directory") # Create LLM instance from arguments - llm = LLM(**dataclasses.asdict(engine_args)) + llm = LLM.from_engine_args(engine_args) # Prepare output directory Path(args.output).mkdir(exist_ok=True) # Dump worker states to output directory diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index d0122b318..56154c122 100755 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -11,7 +11,6 @@ on HuggingFace model repository. import os import random from contextlib import contextmanager -from dataclasses import asdict from typing import NamedTuple from huggingface_hub import snapshot_download @@ -2434,13 +2433,13 @@ def main(args): req_data.engine_args.limit_mm_per_prompt or {} ) - engine_args = asdict(req_data.engine_args) | { - "seed": args.seed, - "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4, - } + engine_args = req_data.engine_args + engine_args.seed = args.seed + mm_processor_cache_gb = 0 if args.disable_mm_processor_cache else 4 + engine_args.mm_processor_cache_gb = mm_processor_cache_gb if args.tensor_parallel_size is not None: - engine_args["tensor_parallel_size"] = args.tensor_parallel_size - llm = LLM(**engine_args) + engine_args.tensor_parallel_size = args.tensor_parallel_size + llm = LLM.from_engine_args(engine_args) # Don't want to check the flag multiple times, so just hijack `prompts`. prompts = ( diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 632646956..38a34a68e 100755 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -8,7 +8,6 @@ using the chat template defined by the model. import os from argparse import Namespace -from dataclasses import asdict from typing import NamedTuple from huggingface_hub import snapshot_download @@ -1481,10 +1480,11 @@ def run_generate( ): req_data = model_example_map[model](question, image_urls) - engine_args = asdict(req_data.engine_args) | {"seed": seed} + engine_args = req_data.engine_args + engine_args.seed = seed if tensor_parallel_size is not None: - engine_args["tensor_parallel_size"] = tensor_parallel_size - llm = LLM(**engine_args) + engine_args.tensor_parallel_size = tensor_parallel_size + llm = LLM.from_engine_args(engine_args) sampling_params = SamplingParams( temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids @@ -1521,10 +1521,11 @@ def run_chat( req_data.engine_args.limit_mm_per_prompt or {} ) - engine_args = asdict(req_data.engine_args) | {"seed": seed} + engine_args = req_data.engine_args + engine_args.seed = seed if tensor_parallel_size is not None: - engine_args["tensor_parallel_size"] = tensor_parallel_size - llm = LLM(**engine_args) + engine_args.tensor_parallel_size = tensor_parallel_size + llm = LLM.from_engine_args(engine_args) sampling_params = ( SamplingParams( diff --git a/examples/pooling/embed/vision_embedding_offline.py b/examples/pooling/embed/vision_embedding_offline.py index a5f0d35af..3190e00c4 100644 --- a/examples/pooling/embed/vision_embedding_offline.py +++ b/examples/pooling/embed/vision_embedding_offline.py @@ -10,12 +10,11 @@ on HuggingFace model repository. """ import argparse -from dataclasses import asdict from pathlib import Path from PIL.Image import Image -from vllm import LLM, EngineArgs +from vllm import LLM from vllm.multimodal.utils import fetch_image from vllm.utils.print_utils import print_embeddings @@ -28,14 +27,13 @@ multi_modal_data = {"image": fetch_image(image_url)} def run_clip(seed: int): - engine_args = EngineArgs( + llm = LLM( model="openai/clip-vit-base-patch32", runner="pooling", limit_mm_per_prompt={"image": 1}, + seed=seed, ) - llm = LLM(**asdict(engine_args) | {"seed": seed}) - print("Text embedding output:") outputs = llm.embed(text, use_tqdm=False) print_embeddings(outputs[0].outputs.embedding) @@ -53,15 +51,14 @@ def run_clip(seed: int): def run_e5_v(seed: int): - engine_args = EngineArgs( + llm = LLM( model="royokong/e5-v", runner="pooling", max_model_len=4096, limit_mm_per_prompt={"image": 1}, + seed=seed, ) - llm = LLM(**asdict(engine_args) | {"seed": seed}) - llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501 print("Text embedding output:") @@ -108,20 +105,20 @@ def run_qwen3_vl(seed: int): multi_modal_data["image"] = post_process_image(multi_modal_data["image"]) - engine_args = EngineArgs( - model="Qwen/Qwen3-VL-Embedding-2B", - runner="pooling", - max_model_len=8192, - limit_mm_per_prompt={"image": 1}, - mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None, - ) default_instruction = "Represent the user's input." image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>" prompt_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n" prompt_image = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n" prompt_image_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n" - llm = LLM(**asdict(engine_args) | {"seed": seed}) + llm = LLM( + model="Qwen/Qwen3-VL-Embedding-2B", + runner="pooling", + max_model_len=8192, + limit_mm_per_prompt={"image": 1}, + mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None, + seed=seed, + ) print("Text embedding output:") outputs = llm.embed(prompt_text, use_tqdm=False) @@ -149,14 +146,13 @@ def run_qwen3_vl(seed: int): def run_siglip(seed: int): - engine_args = EngineArgs( + llm = LLM( model="google/siglip-base-patch16-224", runner="pooling", limit_mm_per_prompt={"image": 1}, + seed=seed, ) - llm = LLM(**asdict(engine_args) | {"seed": seed}) - print("Text embedding output:") outputs = llm.embed(text, use_tqdm=False) print_embeddings(outputs[0].outputs.embedding) @@ -174,16 +170,15 @@ def run_siglip(seed: int): def run_vlm2vec_phi3v(seed: int): - engine_args = EngineArgs( + llm = LLM( model="TIGER-Lab/VLM2Vec-Full", runner="pooling", max_model_len=4096, trust_remote_code=True, mm_processor_kwargs={"num_crops": 4}, limit_mm_per_prompt={"image": 1}, + seed=seed, ) - - llm = LLM(**asdict(engine_args) | {"seed": seed}) image_token = "<|image_1|>" print("Text embedding output:") @@ -259,7 +254,7 @@ def run_vlm2vec_qwen2vl(seed: int): processor.save_pretrained(merged_path) print("Done!") - engine_args = EngineArgs( + llm = LLM( model=merged_path, runner="pooling", max_model_len=4096, @@ -268,9 +263,8 @@ def run_vlm2vec_qwen2vl(seed: int): "max_pixels": 12845056, }, limit_mm_per_prompt={"image": 1}, + seed=seed, ) - - llm = LLM(**asdict(engine_args) | {"seed": seed}) image_token = "<|image_pad|>" print("Text embedding output:") diff --git a/examples/pooling/score/vision_reranker_offline.py b/examples/pooling/score/vision_reranker_offline.py index 19bb98177..ef2b35487 100644 --- a/examples/pooling/score/vision_reranker_offline.py +++ b/examples/pooling/score/vision_reranker_offline.py @@ -10,7 +10,6 @@ multimodal documents (text + images/videos). from argparse import Namespace from collections.abc import Callable -from dataclasses import asdict from pathlib import Path from typing import NamedTuple @@ -125,7 +124,7 @@ def main(args: Namespace): model_request = model_example_map[args.model_name]() engine_args = model_request.engine_args - llm = LLM(**asdict(engine_args)) + llm = LLM.from_engine_args(engine_args) print("Query: string & Document: string") outputs = llm.score(query, document) diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index 6b8f3b60b..53434b0b4 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -414,9 +414,12 @@ def test_cudagraph_sizes_post_init( ctx, patch("vllm.config.parallel.cuda_device_count_stateless", return_value=tp_size), ): + kwargs = {} + if cudagraph_capture_sizes is not None: + kwargs["cudagraph_capture_sizes"] = cudagraph_capture_sizes + if max_cudagraph_capture_size is not None: + kwargs["max_cudagraph_capture_size"] = max_cudagraph_capture_size compilation_config = CompilationConfig( - cudagraph_capture_sizes=cudagraph_capture_sizes, - max_cudagraph_capture_size=max_cudagraph_capture_size, pass_config=PassConfig( enable_sp=enable_sp, fuse_norm_quant=True, @@ -425,6 +428,7 @@ def test_cudagraph_sizes_post_init( sp_min_token_num=512 if enable_sp else None, ), cudagraph_mode=cudagraph_mode, + **kwargs, ) engine_args = EngineArgs( model="facebook/opt-125m", diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index 55a3b9858..8ca15c286 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for HF_HUB_OFFLINE mode""" -import dataclasses import importlib import sys @@ -12,7 +11,6 @@ import urllib3 from vllm import LLM from vllm.distributed import cleanup_dist_env_and_memory -from vllm.engine.arg_utils import EngineArgs MODEL_CONFIGS = [ { @@ -160,8 +158,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch): # Need to re-import huggingface_hub # and friends to set up offline mode _re_import_modules() - engine_args = EngineArgs(model="facebook/opt-125m") - LLM(**dataclasses.asdict(engine_args)) + LLM(model="facebook/opt-125m") finally: # Reset the environment after the test # NB: Assuming tests are run in online mode diff --git a/tests/models/multimodal/generation/test_keye.py b/tests/models/multimodal/generation/test_keye.py index d7430821d..e9d6626a2 100644 --- a/tests/models/multimodal/generation/test_keye.py +++ b/tests/models/multimodal/generation/test_keye.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from dataclasses import asdict from typing import NamedTuple import pytest @@ -29,14 +28,6 @@ def test_keye_vl(image_assets, question: str): images = [asset.pil_image for asset in image_assets] image_urls = [encode_image_url(image) for image in images] - engine_args = EngineArgs( - model=MODEL_NAME, - trust_remote_code=True, - max_model_len=8192, - max_num_seqs=5, - limit_mm_per_prompt={"image": len(image_urls)}, - ) - placeholders = [{"type": "image", "image": url} for url in image_urls] messages = [ { @@ -54,8 +45,14 @@ def test_keye_vl(image_assets, question: str): messages, tokenize=False, add_generation_prompt=True ) - engine_args = asdict(engine_args) | {"seed": 42} - llm = LLM(**engine_args) + llm = LLM( + model=MODEL_NAME, + trust_remote_code=True, + max_model_len=8192, + max_num_seqs=5, + limit_mm_per_prompt={"image": len(image_urls)}, + seed=42, + ) sampling_params = SamplingParams( temperature=0.0, max_tokens=256, stop_token_ids=None diff --git a/tests/models/multimodal/generation/test_vit_backend_functionality.py b/tests/models/multimodal/generation/test_vit_backend_functionality.py index 9310f52df..ad912067a 100644 --- a/tests/models/multimodal/generation/test_vit_backend_functionality.py +++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py @@ -7,13 +7,12 @@ This test validates that each multimodal model can successfully generate outputs using different ViT attention backends. Tests are parametrized by model and backend. """ -from dataclasses import asdict from typing import Any import pytest from transformers import AutoProcessor -from vllm import LLM, EngineArgs, SamplingParams +from vllm import LLM, SamplingParams from vllm.multimodal.utils import encode_image_url from vllm.multimodal.video import sample_frames_from_video from vllm.platforms import current_platform @@ -274,7 +273,7 @@ def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets): limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)}) # Create engine - engine_args = EngineArgs( + llm = LLM( model=config["model_name"], trust_remote_code=True, max_model_len=config["max_model_len"], @@ -283,11 +282,9 @@ def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets): mm_encoder_attn_backend=mm_encoder_attn_backend, hf_overrides=dummy_hf_overrides, load_format="dummy", + seed=42, ) - engine_dict = asdict(engine_args) | {"seed": 42} - llm = LLM(**engine_dict) - # Generate sampling_params = SamplingParams(**config["sampling_params"]) outputs = llm.generate( @@ -318,7 +315,7 @@ def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets): messages = build_dots_ocr_prompt([stop_sign_image], config) # Create engine - engine_args = EngineArgs( + llm = LLM( model=config["model_name"], trust_remote_code=True, max_model_len=config["max_model_len"], @@ -327,11 +324,9 @@ def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets): mm_encoder_attn_backend=mm_encoder_attn_backend, hf_overrides=dummy_hf_overrides, load_format="dummy", + seed=42, ) - engine_dict = asdict(engine_args) | {"seed": 42} - llm = LLM(**engine_dict) - # Generate using chat sampling_params = SamplingParams(**config["sampling_params"]) outputs = llm.chat(messages=messages, sampling_params=sampling_params) diff --git a/tests/models/multimodal/generation/test_voxtral_realtime.py b/tests/models/multimodal/generation/test_voxtral_realtime.py index cac79b237..ca43e7b51 100644 --- a/tests/models/multimodal/generation/test_voxtral_realtime.py +++ b/tests/models/multimodal/generation/test_voxtral_realtime.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib -from dataclasses import asdict import pytest import pytest_asyncio @@ -75,7 +74,7 @@ def tokenizer() -> MistralTokenizer: @pytest.fixture def engine(): engine_args = EngineArgs(**ENGINE_CONFIG) - llm = LLM(**asdict(engine_args)) + llm = LLM.from_engine_args(engine_args) try: yield llm finally: diff --git a/tests/v1/kv_connector/unit/test_example_connector.py b/tests/v1/kv_connector/unit/test_example_connector.py index 7e05a0d93..90381d47b 100644 --- a/tests/v1/kv_connector/unit/test_example_connector.py +++ b/tests/v1/kv_connector/unit/test_example_connector.py @@ -1,12 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from dataclasses import asdict from typing import NamedTuple import pytest from PIL import Image -from vllm import LLM, EngineArgs, SamplingParams +from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset from vllm.config import AttentionConfig, KVTransferConfig from vllm.multimodal.utils import encode_image_url @@ -129,24 +128,6 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend): # Using tmp_path as the storage path to store KV print(f"KV storage path at: {str(tmp_path)}") - # Configure the ExampleConnector - kv_transfer_config = KVTransferConfig( - kv_connector="ExampleConnector", - kv_role="kv_both", - kv_connector_extra_config={"shared_storage_path": str(tmp_path)}, - ) - - engine_args = EngineArgs( - model=MODEL_NAME, - max_model_len=8192, - max_num_seqs=1, - gpu_memory_utilization=0.4, - attention_config=AttentionConfig(backend=attn_backend), - enforce_eager=True, - kv_transfer_config=kv_transfer_config, - limit_mm_per_prompt={"image": 2}, - ) - # don't put this import at the top level # it will call torch.accelerator.device_count() from transformers import AutoProcessor @@ -163,8 +144,20 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend): assert image_1 != image_2, "The images should not be identical" # Create the LLM instance - engine_args = asdict(engine_args) - llm = LLM(**engine_args) + llm = LLM( + model=MODEL_NAME, + max_model_len=8192, + max_num_seqs=1, + gpu_memory_utilization=0.4, + attention_config=AttentionConfig(backend=attn_backend), + enforce_eager=True, + kv_transfer_config=KVTransferConfig( + kv_connector="ExampleConnector", + kv_role="kv_both", + kv_connector_extra_config={"shared_storage_path": str(tmp_path)}, + ), + limit_mm_per_prompt={"image": 2}, + ) # Prepare the input cases input_cases = [ diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index 758e5efed..66afbec1b 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -6,7 +6,6 @@ import argparse import json import os import time -from dataclasses import fields from typing import Any import numpy as np @@ -85,7 +84,7 @@ def main(args: argparse.Namespace): # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. - llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)}) + llm = LLM.from_engine_args(engine_args) assert llm.llm_engine.model_config.max_model_len >= ( args.input_len + args.output_len ), ( diff --git a/vllm/benchmarks/mm_processor.py b/vllm/benchmarks/mm_processor.py index 4f31af0e0..3ce0b9119 100644 --- a/vllm/benchmarks/mm_processor.py +++ b/vllm/benchmarks/mm_processor.py @@ -17,7 +17,6 @@ import argparse import json import time from collections import defaultdict -from dataclasses import fields from datetime import datetime from typing import TYPE_CHECKING, Any, Literal @@ -225,7 +224,7 @@ def benchmark_multimodal_processor( args.seed = 0 engine_args = EngineArgs.from_cli_args(args) - llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)}) + llm = LLM.from_engine_args(engine_args) tokenizer = llm.get_tokenizer() requests = get_requests(args, tokenizer) diff --git a/vllm/benchmarks/startup.py b/vllm/benchmarks/startup.py index 405299938..375b8f9fa 100644 --- a/vllm/benchmarks/startup.py +++ b/vllm/benchmarks/startup.py @@ -16,7 +16,6 @@ import shutil import tempfile import time from contextlib import contextmanager -from dataclasses import fields from typing import Any import numpy as np @@ -67,7 +66,7 @@ def run_startup_in_subprocess(engine_args, result_queue): # Measure total startup time start_time = time.perf_counter() - llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)}) + llm = LLM.from_engine_args(engine_args) total_startup_time = time.perf_counter() - start_time diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index f7cea8bdd..6f878b275 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -8,7 +8,6 @@ import os import random import time import warnings -from dataclasses import fields from typing import Any import torch @@ -53,7 +52,7 @@ def run_vllm( ) -> tuple[float, list[RequestOutput] | None]: from vllm import LLM, SamplingParams - llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)}) + llm = LLM.from_engine_args(engine_args) assert all( llm.llm_engine.model_config.max_model_len >= (request.prompt_len + request.expected_output_len) @@ -141,7 +140,7 @@ def run_vllm_chat( """ from vllm import LLM, SamplingParams - llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)}) + llm = LLM.from_engine_args(engine_args) assert all( llm.llm_engine.model_config.max_model_len diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 9e4196a44..ec2f7e7f7 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -116,29 +116,29 @@ class PassConfig: """ # New flags - fuse_norm_quant: bool | None = Field(default=None) + fuse_norm_quant: bool = None # type: ignore[assignment] """Fuse the custom RMSNorm + quant ops.""" - fuse_act_quant: bool | None = Field(default=None) + fuse_act_quant: bool = None # type: ignore[assignment] """Fuse the custom SiluMul + quant ops.""" - fuse_attn_quant: bool | None = Field(default=None) + fuse_attn_quant: bool = None # type: ignore[assignment] """Fuse the custom attention + quant ops.""" eliminate_noops: bool = Field(default=True) """Eliminate no-op ops.""" - enable_sp: bool | None = Field(default=None) + enable_sp: bool = None # type: ignore[assignment] """Enable sequence parallelism. Requires TP>1. Automatically disabled if the model's hidden_size is too small for SP to be beneficial (threshold is device-capability dependent).""" - fuse_gemm_comms: bool | None = Field(default=None) + fuse_gemm_comms: bool = None # type: ignore[assignment] """Enable async TP.""" - fuse_allreduce_rms: bool | None = Field(default=None) + fuse_allreduce_rms: bool = None # type: ignore[assignment] """Enable flashinfer allreduce fusion.""" enable_qk_norm_rope_fusion: bool = False """Enable fused Q/K RMSNorm + RoPE pass.""" # ROCm/AITER specific fusions - fuse_act_padding: bool | None = Field(default=None) + fuse_act_padding: bool = None # type: ignore[assignment] """Fuse the custom RMSNorm + padding ops.""" - fuse_rope_kvcache: bool | None = Field(default=None) + fuse_rope_kvcache: bool = None # type: ignore[assignment] """Fuse the QK rope + KV cache ops.""" rope_kvcache_fusion_max_token_num: int = 256 @@ -405,7 +405,7 @@ class CompilationConfig: """ # Top-level Compilation control - mode: CompilationMode = Field(default=None) # type: ignore[assignment] + mode: CompilationMode = None # type: ignore[assignment] """The compilation approach used for torch.compile-based compilation of the model. @@ -545,7 +545,7 @@ class CompilationConfig: constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`.""" # CudaGraph compilation - cudagraph_mode: CUDAGraphMode = Field(default=None) # type: ignore[assignment] + cudagraph_mode: CUDAGraphMode = None # type: ignore[assignment] """ The mode of the cudagraph: @@ -586,7 +586,7 @@ class CompilationConfig: It means the first several runs will be treated as warmup runs. Only after that, the execution will be recorded, and the recorded cudagraph will be used for subsequent runs.""" - cudagraph_capture_sizes: list[int] | None = None + cudagraph_capture_sizes: list[int] = None # type: ignore[assignment] """Sizes to capture cudagraph. - None (default): capture sizes are inferred from vllm config. - list[int]: capture sizes are specified as given.""" @@ -607,7 +607,7 @@ class CompilationConfig: When `enable_lora` is False, this option has no effect. """ - use_inductor_graph_partition: bool = Field(default=None) # type: ignore[assignment] + use_inductor_graph_partition: bool = None # type: ignore[assignment] """Use inductor graph partition to split the graph at cudagraph_unsafe ops. This partition happens at inductor codegen time after all passes and fusions are finished. It generates a single `call` function which wraps @@ -630,7 +630,7 @@ class CompilationConfig: pass_config: PassConfig = field(default_factory=PassConfig) """Custom inductor passes, see PassConfig for more details""" - max_cudagraph_capture_size: int | None = field(default=None) + max_cudagraph_capture_size: int = None # type: ignore[assignment] """The maximum cudagraph capture size. If cudagraph_capture_sizes is specified, this will be set to the largest @@ -750,7 +750,7 @@ class CompilationConfig: return hash_factors(factors) def __repr__(self) -> str: - exclude = { + exclude: dict[str, bool | dict[str, bool]] = { "static_forward_context": True, "enabled_custom_ops": True, "disabled_custom_ops": True, @@ -770,9 +770,7 @@ class CompilationConfig: exclude["pass_config"] = pass_config_exclude config = TypeAdapter(CompilationConfig).dump_python( - self, - exclude=exclude, # type: ignore[arg-type] - exclude_unset=True, + self, exclude=exclude, exclude_unset=True ) return str(config) @@ -1023,7 +1021,6 @@ class CompilationConfig: "Unrecognized size type in compile_sizes, " f"expect 'cudagraph_capture_sizes', got {x}" ) - assert self.cudagraph_capture_sizes is not None computed_compile_sizes.extend(self.cudagraph_capture_sizes) else: assert isinstance(x, int) @@ -1031,7 +1028,6 @@ class CompilationConfig: self.compile_sizes = computed_compile_sizes # type: ignore # make sure the sizes are in ascending order - assert self.cudagraph_capture_sizes is not None self.cudagraph_capture_sizes.sort() if self.cudagraph_capture_sizes: assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size @@ -1123,7 +1119,6 @@ class CompilationConfig: def set_splitting_ops_for_attn_fusion(self): assert self.pass_config.fuse_attn_quant - assert self.cudagraph_mode is not None if self.splitting_ops is None: self.splitting_ops = [] if self.cudagraph_mode.has_piecewise_cudagraphs(): diff --git a/vllm/config/device.py b/vllm/config/device.py index bb689c9b3..c20e4d0f2 100644 --- a/vllm/config/device.py +++ b/vllm/config/device.py @@ -13,8 +13,8 @@ from vllm.utils.hashing import safe_hash Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"] -@config(config=ConfigDict(arbitrary_types_allowed=True)) # type: ignore[arg-type,misc] -class DeviceConfig: # type: ignore[misc] +@config(config=ConfigDict(arbitrary_types_allowed=True)) +class DeviceConfig: """Configuration for the device to use for vLLM execution.""" device: SkipValidation[Device | torch.device | None] = "auto" diff --git a/vllm/config/kernel.py b/vllm/config/kernel.py index 5e1c9109a..2ec18289d 100644 --- a/vllm/config/kernel.py +++ b/vllm/config/kernel.py @@ -4,7 +4,7 @@ from collections.abc import Callable from typing import Any, Literal -from pydantic import Field, field_validator +from pydantic import field_validator from vllm.config.utils import config from vllm.utils.hashing import safe_hash @@ -26,7 +26,7 @@ MoEBackend = Literal[ class KernelConfig: """Configuration for kernel selection and warmup behavior.""" - enable_flashinfer_autotune: bool | None = Field(default=None) + enable_flashinfer_autotune: bool = None # type: ignore[assignment] """If True, run FlashInfer autotuning during kernel warmup.""" moe_backend: MoEBackend = "auto" diff --git a/vllm/config/kv_events.py b/vllm/config/kv_events.py index 77ed5fabf..d618bc9a7 100644 --- a/vllm/config/kv_events.py +++ b/vllm/config/kv_events.py @@ -4,8 +4,6 @@ from typing import Literal -from pydantic import Field - from vllm.config.utils import config @@ -18,7 +16,7 @@ class KVEventsConfig: Events can be published externally by zmq using the event publisher config. """ - publisher: Literal["null", "zmq"] | None = Field(default=None) + publisher: Literal["null", "zmq"] = None # type: ignore[assignment] """The publisher to use for publishing kv events. Can be "null", "zmq". """ diff --git a/vllm/config/lora.py b/vllm/config/lora.py index 696e92df7..bfef0efa3 100644 --- a/vllm/config/lora.py +++ b/vllm/config/lora.py @@ -25,8 +25,8 @@ MaxLoRARanks = Literal[1, 8, 16, 32, 64, 128, 256, 320, 512] LoRAExtraVocabSize = Literal[256, 512] -@config(config=ConfigDict(arbitrary_types_allowed=True)) # type: ignore[arg-type,misc] -class LoRAConfig: # type: ignore[misc] +@config(config=ConfigDict(arbitrary_types_allowed=True)) +class LoRAConfig: """Configuration for LoRA.""" max_lora_rank: MaxLoRARanks = 16 diff --git a/vllm/config/model.py b/vllm/config/model.py index 032f29c18..e51723009 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -102,8 +102,8 @@ AttnTypeStr = Literal[ ] -@config(config=ConfigDict(arbitrary_types_allowed=True)) # type: ignore[arg-type,misc] -class ModelConfig: # type: ignore[misc] +@config(config=ConfigDict(arbitrary_types_allowed=True)) +class ModelConfig: """Configuration for the model.""" model: str = "Qwen/Qwen3-0.6B" @@ -121,7 +121,7 @@ class ModelConfig: # type: ignore[misc] """Convert the model using adapters defined in [vllm.model_executor.models.adapters][]. The most common use case is to adapt a text generation model to be used for pooling tasks.""" - tokenizer: str = Field(default=None) # type: ignore[assignment] + tokenizer: str = None # type: ignore[assignment] """Name or path of the Hugging Face tokenizer to use. If unspecified, model name or path will be used.""" tokenizer_mode: TokenizerMode | str = "auto" @@ -583,7 +583,7 @@ class ModelConfig: # type: ignore[misc] self.dtype, is_pooling_model=self.runner_type == "pooling", revision=self.revision, - config_format=self.config_format, # type: ignore[arg-type] + config_format=self.config_format, ) self.original_max_model_len = self.max_model_len @@ -733,7 +733,7 @@ class ModelConfig: # type: ignore[misc] @property def architectures(self) -> list[str]: - return self.model_arch_config.architectures # type: ignore[return-value] + return self.model_arch_config.architectures @property def architecture(self) -> str: @@ -1944,7 +1944,7 @@ def _get_and_verify_dtype( *, is_pooling_model: bool, revision: str | None = None, - config_format: ConfigFormat = "hf", + config_format: str | ConfigFormat = "hf", ) -> torch.dtype: config_dtype = ModelArchConfigConvertorBase.get_torch_dtype( config, model_id, revision=revision, config_format=config_format diff --git a/vllm/config/model_arch.py b/vllm/config/model_arch.py index d55e2a339..24d1baea0 100644 --- a/vllm/config/model_arch.py +++ b/vllm/config/model_arch.py @@ -16,7 +16,7 @@ class ModelArchitectureConfig: Configuration for model architecture that required by vLLM runtime """ - architectures: list[str] | None + architectures: list[str] """List of model architecture class names (e.g., ['LlamaForCausalLM']). It can be None upon calling `vllm_config.with_hf_config(config.text_config)`""" diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 8332b0ec7..7dd9c5bb5 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -194,7 +194,7 @@ class ParallelConfig: threshold, microbatching will be used. Otherwise, the request will be processed in a single batch.""" - disable_nccl_for_dp_synchronization: bool | None = Field(default=None) + disable_nccl_for_dp_synchronization: bool | None = None """Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py to use Gloo instead of NCCL for its all reduce. diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index ce30fcab4..f988c1086 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -52,7 +52,7 @@ class SchedulerConfig: In real usage, this should be set in `EngineArgs.create_engine_config`. """ - max_num_scheduled_tokens: int | None = Field(default=None) + max_num_scheduled_tokens: int | None = None """Maximum number of tokens that the scheduler may issue in a single iteration. This is usually equal to max_num_batched_tokens, but can be smaller in cases @@ -122,7 +122,7 @@ class SchedulerConfig: # scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler" # (default) or "mod.custom_class". - scheduler_cls: str | type[object] | None = Field(default=None) + scheduler_cls: str | type[object] | None = None """The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is the default scheduler. Can be a class directly or the path to a class of form "mod.custom_class".""" @@ -141,7 +141,7 @@ class SchedulerConfig: checking the first chunk. Prevents over-admission and KV cache thrashing with chunked prefill.""" - async_scheduling: bool | None = Field(default=None) + async_scheduling: bool | None = None """If set to False, disable async scheduling. Async scheduling helps to avoid gaps in GPU utilization, leading to better latency and throughput. """ diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 7ae9c0c24..899666048 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -11,13 +11,13 @@ import os import pathlib import textwrap from collections.abc import Callable, Mapping, Sequence, Set -from dataclasses import MISSING, dataclass, field, fields, is_dataclass +from dataclasses import MISSING, field, fields, is_dataclass from itertools import pairwise -from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast +from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast, overload import torch from pydantic import ConfigDict -from pydantic.dataclasses import dataclass as pydantic_dataclass +from pydantic.dataclasses import dataclass from pydantic.fields import Field as PydanticField from pydantic.fields import FieldInfo from typing_extensions import dataclass_transform, runtime_checkable @@ -36,6 +36,16 @@ ConfigType = type[DataclassInstance] ConfigT = TypeVar("ConfigT", bound=DataclassInstance) +@overload +def config(cls: type[ConfigT]) -> type[ConfigT]: ... + + +@overload +def config( + *, config: ConfigDict | None = None, **kwargs: Any +) -> Callable[[type[ConfigT]], type[ConfigT]]: ... + + @dataclass_transform(field_specifiers=(PydanticField,)) def config( cls: type[ConfigT] | None = None, @@ -59,7 +69,7 @@ def config( merged_config.update(config) def decorator(cls: type[ConfigT]) -> type[ConfigT]: - return pydantic_dataclass(cls, config=merged_config, **kwargs) # type: ignore[return-value] + return dataclass(cls, config=merged_config, **kwargs) # type: ignore[return-value] # Called with arguments: @config(config=...) if cls is None: diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 88942fc86..65a78f4d0 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -246,15 +246,15 @@ OPTIMIZATION_LEVEL_TO_CONFIG = { } -@config(config=ConfigDict(arbitrary_types_allowed=True)) # type: ignore[arg-type,misc] -class VllmConfig: # type: ignore[misc] +@config(config=ConfigDict(arbitrary_types_allowed=True)) +class VllmConfig: """Dataclass which contains all vllm-related configuration. This simplifies passing around the distinct configurations in the codebase. """ # TODO: use default_factory once default constructing ModelConfig doesn't # try to download a model - model_config: ModelConfig = Field(default=None) # type: ignore[assignment] + model_config: ModelConfig = None # type: ignore[assignment] """Model configuration.""" cache_config: CacheConfig = Field(default_factory=CacheConfig) """Cache configuration.""" @@ -912,7 +912,8 @@ class VllmConfig: # type: ignore[misc] tp_size = self.parallel_config.tensor_parallel_size hidden_size = self.model_config.get_hidden_size() - element_size = self.model_config.dtype.itemsize # type: ignore[union-attr] + assert isinstance(self.model_config.dtype, torch.dtype) + element_size = self.model_config.dtype.itemsize pass_config.sp_min_token_num = get_sequence_parallelism_threshold( hidden_size, tp_size, element_size ) @@ -1246,14 +1247,6 @@ class VllmConfig: # type: ignore[misc] ) self.compilation_config.debug_dump_path = env_path - def has_blocked_weights(): # type: ignore[no-redef] - if self.quant_config is not None: - if hasattr(self.quant_config, "weight_block_size"): - return self.quant_config.weight_block_size is not None - elif hasattr(self.quant_config, "has_blocked_weights"): - return self.quant_config.has_blocked_weights() - return False - # Enable quant_fp8 CUDA ops (TODO disable in follow up) # On H100 the CUDA kernel is faster than # native implementation @@ -1502,9 +1495,10 @@ class VllmConfig: # type: ignore[misc] tp_size = self.parallel_config.tensor_parallel_size max_size = compilation_config.pass_config.flashinfer_max_size(tp_size) if max_size is not None: + assert isinstance(self.model_config.dtype, torch.dtype) max_token_num = max_size // ( self.model_config.get_hidden_size() - * self.model_config.dtype.itemsize # type: ignore[union-attr] + * self.model_config.dtype.itemsize ) if compile_range_end is not None and max_token_num < compile_range_end: computed_compile_ranges_endpoints.append(max_token_num) @@ -1527,7 +1521,8 @@ class VllmConfig: # type: ignore[misc] tp_size = self.parallel_config.tensor_parallel_size hidden_size = self.model_config.get_hidden_size() - element_size = self.model_config.dtype.itemsize # type: ignore[union-attr] + assert isinstance(self.model_config.dtype, torch.dtype) + element_size = self.model_config.dtype.itemsize pass_config.sp_min_token_num = get_sequence_parallelism_threshold( hidden_size, tp_size, element_size ) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 939c195b2..d4018b7ec 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1935,7 +1935,7 @@ class EngineArgs: ) offload_config = OffloadConfig( - offload_backend=self.offload_backend, # type: ignore[arg-type] + offload_backend=self.offload_backend, uva=UVAOffloadConfig( cpu_offload_gb=self.cpu_offload_gb, cpu_offload_params=self.cpu_offload_params, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 61577695a..e9e7cb91c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -409,6 +409,11 @@ class LLM: # Cache for __repr__ to avoid repeated collective_rpc calls self._cached_repr: str | None = None + @classmethod + def from_engine_args(cls, engine_args: EngineArgs) -> "LLM": + """Create an LLM instance from EngineArgs.""" + return cls(**vars(engine_args)) + def get_tokenizer(self) -> TokenizerLike: return self.llm_engine.get_tokenizer() diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py index f5fb290d1..3229539e3 100644 --- a/vllm/transformers_utils/model_arch_config_convertor.py +++ b/vllm/transformers_utils/model_arch_config_convertor.py @@ -28,7 +28,10 @@ class ModelArchConfigConvertorBase: self.hf_text_config = hf_text_config def get_architectures(self) -> list[str]: - return getattr(self.hf_config, "architectures", []) + # Sometimes we get here from `vllm_config.with_hf_config(text_config)` where + # `text_config` is a sub-config from a multi-modal model. If this is the case, + # the sub-config will not have `architectures` and it will explicitly be `None` + return getattr(self.hf_config, "architectures", None) or [] def get_num_hidden_layers(self) -> int: return getattr(self.hf_text_config, "num_hidden_layers", 0) @@ -128,7 +131,7 @@ class ModelArchConfigConvertorBase: hf_config: PretrainedConfig, model_id: str, revision: str | None, - config_format: ConfigFormat, + config_format: str | ConfigFormat, ): # NOTE: getattr(config, "dtype", torch.float32) is not correct # because config.dtype can be None.