diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py
index b50b310fd..6106b9014 100644
--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -42,7 +42,6 @@ details.
 
 import random
 import time
-from dataclasses import fields
 
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
@@ -124,7 +123,7 @@ def main(args):
 
     # Create the LLM engine
     engine_args = EngineArgs.from_cli_args(args)
-    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+    llm = LLM.from_engine_args(engine_args)
     sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
 
     print("------warm up------")
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index e7759616e..0145f6b7c 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -32,7 +32,6 @@ import dataclasses
 import json
 import random
 import time
-from dataclasses import fields
 
 from transformers import PreTrainedTokenizerBase
 
@@ -197,7 +196,7 @@ def main(args):
 
     engine_args = EngineArgs.from_cli_args(args)
 
-    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+    llm = LLM.from_engine_args(engine_args)
 
     sampling_params = SamplingParams(
         temperature=0,
diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
index d83bb7e17..1de833978 100644
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -6,7 +6,6 @@ import argparse
 import json
 import random
 import time
-from dataclasses import fields
 
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
@@ -79,7 +78,7 @@ def run_vllm(
 ) -> float:
     from vllm import LLM, SamplingParams
 
-    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+    llm = LLM.from_engine_args(engine_args)
 
     assert all(
         llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index b7e49d2c9..f384dc2bb 100755
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -9,7 +9,6 @@ on HuggingFace model repository.
 """
 
 import os
-from dataclasses import asdict
 from typing import Any, NamedTuple
 
 from huggingface_hub import snapshot_download
@@ -633,7 +632,7 @@ def main(args):
         req_data.engine_args.limit_mm_per_prompt or {}
     )
 
-    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    engine_args = vars(req_data.engine_args) | {"seed": args.seed}
     if args.tensor_parallel_size is not None:
         engine_args["tensor_parallel_size"] = args.tensor_parallel_size
     llm = LLM(**engine_args)
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index 857767ac3..2f72b7d06 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -8,7 +8,6 @@ the explicit/implicit prompt format on enc-dec LMMs for text generation.
 import os
 import time
 from collections.abc import Sequence
-from dataclasses import asdict
 from typing import NamedTuple
 
 from vllm import LLM, EngineArgs, PromptType, SamplingParams
@@ -91,13 +90,12 @@ def main(args):
     req_data = model_example_map[model]()
 
     # Disable other modalities to save memory
+    engine_args = req_data.engine_args
     default_limits = {"image": 0, "video": 0, "audio": 0}
-    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
-        req_data.engine_args.limit_mm_per_prompt or {}
-    )
-
-    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
-    llm = LLM(**engine_args)
+    limit_mm_per_prompt = default_limits | (engine_args.limit_mm_per_prompt or {})
+    engine_args.limit_mm_per_prompt = limit_mm_per_prompt
+    engine_args.seed = args.seed
+    llm = LLM.from_engine_args(engine_args)
 
     prompts = req_data.prompts
 
diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py
index 7743733f8..0085e8e8e 100644
--- a/examples/offline_inference/load_sharded_state.py
+++ b/examples/offline_inference/load_sharded_state.py
@@ -20,8 +20,6 @@ python load_sharded_state.py \
     --max-tokens 50
 """
 
-import dataclasses
-
 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
@@ -64,7 +62,7 @@ def main():
     print(f"Tensor parallel size: {engine_args.tensor_parallel_size}")
 
     # Load the model using engine args
-    llm = LLM(**dataclasses.asdict(engine_args))
+    llm = LLM.from_engine_args(engine_args)
 
     # Prepare sampling parameters
     sampling_params = SamplingParams(
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index 43d890465..14d472ee3 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -21,7 +21,6 @@ llm = LLM(
 )
 """
 
-import dataclasses
 import os
 import shutil
 from pathlib import Path
@@ -60,7 +59,7 @@ def main(args):
     if not Path(model_path).is_dir():
         raise ValueError("model path must be a local directory")
     # Create LLM instance from arguments
-    llm = LLM(**dataclasses.asdict(engine_args))
+    llm = LLM.from_engine_args(engine_args)
     # Prepare output directory
     Path(args.output).mkdir(exist_ok=True)
     # Dump worker states to output directory
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index d0122b318..56154c122 100755
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -11,7 +11,6 @@ on HuggingFace model repository.
 import os
 import random
 from contextlib import contextmanager
-from dataclasses import asdict
 from typing import NamedTuple
 
 from huggingface_hub import snapshot_download
@@ -2434,13 +2433,13 @@ def main(args):
         req_data.engine_args.limit_mm_per_prompt or {}
     )
 
-    engine_args = asdict(req_data.engine_args) | {
-        "seed": args.seed,
-        "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
-    }
+    engine_args = req_data.engine_args
+    engine_args.seed = args.seed
+    mm_processor_cache_gb = 0 if args.disable_mm_processor_cache else 4
+    engine_args.mm_processor_cache_gb = mm_processor_cache_gb
     if args.tensor_parallel_size is not None:
-        engine_args["tensor_parallel_size"] = args.tensor_parallel_size
-    llm = LLM(**engine_args)
+        engine_args.tensor_parallel_size = args.tensor_parallel_size
+    llm = LLM.from_engine_args(engine_args)
 
     # Don't want to check the flag multiple times, so just hijack `prompts`.
     prompts = (
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 632646956..38a34a68e 100755
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -8,7 +8,6 @@ using the chat template defined by the model.
 
 import os
 from argparse import Namespace
-from dataclasses import asdict
 from typing import NamedTuple
 
 from huggingface_hub import snapshot_download
@@ -1481,10 +1480,11 @@ def run_generate(
 ):
     req_data = model_example_map[model](question, image_urls)
 
-    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    engine_args = req_data.engine_args
+    engine_args.seed = seed
     if tensor_parallel_size is not None:
-        engine_args["tensor_parallel_size"] = tensor_parallel_size
-    llm = LLM(**engine_args)
+        engine_args.tensor_parallel_size = tensor_parallel_size
+    llm = LLM.from_engine_args(engine_args)
 
     sampling_params = SamplingParams(
         temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
@@ -1521,10 +1521,11 @@ def run_chat(
         req_data.engine_args.limit_mm_per_prompt or {}
     )
 
-    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    engine_args = req_data.engine_args
+    engine_args.seed = seed
     if tensor_parallel_size is not None:
-        engine_args["tensor_parallel_size"] = tensor_parallel_size
-    llm = LLM(**engine_args)
+        engine_args.tensor_parallel_size = tensor_parallel_size
+    llm = LLM.from_engine_args(engine_args)
 
     sampling_params = (
         SamplingParams(
diff --git a/examples/pooling/embed/vision_embedding_offline.py b/examples/pooling/embed/vision_embedding_offline.py
index a5f0d35af..3190e00c4 100644
--- a/examples/pooling/embed/vision_embedding_offline.py
+++ b/examples/pooling/embed/vision_embedding_offline.py
@@ -10,12 +10,11 @@ on HuggingFace model repository.
 """
 
 import argparse
-from dataclasses import asdict
 from pathlib import Path
 
 from PIL.Image import Image
 
-from vllm import LLM, EngineArgs
+from vllm import LLM
 from vllm.multimodal.utils import fetch_image
 from vllm.utils.print_utils import print_embeddings
 
@@ -28,14 +27,13 @@ multi_modal_data = {"image": fetch_image(image_url)}
 
 
 def run_clip(seed: int):
-    engine_args = EngineArgs(
+    llm = LLM(
         model="openai/clip-vit-base-patch32",
         runner="pooling",
         limit_mm_per_prompt={"image": 1},
+        seed=seed,
     )
 
-    llm = LLM(**asdict(engine_args) | {"seed": seed})
-
     print("Text embedding output:")
     outputs = llm.embed(text, use_tqdm=False)
     print_embeddings(outputs[0].outputs.embedding)
@@ -53,15 +51,14 @@ def run_clip(seed: int):
 
 
 def run_e5_v(seed: int):
-    engine_args = EngineArgs(
+    llm = LLM(
         model="royokong/e5-v",
         runner="pooling",
         max_model_len=4096,
         limit_mm_per_prompt={"image": 1},
+        seed=seed,
     )
 
-    llm = LLM(**asdict(engine_args) | {"seed": seed})
-
     llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"  # noqa: E501
 
     print("Text embedding output:")
@@ -108,20 +105,20 @@ def run_qwen3_vl(seed: int):
 
         multi_modal_data["image"] = post_process_image(multi_modal_data["image"])
 
-    engine_args = EngineArgs(
-        model="Qwen/Qwen3-VL-Embedding-2B",
-        runner="pooling",
-        max_model_len=8192,
-        limit_mm_per_prompt={"image": 1},
-        mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None,
-    )
     default_instruction = "Represent the user's input."
     image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
     prompt_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
     prompt_image = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
     prompt_image_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"
 
-    llm = LLM(**asdict(engine_args) | {"seed": seed})
+    llm = LLM(
+        model="Qwen/Qwen3-VL-Embedding-2B",
+        runner="pooling",
+        max_model_len=8192,
+        limit_mm_per_prompt={"image": 1},
+        mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None,
+        seed=seed,
+    )
 
     print("Text embedding output:")
     outputs = llm.embed(prompt_text, use_tqdm=False)
@@ -149,14 +146,13 @@ def run_qwen3_vl(seed: int):
 
 
 def run_siglip(seed: int):
-    engine_args = EngineArgs(
+    llm = LLM(
         model="google/siglip-base-patch16-224",
         runner="pooling",
         limit_mm_per_prompt={"image": 1},
+        seed=seed,
     )
 
-    llm = LLM(**asdict(engine_args) | {"seed": seed})
-
     print("Text embedding output:")
     outputs = llm.embed(text, use_tqdm=False)
     print_embeddings(outputs[0].outputs.embedding)
@@ -174,16 +170,15 @@ def run_siglip(seed: int):
 
 
 def run_vlm2vec_phi3v(seed: int):
-    engine_args = EngineArgs(
+    llm = LLM(
         model="TIGER-Lab/VLM2Vec-Full",
         runner="pooling",
         max_model_len=4096,
         trust_remote_code=True,
         mm_processor_kwargs={"num_crops": 4},
         limit_mm_per_prompt={"image": 1},
+        seed=seed,
     )
-
-    llm = LLM(**asdict(engine_args) | {"seed": seed})
     image_token = "<|image_1|>"
 
     print("Text embedding output:")
@@ -259,7 +254,7 @@ def run_vlm2vec_qwen2vl(seed: int):
     processor.save_pretrained(merged_path)
     print("Done!")
 
-    engine_args = EngineArgs(
+    llm = LLM(
         model=merged_path,
         runner="pooling",
         max_model_len=4096,
@@ -268,9 +263,8 @@ def run_vlm2vec_qwen2vl(seed: int):
             "max_pixels": 12845056,
         },
         limit_mm_per_prompt={"image": 1},
+        seed=seed,
     )
-
-    llm = LLM(**asdict(engine_args) | {"seed": seed})
     image_token = "<|image_pad|>"
 
     print("Text embedding output:")
diff --git a/examples/pooling/score/vision_reranker_offline.py b/examples/pooling/score/vision_reranker_offline.py
index 19bb98177..ef2b35487 100644
--- a/examples/pooling/score/vision_reranker_offline.py
+++ b/examples/pooling/score/vision_reranker_offline.py
@@ -10,7 +10,6 @@ multimodal documents (text + images/videos).
 
 from argparse import Namespace
 from collections.abc import Callable
-from dataclasses import asdict
 from pathlib import Path
 from typing import NamedTuple
 
@@ -125,7 +124,7 @@ def main(args: Namespace):
     model_request = model_example_map[args.model_name]()
     engine_args = model_request.engine_args
 
-    llm = LLM(**asdict(engine_args))
+    llm = LLM.from_engine_args(engine_args)
 
     print("Query: string & Document: string")
     outputs = llm.score(query, document)
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index 6b8f3b60b..53434b0b4 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -414,9 +414,12 @@ def test_cudagraph_sizes_post_init(
         ctx,
         patch("vllm.config.parallel.cuda_device_count_stateless", return_value=tp_size),
     ):
+        kwargs = {}
+        if cudagraph_capture_sizes is not None:
+            kwargs["cudagraph_capture_sizes"] = cudagraph_capture_sizes
+        if max_cudagraph_capture_size is not None:
+            kwargs["max_cudagraph_capture_size"] = max_cudagraph_capture_size
         compilation_config = CompilationConfig(
-            cudagraph_capture_sizes=cudagraph_capture_sizes,
-            max_cudagraph_capture_size=max_cudagraph_capture_size,
             pass_config=PassConfig(
                 enable_sp=enable_sp,
                 fuse_norm_quant=True,
@@ -425,6 +428,7 @@ def test_cudagraph_sizes_post_init(
                 sp_min_token_num=512 if enable_sp else None,
             ),
             cudagraph_mode=cudagraph_mode,
+            **kwargs,
         )
         engine_args = EngineArgs(
             model="facebook/opt-125m",
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index 55a3b9858..8ca15c286 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for HF_HUB_OFFLINE mode"""
 
-import dataclasses
 import importlib
 import sys
 
@@ -12,7 +11,6 @@ import urllib3
 
 from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.engine.arg_utils import EngineArgs
 
 MODEL_CONFIGS = [
     {
@@ -160,8 +158,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
             # Need to re-import huggingface_hub
             # and friends to set up offline mode
             _re_import_modules()
-            engine_args = EngineArgs(model="facebook/opt-125m")
-            LLM(**dataclasses.asdict(engine_args))
+            LLM(model="facebook/opt-125m")
         finally:
             # Reset the environment after the test
             # NB: Assuming tests are run in online mode
diff --git a/tests/models/multimodal/generation/test_keye.py b/tests/models/multimodal/generation/test_keye.py
index d7430821d..e9d6626a2 100644
--- a/tests/models/multimodal/generation/test_keye.py
+++ b/tests/models/multimodal/generation/test_keye.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from dataclasses import asdict
 from typing import NamedTuple
 
 import pytest
@@ -29,14 +28,6 @@ def test_keye_vl(image_assets, question: str):
     images = [asset.pil_image for asset in image_assets]
     image_urls = [encode_image_url(image) for image in images]
 
-    engine_args = EngineArgs(
-        model=MODEL_NAME,
-        trust_remote_code=True,
-        max_model_len=8192,
-        max_num_seqs=5,
-        limit_mm_per_prompt={"image": len(image_urls)},
-    )
-
     placeholders = [{"type": "image", "image": url} for url in image_urls]
     messages = [
         {
@@ -54,8 +45,14 @@ def test_keye_vl(image_assets, question: str):
         messages, tokenize=False, add_generation_prompt=True
     )
 
-    engine_args = asdict(engine_args) | {"seed": 42}
-    llm = LLM(**engine_args)
+    llm = LLM(
+        model=MODEL_NAME,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        seed=42,
+    )
 
     sampling_params = SamplingParams(
         temperature=0.0, max_tokens=256, stop_token_ids=None
diff --git a/tests/models/multimodal/generation/test_vit_backend_functionality.py b/tests/models/multimodal/generation/test_vit_backend_functionality.py
index 9310f52df..ad912067a 100644
--- a/tests/models/multimodal/generation/test_vit_backend_functionality.py
+++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py
@@ -7,13 +7,12 @@ This test validates that each multimodal model can successfully generate outputs
 using different ViT attention backends. Tests are parametrized by model and backend.
 """
 
-from dataclasses import asdict
 from typing import Any
 
 import pytest
 from transformers import AutoProcessor
 
-from vllm import LLM, EngineArgs, SamplingParams
+from vllm import LLM, SamplingParams
 from vllm.multimodal.utils import encode_image_url
 from vllm.multimodal.video import sample_frames_from_video
 from vllm.platforms import current_platform
@@ -274,7 +273,7 @@ def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
     limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})
 
     # Create engine
-    engine_args = EngineArgs(
+    llm = LLM(
         model=config["model_name"],
         trust_remote_code=True,
         max_model_len=config["max_model_len"],
@@ -283,11 +282,9 @@ def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
         mm_encoder_attn_backend=mm_encoder_attn_backend,
         hf_overrides=dummy_hf_overrides,
         load_format="dummy",
+        seed=42,
     )
 
-    engine_dict = asdict(engine_args) | {"seed": 42}
-    llm = LLM(**engine_dict)
-
     # Generate
     sampling_params = SamplingParams(**config["sampling_params"])
     outputs = llm.generate(
@@ -318,7 +315,7 @@ def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
     messages = build_dots_ocr_prompt([stop_sign_image], config)
 
     # Create engine
-    engine_args = EngineArgs(
+    llm = LLM(
         model=config["model_name"],
         trust_remote_code=True,
         max_model_len=config["max_model_len"],
@@ -327,11 +324,9 @@ def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
         mm_encoder_attn_backend=mm_encoder_attn_backend,
         hf_overrides=dummy_hf_overrides,
         load_format="dummy",
+        seed=42,
     )
 
-    engine_dict = asdict(engine_args) | {"seed": 42}
-    llm = LLM(**engine_dict)
-
     # Generate using chat
     sampling_params = SamplingParams(**config["sampling_params"])
     outputs = llm.chat(messages=messages, sampling_params=sampling_params)
diff --git a/tests/models/multimodal/generation/test_voxtral_realtime.py b/tests/models/multimodal/generation/test_voxtral_realtime.py
index cac79b237..ca43e7b51 100644
--- a/tests/models/multimodal/generation/test_voxtral_realtime.py
+++ b/tests/models/multimodal/generation/test_voxtral_realtime.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
-from dataclasses import asdict
 
 import pytest
 import pytest_asyncio
@@ -75,7 +74,7 @@ def tokenizer() -> MistralTokenizer:
 @pytest.fixture
 def engine():
     engine_args = EngineArgs(**ENGINE_CONFIG)
-    llm = LLM(**asdict(engine_args))
+    llm = LLM.from_engine_args(engine_args)
     try:
         yield llm
     finally:
diff --git a/tests/v1/kv_connector/unit/test_example_connector.py b/tests/v1/kv_connector/unit/test_example_connector.py
index 7e05a0d93..90381d47b 100644
--- a/tests/v1/kv_connector/unit/test_example_connector.py
+++ b/tests/v1/kv_connector/unit/test_example_connector.py
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from dataclasses import asdict
 from typing import NamedTuple
 
 import pytest
 from PIL import Image
 
-from vllm import LLM, EngineArgs, SamplingParams
+from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import AttentionConfig, KVTransferConfig
 from vllm.multimodal.utils import encode_image_url
@@ -129,24 +128,6 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend):
     # Using tmp_path as the storage path to store KV
     print(f"KV storage path at: {str(tmp_path)}")
 
-    # Configure the ExampleConnector
-    kv_transfer_config = KVTransferConfig(
-        kv_connector="ExampleConnector",
-        kv_role="kv_both",
-        kv_connector_extra_config={"shared_storage_path": str(tmp_path)},
-    )
-
-    engine_args = EngineArgs(
-        model=MODEL_NAME,
-        max_model_len=8192,
-        max_num_seqs=1,
-        gpu_memory_utilization=0.4,
-        attention_config=AttentionConfig(backend=attn_backend),
-        enforce_eager=True,
-        kv_transfer_config=kv_transfer_config,
-        limit_mm_per_prompt={"image": 2},
-    )
-
     # don't put this import at the top level
     # it will call torch.accelerator.device_count()
     from transformers import AutoProcessor
@@ -163,8 +144,20 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend):
     assert image_1 != image_2, "The images should not be identical"
 
     # Create the LLM instance
-    engine_args = asdict(engine_args)
-    llm = LLM(**engine_args)
+    llm = LLM(
+        model=MODEL_NAME,
+        max_model_len=8192,
+        max_num_seqs=1,
+        gpu_memory_utilization=0.4,
+        attention_config=AttentionConfig(backend=attn_backend),
+        enforce_eager=True,
+        kv_transfer_config=KVTransferConfig(
+            kv_connector="ExampleConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": str(tmp_path)},
+        ),
+        limit_mm_per_prompt={"image": 2},
+    )
 
     # Prepare the input cases
     input_cases = [
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index 758e5efed..66afbec1b 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -6,7 +6,6 @@ import argparse
 import json
 import os
 import time
-from dataclasses import fields
 from typing import Any
 
 import numpy as np
@@ -85,7 +84,7 @@ def main(args: argparse.Namespace):
 
     # NOTE(woosuk): If the request cannot be processed in a single batch,
     # the engine will automatically process the request in multiple batches.
-    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+    llm = LLM.from_engine_args(engine_args)
     assert llm.llm_engine.model_config.max_model_len >= (
         args.input_len + args.output_len
     ), (
diff --git a/vllm/benchmarks/mm_processor.py b/vllm/benchmarks/mm_processor.py
index 4f31af0e0..3ce0b9119 100644
--- a/vllm/benchmarks/mm_processor.py
+++ b/vllm/benchmarks/mm_processor.py
@@ -17,7 +17,6 @@ import argparse
 import json
 import time
 from collections import defaultdict
-from dataclasses import fields
 from datetime import datetime
 from typing import TYPE_CHECKING, Any, Literal
 
@@ -225,7 +224,7 @@ def benchmark_multimodal_processor(
         args.seed = 0
 
     engine_args = EngineArgs.from_cli_args(args)
-    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+    llm = LLM.from_engine_args(engine_args)
 
     tokenizer = llm.get_tokenizer()
     requests = get_requests(args, tokenizer)
diff --git a/vllm/benchmarks/startup.py b/vllm/benchmarks/startup.py
index 405299938..375b8f9fa 100644
--- a/vllm/benchmarks/startup.py
+++ b/vllm/benchmarks/startup.py
@@ -16,7 +16,6 @@ import shutil
 import tempfile
 import time
 from contextlib import contextmanager
-from dataclasses import fields
 from typing import Any
 
 import numpy as np
@@ -67,7 +66,7 @@ def run_startup_in_subprocess(engine_args, result_queue):
         # Measure total startup time
         start_time = time.perf_counter()
 
-        llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+        llm = LLM.from_engine_args(engine_args)
 
         total_startup_time = time.perf_counter() - start_time
 
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index f7cea8bdd..6f878b275 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -8,7 +8,6 @@ import os
 import random
 import time
 import warnings
-from dataclasses import fields
 from typing import Any
 
 import torch
@@ -53,7 +52,7 @@ def run_vllm(
 ) -> tuple[float, list[RequestOutput] | None]:
     from vllm import LLM, SamplingParams
 
-    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+    llm = LLM.from_engine_args(engine_args)
     assert all(
         llm.llm_engine.model_config.max_model_len
         >= (request.prompt_len + request.expected_output_len)
@@ -141,7 +140,7 @@ def run_vllm_chat(
     """
     from vllm import LLM, SamplingParams
 
-    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
+    llm = LLM.from_engine_args(engine_args)
 
     assert all(
         llm.llm_engine.model_config.max_model_len
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 9e4196a44..ec2f7e7f7 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -116,29 +116,29 @@ class PassConfig:
     """
 
     # New flags
-    fuse_norm_quant: bool | None = Field(default=None)
+    fuse_norm_quant: bool = None  # type: ignore[assignment]
     """Fuse the custom RMSNorm + quant ops."""
-    fuse_act_quant: bool | None = Field(default=None)
+    fuse_act_quant: bool = None  # type: ignore[assignment]
     """Fuse the custom SiluMul + quant ops."""
-    fuse_attn_quant: bool | None = Field(default=None)
+    fuse_attn_quant: bool = None  # type: ignore[assignment]
     """Fuse the custom attention + quant ops."""
     eliminate_noops: bool = Field(default=True)
     """Eliminate no-op ops."""
-    enable_sp: bool | None = Field(default=None)
+    enable_sp: bool = None  # type: ignore[assignment]
     """Enable sequence parallelism. Requires TP>1. Automatically disabled
     if the model's hidden_size is too small for SP to be beneficial
     (threshold is device-capability dependent)."""
-    fuse_gemm_comms: bool | None = Field(default=None)
+    fuse_gemm_comms: bool = None  # type: ignore[assignment]
     """Enable async TP."""
-    fuse_allreduce_rms: bool | None = Field(default=None)
+    fuse_allreduce_rms: bool = None  # type: ignore[assignment]
     """Enable flashinfer allreduce fusion."""
     enable_qk_norm_rope_fusion: bool = False
     """Enable fused Q/K RMSNorm + RoPE pass."""
 
     # ROCm/AITER specific fusions
-    fuse_act_padding: bool | None = Field(default=None)
+    fuse_act_padding: bool = None  # type: ignore[assignment]
     """Fuse the custom RMSNorm + padding ops."""
-    fuse_rope_kvcache: bool | None = Field(default=None)
+    fuse_rope_kvcache: bool = None  # type: ignore[assignment]
     """Fuse the QK rope + KV cache ops."""
 
     rope_kvcache_fusion_max_token_num: int = 256
@@ -405,7 +405,7 @@ class CompilationConfig:
     """
 
     # Top-level Compilation control
-    mode: CompilationMode = Field(default=None)  # type: ignore[assignment]
+    mode: CompilationMode = None  # type: ignore[assignment]
     """The compilation approach used for torch.compile-based compilation of the
     model.
 
@@ -545,7 +545,7 @@ class CompilationConfig:
     constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
 
     # CudaGraph compilation
-    cudagraph_mode: CUDAGraphMode = Field(default=None)  # type: ignore[assignment]
+    cudagraph_mode: CUDAGraphMode = None  # type: ignore[assignment]
     """
     The mode of the cudagraph:
 
@@ -586,7 +586,7 @@ class CompilationConfig:
     It means the first several runs will be treated as warmup runs.
     Only after that, the execution will be recorded, and the recorded
     cudagraph will be used for subsequent runs."""
-    cudagraph_capture_sizes: list[int] | None = None
+    cudagraph_capture_sizes: list[int] = None  # type: ignore[assignment]
     """Sizes to capture cudagraph.
     - None (default): capture sizes are inferred from vllm config.
     - list[int]: capture sizes are specified as given."""
@@ -607,7 +607,7 @@ class CompilationConfig:
     When `enable_lora` is False, this option has no effect.
     """
 
-    use_inductor_graph_partition: bool = Field(default=None)  # type: ignore[assignment]
+    use_inductor_graph_partition: bool = None  # type: ignore[assignment]
     """Use inductor graph partition to split the graph at cudagraph_unsafe ops.
     This partition happens at inductor codegen time after all passes and fusions
     are finished. It generates a single `call` function which wraps
@@ -630,7 +630,7 @@ class CompilationConfig:
     pass_config: PassConfig = field(default_factory=PassConfig)
     """Custom inductor passes, see PassConfig for more details"""
 
-    max_cudagraph_capture_size: int | None = field(default=None)
+    max_cudagraph_capture_size: int = None  # type: ignore[assignment]
     """The maximum cudagraph capture size.
 
     If cudagraph_capture_sizes is specified, this will be set to the largest
@@ -750,7 +750,7 @@ class CompilationConfig:
         return hash_factors(factors)
 
     def __repr__(self) -> str:
-        exclude = {
+        exclude: dict[str, bool | dict[str, bool]] = {
             "static_forward_context": True,
             "enabled_custom_ops": True,
             "disabled_custom_ops": True,
@@ -770,9 +770,7 @@ class CompilationConfig:
             exclude["pass_config"] = pass_config_exclude
 
         config = TypeAdapter(CompilationConfig).dump_python(
-            self,
-            exclude=exclude,  # type: ignore[arg-type]
-            exclude_unset=True,
+            self, exclude=exclude, exclude_unset=True
         )
 
         return str(config)
@@ -1023,7 +1021,6 @@ class CompilationConfig:
                         "Unrecognized size type in compile_sizes, "
                         f"expect 'cudagraph_capture_sizes', got {x}"
                     )
-                    assert self.cudagraph_capture_sizes is not None
                     computed_compile_sizes.extend(self.cudagraph_capture_sizes)
                 else:
                     assert isinstance(x, int)
@@ -1031,7 +1028,6 @@ class CompilationConfig:
         self.compile_sizes = computed_compile_sizes  # type: ignore
 
         # make sure the sizes are in ascending order
-        assert self.cudagraph_capture_sizes is not None
         self.cudagraph_capture_sizes.sort()
         if self.cudagraph_capture_sizes:
             assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size
@@ -1123,7 +1119,6 @@ class CompilationConfig:
 
     def set_splitting_ops_for_attn_fusion(self):
         assert self.pass_config.fuse_attn_quant
-        assert self.cudagraph_mode is not None
         if self.splitting_ops is None:
             self.splitting_ops = []
             if self.cudagraph_mode.has_piecewise_cudagraphs():
diff --git a/vllm/config/device.py b/vllm/config/device.py
index bb689c9b3..c20e4d0f2 100644
--- a/vllm/config/device.py
+++ b/vllm/config/device.py
@@ -13,8 +13,8 @@ from vllm.utils.hashing import safe_hash
 Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
 
 
-@config(config=ConfigDict(arbitrary_types_allowed=True))  # type: ignore[arg-type,misc]
-class DeviceConfig:  # type: ignore[misc]
+@config(config=ConfigDict(arbitrary_types_allowed=True))
+class DeviceConfig:
     """Configuration for the device to use for vLLM execution."""
 
     device: SkipValidation[Device | torch.device | None] = "auto"
diff --git a/vllm/config/kernel.py b/vllm/config/kernel.py
index 5e1c9109a..2ec18289d 100644
--- a/vllm/config/kernel.py
+++ b/vllm/config/kernel.py
@@ -4,7 +4,7 @@
 from collections.abc import Callable
 from typing import Any, Literal
 
-from pydantic import Field, field_validator
+from pydantic import field_validator
 
 from vllm.config.utils import config
 from vllm.utils.hashing import safe_hash
@@ -26,7 +26,7 @@ MoEBackend = Literal[
 class KernelConfig:
     """Configuration for kernel selection and warmup behavior."""
 
-    enable_flashinfer_autotune: bool | None = Field(default=None)
+    enable_flashinfer_autotune: bool = None  # type: ignore[assignment]
     """If True, run FlashInfer autotuning during kernel warmup."""
 
     moe_backend: MoEBackend = "auto"
diff --git a/vllm/config/kv_events.py b/vllm/config/kv_events.py
index 77ed5fabf..d618bc9a7 100644
--- a/vllm/config/kv_events.py
+++ b/vllm/config/kv_events.py
@@ -4,8 +4,6 @@
 
 from typing import Literal
 
-from pydantic import Field
-
 from vllm.config.utils import config
 
 
@@ -18,7 +16,7 @@ class KVEventsConfig:
     Events can be published externally by zmq using the event publisher config.
     """
 
-    publisher: Literal["null", "zmq"] | None = Field(default=None)
+    publisher: Literal["null", "zmq"] = None  # type: ignore[assignment]
     """The publisher to use for publishing kv events. Can be "null", "zmq".
     """
 
diff --git a/vllm/config/lora.py b/vllm/config/lora.py
index 696e92df7..bfef0efa3 100644
--- a/vllm/config/lora.py
+++ b/vllm/config/lora.py
@@ -25,8 +25,8 @@ MaxLoRARanks = Literal[1, 8, 16, 32, 64, 128, 256, 320, 512]
 LoRAExtraVocabSize = Literal[256, 512]
 
 
-@config(config=ConfigDict(arbitrary_types_allowed=True))  # type: ignore[arg-type,misc]
-class LoRAConfig:  # type: ignore[misc]
+@config(config=ConfigDict(arbitrary_types_allowed=True))
+class LoRAConfig:
     """Configuration for LoRA."""
 
     max_lora_rank: MaxLoRARanks = 16
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 032f29c18..e51723009 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -102,8 +102,8 @@ AttnTypeStr = Literal[
 ]
 
 
-@config(config=ConfigDict(arbitrary_types_allowed=True))  # type: ignore[arg-type,misc]
-class ModelConfig:  # type: ignore[misc]
+@config(config=ConfigDict(arbitrary_types_allowed=True))
+class ModelConfig:
     """Configuration for the model."""
 
     model: str = "Qwen/Qwen3-0.6B"
@@ -121,7 +121,7 @@ class ModelConfig:  # type: ignore[misc]
     """Convert the model using adapters defined in
     [vllm.model_executor.models.adapters][]. The most common use case is to
     adapt a text generation model to be used for pooling tasks."""
-    tokenizer: str = Field(default=None)  # type: ignore[assignment]
+    tokenizer: str = None  # type: ignore[assignment]
     """Name or path of the Hugging Face tokenizer to use. If unspecified, model
     name or path will be used."""
     tokenizer_mode: TokenizerMode | str = "auto"
@@ -583,7 +583,7 @@ class ModelConfig:  # type: ignore[misc]
             self.dtype,
             is_pooling_model=self.runner_type == "pooling",
             revision=self.revision,
-            config_format=self.config_format,  # type: ignore[arg-type]
+            config_format=self.config_format,
         )
 
         self.original_max_model_len = self.max_model_len
@@ -733,7 +733,7 @@ class ModelConfig:  # type: ignore[misc]
 
     @property
     def architectures(self) -> list[str]:
-        return self.model_arch_config.architectures  # type: ignore[return-value]
+        return self.model_arch_config.architectures
 
     @property
     def architecture(self) -> str:
@@ -1944,7 +1944,7 @@ def _get_and_verify_dtype(
     *,
     is_pooling_model: bool,
     revision: str | None = None,
-    config_format: ConfigFormat = "hf",
+    config_format: str | ConfigFormat = "hf",
 ) -> torch.dtype:
     config_dtype = ModelArchConfigConvertorBase.get_torch_dtype(
         config, model_id, revision=revision, config_format=config_format
diff --git a/vllm/config/model_arch.py b/vllm/config/model_arch.py
index d55e2a339..24d1baea0 100644
--- a/vllm/config/model_arch.py
+++ b/vllm/config/model_arch.py
@@ -16,7 +16,7 @@ class ModelArchitectureConfig:
     Configuration for model architecture that required by vLLM runtime
     """
 
-    architectures: list[str] | None
+    architectures: list[str]
     """List of model architecture class names (e.g., ['LlamaForCausalLM']).
        It can be None upon calling `vllm_config.with_hf_config(config.text_config)`"""
 
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 8332b0ec7..7dd9c5bb5 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -194,7 +194,7 @@ class ParallelConfig:
     threshold, microbatching will be used. Otherwise, the request will be
     processed in a single batch."""
 
-    disable_nccl_for_dp_synchronization: bool | None = Field(default=None)
+    disable_nccl_for_dp_synchronization: bool | None = None
     """Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py 
     to use Gloo instead of NCCL for its all reduce.
 
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index ce30fcab4..f988c1086 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -52,7 +52,7 @@ class SchedulerConfig:
     In real usage, this should be set in `EngineArgs.create_engine_config`.
     """
 
-    max_num_scheduled_tokens: int | None = Field(default=None)
+    max_num_scheduled_tokens: int | None = None
     """Maximum number of tokens that the scheduler may issue in a single iteration.
     
     This is usually equal to max_num_batched_tokens, but can be smaller in cases
@@ -122,7 +122,7 @@ class SchedulerConfig:
 
     # scheduler class or path. "vllm.v1.core.sched.scheduler.Scheduler"
     # (default) or "mod.custom_class".
-    scheduler_cls: str | type[object] | None = Field(default=None)
+    scheduler_cls: str | type[object] | None = None
     """The scheduler class to use. "vllm.v1.core.sched.scheduler.Scheduler" is
     the default scheduler. Can be a class directly or the path to a class of
     form "mod.custom_class"."""
@@ -141,7 +141,7 @@ class SchedulerConfig:
     checking the first chunk. Prevents over-admission and KV cache thrashing
     with chunked prefill."""
 
-    async_scheduling: bool | None = Field(default=None)
+    async_scheduling: bool | None = None
     """If set to False, disable async scheduling. Async scheduling helps to
     avoid gaps in GPU utilization, leading to better latency and throughput.
     """
diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 7ae9c0c24..899666048 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -11,13 +11,13 @@ import os
 import pathlib
 import textwrap
 from collections.abc import Callable, Mapping, Sequence, Set
-from dataclasses import MISSING, dataclass, field, fields, is_dataclass
+from dataclasses import MISSING, field, fields, is_dataclass
 from itertools import pairwise
-from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast
+from typing import TYPE_CHECKING, Any, Protocol, TypeVar, cast, overload
 
 import torch
 from pydantic import ConfigDict
-from pydantic.dataclasses import dataclass as pydantic_dataclass
+from pydantic.dataclasses import dataclass
 from pydantic.fields import Field as PydanticField
 from pydantic.fields import FieldInfo
 from typing_extensions import dataclass_transform, runtime_checkable
@@ -36,6 +36,16 @@ ConfigType = type[DataclassInstance]
 ConfigT = TypeVar("ConfigT", bound=DataclassInstance)
 
 
+@overload
+def config(cls: type[ConfigT]) -> type[ConfigT]: ...
+
+
+@overload
+def config(
+    *, config: ConfigDict | None = None, **kwargs: Any
+) -> Callable[[type[ConfigT]], type[ConfigT]]: ...
+
+
 @dataclass_transform(field_specifiers=(PydanticField,))
 def config(
     cls: type[ConfigT] | None = None,
@@ -59,7 +69,7 @@ def config(
         merged_config.update(config)
 
     def decorator(cls: type[ConfigT]) -> type[ConfigT]:
-        return pydantic_dataclass(cls, config=merged_config, **kwargs)  # type: ignore[return-value]
+        return dataclass(cls, config=merged_config, **kwargs)  # type: ignore[return-value]
 
     # Called with arguments: @config(config=...)
     if cls is None:
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 88942fc86..65a78f4d0 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -246,15 +246,15 @@ OPTIMIZATION_LEVEL_TO_CONFIG = {
 }
 
 
-@config(config=ConfigDict(arbitrary_types_allowed=True))  # type: ignore[arg-type,misc]
-class VllmConfig:  # type: ignore[misc]
+@config(config=ConfigDict(arbitrary_types_allowed=True))
+class VllmConfig:
     """Dataclass which contains all vllm-related configuration. This
     simplifies passing around the distinct configurations in the codebase.
     """
 
     # TODO: use default_factory once default constructing ModelConfig doesn't
     # try to download a model
-    model_config: ModelConfig = Field(default=None)  # type: ignore[assignment]
+    model_config: ModelConfig = None  # type: ignore[assignment]
     """Model configuration."""
     cache_config: CacheConfig = Field(default_factory=CacheConfig)
     """Cache configuration."""
@@ -912,7 +912,8 @@ class VllmConfig:  # type: ignore[misc]
 
                     tp_size = self.parallel_config.tensor_parallel_size
                     hidden_size = self.model_config.get_hidden_size()
-                    element_size = self.model_config.dtype.itemsize  # type: ignore[union-attr]
+                    assert isinstance(self.model_config.dtype, torch.dtype)
+                    element_size = self.model_config.dtype.itemsize
                     pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
                         hidden_size, tp_size, element_size
                     )
@@ -1246,14 +1247,6 @@ class VllmConfig:  # type: ignore[misc]
                 )
             self.compilation_config.debug_dump_path = env_path
 
-        def has_blocked_weights():  # type: ignore[no-redef]
-            if self.quant_config is not None:
-                if hasattr(self.quant_config, "weight_block_size"):
-                    return self.quant_config.weight_block_size is not None
-                elif hasattr(self.quant_config, "has_blocked_weights"):
-                    return self.quant_config.has_blocked_weights()
-            return False
-
         # Enable quant_fp8 CUDA ops (TODO disable in follow up)
         # On H100 the CUDA kernel is faster than
         # native implementation
@@ -1502,9 +1495,10 @@ class VllmConfig:  # type: ignore[misc]
             tp_size = self.parallel_config.tensor_parallel_size
             max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
             if max_size is not None:
+                assert isinstance(self.model_config.dtype, torch.dtype)
                 max_token_num = max_size // (
                     self.model_config.get_hidden_size()
-                    * self.model_config.dtype.itemsize  # type: ignore[union-attr]
+                    * self.model_config.dtype.itemsize
                 )
                 if compile_range_end is not None and max_token_num < compile_range_end:
                     computed_compile_ranges_endpoints.append(max_token_num)
@@ -1527,7 +1521,8 @@ class VllmConfig:  # type: ignore[misc]
 
                 tp_size = self.parallel_config.tensor_parallel_size
                 hidden_size = self.model_config.get_hidden_size()
-                element_size = self.model_config.dtype.itemsize  # type: ignore[union-attr]
+                assert isinstance(self.model_config.dtype, torch.dtype)
+                element_size = self.model_config.dtype.itemsize
                 pass_config.sp_min_token_num = get_sequence_parallelism_threshold(
                     hidden_size, tp_size, element_size
                 )
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 939c195b2..d4018b7ec 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1935,7 +1935,7 @@ class EngineArgs:
             )
 
         offload_config = OffloadConfig(
-            offload_backend=self.offload_backend,  # type: ignore[arg-type]
+            offload_backend=self.offload_backend,
             uva=UVAOffloadConfig(
                 cpu_offload_gb=self.cpu_offload_gb,
                 cpu_offload_params=self.cpu_offload_params,
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 61577695a..e9e7cb91c 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -409,6 +409,11 @@ class LLM:
         # Cache for __repr__ to avoid repeated collective_rpc calls
         self._cached_repr: str | None = None
 
+    @classmethod
+    def from_engine_args(cls, engine_args: EngineArgs) -> "LLM":
+        """Create an LLM instance from EngineArgs."""
+        return cls(**vars(engine_args))
+
     def get_tokenizer(self) -> TokenizerLike:
         return self.llm_engine.get_tokenizer()
 
diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py
index f5fb290d1..3229539e3 100644
--- a/vllm/transformers_utils/model_arch_config_convertor.py
+++ b/vllm/transformers_utils/model_arch_config_convertor.py
@@ -28,7 +28,10 @@ class ModelArchConfigConvertorBase:
         self.hf_text_config = hf_text_config
 
     def get_architectures(self) -> list[str]:
-        return getattr(self.hf_config, "architectures", [])
+        # Sometimes we get here from `vllm_config.with_hf_config(text_config)` where
+        # `text_config` is a sub-config from a multi-modal model. If this is the case,
+        # the sub-config will not have `architectures` and it will explicitly be `None`
+        return getattr(self.hf_config, "architectures", None) or []
 
     def get_num_hidden_layers(self) -> int:
         return getattr(self.hf_text_config, "num_hidden_layers", 0)
@@ -128,7 +131,7 @@ class ModelArchConfigConvertorBase:
         hf_config: PretrainedConfig,
         model_id: str,
         revision: str | None,
-        config_format: ConfigFormat,
+        config_format: str | ConfigFormat,
     ):
         # NOTE: getattr(config, "dtype", torch.float32) is not correct
         # because config.dtype can be None.