Remove default values from InitVars so that they're not stored (#29859)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> (cherry picked from commit 951445a52d)
2025-12-02 12:16:37 +00:00
parent d8c6210eea
commit 85fb2e3120
17 changed files with 139 additions and 77 deletions
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -185,6 +185,8 @@ def create_vllm_config(
        max_num_seqs=max_num_seqs,
        max_num_batched_tokens=max_num_batched_tokens,
        enable_chunked_prefill=enable_chunked_prefill,
+        max_model_len=model_config.max_model_len,
+        is_encoder_decoder=model_config.is_encoder_decoder,
    )

    device_config = DeviceConfig()
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1128,7 +1128,11 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
        dtype="float16",
        max_model_len=max_model_len,
    )
-    scheduler_config = SchedulerConfig(max_num_batched_tokens=32768)
+    scheduler_config = SchedulerConfig(
+        max_num_batched_tokens=32768,
+        max_model_len=model_config.max_model_len,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )

    vllm_config = VllmConfig(
        model_config=model_config,
@@ -1163,7 +1167,10 @@ def test_get_max_concurrency_for_kv_cache_config():
        max_model_len=max_model_len,
    )
    scheduler_config = SchedulerConfig(
-        max_num_batched_tokens=1024, enable_chunked_prefill=True
+        max_num_batched_tokens=1024,
+        enable_chunked_prefill=True,
+        max_model_len=model_config.max_model_len,
+        is_encoder_decoder=model_config.is_encoder_decoder,
    )

    vllm_config = VllmConfig(
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1508,6 +1508,12 @@ def create_scheduler_with_priority(
    Returns:
      {class}`Scheduler` instance with priority scheduling
    """
+    model_config = ModelConfig(
+        model=model,
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+    )
    if max_model_len is None:
        max_model_len = max_num_batched_tokens
    scheduler_config = SchedulerConfig(
@@ -1517,14 +1523,9 @@ def create_scheduler_with_priority(
        long_prefill_token_threshold=long_prefill_token_threshold,
        disable_chunked_mm_input=disable_chunked_mm_input,
        enable_chunked_prefill=True,
+        is_encoder_decoder=model_config.is_encoder_decoder,
        policy="priority",  # Enable priority scheduling
    )
-    model_config = ModelConfig(
-        model=model,
-        trust_remote_code=True,
-        dtype="float16",
-        seed=42,
-    )
    # Cache config, optionally force APC
    cache_config = CacheConfig(
        block_size=block_size,
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -69,6 +69,13 @@ def create_scheduler(
    Returns:
      {class}`Scheduler` instance
    """
+    model_config = ModelConfig(
+        model=model,
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+        skip_tokenizer_init=skip_tokenizer_init,
+    )
    if max_model_len is None:
        max_model_len = max_num_batched_tokens
    scheduler_config = SchedulerConfig(
@@ -79,13 +86,7 @@ def create_scheduler(
        disable_chunked_mm_input=disable_chunked_mm_input,
        enable_chunked_prefill=enable_chunked_prefill,
        async_scheduling=async_scheduling,
-    )
-    model_config = ModelConfig(
-        model=model,
-        trust_remote_code=True,
-        dtype="float16",
-        seed=42,
-        skip_tokenizer_init=skip_tokenizer_init,
+        is_encoder_decoder=model_config.is_encoder_decoder,
    )
    # Cache config, optionally force APC
    cache_config = CacheConfig(
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -40,7 +40,9 @@ def _create_vllm_config(
 ) -> MagicMock:
    mock_config = MagicMock(spec=VllmConfig)
    mock_config.compilation_config = compilation_config
-    mock_config.scheduler_config = SchedulerConfig(max_num_seqs=max_num_seqs)
+    mock_config.scheduler_config = SchedulerConfig.default_factory(
+        max_num_seqs=max_num_seqs,
+    )
    mock_config.parallel_config = ParallelConfig()
    mock_config.speculative_config = None  # No speculative decoding
    if not lora_config:
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -484,12 +484,6 @@ def test_encoder_instance_zero_kv_cache(
    vision encoder, so they don't need KV cache for text generation.
    """
    # Form vllm config
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=10,
-        max_num_batched_tokens=512,
-        max_model_len=512,
-        disable_hybrid_kv_cache_manager=True,
-    )
    model_config = ModelConfig(
        model="llava-hf/llava-1.5-7b-hf",  # Multimodal model
        enforce_eager=True,
@@ -497,6 +491,13 @@ def test_encoder_instance_zero_kv_cache(
        dtype="float16",
        seed=42,
    )
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+        disable_hybrid_kv_cache_manager=True,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )
    cache_config = CacheConfig(
        block_size=16,
        gpu_memory_utilization=gpu_memory_utilization,
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -92,18 +92,19 @@ def create_vllm_config(
    enable_permute_local_kv: bool = False,
 ) -> VllmConfig:
    """Initialize VllmConfig For Testing."""
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=max_num_seqs,
-        max_num_batched_tokens=max_num_batched_tokens,
-        max_model_len=max_model_len,
-        enable_chunked_prefill=enable_chunked_prefill,
-    )
    model_config = ModelConfig(
        model=model,
        trust_remote_code=True,
        dtype="float16",
        seed=42,
    )
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_model_len=max_model_len,
+        enable_chunked_prefill=enable_chunked_prefill,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )
    # Cache config, optionally force APC
    cache_config = CacheConfig(
        block_size=block_size,
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -66,7 +66,10 @@ def _create_proposer(
        device_config=DeviceConfig(device=current_platform.device_type),
        parallel_config=ParallelConfig(),
        load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig(),
+        scheduler_config=SchedulerConfig(
+            max_model_len=model_config.max_model_len,
+            is_encoder_decoder=model_config.is_encoder_decoder,
+        ),
    )

    return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@@ -51,7 +51,10 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
        device_config=DeviceConfig(device=current_platform.device_type),
        parallel_config=ParallelConfig(),
        load_config=LoadConfig(),
-        scheduler_config=SchedulerConfig(),
+        scheduler_config=SchedulerConfig(
+            max_model_len=model_config.max_model_len,
+            is_encoder_decoder=model_config.is_encoder_decoder,
+        ),
    )

    return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -26,16 +26,17 @@ from vllm.v1.worker.tpu_model_runner import (


 def get_vllm_config():
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=10,
-        max_num_batched_tokens=512,
-        max_model_len=512,
-    )
    model_config = ModelConfig(
        model="facebook/opt-125m",
        dtype="bfloat16",  # TPUs typically use bfloat16
        seed=42,
    )
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )
    cache_config = CacheConfig(
        block_size=16,
        gpu_memory_utilization=0.9,
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -79,16 +79,17 @@ def initialize_kv_cache(runner: GPUModelRunner):


 def get_vllm_config():
-    scheduler_config = SchedulerConfig(
-        max_num_seqs=10,
-        max_num_batched_tokens=512,
-        max_model_len=512,
-    )
    model_config = ModelConfig(
        model="facebook/opt-125m",
        dtype="float16",
        seed=42,
    )
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )
    cache_config = CacheConfig(
        block_size=BLOCK_SIZE,
        gpu_memory_utilization=0.9,
@@ -784,14 +785,15 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
    initialize_model_parallel(tensor_model_parallel_size=1)
    torch.set_default_dtype(torch.float16)

+    model_config = ModelConfig(
+        model="ibm-granite/granite-4.0-tiny-preview",
+        dtype="float16",
+    )
    scheduler_config = SchedulerConfig(
        max_num_seqs=10,
        max_num_batched_tokens=512,
        max_model_len=512,
-    )
-    model_config = ModelConfig(
-        model="ibm-granite/granite-4.0-tiny-preview",
-        dtype="float16",
+        is_encoder_decoder=model_config.is_encoder_decoder,
    )
    cache_config = CacheConfig(
        block_size=BLOCK_SIZE,