[CI] Revert PRs 34818 and 33600 (#34979)

2026-02-20 16:25:50 -05:00
parent f24b2de3d3
commit aaefc58ee0
16 changed files with 249 additions and 301 deletions
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -19,6 +19,7 @@ else:

 logger = init_logger(__name__)

+BlockSize = Literal[1, 8, 16, 32, 64, 128, 256]
 CacheDType = Literal[
    "auto",
    "bfloat16",
@@ -38,11 +39,13 @@ KVOffloadingBackend = Literal["native", "lmcache"]
 class CacheConfig:
    """Configuration for the KV cache."""

-    block_size: SkipValidation[int] = None  # type: ignore[assignment]
-    """Size of a contiguous cache block in number of tokens.
+    block_size: SkipValidation[BlockSize] = None  # type: ignore[assignment]
+    """Size of a contiguous cache block in number of tokens. On CUDA devices,
+    only block sizes up to 32 are supported.

-    This is None until the platform sets it. Always an int by the time
-    the engine starts."""
+    This config has no static default. If left unspecified by the user, it will
+    be set in `Platform.check_and_update_config()` based on the current
+    platform."""
    gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
    """The fraction of GPU memory to be used for the model executor, which can
    range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -915,6 +915,32 @@ class VllmConfig:
            )
        current_platform.check_and_update_config(self)

+        # If DCP, ensure the block size is right.
+        if self.parallel_config.decode_context_parallel_size > 1:
+            if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
+                self.parallel_config.cp_kv_cache_interleave_size
+                != self.parallel_config.dcp_kv_cache_interleave_size
+            ):
+                self.parallel_config.cp_kv_cache_interleave_size = (
+                    self.parallel_config.dcp_kv_cache_interleave_size
+                )
+                logger.warning_once(
+                    "cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
+                    "_interleave_size. And dcp-kv-cache-interleave-size will be "
+                    "deprecated when PCP is fully supported."
+                )
+            assert (
+                self.parallel_config.cp_kv_cache_interleave_size
+                <= self.cache_config.block_size
+                and self.cache_config.block_size
+                % self.parallel_config.cp_kv_cache_interleave_size
+                == 0
+            ), (
+                f"Block_size({self.cache_config.block_size}) should be greater "
+                "than or equal to and divisible by cp_kv_cache_interleave_size "
+                f"({self.parallel_config.cp_kv_cache_interleave_size})."
+            )
+
        # Do this after all the updates to compilation_config.mode
        effective_dp_size = (
            self.parallel_config.data_parallel_size
@@ -1082,6 +1108,26 @@ class VllmConfig:
            # Default to enable HMA if not explicitly disabled by user or logic above.
            self.scheduler_config.disable_hybrid_kv_cache_manager = False

+        if self.cache_config.mamba_cache_mode == "align":
+            assert (
+                self.cache_config.block_size
+                <= self.scheduler_config.max_num_batched_tokens
+            ), (
+                "In Mamba cache align mode, block_size "
+                f"({self.cache_config.block_size}) must be <= "
+                "max_num_batched_tokens "
+                f"({self.scheduler_config.max_num_batched_tokens})."
+            )
+            if self.scheduler_config.long_prefill_token_threshold > 0:
+                assert (
+                    self.scheduler_config.long_prefill_token_threshold
+                    >= self.cache_config.block_size
+                )
+            assert not self.scheduler_config.disable_chunked_mm_input, (
+                "Chunked MM input is required because we need the flexibility to "
+                "schedule a multiple of block_size tokens even if they are in the "
+                "middle of a mm input"
+            )
        if self.compilation_config.debug_dump_path:
            self.compilation_config.debug_dump_path = (
                self.compilation_config.debug_dump_path.absolute().expanduser()
@@ -1442,57 +1488,6 @@ class VllmConfig:
            f"compilation_config={self.compilation_config!r}"
        )

-    def validate_block_size(self) -> None:
-        """Validate block_size against DCP and mamba constraints.
-
-        Called after Platform.update_block_size_for_backend() has
-        finalised block_size, so that the checks see the real value
-        rather than the initial None sentinel.
-        """
-        block_size = self.cache_config.block_size
-        assert block_size is not None, (
-            "validate_block_size called before block_size was set"
-        )
-
-        # DCP interleave-size compatibility
-        if self.parallel_config.decode_context_parallel_size > 1:
-            if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
-                self.parallel_config.cp_kv_cache_interleave_size
-                != self.parallel_config.dcp_kv_cache_interleave_size
-            ):
-                self.parallel_config.cp_kv_cache_interleave_size = (
-                    self.parallel_config.dcp_kv_cache_interleave_size
-                )
-                logger.warning_once(
-                    "cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
-                    "_interleave_size. And dcp-kv-cache-interleave-size will be "
-                    "deprecated when PCP is fully supported."
-                )
-            assert (
-                self.parallel_config.cp_kv_cache_interleave_size <= block_size
-                and block_size % self.parallel_config.cp_kv_cache_interleave_size == 0
-            ), (
-                f"Block_size({block_size}) should be greater "
-                "than or equal to and divisible by cp_kv_cache_interleave_size "
-                f"({self.parallel_config.cp_kv_cache_interleave_size})."
-            )
-
-        # Mamba cache align-mode constraints
-        if self.cache_config.mamba_cache_mode == "align":
-            assert block_size <= self.scheduler_config.max_num_batched_tokens, (
-                "In Mamba cache align mode, block_size "
-                f"({block_size}) must be <= "
-                "max_num_batched_tokens "
-                f"({self.scheduler_config.max_num_batched_tokens})."
-            )
-            if self.scheduler_config.long_prefill_token_threshold > 0:
-                assert self.scheduler_config.long_prefill_token_threshold >= block_size
-            assert not self.scheduler_config.disable_chunked_mm_input, (
-                "Chunked MM input is required because we need the flexibility "
-                "to schedule a multiple of block_size tokens even if they are "
-                "in the middle of a mm input"
-            )
-
    @model_validator(mode="after")
    def validate_mamba_block_size(self) -> "VllmConfig":
        if self.model_config is None: