[CI] Revert PRs 34818 and 33600 (#34979)
This commit is contained in:
@@ -19,6 +19,7 @@ else:
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
BlockSize = Literal[1, 8, 16, 32, 64, 128, 256]
|
||||
CacheDType = Literal[
|
||||
"auto",
|
||||
"bfloat16",
|
||||
@@ -38,11 +39,13 @@ KVOffloadingBackend = Literal["native", "lmcache"]
|
||||
class CacheConfig:
|
||||
"""Configuration for the KV cache."""
|
||||
|
||||
block_size: SkipValidation[int] = None # type: ignore[assignment]
|
||||
"""Size of a contiguous cache block in number of tokens.
|
||||
block_size: SkipValidation[BlockSize] = None # type: ignore[assignment]
|
||||
"""Size of a contiguous cache block in number of tokens. On CUDA devices,
|
||||
only block sizes up to 32 are supported.
|
||||
|
||||
This is None until the platform sets it. Always an int by the time
|
||||
the engine starts."""
|
||||
This config has no static default. If left unspecified by the user, it will
|
||||
be set in `Platform.check_and_update_config()` based on the current
|
||||
platform."""
|
||||
gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
|
||||
"""The fraction of GPU memory to be used for the model executor, which can
|
||||
range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
|
||||
|
||||
@@ -915,6 +915,32 @@ class VllmConfig:
|
||||
)
|
||||
current_platform.check_and_update_config(self)
|
||||
|
||||
# If DCP, ensure the block size is right.
|
||||
if self.parallel_config.decode_context_parallel_size > 1:
|
||||
if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
|
||||
self.parallel_config.cp_kv_cache_interleave_size
|
||||
!= self.parallel_config.dcp_kv_cache_interleave_size
|
||||
):
|
||||
self.parallel_config.cp_kv_cache_interleave_size = (
|
||||
self.parallel_config.dcp_kv_cache_interleave_size
|
||||
)
|
||||
logger.warning_once(
|
||||
"cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
|
||||
"_interleave_size. And dcp-kv-cache-interleave-size will be "
|
||||
"deprecated when PCP is fully supported."
|
||||
)
|
||||
assert (
|
||||
self.parallel_config.cp_kv_cache_interleave_size
|
||||
<= self.cache_config.block_size
|
||||
and self.cache_config.block_size
|
||||
% self.parallel_config.cp_kv_cache_interleave_size
|
||||
== 0
|
||||
), (
|
||||
f"Block_size({self.cache_config.block_size}) should be greater "
|
||||
"than or equal to and divisible by cp_kv_cache_interleave_size "
|
||||
f"({self.parallel_config.cp_kv_cache_interleave_size})."
|
||||
)
|
||||
|
||||
# Do this after all the updates to compilation_config.mode
|
||||
effective_dp_size = (
|
||||
self.parallel_config.data_parallel_size
|
||||
@@ -1082,6 +1108,26 @@ class VllmConfig:
|
||||
# Default to enable HMA if not explicitly disabled by user or logic above.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = False
|
||||
|
||||
if self.cache_config.mamba_cache_mode == "align":
|
||||
assert (
|
||||
self.cache_config.block_size
|
||||
<= self.scheduler_config.max_num_batched_tokens
|
||||
), (
|
||||
"In Mamba cache align mode, block_size "
|
||||
f"({self.cache_config.block_size}) must be <= "
|
||||
"max_num_batched_tokens "
|
||||
f"({self.scheduler_config.max_num_batched_tokens})."
|
||||
)
|
||||
if self.scheduler_config.long_prefill_token_threshold > 0:
|
||||
assert (
|
||||
self.scheduler_config.long_prefill_token_threshold
|
||||
>= self.cache_config.block_size
|
||||
)
|
||||
assert not self.scheduler_config.disable_chunked_mm_input, (
|
||||
"Chunked MM input is required because we need the flexibility to "
|
||||
"schedule a multiple of block_size tokens even if they are in the "
|
||||
"middle of a mm input"
|
||||
)
|
||||
if self.compilation_config.debug_dump_path:
|
||||
self.compilation_config.debug_dump_path = (
|
||||
self.compilation_config.debug_dump_path.absolute().expanduser()
|
||||
@@ -1442,57 +1488,6 @@ class VllmConfig:
|
||||
f"compilation_config={self.compilation_config!r}"
|
||||
)
|
||||
|
||||
def validate_block_size(self) -> None:
|
||||
"""Validate block_size against DCP and mamba constraints.
|
||||
|
||||
Called after Platform.update_block_size_for_backend() has
|
||||
finalised block_size, so that the checks see the real value
|
||||
rather than the initial None sentinel.
|
||||
"""
|
||||
block_size = self.cache_config.block_size
|
||||
assert block_size is not None, (
|
||||
"validate_block_size called before block_size was set"
|
||||
)
|
||||
|
||||
# DCP interleave-size compatibility
|
||||
if self.parallel_config.decode_context_parallel_size > 1:
|
||||
if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
|
||||
self.parallel_config.cp_kv_cache_interleave_size
|
||||
!= self.parallel_config.dcp_kv_cache_interleave_size
|
||||
):
|
||||
self.parallel_config.cp_kv_cache_interleave_size = (
|
||||
self.parallel_config.dcp_kv_cache_interleave_size
|
||||
)
|
||||
logger.warning_once(
|
||||
"cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
|
||||
"_interleave_size. And dcp-kv-cache-interleave-size will be "
|
||||
"deprecated when PCP is fully supported."
|
||||
)
|
||||
assert (
|
||||
self.parallel_config.cp_kv_cache_interleave_size <= block_size
|
||||
and block_size % self.parallel_config.cp_kv_cache_interleave_size == 0
|
||||
), (
|
||||
f"Block_size({block_size}) should be greater "
|
||||
"than or equal to and divisible by cp_kv_cache_interleave_size "
|
||||
f"({self.parallel_config.cp_kv_cache_interleave_size})."
|
||||
)
|
||||
|
||||
# Mamba cache align-mode constraints
|
||||
if self.cache_config.mamba_cache_mode == "align":
|
||||
assert block_size <= self.scheduler_config.max_num_batched_tokens, (
|
||||
"In Mamba cache align mode, block_size "
|
||||
f"({block_size}) must be <= "
|
||||
"max_num_batched_tokens "
|
||||
f"({self.scheduler_config.max_num_batched_tokens})."
|
||||
)
|
||||
if self.scheduler_config.long_prefill_token_threshold > 0:
|
||||
assert self.scheduler_config.long_prefill_token_threshold >= block_size
|
||||
assert not self.scheduler_config.disable_chunked_mm_input, (
|
||||
"Chunked MM input is required because we need the flexibility "
|
||||
"to schedule a multiple of block_size tokens even if they are "
|
||||
"in the middle of a mm input"
|
||||
)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def validate_mamba_block_size(self) -> "VllmConfig":
|
||||
if self.model_config is None:
|
||||
|
||||
Reference in New Issue
Block a user