[CI] Revert PRs 34818 and 33600 (#34979)
This commit is contained in:
@@ -915,6 +915,32 @@ class VllmConfig:
|
||||
)
|
||||
current_platform.check_and_update_config(self)
|
||||
|
||||
# If DCP, ensure the block size is right.
|
||||
if self.parallel_config.decode_context_parallel_size > 1:
|
||||
if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
|
||||
self.parallel_config.cp_kv_cache_interleave_size
|
||||
!= self.parallel_config.dcp_kv_cache_interleave_size
|
||||
):
|
||||
self.parallel_config.cp_kv_cache_interleave_size = (
|
||||
self.parallel_config.dcp_kv_cache_interleave_size
|
||||
)
|
||||
logger.warning_once(
|
||||
"cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
|
||||
"_interleave_size. And dcp-kv-cache-interleave-size will be "
|
||||
"deprecated when PCP is fully supported."
|
||||
)
|
||||
assert (
|
||||
self.parallel_config.cp_kv_cache_interleave_size
|
||||
<= self.cache_config.block_size
|
||||
and self.cache_config.block_size
|
||||
% self.parallel_config.cp_kv_cache_interleave_size
|
||||
== 0
|
||||
), (
|
||||
f"Block_size({self.cache_config.block_size}) should be greater "
|
||||
"than or equal to and divisible by cp_kv_cache_interleave_size "
|
||||
f"({self.parallel_config.cp_kv_cache_interleave_size})."
|
||||
)
|
||||
|
||||
# Do this after all the updates to compilation_config.mode
|
||||
effective_dp_size = (
|
||||
self.parallel_config.data_parallel_size
|
||||
@@ -1082,6 +1108,26 @@ class VllmConfig:
|
||||
# Default to enable HMA if not explicitly disabled by user or logic above.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = False
|
||||
|
||||
if self.cache_config.mamba_cache_mode == "align":
|
||||
assert (
|
||||
self.cache_config.block_size
|
||||
<= self.scheduler_config.max_num_batched_tokens
|
||||
), (
|
||||
"In Mamba cache align mode, block_size "
|
||||
f"({self.cache_config.block_size}) must be <= "
|
||||
"max_num_batched_tokens "
|
||||
f"({self.scheduler_config.max_num_batched_tokens})."
|
||||
)
|
||||
if self.scheduler_config.long_prefill_token_threshold > 0:
|
||||
assert (
|
||||
self.scheduler_config.long_prefill_token_threshold
|
||||
>= self.cache_config.block_size
|
||||
)
|
||||
assert not self.scheduler_config.disable_chunked_mm_input, (
|
||||
"Chunked MM input is required because we need the flexibility to "
|
||||
"schedule a multiple of block_size tokens even if they are in the "
|
||||
"middle of a mm input"
|
||||
)
|
||||
if self.compilation_config.debug_dump_path:
|
||||
self.compilation_config.debug_dump_path = (
|
||||
self.compilation_config.debug_dump_path.absolute().expanduser()
|
||||
@@ -1442,57 +1488,6 @@ class VllmConfig:
|
||||
f"compilation_config={self.compilation_config!r}"
|
||||
)
|
||||
|
||||
def validate_block_size(self) -> None:
|
||||
"""Validate block_size against DCP and mamba constraints.
|
||||
|
||||
Called after Platform.update_block_size_for_backend() has
|
||||
finalised block_size, so that the checks see the real value
|
||||
rather than the initial None sentinel.
|
||||
"""
|
||||
block_size = self.cache_config.block_size
|
||||
assert block_size is not None, (
|
||||
"validate_block_size called before block_size was set"
|
||||
)
|
||||
|
||||
# DCP interleave-size compatibility
|
||||
if self.parallel_config.decode_context_parallel_size > 1:
|
||||
if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
|
||||
self.parallel_config.cp_kv_cache_interleave_size
|
||||
!= self.parallel_config.dcp_kv_cache_interleave_size
|
||||
):
|
||||
self.parallel_config.cp_kv_cache_interleave_size = (
|
||||
self.parallel_config.dcp_kv_cache_interleave_size
|
||||
)
|
||||
logger.warning_once(
|
||||
"cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
|
||||
"_interleave_size. And dcp-kv-cache-interleave-size will be "
|
||||
"deprecated when PCP is fully supported."
|
||||
)
|
||||
assert (
|
||||
self.parallel_config.cp_kv_cache_interleave_size <= block_size
|
||||
and block_size % self.parallel_config.cp_kv_cache_interleave_size == 0
|
||||
), (
|
||||
f"Block_size({block_size}) should be greater "
|
||||
"than or equal to and divisible by cp_kv_cache_interleave_size "
|
||||
f"({self.parallel_config.cp_kv_cache_interleave_size})."
|
||||
)
|
||||
|
||||
# Mamba cache align-mode constraints
|
||||
if self.cache_config.mamba_cache_mode == "align":
|
||||
assert block_size <= self.scheduler_config.max_num_batched_tokens, (
|
||||
"In Mamba cache align mode, block_size "
|
||||
f"({block_size}) must be <= "
|
||||
"max_num_batched_tokens "
|
||||
f"({self.scheduler_config.max_num_batched_tokens})."
|
||||
)
|
||||
if self.scheduler_config.long_prefill_token_threshold > 0:
|
||||
assert self.scheduler_config.long_prefill_token_threshold >= block_size
|
||||
assert not self.scheduler_config.disable_chunked_mm_input, (
|
||||
"Chunked MM input is required because we need the flexibility "
|
||||
"to schedule a multiple of block_size tokens even if they are "
|
||||
"in the middle of a mm input"
|
||||
)
|
||||
|
||||
@model_validator(mode="after")
|
||||
def validate_mamba_block_size(self) -> "VllmConfig":
|
||||
if self.model_config is None:
|
||||
|
||||
Reference in New Issue
Block a user