[V1][Hybrid] Mamba Prefix Caching with align mode (#30877)
Signed-off-by: huanghaoyan.hhy <huanghaoyan.hhy@alibaba-inc.com> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
@@ -999,6 +999,17 @@ class VllmConfig:
|
||||
# Default to enable HMA if not explicitly disabled by user or logic above.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = False
|
||||
|
||||
if self.cache_config.mamba_cache_mode == "align":
|
||||
if self.scheduler_config.long_prefill_token_threshold > 0:
|
||||
assert (
|
||||
self.scheduler_config.long_prefill_token_threshold
|
||||
>= self.cache_config.block_size
|
||||
)
|
||||
assert not self.scheduler_config.disable_chunked_mm_input, (
|
||||
"Chunked MM input is required because we need the flexibility to "
|
||||
"schedule a multiple of block_size tokens even if they are in the "
|
||||
"middle of a mm input"
|
||||
)
|
||||
if self.compilation_config.debug_dump_path:
|
||||
self.compilation_config.debug_dump_path = (
|
||||
self.compilation_config.debug_dump_path.absolute().expanduser()
|
||||
|
||||
Reference in New Issue
Block a user