[v1][core] Support for attention free models (#20811)

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
This commit is contained in:
Christian Pinto
2025-07-15 15:20:01 +01:00
committed by GitHub
parent 56fe4bedd6
commit 4ffd963fa0
3 changed files with 33 additions and 3 deletions

View File

@@ -139,7 +139,13 @@ class EngineCore:
# Profiles the peak memory usage of the model to determine how much
# memory can be allocated for kv cache.
available_gpu_memory = self.model_executor.determine_available_memory()
has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
if has_kv_cache:
available_gpu_memory = \
self.model_executor.determine_available_memory()
else:
# Attention free models don't need memory for kv cache
available_gpu_memory = [0] * len(kv_cache_specs)
assert len(kv_cache_specs) == len(available_gpu_memory)
# Get the kv cache tensor size