[V1] Enable prefill optimization for Gemma3n (#22628)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
This commit is contained in:
@@ -145,12 +145,19 @@ class CacheConfig:
|
||||
|
||||
self._verify_cache_dtype()
|
||||
self._verify_prefix_caching()
|
||||
self._verify_kv_sharing_fast_prefill()
|
||||
|
||||
def metrics_info(self):
|
||||
# convert cache_config to dict(key: str, value: str) for prometheus
|
||||
# metrics info
|
||||
return {key: str(value) for key, value in self.__dict__.items()}
|
||||
|
||||
def _verify_kv_sharing_fast_prefill(self) -> None:
|
||||
if self.kv_sharing_fast_prefill and not envs.VLLM_USE_V1:
|
||||
raise NotImplementedError(
|
||||
"Fast prefill optimization for KV sharing is not supported "
|
||||
"in V0 currently.")
|
||||
|
||||
@model_validator(mode='after')
|
||||
def _verify_args(self) -> Self:
|
||||
if self.cpu_offload_gb < 0:
|
||||
@@ -162,11 +169,6 @@ class CacheConfig:
|
||||
"GPU memory utilization must be less than 1.0. Got "
|
||||
f"{self.gpu_memory_utilization}.")
|
||||
|
||||
if self.kv_sharing_fast_prefill:
|
||||
logger.warning_once(
|
||||
"--kv-sharing-fast-prefill is currently work in progress "
|
||||
"and not functional yet (i.e. no prefill savings)")
|
||||
|
||||
return self
|
||||
|
||||
def _verify_cache_dtype(self) -> None:
|
||||
|
||||
Reference in New Issue
Block a user