[V1] Enable prefill optimization for Gemma3n (#22628)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-08-28 14:54:30 -07:00
parent 7ffbf27239
commit cb293f6a79
9 changed files with 591 additions and 236 deletions
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -145,12 +145,19 @@ class CacheConfig:

        self._verify_cache_dtype()
        self._verify_prefix_caching()
+        self._verify_kv_sharing_fast_prefill()

    def metrics_info(self):
        # convert cache_config to dict(key: str, value: str) for prometheus
        # metrics info
        return {key: str(value) for key, value in self.__dict__.items()}

+    def _verify_kv_sharing_fast_prefill(self) -> None:
+        if self.kv_sharing_fast_prefill and not envs.VLLM_USE_V1:
+            raise NotImplementedError(
+                "Fast prefill optimization for KV sharing is not supported "
+                "in V0 currently.")
+
    @model_validator(mode='after')
    def _verify_args(self) -> Self:
        if self.cpu_offload_gb < 0:
@@ -162,11 +169,6 @@ class CacheConfig:
                "GPU memory utilization must be less than 1.0. Got "
                f"{self.gpu_memory_utilization}.")

-        if self.kv_sharing_fast_prefill:
-            logger.warning_once(
-                "--kv-sharing-fast-prefill is currently work in progress "
-                "and not functional yet (i.e. no prefill savings)")
-
        return self

    def _verify_cache_dtype(self) -> None: