[Hardware][Gaudi][Feature] Support Contiguous Cache Fetch (#12139)

Signed-off-by: yuzhou <yuzhou@habana.ai> Signed-off-by: zhouyu5 <yu.zhou@intel.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2025-02-19 11:40:19 +08:00
parent 00b69c2d27
commit d0a7a2769d
4 changed files with 80 additions and 47 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -89,6 +89,7 @@ if TYPE_CHECKING:
    VLLM_RAY_PER_WORKER_GPUS: float = 1.0
    VLLM_RAY_BUNDLE_INDICES: str = ""
    VLLM_CUDART_SO_PATH: Optional[str] = None
+    VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True


 def get_default_cache_root():
@@ -585,6 +586,13 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    # specify the path through environment variable VLLM_CUDART_SO_PATH.
    "VLLM_CUDART_SO_PATH":
    lambda: os.getenv("VLLM_CUDART_SO_PATH", None),
+
+    # Contiguous cache fetching to avoid using costly gather operation on
+    # Gaudi3. This is only applicable to HPU contiguous cache. If set to true,
+    # contiguous cache fetch will be used.
+    "VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH":
+    lambda: os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() in
+    ("1", "true"),
 }

 # end-env-vars-definition