[Core] Support reset_prefix_cache (#12284)
This commit is contained in:
@@ -1132,6 +1132,9 @@ class LLM:
|
||||
def stop_profile(self) -> None:
|
||||
self.llm_engine.stop_profile()
|
||||
|
||||
def reset_prefix_cache(self) -> bool:
|
||||
return self.llm_engine.reset_prefix_cache()
|
||||
|
||||
def sleep(self, level: int = 1):
|
||||
"""
|
||||
Put the engine to sleep. The engine should not process any requests.
|
||||
@@ -1150,6 +1153,7 @@ class LLM:
|
||||
where previous model weights are not needed. It reduces CPU memory
|
||||
pressure.
|
||||
"""
|
||||
self.reset_prefix_cache()
|
||||
self.llm_engine.sleep(level=level)
|
||||
|
||||
def wake_up(self):
|
||||
|
||||
@@ -518,6 +518,18 @@ TASK_HANDLERS: Dict[str, Dict[str, tuple]] = {
|
||||
},
|
||||
}
|
||||
|
||||
if envs.VLLM_SERVER_DEV_MODE:
|
||||
|
||||
@router.post("/reset_prefix_cache")
|
||||
async def reset_prefix_cache(raw_request: Request):
|
||||
"""
|
||||
Reset the prefix cache. Note that we currently do not check if the
|
||||
prefix cache is successfully reset in the API server.
|
||||
"""
|
||||
logger.info("Resetting prefix cache...")
|
||||
await engine_client(raw_request).reset_prefix_cache()
|
||||
return Response(status_code=200)
|
||||
|
||||
|
||||
@router.post("/invocations")
|
||||
async def invocations(raw_request: Request):
|
||||
|
||||
Reference in New Issue
Block a user