[Model] Allow users to control skip reading cache per request. (#28194)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
wang.yuqi
2025-11-16 16:04:50 +08:00
committed by GitHub
parent d231876ce3
commit a55b64635c
5 changed files with 67 additions and 8 deletions

View File

@@ -127,6 +127,8 @@ class Request:
self.get_hash_new_full_blocks = partial(block_hasher, self)
self.block_hashes = self.get_hash_new_full_blocks()
self.skip_reading_prefix_cache = self.get_skip_reading_prefix_cache()
@classmethod
def from_engine_core_request(
cls,
@@ -180,6 +182,19 @@ class Request:
def num_output_tokens(self) -> int:
return len(self._output_token_ids)
def get_skip_reading_prefix_cache(self) -> bool:
if (
self.sampling_params is not None
and self.sampling_params.skip_reading_prefix_cache is not None
):
return self.sampling_params.skip_reading_prefix_cache
elif (
self.pooling_params is not None
and self.pooling_params.skip_reading_prefix_cache is not None
):
return self.pooling_params.skip_reading_prefix_cache
return False
def is_finished(self) -> bool:
return RequestStatus.is_finished(self.status)