[Model] Allow users to control skip reading cache per request. (#28194)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: wang.yuqi <noooop@126.com>
This commit is contained in:
wang.yuqi
2025-11-16 16:04:50 +08:00
committed by GitHub
parent d231876ce3
commit a55b64635c
5 changed files with 67 additions and 8 deletions

View File

@@ -254,6 +254,8 @@ class SamplingParams(
generated token can complete the sequence."""
_bad_words_token_ids: list[list[int]] | None = None
skip_reading_prefix_cache: bool = None
@staticmethod
def from_optional(
n: int | None = 1,
@@ -414,6 +416,12 @@ class SamplingParams(
self.structured_outputs = self.guided_decoding
self.guided_decoding = None
if self.skip_reading_prefix_cache is None:
# If prefix caching is enabled,
# the output of prompt logprobs may less than n_prompt_tokens,
# we need to skip reading cache at this request.
self.skip_reading_prefix_cache = self.prompt_logprobs is not None
def _verify_args(self) -> None:
if not isinstance(self.n, int):
raise ValueError(f"n must be an int, but is of type {type(self.n)}")