[Model] Allow users to control skip reading cache per request. (#28194)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io> Signed-off-by: wang.yuqi <noooop@126.com>
2025-11-16 16:04:50 +08:00
parent d231876ce3
commit a55b64635c
5 changed files with 67 additions and 8 deletions
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -254,6 +254,8 @@ class SamplingParams(
    generated token can complete the sequence."""
    _bad_words_token_ids: list[list[int]] | None = None

+    skip_reading_prefix_cache: bool = None
+
    @staticmethod
    def from_optional(
        n: int | None = 1,
@@ -414,6 +416,12 @@ class SamplingParams(
            self.structured_outputs = self.guided_decoding
            self.guided_decoding = None

+        if self.skip_reading_prefix_cache is None:
+            # If prefix caching is enabled,
+            # the output of prompt logprobs may less than n_prompt_tokens,
+            # we need to skip reading cache at this request.
+            self.skip_reading_prefix_cache = self.prompt_logprobs is not None
+
    def _verify_args(self) -> None:
        if not isinstance(self.n, int):
            raise ValueError(f"n must be an int, but is of type {type(self.n)}")