[Core] [Frontend] Make detokenization optional (#3749)

Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-04-03 21:52:18 -07:00
parent 498eb5cfa3
commit aabe8f40f2
3 changed files with 53 additions and 9 deletions
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -88,6 +88,7 @@ class SamplingParams:
            log probability of the sampled token, so there  may be up to
            `logprobs+1` elements in the response.
        prompt_logprobs: Number of log probabilities to return per prompt token.
+        detokenize: Whether to detokenize the output. Defaults to True.
        skip_special_tokens: Whether to skip special tokens in the output.
        spaces_between_special_tokens: Whether to add spaces between special
            tokens in the output.  Defaults to True.
@@ -118,6 +119,7 @@ class SamplingParams:
        min_tokens: int = 0,
        logprobs: Optional[int] = None,
        prompt_logprobs: Optional[int] = None,
+        detokenize: bool = True,
        skip_special_tokens: bool = True,
        spaces_between_special_tokens: bool = True,
        logits_processors: Optional[List[LogitsProcessor]] = None,
@@ -150,6 +152,10 @@ class SamplingParams:
        self.min_tokens = min_tokens
        self.logprobs = logprobs
        self.prompt_logprobs = prompt_logprobs
+        # NOTE: This parameter is only exposed at the engine level for now.
+        # It is not exposed in the OpenAI API server, as the OpenAI API does
+        # not support returning only a list of token IDs.
+        self.detokenize = detokenize
        self.skip_special_tokens = skip_special_tokens
        self.spaces_between_special_tokens = spaces_between_special_tokens
        self.logits_processors = logits_processors
@@ -210,6 +216,10 @@ class SamplingParams:
        if self.prompt_logprobs is not None and self.prompt_logprobs < 0:
            raise ValueError(f"prompt_logprobs must be non-negative, got "
                             f"{self.prompt_logprobs}.")
+        if self.stop and not self.detokenize:
+            raise ValueError(
+                "stop strings are only supported when detokenize is True. "
+                "Set detokenize=True to use stop.")

    def _verify_beam_search(self) -> None:
        if self.best_of == 1: