[Core] Add optional flags to check for repetitive token patterns in engine output (#35451)
Signed-off-by: aykoppol <aykoppol+git@gmail.com>
This commit is contained in:
@@ -38,6 +38,7 @@ from vllm.logprobs import Logprob
|
||||
from vllm.renderers import ChatParams, TokenizeParams, merge_kwargs
|
||||
from vllm.sampling_params import (
|
||||
BeamSearchParams,
|
||||
RepetitionDetectionParams,
|
||||
RequestOutputKind,
|
||||
SamplingParams,
|
||||
StructuredOutputsParams,
|
||||
@@ -336,6 +337,16 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
),
|
||||
)
|
||||
|
||||
repetition_detection: RepetitionDetectionParams | None = Field(
|
||||
default=None,
|
||||
description="Parameters for detecting repetitive N-gram patterns "
|
||||
"in output tokens. If such repetition is detected, generation will "
|
||||
"be ended early. LLMs can sometimes generate repetitive, unhelpful "
|
||||
"token patterns, stopping only when they hit the maximum output length "
|
||||
"(e.g. 'abcdabcdabcd...' or '\emoji \emoji \emoji ...'). This feature "
|
||||
"can detect such behavior and terminate early, saving time and tokens.",
|
||||
)
|
||||
|
||||
# --8<-- [end:chat-completion-extra-params]
|
||||
|
||||
def build_chat_params(
|
||||
@@ -499,6 +510,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
allowed_token_ids=self.allowed_token_ids,
|
||||
extra_args=extra_args or None,
|
||||
skip_clone=True, # Created fresh per request, safe to skip clone
|
||||
repetition_detection=self.repetition_detection,
|
||||
)
|
||||
|
||||
@model_validator(mode="before")
|
||||
|
||||
Reference in New Issue
Block a user