[Sampler] Support returning final logprobs (#22387)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
22quinn
2025-08-20 21:28:32 -07:00
committed by GitHub
parent f64ee61d9e
commit f571ff8eb6
7 changed files with 125 additions and 69 deletions

View File

@@ -257,11 +257,16 @@ def is_init_field(cls: ConfigType, name: str) -> bool:
TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
LogprobsMode = Literal["raw_logprobs", "raw_logits", "processed_logprobs",
"processed_logits"]
MMEncoderTPMode = Literal["weights", "data"]
class LogprobsMode(enum.Enum):
RAW_LOGITS = "raw_logits"
RAW_LOGPROBS = "raw_logprobs"
PROCESSED_LOGITS = "processed_logits"
PROCESSED_LOGPROBS = "processed_logprobs"
@config
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class ModelConfig:
@@ -363,12 +368,13 @@ class ModelConfig:
specified in `SamplingParams`. The default value comes the default for the
OpenAI Chat Completions API. -1 means no cap, i.e. all (output_length *
vocab_size) logprobs are allowed to be returned and it may cause OOM."""
logprobs_mode: LogprobsMode = "raw_logprobs"
logprobs_mode: LogprobsMode = LogprobsMode.RAW_LOGPROBS
"""Indicates the content returned in the logprobs and prompt_logprobs.
Supported mode:
1) raw_logprobs, 2) processed_logprobs, 3) raw_logits, 4) processed_logits.
Raw means the values before applying logit processors, like bad words.
Processed means the values after applying such processors.
Raw means the values before applying any logit processors, like bad words.
Processed means the values after applying all processors, including
temperature and top_k/top_p.
"""
disable_sliding_window: bool = False
"""Whether to disable sliding window. If True, we will disable the sliding
@@ -2586,7 +2592,7 @@ class MultiModalConfig:
skip_mm_profiling: bool = False
"""
When enabled, skips multimodal memory profiling and only profiles with
When enabled, skips multimodal memory profiling and only profiles with
language backbone model during engine initialization.
This reduces engine startup time but shifts the responsibility to users for
@@ -2649,24 +2655,24 @@ class PoolerConfig:
## for embeddings models
normalize: Optional[bool] = None
"""
Whether to normalize the embeddings outputs.
Whether to normalize the embeddings outputs.
"""
dimensions: Optional[int] = None
"""
Reduce the dimensions of embeddings if model
Reduce the dimensions of embeddings if model
support matryoshka representation.
"""
## for classification models
activation: Optional[bool] = None
"""
Whether to apply activation function to the classification outputs.
Whether to apply activation function to the classification outputs.
"""
## for reward models
softmax: Optional[bool] = None
"""
Whether to apply softmax to the reward outputs.
Whether to apply softmax to the reward outputs.
"""
step_tag_id: Optional[int] = None
"""
@@ -2692,9 +2698,9 @@ class PoolerConfig:
max_embed_len: Optional[int] = None
"""
Maximum input length allowed for embedding generation. When set, allows
Maximum input length allowed for embedding generation. When set, allows
inputs longer than max_embed_len to be accepted for embedding models.
This parameter enables accepting long inputs without requiring
This parameter enables accepting long inputs without requiring
VLLM_ALLOW_LONG_MAX_MODEL_LEN environment variable. When an input exceeds
max_embed_len, it will be handled according to the original max_model_len
validation logic. Defaults to None (i.e. set to max_model_len).