[Misc] Rename think_start_str/think_end_str to reasoning_start_str/reasoning_end_str (#38242)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
(cherry picked from commit cbe7d18096)
This commit is contained in:
Chauncey
2026-04-02 00:56:45 +08:00
committed by khluu
parent 29982d48b3
commit 3a30a1a6a8
5 changed files with 55 additions and 49 deletions

View File

@@ -244,12 +244,12 @@ response = client.chat.completions.create(
Some models, such as [Qwen3](https://qwen.readthedocs.io/en/latest/getting_started/quickstart.html#thinking-budget), [DeepSeek](https://www.alibabacloud.com/help/en/model-studio/deep-thinking), and [Nemotron3](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16), support a thinking budget that limits the maximum number of tokens used for reasoning. Some models, such as [Qwen3](https://qwen.readthedocs.io/en/latest/getting_started/quickstart.html#thinking-budget), [DeepSeek](https://www.alibabacloud.com/help/en/model-studio/deep-thinking), and [Nemotron3](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16), support a thinking budget that limits the maximum number of tokens used for reasoning.
Token counting starts from `think_start_str`. Once the reasoning token count reaches the configured `thinking_token_budget`, vLLM forces the model to produce `think_end_str`, effectively terminating the reasoning block. Token counting starts from `reasoning_start_str`. Once the reasoning token count reaches the configured `thinking_token_budget`, vLLM forces the model to produce `reasoning_end_str`, effectively terminating the reasoning block.
To use this feature: To use this feature:
- `--reasoning-parser` enables reasoning extraction. - `--reasoning-parser` enables reasoning extraction.
- `--reasoning-config` defines the reasoning boundary tokens (e.g., `think_start_str`, `think_end_str`). - `--reasoning-config` defines the reasoning boundary tokens (e.g., `reasoning_start_str`, `reasoning_end_str`).
- `thinking_token_budget` (a sampling parameter) sets the per-request reasoning token limit. - `thinking_token_budget` (a sampling parameter) sets the per-request reasoning token limit.
If `thinking_token_budget` is not specified, no explicit reasoning limit is applied beyond normal generation constraints such as `max_tokens`. If `thinking_token_budget` is not specified, no explicit reasoning limit is applied beyond normal generation constraints such as `max_tokens`.
@@ -257,20 +257,20 @@ If `thinking_token_budget` is not specified, no explicit reasoning limit is appl
`--reasoning-config` accepts a JSON object corresponding to `--reasoning-config` accepts a JSON object corresponding to
[ReasoningConfig][vllm.config.ReasoningConfig] with the following fields: [ReasoningConfig][vllm.config.ReasoningConfig] with the following fields:
| Field | Type | Description | | Field | Type | Description |
|-------------------|----------------|--------------------------------------------------| |-----------------------|----------------|--------------------------------------------------|
| `think_start_str` | `str \| null` | String that marks the start of reasoning content | | `reasoning_start_str` | `str \| null` | String that marks the start of reasoning content |
| `think_end_str` | `str \| null` | String that marks the end of reasoning content | | `reasoning_end_str` | `str \| null` | String that marks the end of reasoning content |
!!! note !!! note
`think_end_str` can include a transition phrase before the think end token. For example, setting `think_end_str` to `"I have to give the solution based on the thinking directly now.</think>"` instructs the model to emit that phrase when the budget is exhausted, making the reasoning termination more natural. `reasoning_end_str` can include a transition phrase before the reasoning end token. For example, setting `reasoning_end_str` to `"I have to give the solution based on the reasoning directly now.</think>"` instructs the model to emit that phrase when the budget is exhausted, making the reasoning termination more natural.
### Online Serving ### Online Serving
```bash ```bash
vllm serve Qwen/Qwen3-0.6B \ vllm serve Qwen/Qwen3-0.6B \
--reasoning-parser qwen3 \ --reasoning-parser qwen3 \
--reasoning-config '{"think_start_str": "<think>", "think_end_str": "I have to give the solution based on the thinking directly now.</think>"}' --reasoning-config '{"reasoning_start_str": "<think>", "reasoning_end_str": "I have to give the solution based on the reasoning directly now.</think>"}'
``` ```
Then make a request with `thinking_token_budget` to limit the reasoning tokens: Then make a request with `thinking_token_budget` to limit the reasoning tokens:
@@ -298,8 +298,8 @@ from vllm.config import ReasoningConfig
llm = LLM( llm = LLM(
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
reasoning_config=ReasoningConfig( reasoning_config=ReasoningConfig(
think_start_str="<think>", reasoning_start_str="<think>",
think_end_str="I have to give the solution based on the thinking directly now.</think>", reasoning_end_str="I have to give the solution based on the thinking directly now.</think>",
), ),
) )

View File

@@ -20,7 +20,7 @@ def server():
"--reasoning-parser", "--reasoning-parser",
"qwen3", "qwen3",
"--reasoning-config", "--reasoning-config",
'{"think_start_str": "<think>", "think_end_str": "</think>"}', '{"reasoning_start_str": "<think>", "reasoning_end_str": "</think>"}',
"--max-model-len", "--max-model-len",
"2048", "2048",
"--enforce-eager", "--enforce-eager",

View File

@@ -103,8 +103,8 @@ class LogitsProcsRequestParams:
class MockReasoningConfig: class MockReasoningConfig:
"""Mock reasoning config for testing ThinkingTokenBudgetLogitsProcessor.""" """Mock reasoning config for testing ThinkingTokenBudgetLogitsProcessor."""
think_start_token_ids = [THINK_START_TOKEN_ID] reasoning_start_token_ids = [THINK_START_TOKEN_ID]
think_end_token_ids = [THINK_END_TOKEN_ID] reasoning_end_token_ids = [THINK_END_TOKEN_ID]
def _generate_fake_sampling_metadata( def _generate_fake_sampling_metadata(
@@ -491,7 +491,7 @@ def _thinking_budget_validate(
# Find if thinking has started in output tokens # Find if thinking has started in output tokens
thinking_started = False thinking_started = False
start_tokens = tb_processor.think_start_token_ids start_tokens = tb_processor.reasoning_start_token_ids
if len(start_tokens) > 0: if len(start_tokens) > 0:
for i in range(len(output_tokens) - len(start_tokens) + 1): for i in range(len(output_tokens) - len(start_tokens) + 1):
@@ -518,7 +518,7 @@ def _thinking_budget_validate(
) )
# Validate that only end tokens are allowed # Validate that only end tokens are allowed
end_tokens = tb_processor.think_end_token_ids end_tokens = tb_processor.reasoning_end_token_ids
if len(end_tokens) > 0: if len(end_tokens) > 0:
expected_end_token_id = end_tokens[ expected_end_token_id = end_tokens[
min(state["end_count"], len(end_tokens) - 1) min(state["end_count"], len(end_tokens) - 1)

View File

@@ -12,7 +12,7 @@ from vllm.tokenizers import cached_tokenizer_from_config
class ReasoningConfig: class ReasoningConfig:
"""Configuration for reasoning models. """Configuration for reasoning models.
Set `think_start_str` and `think_end_str` to the strings that delimit Set `reasoning_start_str` and `reasoning_end_str` to the strings that delimit
the reasoning block (e.g. `"<think>"` and `"</think>"`). The the reasoning block (e.g. `"<think>"` and `"</think>"`). The
corresponding token IDs are derived automatically via corresponding token IDs are derived automatically via
`initialize_token_ids` and are not intended to be set directly. `initialize_token_ids` and are not intended to be set directly.
@@ -20,53 +20,55 @@ class ReasoningConfig:
# NOTE: These parameters are temporary, the intent is to derive them # NOTE: These parameters are temporary, the intent is to derive them
# automatically from the reasoning parser in a future version. # automatically from the reasoning parser in a future version.
think_start_str: str = "<think>" reasoning_start_str: str = "<think>"
"""String that indicates the start of reasoning.""" """String that indicates the start of reasoning."""
think_end_str: str = "</think>" reasoning_end_str: str = "</think>"
"""String that indicates the end of reasoning content.""" """String that indicates the end of reasoning content."""
_think_start_token_ids: list[int] | None = field( _reasoning_start_token_ids: list[int] | None = field(
default=None, init=False, repr=False default=None, init=False, repr=False
) )
"""Private backing field for `think_start_token_ids`. Set by """Private backing field for `reasoning_start_token_ids`. Set by
`initialize_token_ids`. Not intended to be configured directly.""" `initialize_token_ids`. Not intended to be configured directly."""
_think_end_token_ids: list[int] | None = field(default=None, init=False, repr=False) _reasoning_end_token_ids: list[int] | None = field(
"""Private backing field for `think_end_token_ids`. Set by default=None, init=False, repr=False
)
"""Private backing field for `reasoning_end_token_ids`. Set by
`initialize_token_ids`. Not intended to be configured directly.""" `initialize_token_ids`. Not intended to be configured directly."""
@property @property
def think_start_token_ids(self) -> list[int] | None: def reasoning_start_token_ids(self) -> list[int] | None:
"""Token IDs derived from `think_start_str`. Set automatically by """Token IDs derived from `reasoning_start_str`. Set automatically by
`initialize_token_ids`. Not intended to be configured directly.""" `initialize_token_ids`. Not intended to be configured directly."""
return self._think_start_token_ids return self._reasoning_start_token_ids
@property @property
def think_end_token_ids(self) -> list[int] | None: def reasoning_end_token_ids(self) -> list[int] | None:
"""Token IDs derived from `think_end_str`. Set automatically by """Token IDs derived from `reasoning_end_str`. Set automatically by
`initialize_token_ids`. Not intended to be configured directly.""" `initialize_token_ids`. Not intended to be configured directly."""
return self._think_end_token_ids return self._reasoning_end_token_ids
def initialize_token_ids(self, model_config: ModelConfig) -> None: def initialize_token_ids(self, model_config: ModelConfig) -> None:
"""Initialize reasoning token IDs from strings using the tokenizer.""" """Initialize reasoning token IDs from strings using the tokenizer."""
if ( if (
self._think_start_token_ids is not None self._reasoning_start_token_ids is not None
and self._think_end_token_ids is not None and self._reasoning_end_token_ids is not None
): ):
return return
tokenizer = cached_tokenizer_from_config(model_config=model_config) tokenizer = cached_tokenizer_from_config(model_config=model_config)
self._think_start_token_ids = tokenizer.encode( self._reasoning_start_token_ids = tokenizer.encode(
self.think_start_str, add_special_tokens=False self.reasoning_start_str, add_special_tokens=False
) )
self._think_end_token_ids = tokenizer.encode( self._reasoning_end_token_ids = tokenizer.encode(
self.think_end_str, add_special_tokens=False self.reasoning_end_str, add_special_tokens=False
) )
if not self._think_start_token_ids or not self._think_end_token_ids: if not self._reasoning_start_token_ids or not self._reasoning_end_token_ids:
raise ValueError( raise ValueError(
f"ReasoningConfig: failed to tokenize reasoning strings: " f"ReasoningConfig: failed to tokenize reasoning strings: "
f"think_start_str='{self.think_start_str}', " f"reasoning_start_str='{self.reasoning_start_str}', "
f"think_end_str='{self.think_end_str}'. " f"reasoning_end_str='{self.reasoning_end_str}'. "
"Ensure the strings are valid tokens in the model's vocabulary." "Ensure the strings are valid tokens in the model's vocabulary."
) )

View File

@@ -303,10 +303,12 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor):
# Check if thinking is enabled # Check if thinking is enabled
self.is_enabled = reasoning_config is not None self.is_enabled = reasoning_config is not None
self.think_start_token_ids = getattr( self.reasoning_start_token_ids = getattr(
reasoning_config, "think_start_token_ids", [] reasoning_config, "reasoning_start_token_ids", []
)
self.reasoning_end_token_ids = getattr(
reasoning_config, "reasoning_end_token_ids", []
) )
self.think_end_token_ids = getattr(reasoning_config, "think_end_token_ids", [])
self.pin_memory = is_pin_memory self.pin_memory = is_pin_memory
self.device = device self.device = device
@@ -357,15 +359,15 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor):
think_count = 0 think_count = 0
else: else:
last_start = self._find_last_sequence_index( last_start = self._find_last_sequence_index(
prompt_tok_ids, self.think_start_token_ids prompt_tok_ids, self.reasoning_start_token_ids
) )
last_end = self._find_last_sequence_index( last_end = self._find_last_sequence_index(
prompt_tok_ids, self.think_end_token_ids prompt_tok_ids, self.reasoning_end_token_ids
) )
in_think = last_start > last_end in_think = last_start > last_end
if in_think: if in_think:
think_count = len(prompt_tok_ids) - ( think_count = len(prompt_tok_ids) - (
last_start + len(self.think_start_token_ids) last_start + len(self.reasoning_start_token_ids)
) )
else: else:
think_count = 0 think_count = 0
@@ -405,8 +407,8 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor):
state["prev_output_length"] = current_length state["prev_output_length"] = current_length
# Check if new tokens contain think start or end sequences # Check if new tokens contain think start or end sequences
start_len = len(self.think_start_token_ids) start_len = len(self.reasoning_start_token_ids)
end_len = len(self.think_end_token_ids) end_len = len(self.reasoning_end_token_ids)
# Look for think sequences in recent tokens (including boundary) # Look for think sequences in recent tokens (including boundary)
# Check overlapping regions where sequences might span boundaries # Check overlapping regions where sequences might span boundaries
@@ -415,10 +417,10 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor):
# Find any think start/end sequences in recent tokens # Find any think start/end sequences in recent tokens
recent_start_pos = self._find_last_sequence_index( recent_start_pos = self._find_last_sequence_index(
recent_tokens, self.think_start_token_ids recent_tokens, self.reasoning_start_token_ids
) )
recent_end_pos = self._find_last_sequence_index( recent_end_pos = self._find_last_sequence_index(
recent_tokens, self.think_end_token_ids recent_tokens, self.reasoning_end_token_ids
) )
# Update state based on recent sequences # Update state based on recent sequences
@@ -469,7 +471,7 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor):
else: else:
# In end mode # In end mode
state["end_count"] += 1 state["end_count"] += 1
if state["end_count"] >= len(self.think_end_token_ids): if state["end_count"] >= len(self.reasoning_end_token_ids):
state.update( state.update(
{ {
"in_end": False, "in_end": False,
@@ -530,7 +532,9 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor):
state = self._state.get(i) state = self._state.get(i)
if state and state["in_end"]: if state and state["in_end"]:
self.mask[i] = True self.mask[i] = True
self.force_token_ids[i] = self.think_end_token_ids[state["end_count"]] self.force_token_ids[i] = self.reasoning_end_token_ids[
state["end_count"]
]
# Check in CPU first not to sync with GPU # Check in CPU first not to sync with GPU
has_active_thinking = any( has_active_thinking = any(