diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 4a9f279e0..9d41185f1 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -244,12 +244,12 @@ response = client.chat.completions.create( Some models, such as [Qwen3](https://qwen.readthedocs.io/en/latest/getting_started/quickstart.html#thinking-budget), [DeepSeek](https://www.alibabacloud.com/help/en/model-studio/deep-thinking), and [Nemotron3](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16), support a thinking budget that limits the maximum number of tokens used for reasoning. -Token counting starts from `think_start_str`. Once the reasoning token count reaches the configured `thinking_token_budget`, vLLM forces the model to produce `think_end_str`, effectively terminating the reasoning block. +Token counting starts from `reasoning_start_str`. Once the reasoning token count reaches the configured `thinking_token_budget`, vLLM forces the model to produce `reasoning_end_str`, effectively terminating the reasoning block. To use this feature: - `--reasoning-parser` enables reasoning extraction. -- `--reasoning-config` defines the reasoning boundary tokens (e.g., `think_start_str`, `think_end_str`). +- `--reasoning-config` defines the reasoning boundary tokens (e.g., `reasoning_start_str`, `reasoning_end_str`). - `thinking_token_budget` (a sampling parameter) sets the per-request reasoning token limit. If `thinking_token_budget` is not specified, no explicit reasoning limit is applied beyond normal generation constraints such as `max_tokens`. @@ -257,20 +257,20 @@ If `thinking_token_budget` is not specified, no explicit reasoning limit is appl `--reasoning-config` accepts a JSON object corresponding to [ReasoningConfig][vllm.config.ReasoningConfig] with the following fields: -| Field | Type | Description | -|-------------------|----------------|--------------------------------------------------| -| `think_start_str` | `str \| null` | String that marks the start of reasoning content | -| `think_end_str` | `str \| null` | String that marks the end of reasoning content | +| Field | Type | Description | +|-----------------------|----------------|--------------------------------------------------| +| `reasoning_start_str` | `str \| null` | String that marks the start of reasoning content | +| `reasoning_end_str` | `str \| null` | String that marks the end of reasoning content | !!! note - `think_end_str` can include a transition phrase before the think end token. For example, setting `think_end_str` to `"I have to give the solution based on the thinking directly now."` instructs the model to emit that phrase when the budget is exhausted, making the reasoning termination more natural. + `reasoning_end_str` can include a transition phrase before the reasoning end token. For example, setting `reasoning_end_str` to `"I have to give the solution based on the reasoning directly now."` instructs the model to emit that phrase when the budget is exhausted, making the reasoning termination more natural. ### Online Serving ```bash vllm serve Qwen/Qwen3-0.6B \ --reasoning-parser qwen3 \ - --reasoning-config '{"think_start_str": "", "think_end_str": "I have to give the solution based on the thinking directly now."}' + --reasoning-config '{"reasoning_start_str": "", "reasoning_end_str": "I have to give the solution based on the reasoning directly now."}' ``` Then make a request with `thinking_token_budget` to limit the reasoning tokens: @@ -298,8 +298,8 @@ from vllm.config import ReasoningConfig llm = LLM( model="Qwen/Qwen3-0.6B", reasoning_config=ReasoningConfig( - think_start_str="", - think_end_str="I have to give the solution based on the thinking directly now.", + reasoning_start_str="", + reasoning_end_str="I have to give the solution based on the thinking directly now.", ), ) diff --git a/tests/v1/entrypoints/openai/test_thinking_token_budget.py b/tests/v1/entrypoints/openai/test_thinking_token_budget.py index f574b07b6..b3f6d53ab 100644 --- a/tests/v1/entrypoints/openai/test_thinking_token_budget.py +++ b/tests/v1/entrypoints/openai/test_thinking_token_budget.py @@ -20,7 +20,7 @@ def server(): "--reasoning-parser", "qwen3", "--reasoning-config", - '{"think_start_str": "", "think_end_str": ""}', + '{"reasoning_start_str": "", "reasoning_end_str": ""}', "--max-model-len", "2048", "--enforce-eager", diff --git a/tests/v1/logits_processors/test_correctness.py b/tests/v1/logits_processors/test_correctness.py index 792168877..bc2cc1720 100644 --- a/tests/v1/logits_processors/test_correctness.py +++ b/tests/v1/logits_processors/test_correctness.py @@ -103,8 +103,8 @@ class LogitsProcsRequestParams: class MockReasoningConfig: """Mock reasoning config for testing ThinkingTokenBudgetLogitsProcessor.""" - think_start_token_ids = [THINK_START_TOKEN_ID] - think_end_token_ids = [THINK_END_TOKEN_ID] + reasoning_start_token_ids = [THINK_START_TOKEN_ID] + reasoning_end_token_ids = [THINK_END_TOKEN_ID] def _generate_fake_sampling_metadata( @@ -491,7 +491,7 @@ def _thinking_budget_validate( # Find if thinking has started in output tokens thinking_started = False - start_tokens = tb_processor.think_start_token_ids + start_tokens = tb_processor.reasoning_start_token_ids if len(start_tokens) > 0: for i in range(len(output_tokens) - len(start_tokens) + 1): @@ -518,7 +518,7 @@ def _thinking_budget_validate( ) # Validate that only end tokens are allowed - end_tokens = tb_processor.think_end_token_ids + end_tokens = tb_processor.reasoning_end_token_ids if len(end_tokens) > 0: expected_end_token_id = end_tokens[ min(state["end_count"], len(end_tokens) - 1) diff --git a/vllm/config/reasoning.py b/vllm/config/reasoning.py index 872e05580..be1e2b6da 100644 --- a/vllm/config/reasoning.py +++ b/vllm/config/reasoning.py @@ -12,7 +12,7 @@ from vllm.tokenizers import cached_tokenizer_from_config class ReasoningConfig: """Configuration for reasoning models. - Set `think_start_str` and `think_end_str` to the strings that delimit + Set `reasoning_start_str` and `reasoning_end_str` to the strings that delimit the reasoning block (e.g. `""` and `""`). The corresponding token IDs are derived automatically via `initialize_token_ids` and are not intended to be set directly. @@ -20,53 +20,55 @@ class ReasoningConfig: # NOTE: These parameters are temporary, the intent is to derive them # automatically from the reasoning parser in a future version. - think_start_str: str = "" + reasoning_start_str: str = "" """String that indicates the start of reasoning.""" - think_end_str: str = "" + reasoning_end_str: str = "" """String that indicates the end of reasoning content.""" - _think_start_token_ids: list[int] | None = field( + _reasoning_start_token_ids: list[int] | None = field( default=None, init=False, repr=False ) - """Private backing field for `think_start_token_ids`. Set by + """Private backing field for `reasoning_start_token_ids`. Set by `initialize_token_ids`. Not intended to be configured directly.""" - _think_end_token_ids: list[int] | None = field(default=None, init=False, repr=False) - """Private backing field for `think_end_token_ids`. Set by + _reasoning_end_token_ids: list[int] | None = field( + default=None, init=False, repr=False + ) + """Private backing field for `reasoning_end_token_ids`. Set by `initialize_token_ids`. Not intended to be configured directly.""" @property - def think_start_token_ids(self) -> list[int] | None: - """Token IDs derived from `think_start_str`. Set automatically by + def reasoning_start_token_ids(self) -> list[int] | None: + """Token IDs derived from `reasoning_start_str`. Set automatically by `initialize_token_ids`. Not intended to be configured directly.""" - return self._think_start_token_ids + return self._reasoning_start_token_ids @property - def think_end_token_ids(self) -> list[int] | None: - """Token IDs derived from `think_end_str`. Set automatically by + def reasoning_end_token_ids(self) -> list[int] | None: + """Token IDs derived from `reasoning_end_str`. Set automatically by `initialize_token_ids`. Not intended to be configured directly.""" - return self._think_end_token_ids + return self._reasoning_end_token_ids def initialize_token_ids(self, model_config: ModelConfig) -> None: """Initialize reasoning token IDs from strings using the tokenizer.""" if ( - self._think_start_token_ids is not None - and self._think_end_token_ids is not None + self._reasoning_start_token_ids is not None + and self._reasoning_end_token_ids is not None ): return tokenizer = cached_tokenizer_from_config(model_config=model_config) - self._think_start_token_ids = tokenizer.encode( - self.think_start_str, add_special_tokens=False + self._reasoning_start_token_ids = tokenizer.encode( + self.reasoning_start_str, add_special_tokens=False ) - self._think_end_token_ids = tokenizer.encode( - self.think_end_str, add_special_tokens=False + self._reasoning_end_token_ids = tokenizer.encode( + self.reasoning_end_str, add_special_tokens=False ) - if not self._think_start_token_ids or not self._think_end_token_ids: + if not self._reasoning_start_token_ids or not self._reasoning_end_token_ids: raise ValueError( f"ReasoningConfig: failed to tokenize reasoning strings: " - f"think_start_str='{self.think_start_str}', " - f"think_end_str='{self.think_end_str}'. " + f"reasoning_start_str='{self.reasoning_start_str}', " + f"reasoning_end_str='{self.reasoning_end_str}'. " "Ensure the strings are valid tokens in the model's vocabulary." ) diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py index c92f33402..0d9b67017 100644 --- a/vllm/v1/sample/logits_processor/builtin.py +++ b/vllm/v1/sample/logits_processor/builtin.py @@ -303,10 +303,12 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor): # Check if thinking is enabled self.is_enabled = reasoning_config is not None - self.think_start_token_ids = getattr( - reasoning_config, "think_start_token_ids", [] + self.reasoning_start_token_ids = getattr( + reasoning_config, "reasoning_start_token_ids", [] + ) + self.reasoning_end_token_ids = getattr( + reasoning_config, "reasoning_end_token_ids", [] ) - self.think_end_token_ids = getattr(reasoning_config, "think_end_token_ids", []) self.pin_memory = is_pin_memory self.device = device @@ -357,15 +359,15 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor): think_count = 0 else: last_start = self._find_last_sequence_index( - prompt_tok_ids, self.think_start_token_ids + prompt_tok_ids, self.reasoning_start_token_ids ) last_end = self._find_last_sequence_index( - prompt_tok_ids, self.think_end_token_ids + prompt_tok_ids, self.reasoning_end_token_ids ) in_think = last_start > last_end if in_think: think_count = len(prompt_tok_ids) - ( - last_start + len(self.think_start_token_ids) + last_start + len(self.reasoning_start_token_ids) ) else: think_count = 0 @@ -405,8 +407,8 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor): state["prev_output_length"] = current_length # Check if new tokens contain think start or end sequences - start_len = len(self.think_start_token_ids) - end_len = len(self.think_end_token_ids) + start_len = len(self.reasoning_start_token_ids) + end_len = len(self.reasoning_end_token_ids) # Look for think sequences in recent tokens (including boundary) # Check overlapping regions where sequences might span boundaries @@ -415,10 +417,10 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor): # Find any think start/end sequences in recent tokens recent_start_pos = self._find_last_sequence_index( - recent_tokens, self.think_start_token_ids + recent_tokens, self.reasoning_start_token_ids ) recent_end_pos = self._find_last_sequence_index( - recent_tokens, self.think_end_token_ids + recent_tokens, self.reasoning_end_token_ids ) # Update state based on recent sequences @@ -469,7 +471,7 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor): else: # In end mode state["end_count"] += 1 - if state["end_count"] >= len(self.think_end_token_ids): + if state["end_count"] >= len(self.reasoning_end_token_ids): state.update( { "in_end": False, @@ -530,7 +532,9 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor): state = self._state.get(i) if state and state["in_end"]: self.mask[i] = True - self.force_token_ids[i] = self.think_end_token_ids[state["end_count"]] + self.force_token_ids[i] = self.reasoning_end_token_ids[ + state["end_count"] + ] # Check in CPU first not to sync with GPU has_active_thinking = any(