Compare commits
3 Commits
v0.19.0rc0
...
v0.19.0rc1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c284a6671c | ||
|
|
3a30a1a6a8 | ||
|
|
29982d48b3 |
@@ -167,7 +167,7 @@ Priority is **1 = highest** (tried first).
|
|||||||
| ------- | ------- | ------ | --------- | ----------- | ---------- | ---- | --------- | --- | --------------- | ------------ |
|
| ------- | ------- | ------ | --------- | ----------- | ---------- | ---- | --------- | --- | --------------- | ------------ |
|
||||||
| `CPU_ATTN` | | fp16, bf16, fp32 | `auto` | Any | 32, 64, 80, 96, 112, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
|
| `CPU_ATTN` | | fp16, bf16, fp32 | `auto` | Any | 32, 64, 80, 96, 112, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A |
|
||||||
| `FLASHINFER` | Native† | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ❌ | ❌ | ✅ | Decoder | 7.x-9.x |
|
| `FLASHINFER` | Native† | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ❌ | ❌ | ✅ | Decoder | 7.x-9.x |
|
||||||
| `FLASHINFER` | TRTLLM† | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ✅ | ❌ | ✅ | Decoder | 10.x |
|
| `FLASHINFER` | TRTLLM† | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ✅ | ❌ | ✅ | Decoder | 10.0 |
|
||||||
| `FLASH_ATTN` | FA2* | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥8.0 |
|
| `FLASH_ATTN` | FA2* | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥8.0 |
|
||||||
| `FLASH_ATTN` | FA3* | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ❌ | ✅ | All | 9.x |
|
| `FLASH_ATTN` | FA3* | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ❌ | ✅ | All | 9.x |
|
||||||
| `FLASH_ATTN` | FA4* | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥10.0 |
|
| `FLASH_ATTN` | FA4* | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥10.0 |
|
||||||
|
|||||||
@@ -244,12 +244,12 @@ response = client.chat.completions.create(
|
|||||||
|
|
||||||
Some models, such as [Qwen3](https://qwen.readthedocs.io/en/latest/getting_started/quickstart.html#thinking-budget), [DeepSeek](https://www.alibabacloud.com/help/en/model-studio/deep-thinking), and [Nemotron3](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16), support a thinking budget that limits the maximum number of tokens used for reasoning.
|
Some models, such as [Qwen3](https://qwen.readthedocs.io/en/latest/getting_started/quickstart.html#thinking-budget), [DeepSeek](https://www.alibabacloud.com/help/en/model-studio/deep-thinking), and [Nemotron3](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16), support a thinking budget that limits the maximum number of tokens used for reasoning.
|
||||||
|
|
||||||
Token counting starts from `think_start_str`. Once the reasoning token count reaches the configured `thinking_token_budget`, vLLM forces the model to produce `think_end_str`, effectively terminating the reasoning block.
|
Token counting starts from `reasoning_start_str`. Once the reasoning token count reaches the configured `thinking_token_budget`, vLLM forces the model to produce `reasoning_end_str`, effectively terminating the reasoning block.
|
||||||
|
|
||||||
To use this feature:
|
To use this feature:
|
||||||
|
|
||||||
- `--reasoning-parser` enables reasoning extraction.
|
- `--reasoning-parser` enables reasoning extraction.
|
||||||
- `--reasoning-config` defines the reasoning boundary tokens (e.g., `think_start_str`, `think_end_str`).
|
- `--reasoning-config` defines the reasoning boundary tokens (e.g., `reasoning_start_str`, `reasoning_end_str`).
|
||||||
- `thinking_token_budget` (a sampling parameter) sets the per-request reasoning token limit.
|
- `thinking_token_budget` (a sampling parameter) sets the per-request reasoning token limit.
|
||||||
|
|
||||||
If `thinking_token_budget` is not specified, no explicit reasoning limit is applied beyond normal generation constraints such as `max_tokens`.
|
If `thinking_token_budget` is not specified, no explicit reasoning limit is applied beyond normal generation constraints such as `max_tokens`.
|
||||||
@@ -257,20 +257,20 @@ If `thinking_token_budget` is not specified, no explicit reasoning limit is appl
|
|||||||
`--reasoning-config` accepts a JSON object corresponding to
|
`--reasoning-config` accepts a JSON object corresponding to
|
||||||
[ReasoningConfig][vllm.config.ReasoningConfig] with the following fields:
|
[ReasoningConfig][vllm.config.ReasoningConfig] with the following fields:
|
||||||
|
|
||||||
| Field | Type | Description |
|
| Field | Type | Description |
|
||||||
|-------------------|----------------|--------------------------------------------------|
|
|-----------------------|----------------|--------------------------------------------------|
|
||||||
| `think_start_str` | `str \| null` | String that marks the start of reasoning content |
|
| `reasoning_start_str` | `str \| null` | String that marks the start of reasoning content |
|
||||||
| `think_end_str` | `str \| null` | String that marks the end of reasoning content |
|
| `reasoning_end_str` | `str \| null` | String that marks the end of reasoning content |
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
`think_end_str` can include a transition phrase before the think end token. For example, setting `think_end_str` to `"I have to give the solution based on the thinking directly now.</think>"` instructs the model to emit that phrase when the budget is exhausted, making the reasoning termination more natural.
|
`reasoning_end_str` can include a transition phrase before the reasoning end token. For example, setting `reasoning_end_str` to `"I have to give the solution based on the reasoning directly now.</think>"` instructs the model to emit that phrase when the budget is exhausted, making the reasoning termination more natural.
|
||||||
|
|
||||||
### Online Serving
|
### Online Serving
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve Qwen/Qwen3-0.6B \
|
vllm serve Qwen/Qwen3-0.6B \
|
||||||
--reasoning-parser qwen3 \
|
--reasoning-parser qwen3 \
|
||||||
--reasoning-config '{"think_start_str": "<think>", "think_end_str": "I have to give the solution based on the thinking directly now.</think>"}'
|
--reasoning-config '{"reasoning_start_str": "<think>", "reasoning_end_str": "I have to give the solution based on the reasoning directly now.</think>"}'
|
||||||
```
|
```
|
||||||
|
|
||||||
Then make a request with `thinking_token_budget` to limit the reasoning tokens:
|
Then make a request with `thinking_token_budget` to limit the reasoning tokens:
|
||||||
@@ -298,8 +298,8 @@ from vllm.config import ReasoningConfig
|
|||||||
llm = LLM(
|
llm = LLM(
|
||||||
model="Qwen/Qwen3-0.6B",
|
model="Qwen/Qwen3-0.6B",
|
||||||
reasoning_config=ReasoningConfig(
|
reasoning_config=ReasoningConfig(
|
||||||
think_start_str="<think>",
|
reasoning_start_str="<think>",
|
||||||
think_end_str="I have to give the solution based on the thinking directly now.</think>",
|
reasoning_end_str="I have to give the solution based on the thinking directly now.</think>",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -239,6 +239,17 @@ def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch
|
|||||||
assert metadata_missing["video_backend"] == "test_video_backend_override_2"
|
assert metadata_missing["video_backend"] == "test_video_backend_override_2"
|
||||||
|
|
||||||
|
|
||||||
|
def _make_jpeg_b64_frames(n: int, width: int = 8, height: int = 8) -> list[str]:
|
||||||
|
"""Return *n* tiny base64-encoded JPEG frames."""
|
||||||
|
frames: list[str] = []
|
||||||
|
for i in range(n):
|
||||||
|
img = Image.new("RGB", (width, height), color=(i % 256, 0, 0))
|
||||||
|
buf = io.BytesIO()
|
||||||
|
img.save(buf, format="JPEG")
|
||||||
|
frames.append(pybase64.b64encode(buf.getvalue()).decode("ascii"))
|
||||||
|
return frames
|
||||||
|
|
||||||
|
|
||||||
def test_load_base64_jpeg_returns_metadata():
|
def test_load_base64_jpeg_returns_metadata():
|
||||||
"""Regression test: load_base64 with video/jpeg must return metadata.
|
"""Regression test: load_base64 with video/jpeg must return metadata.
|
||||||
|
|
||||||
@@ -248,16 +259,8 @@ def test_load_base64_jpeg_returns_metadata():
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
num_test_frames = 3
|
num_test_frames = 3
|
||||||
frame_width, frame_height = 8, 8
|
|
||||||
|
|
||||||
# Build a few tiny JPEG frames and base64-encode them
|
|
||||||
b64_frames = []
|
|
||||||
for i in range(num_test_frames):
|
|
||||||
img = Image.new("RGB", (frame_width, frame_height), color=(i * 80, 0, 0))
|
|
||||||
buf = io.BytesIO()
|
|
||||||
img.save(buf, format="JPEG")
|
|
||||||
b64_frames.append(pybase64.b64encode(buf.getvalue()).decode("ascii"))
|
|
||||||
|
|
||||||
|
b64_frames = _make_jpeg_b64_frames(num_test_frames)
|
||||||
data = ",".join(b64_frames)
|
data = ",".join(b64_frames)
|
||||||
|
|
||||||
imageio = ImageMediaIO()
|
imageio = ImageMediaIO()
|
||||||
@@ -287,3 +290,52 @@ def test_load_base64_jpeg_returns_metadata():
|
|||||||
# Default fps=1 → duration == num_frames
|
# Default fps=1 → duration == num_frames
|
||||||
assert metadata["fps"] == 1.0
|
assert metadata["fps"] == 1.0
|
||||||
assert metadata["duration"] == float(num_test_frames)
|
assert metadata["duration"] == float(num_test_frames)
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_base64_jpeg_enforces_num_frames_limit():
|
||||||
|
"""Frames beyond num_frames must be truncated in the video/jpeg path.
|
||||||
|
|
||||||
|
Without the limit an attacker can send thousands of base64 JPEG frames
|
||||||
|
in a single request and exhaust server memory (OOM).
|
||||||
|
"""
|
||||||
|
num_frames_limit = 4
|
||||||
|
sent_frames = 20
|
||||||
|
|
||||||
|
b64_frames = _make_jpeg_b64_frames(sent_frames)
|
||||||
|
data = ",".join(b64_frames)
|
||||||
|
|
||||||
|
imageio = ImageMediaIO()
|
||||||
|
videoio = VideoMediaIO(imageio, num_frames=num_frames_limit)
|
||||||
|
frames, metadata = videoio.load_base64("video/jpeg", data)
|
||||||
|
|
||||||
|
assert frames.shape[0] == num_frames_limit
|
||||||
|
assert metadata["total_num_frames"] == num_frames_limit
|
||||||
|
assert metadata["frames_indices"] == list(range(num_frames_limit))
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_base64_jpeg_no_limit_when_num_frames_negative():
|
||||||
|
"""When num_frames is -1, all frames should be loaded without truncation."""
|
||||||
|
sent_frames = 10
|
||||||
|
|
||||||
|
b64_frames = _make_jpeg_b64_frames(sent_frames)
|
||||||
|
data = ",".join(b64_frames)
|
||||||
|
|
||||||
|
imageio = ImageMediaIO()
|
||||||
|
videoio = VideoMediaIO(imageio, num_frames=-1)
|
||||||
|
frames, metadata = videoio.load_base64("video/jpeg", data)
|
||||||
|
|
||||||
|
assert frames.shape[0] == sent_frames
|
||||||
|
assert metadata["total_num_frames"] == sent_frames
|
||||||
|
assert metadata["frames_indices"] == list(range(sent_frames))
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_base64_jpeg_raises_on_zero_num_frames():
|
||||||
|
"""num_frames=0 is invalid and should raise ValueError."""
|
||||||
|
b64_frames = _make_jpeg_b64_frames(3)
|
||||||
|
data = ",".join(b64_frames)
|
||||||
|
|
||||||
|
imageio = ImageMediaIO()
|
||||||
|
videoio = VideoMediaIO(imageio, num_frames=0)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="num_frames must be greater than 0 or -1"):
|
||||||
|
videoio.load_base64("video/jpeg", data)
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ def server():
|
|||||||
"--reasoning-parser",
|
"--reasoning-parser",
|
||||||
"qwen3",
|
"qwen3",
|
||||||
"--reasoning-config",
|
"--reasoning-config",
|
||||||
'{"think_start_str": "<think>", "think_end_str": "</think>"}',
|
'{"reasoning_start_str": "<think>", "reasoning_end_str": "</think>"}',
|
||||||
"--max-model-len",
|
"--max-model-len",
|
||||||
"2048",
|
"2048",
|
||||||
"--enforce-eager",
|
"--enforce-eager",
|
||||||
|
|||||||
@@ -103,8 +103,8 @@ class LogitsProcsRequestParams:
|
|||||||
class MockReasoningConfig:
|
class MockReasoningConfig:
|
||||||
"""Mock reasoning config for testing ThinkingTokenBudgetLogitsProcessor."""
|
"""Mock reasoning config for testing ThinkingTokenBudgetLogitsProcessor."""
|
||||||
|
|
||||||
think_start_token_ids = [THINK_START_TOKEN_ID]
|
reasoning_start_token_ids = [THINK_START_TOKEN_ID]
|
||||||
think_end_token_ids = [THINK_END_TOKEN_ID]
|
reasoning_end_token_ids = [THINK_END_TOKEN_ID]
|
||||||
|
|
||||||
|
|
||||||
def _generate_fake_sampling_metadata(
|
def _generate_fake_sampling_metadata(
|
||||||
@@ -491,7 +491,7 @@ def _thinking_budget_validate(
|
|||||||
|
|
||||||
# Find if thinking has started in output tokens
|
# Find if thinking has started in output tokens
|
||||||
thinking_started = False
|
thinking_started = False
|
||||||
start_tokens = tb_processor.think_start_token_ids
|
start_tokens = tb_processor.reasoning_start_token_ids
|
||||||
|
|
||||||
if len(start_tokens) > 0:
|
if len(start_tokens) > 0:
|
||||||
for i in range(len(output_tokens) - len(start_tokens) + 1):
|
for i in range(len(output_tokens) - len(start_tokens) + 1):
|
||||||
@@ -518,7 +518,7 @@ def _thinking_budget_validate(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Validate that only end tokens are allowed
|
# Validate that only end tokens are allowed
|
||||||
end_tokens = tb_processor.think_end_token_ids
|
end_tokens = tb_processor.reasoning_end_token_ids
|
||||||
if len(end_tokens) > 0:
|
if len(end_tokens) > 0:
|
||||||
expected_end_token_id = end_tokens[
|
expected_end_token_id = end_tokens[
|
||||||
min(state["end_count"], len(end_tokens) - 1)
|
min(state["end_count"], len(end_tokens) - 1)
|
||||||
|
|||||||
@@ -235,10 +235,11 @@ def _resolve_import_to_file(
|
|||||||
|
|
||||||
|
|
||||||
def _find_cc_in_function(tree: ast.AST, func_name: str) -> str | None:
|
def _find_cc_in_function(tree: ast.AST, func_name: str) -> str | None:
|
||||||
"""Find a compute capability from is_device_capability_family() calls in a function.
|
"""Find a compute capability from is_device_capability*() calls in a function.
|
||||||
|
|
||||||
Looks for the pattern: current_platform.is_device_capability_family(N)
|
Handles two patterns:
|
||||||
and converts N (e.g. 100) to a CC string (e.g. "10.x").
|
- is_device_capability_family(N): "M.x" (e.g. 100 -> "10.x")
|
||||||
|
- is_device_capability(N): "M.m" (e.g. 100 -> "10.0")
|
||||||
"""
|
"""
|
||||||
for node in ast.walk(tree):
|
for node in ast.walk(tree):
|
||||||
if not isinstance(node, ast.FunctionDef) or node.name != func_name:
|
if not isinstance(node, ast.FunctionDef) or node.name != func_name:
|
||||||
@@ -247,12 +248,15 @@ def _find_cc_in_function(tree: ast.AST, func_name: str) -> str | None:
|
|||||||
if (
|
if (
|
||||||
isinstance(n, ast.Call)
|
isinstance(n, ast.Call)
|
||||||
and isinstance(n.func, ast.Attribute)
|
and isinstance(n.func, ast.Attribute)
|
||||||
and n.func.attr == "is_device_capability_family"
|
|
||||||
and n.args
|
and n.args
|
||||||
and isinstance(n.args[0], ast.Constant)
|
and isinstance(n.args[0], ast.Constant)
|
||||||
and isinstance(n.args[0].value, int)
|
and isinstance(n.args[0].value, int)
|
||||||
):
|
):
|
||||||
return f"{n.args[0].value // 10}.x"
|
val = n.args[0].value
|
||||||
|
if n.func.attr == "is_device_capability_family":
|
||||||
|
return f"{val // 10}.x"
|
||||||
|
elif n.func.attr == "is_device_capability":
|
||||||
|
return f"{val // 10}.{val % 10}"
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ from vllm.tokenizers import cached_tokenizer_from_config
|
|||||||
class ReasoningConfig:
|
class ReasoningConfig:
|
||||||
"""Configuration for reasoning models.
|
"""Configuration for reasoning models.
|
||||||
|
|
||||||
Set `think_start_str` and `think_end_str` to the strings that delimit
|
Set `reasoning_start_str` and `reasoning_end_str` to the strings that delimit
|
||||||
the reasoning block (e.g. `"<think>"` and `"</think>"`). The
|
the reasoning block (e.g. `"<think>"` and `"</think>"`). The
|
||||||
corresponding token IDs are derived automatically via
|
corresponding token IDs are derived automatically via
|
||||||
`initialize_token_ids` and are not intended to be set directly.
|
`initialize_token_ids` and are not intended to be set directly.
|
||||||
@@ -20,53 +20,55 @@ class ReasoningConfig:
|
|||||||
|
|
||||||
# NOTE: These parameters are temporary, the intent is to derive them
|
# NOTE: These parameters are temporary, the intent is to derive them
|
||||||
# automatically from the reasoning parser in a future version.
|
# automatically from the reasoning parser in a future version.
|
||||||
think_start_str: str = "<think>"
|
reasoning_start_str: str = "<think>"
|
||||||
"""String that indicates the start of reasoning."""
|
"""String that indicates the start of reasoning."""
|
||||||
think_end_str: str = "</think>"
|
reasoning_end_str: str = "</think>"
|
||||||
"""String that indicates the end of reasoning content."""
|
"""String that indicates the end of reasoning content."""
|
||||||
|
|
||||||
_think_start_token_ids: list[int] | None = field(
|
_reasoning_start_token_ids: list[int] | None = field(
|
||||||
default=None, init=False, repr=False
|
default=None, init=False, repr=False
|
||||||
)
|
)
|
||||||
"""Private backing field for `think_start_token_ids`. Set by
|
"""Private backing field for `reasoning_start_token_ids`. Set by
|
||||||
`initialize_token_ids`. Not intended to be configured directly."""
|
`initialize_token_ids`. Not intended to be configured directly."""
|
||||||
_think_end_token_ids: list[int] | None = field(default=None, init=False, repr=False)
|
_reasoning_end_token_ids: list[int] | None = field(
|
||||||
"""Private backing field for `think_end_token_ids`. Set by
|
default=None, init=False, repr=False
|
||||||
|
)
|
||||||
|
"""Private backing field for `reasoning_end_token_ids`. Set by
|
||||||
`initialize_token_ids`. Not intended to be configured directly."""
|
`initialize_token_ids`. Not intended to be configured directly."""
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def think_start_token_ids(self) -> list[int] | None:
|
def reasoning_start_token_ids(self) -> list[int] | None:
|
||||||
"""Token IDs derived from `think_start_str`. Set automatically by
|
"""Token IDs derived from `reasoning_start_str`. Set automatically by
|
||||||
`initialize_token_ids`. Not intended to be configured directly."""
|
`initialize_token_ids`. Not intended to be configured directly."""
|
||||||
return self._think_start_token_ids
|
return self._reasoning_start_token_ids
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def think_end_token_ids(self) -> list[int] | None:
|
def reasoning_end_token_ids(self) -> list[int] | None:
|
||||||
"""Token IDs derived from `think_end_str`. Set automatically by
|
"""Token IDs derived from `reasoning_end_str`. Set automatically by
|
||||||
`initialize_token_ids`. Not intended to be configured directly."""
|
`initialize_token_ids`. Not intended to be configured directly."""
|
||||||
return self._think_end_token_ids
|
return self._reasoning_end_token_ids
|
||||||
|
|
||||||
def initialize_token_ids(self, model_config: ModelConfig) -> None:
|
def initialize_token_ids(self, model_config: ModelConfig) -> None:
|
||||||
"""Initialize reasoning token IDs from strings using the tokenizer."""
|
"""Initialize reasoning token IDs from strings using the tokenizer."""
|
||||||
if (
|
if (
|
||||||
self._think_start_token_ids is not None
|
self._reasoning_start_token_ids is not None
|
||||||
and self._think_end_token_ids is not None
|
and self._reasoning_end_token_ids is not None
|
||||||
):
|
):
|
||||||
return
|
return
|
||||||
|
|
||||||
tokenizer = cached_tokenizer_from_config(model_config=model_config)
|
tokenizer = cached_tokenizer_from_config(model_config=model_config)
|
||||||
|
|
||||||
self._think_start_token_ids = tokenizer.encode(
|
self._reasoning_start_token_ids = tokenizer.encode(
|
||||||
self.think_start_str, add_special_tokens=False
|
self.reasoning_start_str, add_special_tokens=False
|
||||||
)
|
)
|
||||||
self._think_end_token_ids = tokenizer.encode(
|
self._reasoning_end_token_ids = tokenizer.encode(
|
||||||
self.think_end_str, add_special_tokens=False
|
self.reasoning_end_str, add_special_tokens=False
|
||||||
)
|
)
|
||||||
|
|
||||||
if not self._think_start_token_ids or not self._think_end_token_ids:
|
if not self._reasoning_start_token_ids or not self._reasoning_end_token_ids:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"ReasoningConfig: failed to tokenize reasoning strings: "
|
f"ReasoningConfig: failed to tokenize reasoning strings: "
|
||||||
f"think_start_str='{self.think_start_str}', "
|
f"reasoning_start_str='{self.reasoning_start_str}', "
|
||||||
f"think_end_str='{self.think_end_str}'. "
|
f"reasoning_end_str='{self.reasoning_end_str}'. "
|
||||||
"Ensure the strings are valid tokens in the model's vocabulary."
|
"Ensure the strings are valid tokens in the model's vocabulary."
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -80,8 +80,15 @@ class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
|
|||||||
"image/jpeg",
|
"image/jpeg",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if self.num_frames > 0:
|
||||||
|
frame_parts = data.split(",", self.num_frames)[: self.num_frames]
|
||||||
|
elif self.num_frames == 0:
|
||||||
|
raise ValueError("num_frames must be greater than 0 or -1")
|
||||||
|
else:
|
||||||
|
frame_parts = data.split(",")
|
||||||
|
|
||||||
frames = np.stack(
|
frames = np.stack(
|
||||||
[np.asarray(load_frame(frame_data)) for frame_data in data.split(",")]
|
[np.asarray(load_frame(frame_data)) for frame_data in frame_parts]
|
||||||
)
|
)
|
||||||
total = int(frames.shape[0])
|
total = int(frames.shape[0])
|
||||||
fps = float(self.kwargs.get("fps", 1))
|
fps = float(self.kwargs.get("fps", 1))
|
||||||
|
|||||||
@@ -289,10 +289,10 @@ def supports_trtllm_attention() -> bool:
|
|||||||
if envs.VLLM_BATCH_INVARIANT:
|
if envs.VLLM_BATCH_INVARIANT:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Requires SM100 and NVIDIA artifactory to be accessible to download cubins
|
# TRTLLM attention is currently only validated on SM100 (CC 10.0).
|
||||||
return (
|
# SM103 (GB300) hangs with FlashInfer >= 0.6.7.
|
||||||
current_platform.is_device_capability_family(100) and has_nvidia_artifactory()
|
# See: https://github.com/flashinfer-ai/flashinfer/issues/2939
|
||||||
)
|
return current_platform.is_device_capability(100) and has_nvidia_artifactory()
|
||||||
|
|
||||||
|
|
||||||
def force_use_trtllm_attention() -> bool | None:
|
def force_use_trtllm_attention() -> bool | None:
|
||||||
|
|||||||
@@ -303,10 +303,12 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor):
|
|||||||
# Check if thinking is enabled
|
# Check if thinking is enabled
|
||||||
self.is_enabled = reasoning_config is not None
|
self.is_enabled = reasoning_config is not None
|
||||||
|
|
||||||
self.think_start_token_ids = getattr(
|
self.reasoning_start_token_ids = getattr(
|
||||||
reasoning_config, "think_start_token_ids", []
|
reasoning_config, "reasoning_start_token_ids", []
|
||||||
|
)
|
||||||
|
self.reasoning_end_token_ids = getattr(
|
||||||
|
reasoning_config, "reasoning_end_token_ids", []
|
||||||
)
|
)
|
||||||
self.think_end_token_ids = getattr(reasoning_config, "think_end_token_ids", [])
|
|
||||||
|
|
||||||
self.pin_memory = is_pin_memory
|
self.pin_memory = is_pin_memory
|
||||||
self.device = device
|
self.device = device
|
||||||
@@ -357,15 +359,15 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor):
|
|||||||
think_count = 0
|
think_count = 0
|
||||||
else:
|
else:
|
||||||
last_start = self._find_last_sequence_index(
|
last_start = self._find_last_sequence_index(
|
||||||
prompt_tok_ids, self.think_start_token_ids
|
prompt_tok_ids, self.reasoning_start_token_ids
|
||||||
)
|
)
|
||||||
last_end = self._find_last_sequence_index(
|
last_end = self._find_last_sequence_index(
|
||||||
prompt_tok_ids, self.think_end_token_ids
|
prompt_tok_ids, self.reasoning_end_token_ids
|
||||||
)
|
)
|
||||||
in_think = last_start > last_end
|
in_think = last_start > last_end
|
||||||
if in_think:
|
if in_think:
|
||||||
think_count = len(prompt_tok_ids) - (
|
think_count = len(prompt_tok_ids) - (
|
||||||
last_start + len(self.think_start_token_ids)
|
last_start + len(self.reasoning_start_token_ids)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
think_count = 0
|
think_count = 0
|
||||||
@@ -405,8 +407,8 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor):
|
|||||||
state["prev_output_length"] = current_length
|
state["prev_output_length"] = current_length
|
||||||
|
|
||||||
# Check if new tokens contain think start or end sequences
|
# Check if new tokens contain think start or end sequences
|
||||||
start_len = len(self.think_start_token_ids)
|
start_len = len(self.reasoning_start_token_ids)
|
||||||
end_len = len(self.think_end_token_ids)
|
end_len = len(self.reasoning_end_token_ids)
|
||||||
|
|
||||||
# Look for think sequences in recent tokens (including boundary)
|
# Look for think sequences in recent tokens (including boundary)
|
||||||
# Check overlapping regions where sequences might span boundaries
|
# Check overlapping regions where sequences might span boundaries
|
||||||
@@ -415,10 +417,10 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor):
|
|||||||
|
|
||||||
# Find any think start/end sequences in recent tokens
|
# Find any think start/end sequences in recent tokens
|
||||||
recent_start_pos = self._find_last_sequence_index(
|
recent_start_pos = self._find_last_sequence_index(
|
||||||
recent_tokens, self.think_start_token_ids
|
recent_tokens, self.reasoning_start_token_ids
|
||||||
)
|
)
|
||||||
recent_end_pos = self._find_last_sequence_index(
|
recent_end_pos = self._find_last_sequence_index(
|
||||||
recent_tokens, self.think_end_token_ids
|
recent_tokens, self.reasoning_end_token_ids
|
||||||
)
|
)
|
||||||
|
|
||||||
# Update state based on recent sequences
|
# Update state based on recent sequences
|
||||||
@@ -469,7 +471,7 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor):
|
|||||||
else:
|
else:
|
||||||
# In end mode
|
# In end mode
|
||||||
state["end_count"] += 1
|
state["end_count"] += 1
|
||||||
if state["end_count"] >= len(self.think_end_token_ids):
|
if state["end_count"] >= len(self.reasoning_end_token_ids):
|
||||||
state.update(
|
state.update(
|
||||||
{
|
{
|
||||||
"in_end": False,
|
"in_end": False,
|
||||||
@@ -530,7 +532,9 @@ class ThinkingTokenBudgetLogitsProcessor(LogitsProcessor):
|
|||||||
state = self._state.get(i)
|
state = self._state.get(i)
|
||||||
if state and state["in_end"]:
|
if state and state["in_end"]:
|
||||||
self.mask[i] = True
|
self.mask[i] = True
|
||||||
self.force_token_ids[i] = self.think_end_token_ids[state["end_count"]]
|
self.force_token_ids[i] = self.reasoning_end_token_ids[
|
||||||
|
state["end_count"]
|
||||||
|
]
|
||||||
|
|
||||||
# Check in CPU first not to sync with GPU
|
# Check in CPU first not to sync with GPU
|
||||||
has_active_thinking = any(
|
has_active_thinking = any(
|
||||||
|
|||||||
Reference in New Issue
Block a user