diff --git a/tests/detokenizer/test_min_tokens.py b/tests/detokenizer/test_min_tokens.py index 1f8e94469..37cc3ca1b 100644 --- a/tests/detokenizer/test_min_tokens.py +++ b/tests/detokenizer/test_min_tokens.py @@ -39,7 +39,6 @@ def test_min_tokens_with_stop(min_tokens: int, stop: str, truth: str): mm_features=None, sampling_params=params, pooling_params=None, - eos_token_id=None, arrival_time=0.0, lora_request=None, cache_salt=None, diff --git a/tests/detokenizer/test_stop_string_while_stop_model_terminates.py b/tests/detokenizer/test_stop_string_while_stop_model_terminates.py index 5624332ef..44215cb72 100644 --- a/tests/detokenizer/test_stop_string_while_stop_model_terminates.py +++ b/tests/detokenizer/test_stop_string_while_stop_model_terminates.py @@ -35,7 +35,6 @@ def _make_request(stop, include_stop_str_in_output: bool, min_tokens: int = 0): mm_features=None, sampling_params=params, pooling_params=None, - eos_token_id=None, arrival_time=0.0, lora_request=None, cache_salt=None, diff --git a/tests/tokenizers_/test_detokenize.py b/tests/tokenizers_/test_detokenize.py index ad6c5fb41..2f173bec8 100644 --- a/tests/tokenizers_/test_detokenize.py +++ b/tests/tokenizers_/test_detokenize.py @@ -67,7 +67,6 @@ def _run_incremental_decode( mm_features=None, sampling_params=params, pooling_params=None, - eos_token_id=None, arrival_time=0.0, lora_request=None, cache_salt=None, diff --git a/tests/tool_parsers/test_step3p5_tool_parser.py b/tests/tool_parsers/test_step3p5_tool_parser.py index 6da1e0855..b3cb4e20f 100644 --- a/tests/tool_parsers/test_step3p5_tool_parser.py +++ b/tests/tool_parsers/test_step3p5_tool_parser.py @@ -1123,7 +1123,7 @@ rectangle # Encode all content tokens at once all_token_ids = step3p5_tokenizer.encode(model_output, add_special_tokens=False) - eos_token_id = getattr(step3p5_tokenizer, "eos_token_id", None) + eos_token_id = step3p5_tokenizer.eos_token_id # Include EOS token in delta_token_ids if available if eos_token_id is not None: diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index b91d59e46..ceb8ec424 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -84,13 +84,15 @@ def make_request( ) mm_features.append(mm_feature) + sampling_params = SamplingParams(max_tokens=17) + sampling_params.update_from_generation_config({}, eos_token_id=100) + return Request( request_id=request_id, prompt_token_ids=prompt_token_ids, mm_features=mm_features if mm_features else None, - sampling_params=SamplingParams(max_tokens=17), + sampling_params=sampling_params, pooling_params=None, - eos_token_id=100, lora_request=None, cache_salt=cache_salt, block_hasher=get_request_block_hasher(block_size, hash_fn), diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index e2c924a61..9a968a473 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -75,13 +75,15 @@ def make_request( ) mm_features.append(mm_feature) + sampling_params = SamplingParams(max_tokens=17, prompt_logprobs=prompt_logprobs) + sampling_params.update_from_generation_config({}, eos_token_id=100) + return Request( request_id=request_id, prompt_token_ids=prompt_token_ids, mm_features=mm_features if mm_features else None, - sampling_params=SamplingParams(max_tokens=17, prompt_logprobs=prompt_logprobs), + sampling_params=sampling_params, pooling_params=None, - eos_token_id=100, lora_request=lora_request, cache_salt=cache_salt, block_hasher=get_request_block_hasher(block_size, hash_fn), diff --git a/tests/v1/core/test_priority_scheduler_random.py b/tests/v1/core/test_priority_scheduler_random.py index cb4dfc046..1d03bd104 100644 --- a/tests/v1/core/test_priority_scheduler_random.py +++ b/tests/v1/core/test_priority_scheduler_random.py @@ -48,10 +48,9 @@ def _create_random_request( request_id = uuid.uuid4().hex - sampling_params = SamplingParams( - ignore_eos=False, - max_tokens=max_tokens, - ) + sampling_params = SamplingParams(ignore_eos=False, max_tokens=max_tokens) + sampling_params.update_from_generation_config({}, EOS_TOKEN_ID) + mm_features = [] for j, position in enumerate(mm_positions): identifier = f"{request_id}_hash_{j}" @@ -79,7 +78,6 @@ def _create_random_request( sampling_params=sampling_params, pooling_params=None, mm_features=mm_features if mm_features else None, - eos_token_id=EOS_TOKEN_ID, arrival_time=arrival_time, priority=priority, block_hasher=block_hasher, diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 376b06a5e..0713aa8ab 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -469,8 +469,7 @@ def test_stop_via_update_from_output(): # Test case 4: Ignore EOS flag scheduler = create_scheduler(num_speculative_tokens=2) - requests = create_requests(num_requests=1, max_tokens=10) - requests[0].sampling_params.ignore_eos = True + requests = create_requests(num_requests=1, max_tokens=10, ignore_eos=True) requests[0].num_computed_tokens = requests[0].num_tokens scheduler.requests[requests[0].request_id] = requests[0] scheduler.running.append(requests[0]) @@ -515,12 +514,12 @@ def test_check_stop_min_tokens(): max_tokens=20, min_tokens=5, ) + sampling_params.update_from_generation_config({}, EOS_TOKEN_ID) request = Request( request_id="0", prompt_token_ids=[0, 1, 2], sampling_params=sampling_params, pooling_params=None, - eos_token_id=EOS_TOKEN_ID, ) # Simulate having generated 3 output tokens (less than min_tokens=5) request.append_output_token_ids([10, 11, EOS_TOKEN_ID]) # EOS token present @@ -551,12 +550,12 @@ def test_check_stop_min_tokens(): max_tokens=20, min_tokens=0, ) + sampling_params_no_min.update_from_generation_config({}, EOS_TOKEN_ID) request_no_min = Request( request_id="1", prompt_token_ids=[0, 1, 2], sampling_params=sampling_params_no_min, pooling_params=None, - eos_token_id=EOS_TOKEN_ID, ) request_no_min.append_output_token_ids([10, EOS_TOKEN_ID]) @@ -571,12 +570,12 @@ def test_check_stop_min_tokens(): min_tokens=5, stop_token_ids=[42], ) + sampling_params_stop.update_from_generation_config({}, EOS_TOKEN_ID) request_stop = Request( request_id="2", prompt_token_ids=[0, 1, 2], sampling_params=sampling_params_stop, pooling_params=None, - eos_token_id=EOS_TOKEN_ID, ) # Only 3 output tokens, less than min_tokens=5, but has stop token request_stop.append_output_token_ids([10, 11, 42]) @@ -1877,6 +1876,7 @@ def create_requests_with_priority( stop_token_ids=stop_token_ids, prompt_logprobs=prompt_logprobs, ) + sampling_params.update_from_generation_config({}, EOS_TOKEN_ID) requests = [] if mm_hashes_list is not None: @@ -1938,7 +1938,6 @@ def create_requests_with_priority( sampling_params=sampling_params, pooling_params=None, mm_features=mm_features if mm_features else None, - eos_token_id=EOS_TOKEN_ID, arrival_time=arrival_times[i], priority=priorities[i], block_hasher=block_hasher, @@ -2429,13 +2428,13 @@ def test_schedule_skip_tokenizer_init_structured_output_request(): max_tokens=16, structured_outputs=structured_outputs_params, ) + sampling_params.update_from_generation_config({}, EOS_TOKEN_ID) request = Request( request_id="0", prompt_token_ids=[0, 1], mm_features=None, sampling_params=sampling_params, pooling_params=None, - eos_token_id=EOS_TOKEN_ID, ) scheduler.add_request(request) output = scheduler.schedule() diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 00eb61285..90c174adf 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -174,6 +174,7 @@ def create_requests( num_tokens: int = 10, mm_hashes_list: list[list[str]] | None = None, mm_positions: list[list[PlaceholderRange]] | None = None, + ignore_eos: bool = False, max_tokens: int = 16, stop_token_ids: list[int] | None = None, prompt_logprobs: int | None = None, @@ -188,11 +189,12 @@ def create_requests( block_hasher = get_request_block_hasher(block_size, sha256) sampling_params = SamplingParams( - ignore_eos=False, + ignore_eos=ignore_eos, max_tokens=max_tokens, stop_token_ids=stop_token_ids, prompt_logprobs=prompt_logprobs, ) + sampling_params.update_from_generation_config({}, EOS_TOKEN_ID) requests = [] if mm_hashes_list is not None: @@ -250,7 +252,6 @@ def create_requests( sampling_params=sampling_params, pooling_params=None, mm_features=mm_features if mm_features else None, - eos_token_id=EOS_TOKEN_ID, block_hasher=block_hasher, ) requests.append(request) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 4f96ded7e..8d7377c28 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -54,7 +54,6 @@ def make_request() -> EngineCoreRequest: mm_features=None, sampling_params=SamplingParams(), pooling_params=None, - eos_token_id=None, arrival_time=time.time(), lora_request=None, cache_salt=None, diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index ce0d70cc9..8f8a3cac9 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -69,7 +69,6 @@ def make_request( mm_features=None, sampling_params=params, pooling_params=None, - eos_token_id=None, arrival_time=time.time(), lora_request=None, cache_salt=None, diff --git a/tests/v1/engine/test_fast_incdec_prefix_err.py b/tests/v1/engine/test_fast_incdec_prefix_err.py index 67a3b6b01..036a19b82 100644 --- a/tests/v1/engine/test_fast_incdec_prefix_err.py +++ b/tests/v1/engine/test_fast_incdec_prefix_err.py @@ -32,7 +32,6 @@ def test_fast_inc_detok_invalid_utf8_err_case(): mm_features=None, sampling_params=params, pooling_params=None, - eos_token_id=None, arrival_time=0.0, lora_request=None, cache_salt=None, diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 7c78c5436..ece48e009 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -66,7 +66,6 @@ def test_incremental_detokenization( external_req_id=f"request-{idx}", prompt_token_ids=prompt_tokens, mm_features=None, - eos_token_id=None, arrival_time=0, lora_request=None, cache_salt=None, @@ -487,7 +486,6 @@ def test_logprobs_processor( external_req_id=request_id_list[idx], prompt_token_ids=prompt_tokens, mm_features=None, - eos_token_id=None, arrival_time=0, lora_request=None, cache_salt=None, @@ -663,6 +661,19 @@ def test_stop_token( prompt_string = dummy_test_vectors.prompt_strings[0] prompt_tokens = dummy_test_vectors.prompt_tokens[0] + sampling_params = SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=RequestOutputKind.DELTA, + stop=[], + stop_token_ids=stop_token_ids, + include_stop_str_in_output=include_stop_str_in_output, + logprobs=num_sample_logprobs, + prompt_logprobs=None, + ignore_eos=ignore_eos, + ) + sampling_params.update_from_generation_config({}, eos_token_id) + # Make request. request_id = "request-0" request = EngineCoreRequest( @@ -670,22 +681,11 @@ def test_stop_token( external_req_id=request_id + "-ext", prompt_token_ids=prompt_tokens, mm_features=None, - eos_token_id=eos_token_id, arrival_time=0, lora_request=None, cache_salt=None, data_parallel_rank=None, - sampling_params=SamplingParams( - skip_special_tokens=False, - spaces_between_special_tokens=False, - output_kind=RequestOutputKind.DELTA, - stop=[], - stop_token_ids=stop_token_ids, - include_stop_str_in_output=include_stop_str_in_output, - logprobs=num_sample_logprobs, - prompt_logprobs=None, - ignore_eos=ignore_eos, - ), + sampling_params=sampling_params, pooling_params=None, ) @@ -693,9 +693,8 @@ def test_stop_token( tokens_list=[generation_tokens], generated_logprobs_raw=[generation_logprobs] if do_logprobs else None, prompt_logprobs_raw=None, - eos_token_id=eos_token_id, - stop_token_ids=stop_token_ids, - ignore_eos=ignore_eos, + eos_token_id=sampling_params.eos_token_id, + stop_token_ids=sampling_params.stop_token_ids, request_ids=[request.request_id], ) @@ -775,7 +774,6 @@ def test_stop_string( external_req_id=request_id_list[idx], prompt_token_ids=prompt_tokens, mm_features=None, - eos_token_id=None, arrival_time=0, lora_request=None, cache_salt=None, @@ -907,7 +905,6 @@ def test_iteration_stats(dummy_test_vectors): external_req_id=f"request-{idx}-ext", prompt_token_ids=prompt_tokens, mm_features=None, - eos_token_id=None, arrival_time=0, lora_request=None, cache_salt=None, @@ -994,7 +991,6 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors): external_req_id=f"request-{idx}", prompt_token_ids=prompt_tokens, mm_features=None, - eos_token_id=None, arrival_time=0, lora_request=lora_assignments[idx], cache_salt=None, @@ -1315,7 +1311,6 @@ def test_abort_requests(runner: str, abort_by: str, dummy_test_vectors): external_req_id=f"external-{idx}", prompt_token_ids=prompt_tokens, mm_features=None, - eos_token_id=None, arrival_time=0, lora_request=None, cache_salt=None, diff --git a/tests/v1/engine/test_parallel_sampling.py b/tests/v1/engine/test_parallel_sampling.py index fe6f15df2..395867c06 100644 --- a/tests/v1/engine/test_parallel_sampling.py +++ b/tests/v1/engine/test_parallel_sampling.py @@ -76,7 +76,6 @@ def make_request(sampling_params: SamplingParams) -> EngineCoreRequest: mm_features=None, sampling_params=sampling_params, pooling_params=None, - eos_token_id=None, arrival_time=0.0, lora_request=None, cache_salt=None, diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py index d14775668..de953a588 100644 --- a/tests/v1/engine/utils.py +++ b/tests/v1/engine/utils.py @@ -342,7 +342,6 @@ class MockEngineCore: prompt_logprobs_raw: list[LogprobsTensors] | None = None, eos_token_id: int | None = None, stop_token_ids: list[int] | None = None, - ignore_eos: bool = False, request_ids: list[str] | None = None, ) -> None: self.num_requests = len(tokens_list) @@ -355,7 +354,6 @@ class MockEngineCore: self.request_finished = [False for _ in range(self.num_requests)] self.eos_token_id = eos_token_id self.stop_token_ids = stop_token_ids - self.ignore_eos = ignore_eos self.request_ids = ( request_ids if request_ids is not None @@ -400,7 +398,7 @@ class MockEngineCore: if token_idx == len(token_ids) - 1: output.finish_reason = FinishReason.LENGTH self.request_finished[req_idx] = True - if not self.ignore_eos and new_token_id == self.eos_token_id: + if new_token_id == self.eos_token_id: output.finish_reason = FinishReason.STOP self.request_finished[req_idx] = True if new_token_id in (self.stop_token_ids or ()): diff --git a/tests/v1/kv_connector/unit/test_decode_bench_connector.py b/tests/v1/kv_connector/unit/test_decode_bench_connector.py index 93f4f8537..1d5343644 100644 --- a/tests/v1/kv_connector/unit/test_decode_bench_connector.py +++ b/tests/v1/kv_connector/unit/test_decode_bench_connector.py @@ -93,12 +93,14 @@ class DecodeBenchTestRunner: """Create a new request with given token IDs.""" self.req_id += 1 + sampling_params = SamplingParams(max_tokens=100) + sampling_params.update_from_generation_config({}, EOS_TOKEN_ID) + req = Request( request_id=str(self.req_id), prompt_token_ids=token_ids, - sampling_params=SamplingParams(max_tokens=100), + sampling_params=sampling_params, pooling_params=None, - eos_token_id=EOS_TOKEN_ID, block_hasher=self._block_hasher, ) diff --git a/tests/v1/kv_connector/unit/test_lmcache_integration.py b/tests/v1/kv_connector/unit/test_lmcache_integration.py index cfe8d810c..57ddaa8bf 100644 --- a/tests/v1/kv_connector/unit/test_lmcache_integration.py +++ b/tests/v1/kv_connector/unit/test_lmcache_integration.py @@ -142,12 +142,14 @@ def test_request_interface(): from vllm.sampling_params import SamplingParams from vllm.v1.request import Request + sampling_params = SamplingParams(max_tokens=10) + sampling_params.update_from_generation_config({}, eos_token_id=100) + req = Request( request_id="test_request", prompt_token_ids=[1, 2, 3], - sampling_params=SamplingParams(max_tokens=10), + sampling_params=sampling_params, pooling_params=None, - eos_token_id=100, lora_request=None, ) assumes(req, "mm_features", is_instance_of=(list, NoneType)) diff --git a/tests/v1/kv_connector/unit/test_offloading_connector.py b/tests/v1/kv_connector/unit/test_offloading_connector.py index 5b84202a5..cc89ed1dc 100644 --- a/tests/v1/kv_connector/unit/test_offloading_connector.py +++ b/tests/v1/kv_connector/unit/test_offloading_connector.py @@ -226,12 +226,14 @@ class RequestRunner: def new_request(self, token_ids: list[int]): self.req_id += 1 + sampling_params = SamplingParams(max_tokens=1000) + sampling_params.update_from_generation_config({}, EOS_TOKEN_ID) + req = Request( request_id=str(self.req_id), prompt_token_ids=token_ids, - sampling_params=SamplingParams(max_tokens=1000), + sampling_params=sampling_params, pooling_params=None, - eos_token_id=EOS_TOKEN_ID, block_hasher=self._block_hasher, ) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index e754a0917..d843bd6ff 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -212,6 +212,7 @@ def create_request( max_tokens = 1 if do_remote_decode else max_tokens sampling_params = SamplingParams(max_tokens=max_tokens) + sampling_params.update_from_generation_config({}, EOS_TOKEN_ID) common_prefix = [1] * common_prefix_len if common_prefix_len > 0 else [] suffix = [i * request_id for i in range(num_tokens - common_prefix_len)] @@ -223,7 +224,6 @@ def create_request( sampling_params=sampling_params, pooling_params=None, mm_features=None, - eos_token_id=EOS_TOKEN_ID, block_hasher=get_request_block_hasher(block_size, hash_fn), ) req.kv_transfer_params = kv_transfer_params diff --git a/tests/v1/streaming_input/test_scheduler_streaming.py b/tests/v1/streaming_input/test_scheduler_streaming.py index f8d8c3cb3..fd9f6b17f 100644 --- a/tests/v1/streaming_input/test_scheduler_streaming.py +++ b/tests/v1/streaming_input/test_scheduler_streaming.py @@ -43,7 +43,6 @@ class DummyRequest(Request): stop_token_ids=[STOP_TOKEN], max_tokens=max_tokens ), pooling_params=None, - eos_token_id=None, mm_features=mm_features, resumable=resumable, ) diff --git a/tests/v1/structured_output/test_backend_guidance.py b/tests/v1/structured_output/test_backend_guidance.py index 362f75c49..704ed8b9c 100644 --- a/tests/v1/structured_output/test_backend_guidance.py +++ b/tests/v1/structured_output/test_backend_guidance.py @@ -83,6 +83,7 @@ def test_grammar_bitmask_with_specdec(): ), ) sampling_params.structured_outputs._backend = "guidance" + sampling_params.update_from_generation_config({}, tokenizer.eos_token_id) my_req_id = f"my_req_id_{i}" request = Request( @@ -90,7 +91,6 @@ def test_grammar_bitmask_with_specdec(): prompt_token_ids=prompt[:i], sampling_params=sampling_params, pooling_params=None, - eos_token_id=tokenizer.eos_token_id, ) structured_output_manager.grammar_init(request) @@ -147,13 +147,13 @@ def test_grammar_init_async_and_sync(async_grammar): ), ) sampling_params.structured_outputs._backend = "guidance" + sampling_params.update_from_generation_config({}, tokenizer.eos_token_id) request = Request( "test_request", prompt_token_ids=prompt, sampling_params=sampling_params, pooling_params=None, - eos_token_id=tokenizer.eos_token_id, ) structured_output_manager.grammar_init(request) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index b2cdccbed..2699f70cb 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -77,24 +77,6 @@ class InputPreprocessor: def get_tokenizer(self) -> TokenizerLike: return self.renderer.get_tokenizer() - def get_bos_token_id(self) -> int | None: - if self.tokenizer is None: - logger.warning_once( - "Using None for BOS token id because tokenizer is not initialized" - ) - return None - - return self.tokenizer.bos_token_id - - def get_eos_token_id(self) -> int | None: - if self.tokenizer is None: - logger.warning_once( - "Using None for EOS token id because tokenizer is not initialized" - ) - return None - - return self.tokenizer.eos_token_id - def get_decoder_start_token_id(self) -> int: """ Obtain the decoder start token id employed by an encoder/decoder @@ -106,11 +88,10 @@ class InputPreprocessor: if dec_start_token_id is None: logger.warning_once( - "Falling back on for decoder start token " - "id because decoder start token id is not " - "available." + "Falling back on for decoder start token id " + "because decoder start token id is not available." ) - dec_start_token_id = self.get_bos_token_id() + dec_start_token_id = self.renderer.get_bos_token_id() if dec_start_token_id is None: raise RuntimeError("Cannot find decoder start token id or ") diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py index adf2ee552..0002bdf89 100644 --- a/vllm/renderers/base.py +++ b/vllm/renderers/base.py @@ -6,6 +6,7 @@ from collections.abc import Sequence from typing import TYPE_CHECKING, Any, overload from vllm.inputs import EmbedsPrompt, TextPrompt, TokensPrompt +from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike from vllm.utils.async_utils import AsyncMicrobatchTokenizer @@ -26,6 +27,8 @@ if TYPE_CHECKING: ConversationMessage, ) +logger = init_logger(__name__) + class BaseRenderer(ABC): @classmethod @@ -63,6 +66,24 @@ class BaseRenderer(ABC): return self._async_tokenizer + def get_bos_token_id(self) -> int | None: + if self.tokenizer is None: + logger.warning_once( + "Using None for BOS token id because tokenizer is not initialized" + ) + return None + + return self.tokenizer.bos_token_id + + def get_eos_token_id(self) -> int | None: + if self.tokenizer is None: + logger.warning_once( + "Using None for EOS token id because tokenizer is not initialized" + ) + return None + + return self.tokenizer.eos_token_id + # Step 1: Convert raw inputs to prompts def render_prompt( self, diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 5603e5dc4..520481c58 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -223,6 +223,7 @@ class SamplingParams( # The below fields are not supposed to be used as an input. # They are set in post_init. output_text_buffer_length: int = 0 + _eos_token_id: int | None = None _all_stop_token_ids: set[int] = msgspec.field(default_factory=set) # Fields used to construct logits processors @@ -477,24 +478,26 @@ class SamplingParams( def update_from_generation_config( self, generation_config: dict[str, Any], - model_eos_token_id: int | None = None, + eos_token_id: int | None = None, ) -> None: """Update if there are non-default values from generation_config""" + if not self.ignore_eos: + self._eos_token_id = eos_token_id - if model_eos_token_id is not None: + if eos_token_id is not None: # Add the eos token id into the sampling_params to support # min_tokens processing. - self._all_stop_token_ids.add(model_eos_token_id) + self._all_stop_token_ids.add(eos_token_id) # Update eos_token_id for generation if (eos_ids := generation_config.get("eos_token_id")) is not None: # it can be either int or list of int eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids) - if model_eos_token_id is not None: + if eos_token_id is not None: # We don't need to include the primary eos_token_id in # stop_token_ids since it's handled separately for stopping # purposes. - eos_ids.discard(model_eos_token_id) + eos_ids.discard(eos_token_id) if eos_ids: self._all_stop_token_ids.update(eos_ids) if not self.ignore_eos: @@ -550,6 +553,10 @@ class SamplingParams( return SamplingType.RANDOM_SEED return SamplingType.RANDOM + @property + def eos_token_id(self) -> int | None: + return self._eos_token_id + @property def all_stop_token_ids(self) -> set[int]: return self._all_stop_token_ids diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py index 631973188..22e3aefb6 100644 --- a/vllm/v1/core/sched/utils.py +++ b/vllm/v1/core/sched/utils.py @@ -47,7 +47,7 @@ def check_stop(request: Request, max_model_len: int) -> bool: return False last_token_id = request.output_token_ids[-1] - if not sampling_params.ignore_eos and last_token_id == request.eos_token_id: + if last_token_id == sampling_params.eos_token_id: request.status = RequestStatus.FINISHED_STOPPED return True diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index d0b0370fb..1dd9f64f8 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -9,6 +9,7 @@ from typing import Any, Literal import msgspec import numpy as np import torch +from typing_extensions import deprecated from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalFeatureSpec @@ -63,7 +64,6 @@ class EngineCoreRequest( mm_features: list[MultiModalFeatureSpec] | None sampling_params: SamplingParams | None pooling_params: PoolingParams | None - eos_token_id: int | None arrival_time: float lora_request: LoRARequest | None cache_salt: str | None @@ -99,6 +99,17 @@ class EngineCoreRequest( assert self.pooling_params is not None return self.pooling_params + @property + @deprecated( + "EngineCoreRequest.eos_token_id will be removed in v0.18. " + "Please use EngineCoreRequest.sampling_params.eos_token_id instead." + ) + def eos_token_id(self) -> int | None: + if self.sampling_params is None: + return None + + return self.sampling_params.eos_token_id + class EngineCoreEventType(enum.IntEnum): """The type of engine core request event.""" diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 8bd4b509a..4c105c87b 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -376,8 +376,6 @@ class InputProcessor: processed_inputs=processed_inputs, ) - eos_token_id = self.input_preprocessor.get_eos_token_id() - encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) self._validate_model_inputs(encoder_inputs, decoder_inputs) @@ -403,7 +401,7 @@ class InputProcessor: sampling_params.update_from_generation_config( self.generation_config_fields, - None if self.tokenizer is None else self.tokenizer.eos_token_id, + self.renderer.get_eos_token_id(), ) if self.tokenizer is not None: sampling_params.update_from_tokenizer(self.tokenizer) @@ -446,7 +444,6 @@ class InputProcessor: mm_features=mm_features, sampling_params=sampling_params, pooling_params=pooling_params, - eos_token_id=eos_token_id, arrival_time=arrival_time, lora_request=lora_request, cache_salt=decoder_inputs.get("cache_salt"), diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 970b7e1eb..66ade0097 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -9,6 +9,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any import torch +from typing_extensions import deprecated from vllm.multimodal.inputs import MultiModalFeatureSpec from vllm.pooling_params import PoolingParams @@ -62,7 +63,6 @@ class Request: prompt_token_ids: list[int] | None, sampling_params: SamplingParams | None, pooling_params: PoolingParams | None, - eos_token_id: int | None, client_index: int = 0, arrival_time: float | None = None, prompt_embeds: torch.Tensor | None = None, @@ -80,8 +80,6 @@ class Request: self.priority = priority self.sampling_params = sampling_params self.pooling_params = pooling_params - # Because of LoRA, the eos token id can be different for each request. - self.eos_token_id = eos_token_id self.lora_request = lora_request self.structured_output_request = StructuredOutputRequest.from_sampling_params( sampling_params @@ -176,6 +174,17 @@ class Request: # None entry in the queue means finished. self.streaming_queue: deque[StreamingUpdate | None] | None = None + @property + @deprecated( + "Request.eos_token_id will be removed in v0.18. " + "Please use Request.sampling_params.eos_token_id instead." + ) + def eos_token_id(self) -> int | None: + if self.sampling_params is None: + return None + + return self.sampling_params.eos_token_id + @classmethod def from_engine_core_request( cls, @@ -190,7 +199,6 @@ class Request: mm_features=request.mm_features, sampling_params=request.sampling_params, pooling_params=request.pooling_params, - eos_token_id=request.eos_token_id, arrival_time=request.arrival_time, lora_request=request.lora_request, cache_salt=request.cache_salt, diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index 1419cdce1..aadd057b1 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -185,14 +185,13 @@ re_llama_byte_token = re.compile(r"^<0x[0-9A-F]{2}>$") re_replacement_seq = re.compile(r"^.{0,6}�+.{0,6}$") -def _reduced_vocabulary( - tokenizer: TokenizerLike, eos_token_id: int -) -> dict[bytes, list[int]]: +def _reduced_vocabulary(tokenizer: TokenizerLike) -> dict[bytes, list[int]]: """Create a map from vocabulary tokens to lists of equivalent token ids. Returns: A Dict of token string -> equivalent token ids """ + eos_token_id = tokenizer.eos_token_id unicode_to_bytes = { v: k for k, v in convert_slow_tokenizer.bytes_to_unicode().items() @@ -260,30 +259,13 @@ def get_outlines_vocabulary(tokenizer: TokenizerLike) -> oc.Vocabulary: if hasattr(tokenizer, "_outlines_vocabulary"): return tokenizer._outlines_vocabulary # type: ignore - try: - if hasattr(tokenizer, "eos_token_id") and tokenizer.eos_token_id is not None: - eos_token_id = tokenizer.eos_token_id - else: - raise ValueError( - "Error during structured outputs setup for outlines: Tokenizer " - f"({type(tokenizer)}) has no `eos_token_id` property, but " - "`eos_token_id` is required for structured outputs to work properly." - ) + reduced_vocab = _reduced_vocabulary(tokenizer) + vocabulary = OutlinesVocabulary( + oc.Vocabulary(tokenizer.eos_token_id, reduced_vocab) + ) + tokenizer._outlines_vocabulary = vocabulary # type: ignore - reduced_vocab = _reduced_vocabulary( - tokenizer, - eos_token_id, # type: ignore - ) - vocabulary = OutlinesVocabulary(oc.Vocabulary(eos_token_id, reduced_vocab)) - tokenizer._outlines_vocabulary = vocabulary # type: ignore - - return vocabulary - except AttributeError as e: - raise ValueError( - "Cannot get the vocabulary of the tokenizer " - f"({type(tokenizer)}). The tokenizer should have a " - "get_vocab method." - ) from e + return vocabulary def grammar_is_likely_lark(grammar_str: str) -> bool: