diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py index 760ec8acb..6095d1ec8 100644 --- a/tests/entrypoints/openai/test_chat_error.py +++ b/tests/entrypoints/openai/test_chat_error.py @@ -59,11 +59,16 @@ class MockModelConfig: return self.diff_sampling_param or {} +@dataclass +class MockVllmConfig: + model_config: MockModelConfig + + def _build_renderer(model_config: MockModelConfig): _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config) return HfRenderer( - model_config, + MockVllmConfig(model_config), tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name}, ) diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py index 800bf75f0..d5a266831 100644 --- a/tests/entrypoints/openai/test_completion_error.py +++ b/tests/entrypoints/openai/test_completion_error.py @@ -58,6 +58,11 @@ class MockModelConfig: return self.diff_sampling_param or {} +@dataclass +class MockVllmConfig: + model_config: MockModelConfig + + def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion: models = OpenAIServingModels( engine_client=engine, @@ -74,7 +79,7 @@ def _build_renderer(model_config: MockModelConfig): _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config) return HfRenderer( - model_config, + MockVllmConfig(model_config), tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name}, ) diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index 56fe31556..450a788a3 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -57,6 +57,11 @@ class MockModelConfig: return self.diff_sampling_param or {} +@dataclass +class MockVllmConfig: + model_config: MockModelConfig + + class MockLoRAResolver(LoRAResolver): async def resolve_lora( self, base_model_name: str, lora_name: str @@ -91,7 +96,7 @@ def _build_renderer(model_config: MockModelConfig): _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config) return HfRenderer( - model_config, + MockVllmConfig(model_config), tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name}, ) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index b57f00ab7..2cef772c2 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -534,11 +534,16 @@ class MockModelConfig: return self.diff_sampling_param or {} +@dataclass +class MockVllmConfig: + model_config: MockModelConfig + + def _build_renderer(model_config: MockModelConfig): _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config) return HfRenderer( - model_config, + MockVllmConfig(model_config), tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name}, ) @@ -749,7 +754,10 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated(): mock_engine.io_processor = MagicMock() mock_tokenizer = MagicMock(spec=MistralTokenizer) - mock_renderer = MistralRenderer(mock_engine.model_config, tokenizer_kwargs={}) + mock_renderer = MistralRenderer( + MockVllmConfig(mock_engine.model_config), + tokenizer_kwargs={}, + ) mock_renderer._tokenizer = mock_tokenizer # Force the Mistral chat template renderer to return token IDs. # Choose a prompt length that is < max_model_len, but large enough that @@ -788,7 +796,10 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected(): mock_engine.io_processor = MagicMock() mock_tokenizer = MagicMock(spec=MistralTokenizer) - mock_renderer = MistralRenderer(mock_engine.model_config, tokenizer_kwargs={}) + mock_renderer = MistralRenderer( + MockVllmConfig(mock_engine.model_config), + tokenizer_kwargs={}, + ) mock_renderer._tokenizer = mock_tokenizer # prompt_token_ids length == max_model_len should be rejected for # completion-like requests (ChatCompletionRequest). diff --git a/tests/renderers/test_completions.py b/tests/renderers/test_completions.py index 1cef8551c..ec6d8a688 100644 --- a/tests/renderers/test_completions.py +++ b/tests/renderers/test_completions.py @@ -40,6 +40,11 @@ class MockModelConfig: is_encoder_decoder: bool = False +@dataclass +class MockVllmConfig: + model_config: MockModelConfig + + @dataclass class DummyTokenizer: truncation_side: str = "left" @@ -72,7 +77,7 @@ def _build_renderer( _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config) renderer = HfRenderer( - model_config, + MockVllmConfig(model_config), tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name}, ) @@ -104,14 +109,14 @@ class TestValidatePrompt: renderer = _build_renderer(MockModelConfig()) with pytest.raises(ValueError, match="at least one prompt"): - renderer.render_prompts(_preprocess_prompt(renderer.config, [])) + renderer.render_prompts(_preprocess_prompt(renderer.model_config, [])) def test_invalid_type(self): renderer = _build_renderer(MockModelConfig()) with pytest.raises(TypeError, match="should be a list of integers"): renderer.render_prompts( - _preprocess_prompt(renderer.config, [[1, 2], ["foo", "bar"]]) # type: ignore[arg-type] + _preprocess_prompt(renderer.model_config, [[1, 2], ["foo", "bar"]]) # type: ignore[arg-type] ) @@ -120,7 +125,9 @@ class TestRenderPrompt: renderer = _build_renderer(MockModelConfig()) tokens = [101, 7592, 2088] - prompts = renderer.render_prompts(_preprocess_prompt(renderer.config, tokens)) + prompts = renderer.render_prompts( + _preprocess_prompt(renderer.model_config, tokens) + ) results = renderer.tokenize_prompts( prompts, TokenizeParams(max_total_tokens=100), @@ -134,7 +141,7 @@ class TestRenderPrompt: token_lists = [[101, 7592, 2088], [102, 1234, 5678, 9012], [103, 4567]] prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, token_lists) + _preprocess_prompt(renderer.model_config, token_lists) ) results = renderer.tokenize_prompts( prompts, @@ -151,7 +158,7 @@ class TestRenderPrompt: text_input = "x" * 10 prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, text_input) + _preprocess_prompt(renderer.model_config, text_input) ) results = renderer.tokenize_prompts( prompts, @@ -166,7 +173,7 @@ class TestRenderPrompt: text_list_input = ["x" * 10, "x" * 12, "x" * 14] prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, text_list_input) + _preprocess_prompt(renderer.model_config, text_list_input) ) results = renderer.tokenize_prompts( prompts, @@ -181,7 +188,7 @@ class TestRenderPrompt: renderer = _build_renderer(MockModelConfig()) prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, "x" * 200) + _preprocess_prompt(renderer.model_config, "x" * 200) ) results = renderer.tokenize_prompts( prompts, @@ -195,7 +202,7 @@ class TestRenderPrompt: renderer = _build_renderer(MockModelConfig()) prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, "x" * 200) + _preprocess_prompt(renderer.model_config, "x" * 200) ) results = renderer.tokenize_prompts( prompts, @@ -209,7 +216,7 @@ class TestRenderPrompt: renderer = _build_renderer(MockModelConfig()) prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, "x" * 200) + _preprocess_prompt(renderer.model_config, "x" * 200) ) results = renderer.tokenize_prompts( prompts, @@ -224,7 +231,7 @@ class TestRenderPrompt: long_tokens = [100, 101, 102, 103, 104, 105, 106, 107, 108, 109] # 10 tokens prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, long_tokens) + _preprocess_prompt(renderer.model_config, long_tokens) ) results = renderer.tokenize_prompts( prompts, @@ -240,7 +247,7 @@ class TestRenderPrompt: long_tokens = [100, 101, 102, 103, 104, 105, 106, 107, 108, 109] # 10 tokens prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, long_tokens) + _preprocess_prompt(renderer.model_config, long_tokens) ) results = renderer.tokenize_prompts( prompts, @@ -257,7 +264,7 @@ class TestRenderPrompt: # Exceeds max_total_tokens and max_total_tokens * VLLM_MAX_CHARS_PER_TOKEN long_tokens = "x" * 150 prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, long_tokens) + _preprocess_prompt(renderer.model_config, long_tokens) ) with pytest.raises( @@ -278,7 +285,7 @@ class TestRenderPrompt: # Exceeds max_total_tokens but not max_total_tokens * VLLM_MAX_CHARS_PER_TOKEN long_tokens = "x" * 150 prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, long_tokens) + _preprocess_prompt(renderer.model_config, long_tokens) ) with pytest.raises( @@ -299,7 +306,7 @@ class TestRenderPrompt: long_tokens = list(range(150)) # Exceeds max_total_tokens=100 prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, long_tokens) + _preprocess_prompt(renderer.model_config, long_tokens) ) with pytest.raises( @@ -315,7 +322,7 @@ class TestRenderPrompt: renderer = _build_renderer(MockModelConfig(skip_tokenizer_init=True)) prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, "Hello world") + _preprocess_prompt(renderer.model_config, "Hello world") ) with pytest.raises(ValueError, match="`skip_tokenizer_init=True`"): @@ -328,7 +335,9 @@ class TestRenderPrompt: renderer = _build_renderer(MockModelConfig()) tokens = [1, 2, 3, 4] - prompts = renderer.render_prompts(_preprocess_prompt(renderer.config, tokens)) + prompts = renderer.render_prompts( + _preprocess_prompt(renderer.model_config, tokens) + ) results = renderer.tokenize_prompts( prompts, TokenizeParams( @@ -358,7 +367,7 @@ class TestRenderEmbedPrompt: embed_bytes = self._create_test_embed_bytes(tensor_input) prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, embed_bytes) + _preprocess_prompt(renderer.model_config, embed_bytes) ) results = renderer.tokenize_prompts( prompts, @@ -379,7 +388,7 @@ class TestRenderEmbedPrompt: prompts = renderer.render_prompts( _preprocess_prompt( - renderer.config, + renderer.model_config, [self._create_test_embed_bytes(t) for t in tensor_inputs], ) ) @@ -400,7 +409,7 @@ class TestRenderEmbedPrompt: prompts = renderer.render_prompts( _preprocess_prompt( - renderer.config, self._create_test_embed_bytes(tensor_input) + renderer.model_config, self._create_test_embed_bytes(tensor_input) ) ) results = renderer.tokenize_prompts( @@ -427,7 +436,7 @@ class TestRenderEmbedPrompt: prompts = renderer.render_prompts( _preprocess_prompt( - renderer.config, self._create_test_embed_bytes(tensor_input) + renderer.model_config, self._create_test_embed_bytes(tensor_input) ) ) results = renderer.tokenize_prompts( @@ -446,7 +455,7 @@ class TestRenderEmbedPrompt: prompts = renderer.render_prompts( _preprocess_prompt( - renderer.config, self._create_test_embed_bytes(tensor_input) + renderer.model_config, self._create_test_embed_bytes(tensor_input) ) ) results = renderer.tokenize_prompts( @@ -466,7 +475,7 @@ class TestRenderEmbedPrompt: prompts = renderer.render_prompts( _preprocess_prompt( - renderer.config, + renderer.model_config, [text_input, self._create_test_embed_bytes(tensor_input)], ) ) diff --git a/tests/renderers/test_mistral.py b/tests/renderers/test_mistral.py index f1d73e738..8c68f750a 100644 --- a/tests/renderers/test_mistral.py +++ b/tests/renderers/test_mistral.py @@ -38,6 +38,11 @@ class MockModelConfig: is_encoder_decoder: bool = False +@dataclass +class MockVllmConfig: + model_config: MockModelConfig + + @pytest.mark.asyncio async def test_async_mistral_tokenizer_does_not_block_event_loop(): expected_tokens = [1, 2, 3] @@ -50,7 +55,10 @@ async def test_async_mistral_tokenizer_does_not_block_event_loop(): mock_model_config = MockModelConfig(skip_tokenizer_init=True) mock_tokenizer = Mock(spec=MistralTokenizer) mock_tokenizer.apply_chat_template = mocked_apply_chat_template - mock_renderer = MistralRenderer(mock_model_config, tokenizer_kwargs={}) + mock_renderer = MistralRenderer( + MockVllmConfig(mock_model_config), + tokenizer_kwargs={}, + ) mock_renderer._tokenizer = mock_tokenizer task = mock_renderer.render_messages_async([], ChatParams()) diff --git a/tests/test_inputs.py b/tests/test_inputs.py index 03e470427..fb1bbd21e 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -3,7 +3,7 @@ import pytest -from vllm.config import ModelConfig +from vllm.config import ModelConfig, VllmConfig from vllm.inputs.preprocess import InputPreprocessor pytestmark = pytest.mark.cpu_test @@ -20,7 +20,8 @@ pytestmark = pytest.mark.cpu_test ) def test_preprocessor_always_mm_code_path(model_id, prompt): model_config = ModelConfig(model=model_id) - input_preprocessor = InputPreprocessor(model_config) + vllm_config = VllmConfig(model_config=model_config) + input_preprocessor = InputPreprocessor(vllm_config) # HF processor adds sep token tokenizer = input_preprocessor.get_tokenizer() diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 2699f70cb..ef1f2e0bf 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -6,7 +6,7 @@ from typing import Any, overload from typing_extensions import assert_never -from vllm.config import ModelConfig, ObservabilityConfig +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.cache import BaseMultiModalProcessorCache @@ -54,17 +54,16 @@ logger = init_logger(__name__) class InputPreprocessor: def __init__( self, - model_config: ModelConfig, - observability_config: ObservabilityConfig | None = None, + vllm_config: VllmConfig, renderer: BaseRenderer | None = None, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, mm_processor_cache: BaseMultiModalProcessorCache | None = None, ) -> None: super().__init__() - self.model_config = model_config - self.observability_config = observability_config - self.renderer = renderer or renderer_from_config(model_config) + self.model_config = vllm_config.model_config + self.observability_config = vllm_config.observability_config + self.renderer = renderer or renderer_from_config(vllm_config) self.mm_registry = mm_registry self.mm_processor_cache = mm_processor_cache diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py index 0002bdf89..05058c549 100644 --- a/vllm/renderers/base.py +++ b/vllm/renderers/base.py @@ -21,7 +21,7 @@ from .inputs.preprocess import extract_target_prompt from .params import ChatParams, TokenizeParams if TYPE_CHECKING: - from vllm.config import ModelConfig + from vllm.config import VllmConfig from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, ConversationMessage, @@ -35,15 +35,15 @@ class BaseRenderer(ABC): @abstractmethod def from_config( cls, - config: "ModelConfig", + config: "VllmConfig", tokenizer_kwargs: dict[str, Any], ) -> "BaseRenderer": raise NotImplementedError - def __init__(self, config: "ModelConfig") -> None: + def __init__(self, config: "VllmConfig") -> None: super().__init__() - self.config = config + self.model_config = config.model_config # Lazy initialization since offline LLM doesn't use async self._async_tokenizer: AsyncMicrobatchTokenizer | None = None @@ -90,7 +90,7 @@ class BaseRenderer(ABC): prompt: DictPrompt | bytes, ) -> DictPrompt: if isinstance(prompt, bytes): - embeds = safe_load_prompt_embeds(self.config, prompt) + embeds = safe_load_prompt_embeds(self.model_config, prompt) prompt = EmbedsPrompt(prompt_embeds=embeds) return prompt @@ -310,7 +310,7 @@ class BaseRenderer(ABC): return for prompt in prompts: - target_prompt = extract_target_prompt(self.config, prompt) + target_prompt = extract_target_prompt(self.model_config, prompt) target_prompt.update(prompt_extras) # type: ignore[arg-type] # Top-level methods @@ -325,7 +325,7 @@ class BaseRenderer(ABC): # NOTE: Some MM models have non-default `add_special_tokens` # so we handle tokenization in multi-modal processor - if self.config.is_multimodal_model: + if self.model_config.is_multimodal_model: self._apply_prompt_extras(dict_prompts, prompt_extras) return dict_prompts diff --git a/vllm/renderers/deepseek_v32.py b/vllm/renderers/deepseek_v32.py index e4cc3f0fb..f03a5973f 100644 --- a/vllm/renderers/deepseek_v32.py +++ b/vllm/renderers/deepseek_v32.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any -from vllm.config import ModelConfig +from vllm.config import VllmConfig from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, ConversationMessage, @@ -26,19 +26,20 @@ class DeepseekV32Renderer(BaseRenderer): @classmethod def from_config( cls, - config: ModelConfig, + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> "BaseRenderer": return cls(config, tokenizer_kwargs) def __init__( self, - config: ModelConfig, + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> None: super().__init__(config) - if config.skip_tokenizer_init: + model_config = self.model_config + if model_config.skip_tokenizer_init: tokenizer = None else: tokenizer = cached_get_tokenizer( @@ -67,7 +68,7 @@ class DeepseekV32Renderer(BaseRenderer): tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = parse_chat_messages( messages, - self.config, + self.model_config, content_format="string", ) @@ -93,7 +94,7 @@ class DeepseekV32Renderer(BaseRenderer): tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages, - self.config, + self.model_config, content_format="string", ) diff --git a/vllm/renderers/grok2.py b/vllm/renderers/grok2.py index 141c72aa7..7e8681d82 100644 --- a/vllm/renderers/grok2.py +++ b/vllm/renderers/grok2.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any -from vllm.config import ModelConfig +from vllm.config import VllmConfig from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, ConversationMessage, @@ -25,19 +25,20 @@ class Grok2Renderer(BaseRenderer): @classmethod def from_config( cls, - config: ModelConfig, + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> "BaseRenderer": return cls(config, tokenizer_kwargs) def __init__( self, - config: ModelConfig, + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> None: super().__init__(config) - if config.skip_tokenizer_init: + model_config = self.model_config + if model_config.skip_tokenizer_init: tokenizer = None else: tokenizer = cached_get_tokenizer( @@ -66,7 +67,7 @@ class Grok2Renderer(BaseRenderer): tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = parse_chat_messages( messages, - self.config, + self.model_config, content_format="string", ) @@ -92,7 +93,7 @@ class Grok2Renderer(BaseRenderer): tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages, - self.config, + self.model_config, content_format="string", ) diff --git a/vllm/renderers/hf.py b/vllm/renderers/hf.py index 83b17e961..407b28ae1 100644 --- a/vllm/renderers/hf.py +++ b/vllm/renderers/hf.py @@ -14,7 +14,7 @@ import jinja2.nodes import jinja2.parser import jinja2.sandbox -from vllm.config import ModelConfig +from vllm.config import ModelConfig, VllmConfig from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, ChatTemplateContentFormat, @@ -589,23 +589,24 @@ class HfRenderer(BaseRenderer): @classmethod def from_config( cls, - config: ModelConfig, + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> "BaseRenderer": return cls(config, tokenizer_kwargs) def __init__( self, - config: ModelConfig, + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> None: super().__init__(config) + model_config = self.model_config self.use_unified_vision_chunk = getattr( - config.hf_config, "use_unified_vision_chunk", False + model_config.hf_config, "use_unified_vision_chunk", False ) - if config.skip_tokenizer_init: + if model_config.skip_tokenizer_init: tokenizer = None else: tokenizer = cast( @@ -634,7 +635,7 @@ class HfRenderer(BaseRenderer): messages: list[ChatCompletionMessageParam], params: ChatParams, ) -> tuple[list[ConversationMessage], DictPrompt]: - model_config = self.config + model_config = self.model_config tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = parse_chat_messages( @@ -688,7 +689,7 @@ class HfRenderer(BaseRenderer): messages: list[ChatCompletionMessageParam], params: ChatParams, ) -> tuple[list[ConversationMessage], DictPrompt]: - model_config = self.config + model_config = self.model_config tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = await parse_chat_messages_async( diff --git a/vllm/renderers/mistral.py b/vllm/renderers/mistral.py index 3d3141bdc..ae8078f41 100644 --- a/vllm/renderers/mistral.py +++ b/vllm/renderers/mistral.py @@ -3,7 +3,7 @@ from concurrent.futures import ThreadPoolExecutor from typing import Any -from vllm.config import ModelConfig +from vllm.config import VllmConfig from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, ConversationMessage, @@ -54,19 +54,20 @@ class MistralRenderer(BaseRenderer): @classmethod def from_config( cls, - config: ModelConfig, + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> "BaseRenderer": return cls(config, tokenizer_kwargs) def __init__( self, - config: ModelConfig, + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> None: super().__init__(config) - if config.skip_tokenizer_init: + model_config = self.model_config + if model_config.skip_tokenizer_init: tokenizer = None else: tokenizer = cached_get_tokenizer( @@ -100,7 +101,7 @@ class MistralRenderer(BaseRenderer): tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = parse_chat_messages( messages, - self.config, + self.model_config, content_format="string", ) @@ -126,7 +127,7 @@ class MistralRenderer(BaseRenderer): tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages, - self.config, + self.model_config, content_format="string", ) diff --git a/vllm/renderers/registry.py b/vllm/renderers/registry.py index 3abc7c9fe..cd09c80f9 100644 --- a/vllm/renderers/registry.py +++ b/vllm/renderers/registry.py @@ -10,7 +10,7 @@ from vllm.utils.import_utils import resolve_obj_by_qualname from .base import BaseRenderer if TYPE_CHECKING: - from vllm.config import ModelConfig + from vllm.config import VllmConfig logger = init_logger(__name__) @@ -55,7 +55,7 @@ class RendererRegistry: def load_renderer( self, renderer_mode: str, - config: "ModelConfig", + config: "VllmConfig", tokenizer_kwargs: dict[str, Any], ) -> BaseRenderer: renderer_cls = self.load_renderer_cls(renderer_mode) @@ -71,12 +71,16 @@ RENDERER_REGISTRY = RendererRegistry( """The global `RendererRegistry` instance.""" -def renderer_from_config(config: "ModelConfig", **kwargs): +def renderer_from_config(config: "VllmConfig", **kwargs): + model_config = config.model_config tokenizer_mode, tokenizer_name, args, kwargs = tokenizer_args_from_config( - config, **kwargs + model_config, **kwargs ) - if config.tokenizer_mode == "auto" and config.model_impl == "terratorch": + if ( + model_config.tokenizer_mode == "auto" + and model_config.model_impl == "terratorch" + ): renderer_mode = "terratorch" else: renderer_mode = tokenizer_mode diff --git a/vllm/renderers/terratorch.py b/vllm/renderers/terratorch.py index 2d00ebccb..0ee97f852 100644 --- a/vllm/renderers/terratorch.py +++ b/vllm/renderers/terratorch.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any -from vllm.config import ModelConfig +from vllm.config import VllmConfig from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, ConversationMessage, @@ -24,15 +24,16 @@ class TerratorchRenderer(BaseRenderer): @classmethod def from_config( cls, - config: "ModelConfig", + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> "BaseRenderer": return cls(config) - def __init__(self, config: ModelConfig) -> None: + def __init__(self, config: VllmConfig) -> None: super().__init__(config) - if not config.skip_tokenizer_init: + model_config = self.model_config + if not model_config.skip_tokenizer_init: raise ValueError("Terratorch renderer requires `skip_tokenizer_init=True`") @property @@ -47,7 +48,7 @@ class TerratorchRenderer(BaseRenderer): messages: list[ChatCompletionMessageParam], params: ChatParams, ) -> tuple[list[ConversationMessage], DictPrompt]: - model_config = self.config + model_config = self.model_config conversation, mm_data, mm_uuids = parse_chat_messages( messages, @@ -68,7 +69,7 @@ class TerratorchRenderer(BaseRenderer): messages: list[ChatCompletionMessageParam], params: ChatParams, ) -> tuple[list[ConversationMessage], DictPrompt]: - model_config = self.config + model_config = self.model_config conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages, diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index bab898da6..87410c420 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -132,7 +132,7 @@ class AsyncLLM(EngineClient): "enabling logging without default stat loggers." ) - self.renderer = renderer = renderer_from_config(self.model_config) + self.renderer = renderer = renderer_from_config(self.vllm_config) self.io_processor = get_io_processor( self.vllm_config, self.model_config.io_processor_plugin, diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 4c105c87b..1bda736fe 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -59,7 +59,7 @@ class InputProcessor: self.generation_config_fields = model_config.try_get_generation_config() - self.renderer = renderer or renderer_from_config(model_config) + self.renderer = renderer or renderer_from_config(vllm_config) self.mm_registry = mm_registry self.mm_processor_cache = mm_registry.processor_cache_from_config(vllm_config) @@ -75,8 +75,7 @@ class InputProcessor: mm_budget.reset_cache() # Not used anymore self.input_preprocessor = InputPreprocessor( - model_config, - self.observability_config, + vllm_config, renderer=renderer, mm_registry=mm_registry, mm_processor_cache=self.mm_processor_cache, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 76aa8f438..c7eb93dc8 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -90,7 +90,7 @@ class LLMEngine: self.dp_group = None self.should_execute_dummy_batch = False - self.renderer = renderer = renderer_from_config(self.model_config) + self.renderer = renderer = renderer_from_config(self.vllm_config) self.io_processor = get_io_processor( self.vllm_config, self.model_config.io_processor_plugin,