[Renderer] Separate out RendererConfig from ModelConfig (#30145)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-12-07 15:15:42 +08:00
committed by GitHub
parent a49d813fa8
commit 27f4c2fd46
105 changed files with 969 additions and 797 deletions

View File

@@ -415,7 +415,7 @@ def load_weights_using_from_2_way_softmax(
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
model_config = model.vllm_config.model_config
renderer_config = model.vllm_config.renderer_config
quant_config = model.vllm_config.quant_config
text_config = model.config.get_text_config()
@@ -447,10 +447,10 @@ def load_weights_using_from_2_way_softmax(
from vllm.tokenizers import get_tokenizer
tokenizer = get_tokenizer(
model_config.tokenizer,
revision=model_config.tokenizer_revision,
tokenizer_mode=model_config.tokenizer_mode,
trust_remote_code=model_config.trust_remote_code,
renderer_config.tokenizer,
revision=renderer_config.tokenizer_revision,
tokenizer_mode=renderer_config.tokenizer_mode,
trust_remote_code=renderer_config.trust_remote_code,
)
false_id = tokenizer.convert_tokens_to_ids(tokens[0])
@@ -473,7 +473,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
model_config = model.vllm_config.model_config
renderer_config = model.vllm_config.renderer_config
quant_config = model.vllm_config.quant_config
text_config = model.config.get_text_config()
@@ -501,10 +501,10 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
from vllm.tokenizers import get_tokenizer
tokenizer = get_tokenizer(
model_config.tokenizer,
revision=model_config.tokenizer_revision,
tokenizer_mode=model_config.tokenizer_mode,
trust_remote_code=model_config.trust_remote_code,
renderer_config.tokenizer,
revision=renderer_config.tokenizer_revision,
tokenizer_mode=renderer_config.tokenizer_mode,
trust_remote_code=renderer_config.trust_remote_code,
)
token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]

View File

@@ -377,8 +377,8 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self.projector_config = config.projector_config
self.text_config = config.text_config
model_config = vllm_config.model_config
tokenizer = cached_tokenizer_from_config(model_config)
renderer_config = vllm_config.renderer_config
tokenizer = cached_tokenizer_from_config(renderer_config)
self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
self.sam_model = build_sam_vit_b()

View File

@@ -370,8 +370,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self.projector_config = config.projector_config
self.text_config = config.text_config
model_config = vllm_config.model_config
tokenizer = cached_tokenizer_from_config(model_config)
renderer_config = vllm_config.renderer_config
tokenizer = cached_tokenizer_from_config(renderer_config)
self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN]
self.vision = self._init_vision_module(

View File

@@ -18,7 +18,7 @@ from transformers.models.gemma3n import (
)
from transformers.models.siglip import SiglipImageProcessorFast
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs.data import PromptType
from vllm.logger import init_logger
@@ -760,7 +760,7 @@ class Gemma3nForConditionalGeneration(
cls,
audio: np.ndarray,
stt_config: SpeechToTextConfig,
model_config: ModelConfig,
renderer_config: RendererConfig,
language: Optional[str],
task_type: Literal["transcribe", "translate"],
request_prompt: str,
@@ -798,7 +798,9 @@ class Gemma3nForConditionalGeneration(
@classmethod
def get_speech_to_text_config(
cls, model_config: ModelConfig, task_type: str
cls,
renderer_config: RendererConfig,
task_type: str,
) -> SpeechToTextConfig:
return SpeechToTextConfig(
# Let's set this to 30 as suggested in the docs for now, although

View File

@@ -34,7 +34,7 @@ import torch.nn.functional as F
from torch import nn
from transformers import BatchFeature, PretrainedConfig
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs.data import PromptType
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
@@ -840,7 +840,7 @@ class GraniteSpeechForConditionalGeneration(
def get_generation_prompt(
cls,
audio: np.ndarray,
model_config: ModelConfig,
renderer_config: RendererConfig,
stt_config: SpeechToTextConfig,
language: str | None,
task_type: Literal["transcribe", "translate"],
@@ -861,7 +861,7 @@ class GraniteSpeechForConditionalGeneration(
else:
raise ValueError(f"Unsupported task type {task_type}")
tokenizer = cached_tokenizer_from_config(model_config)
tokenizer = cached_tokenizer_from_config(renderer_config)
chat = [dict(role="user", content=user_prompt)]
prompt = tokenizer.apply_chat_template(
chat,
@@ -882,10 +882,10 @@ class GraniteSpeechForConditionalGeneration(
cls,
audio_duration_s: float,
stt_config: SpeechToTextConfig,
model_config: ModelConfig,
renderer_config: RendererConfig,
) -> int | None:
"""Get the number of audio tokens for an audio duration in sec."""
processor = cached_processor_from_config(model_config)
processor = cached_processor_from_config(renderer_config)
hop_length = processor.audio_processor.melspec_kwargs["hop_length"]
proj_win_size = processor.audio_processor.projector_window_size
ds_rate = processor.audio_processor.projector_downsample_rate
@@ -903,7 +903,9 @@ class GraniteSpeechForConditionalGeneration(
@classmethod
def get_speech_to_text_config(
cls, model_config: ModelConfig, task_type: str
cls,
renderer_config: RendererConfig,
task_type: str,
) -> SpeechToTextConfig:
"""Get the stt config for this model."""
# Default settings are reasonable for this model and we don't currently

View File

@@ -6,7 +6,7 @@ import numpy as np
import torch
import torch.nn as nn
from vllm.config import ModelConfig, VllmConfig
from vllm.config import RendererConfig, VllmConfig
from vllm.logger import init_logger
from vllm.model_executor.layers.pooler import (
DispatchPooler,
@@ -29,12 +29,12 @@ logger = init_logger(__name__)
class GritLMMeanPool(nn.Module):
"""As `MeanPool`, but only includes non-instruction tokens."""
def __init__(self, model_config: ModelConfig):
def __init__(self, renderer_config: RendererConfig):
super().__init__()
self.model_config = model_config
self.renderer_config = renderer_config
tokenizer = cached_tokenizer_from_config(self.model_config)
tokenizer = cached_tokenizer_from_config(self.renderer_config)
# Collect the tokens needed for pattern matching.
# "▁<" is different from "_<". The former uses "▁" to indicate that
@@ -174,10 +174,10 @@ class GritLMMeanPool(nn.Module):
class GritLMPooler(Pooler):
def __init__(self, model_config: ModelConfig):
def __init__(self, renderer_config: RendererConfig):
super().__init__()
self.pooling = GritLMMeanPool(model_config)
self.pooling = GritLMMeanPool(renderer_config)
self.head = PoolerHead(PoolerNormalize())
def get_supported_tasks(self) -> Set[PoolingTask]:
@@ -238,6 +238,6 @@ class GritLM(LlamaForCausalLM):
self.pooler = DispatchPooler(
{
"token_embed": Pooler.for_token_embed(pooler_config),
"embed": GritLMPooler(vllm_config.model_config),
"embed": GritLMPooler(vllm_config.renderer_config),
}
)

View File

@@ -19,7 +19,7 @@ from torch import Tensor
from transformers.models.whisper.tokenization_whisper import LANGUAGES
from typing_extensions import Self, TypeIs
from vllm.config import ModelConfig, SpeechToTextConfig
from vllm.config import RendererConfig, SpeechToTextConfig
from vllm.inputs import TokensPrompt
from vllm.inputs.data import PromptType
from vllm.logger import init_logger
@@ -887,7 +887,7 @@ class SupportsTranscription(Protocol):
cls,
audio: np.ndarray,
stt_config: SpeechToTextConfig,
model_config: ModelConfig,
renderer_config: RendererConfig,
language: str | None,
task_type: Literal["transcribe", "translate"],
request_prompt: str,
@@ -930,7 +930,9 @@ class SupportsTranscription(Protocol):
@classmethod
def get_speech_to_text_config(
cls, model_config: ModelConfig, task_type: Literal["transcribe", "translate"]
cls,
renderer_config: RendererConfig,
task_type: Literal["transcribe", "translate"],
) -> SpeechToTextConfig:
"""Get the speech to text config for the ASR model."""
...
@@ -940,7 +942,7 @@ class SupportsTranscription(Protocol):
cls,
audio_duration_s: float,
stt_config: SpeechToTextConfig,
model_config: ModelConfig,
renderer_config: RendererConfig,
) -> int | None:
"""
Map from audio duration to number of audio tokens produced by the ASR

View File

@@ -182,7 +182,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
hf_processor.video_processor = cached_video_processor_from_config(
self.ctx.model_config,
self.ctx.renderer_config,
processor_cls=InternVLVideoProcessor,
size=hf_processor.image_processor.size,
**kwargs,

View File

@@ -1169,16 +1169,17 @@ class NemotronH_Nano_VL_V2(
self.mlp1 = self.mlp1.to(self.language_model.config.dtype)
self.config = config
self.model_config = vllm_config.model_config
# Pre-tokenize special tokens for video processing
# to avoid repeated tokenization
tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
self._img_start_token_ids = tokenizer.encode(
self._tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
self._img_start_token_ids = self._tokenizer.encode(
IMG_START, add_special_tokens=False
)
self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
self._img_context_token_ids = tokenizer.encode(
self._img_end_token_ids = self._tokenizer.encode(
IMG_END, add_special_tokens=False
)
self._img_context_token_ids = self._tokenizer.encode(
IMG_CONTEXT, add_special_tokens=False
)
@@ -1364,7 +1365,7 @@ class NemotronH_Nano_VL_V2(
input_embeds for the LLM.
"""
device = video_embeddings.device
tokenizer = cached_tokenizer_from_config(self.model_config)
tokenizer = self._tokenizer
# Generate video replacement token IDs using get_video_repl
# This tokenizes each frame separator independently, then uses pre-tokenized

View File

@@ -347,7 +347,7 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
def get_image_processor(self, **kwargs: object):
return cached_image_processor_from_config(
self.ctx.model_config,
self.ctx.renderer_config,
**kwargs,
)

View File

@@ -193,7 +193,7 @@ class PixtralProcessorAdapter:
class PixtralProcessingInfo(BaseProcessingInfo):
def get_tokenizer(self) -> MistralTokenizer:
tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
if not isinstance(tokenizer, MistralTokenizer):
raise ValueError("This model requires `--tokenizer-mode mistral`")

View File

@@ -20,7 +20,7 @@ from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder
from transformers import BatchFeature, TensorType, WhisperConfig
from transformers.tokenization_utils_base import TextInput
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs.data import PromptType
from vllm.logger import init_logger
@@ -176,7 +176,7 @@ class VoxtralProcessorAdapter:
class VoxtralProcessingInfo(BaseProcessingInfo):
def get_tokenizer(self) -> MistralTokenizer:
tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
if not isinstance(tokenizer, MistralTokenizer):
raise ValueError("This model requires `--tokenizer-mode mistral`")
@@ -339,7 +339,7 @@ class VoxtralForConditionalGeneration(
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
self.tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
# update quant config to so that ignored module and target module names
# match the vLLM model names
@@ -450,9 +450,11 @@ class VoxtralForConditionalGeneration(
@classmethod
def get_speech_to_text_config(
cls, model_config: ModelConfig, task_type: str
cls,
renderer_config: RendererConfig,
task_type: str,
) -> SpeechToTextConfig:
tokenizer = cached_tokenizer_from_config(model_config)
tokenizer = cached_tokenizer_from_config(renderer_config)
audio_config = tokenizer.instruct.audio_encoder.audio_config
max_audio_clip_s = audio_config.chunk_length_s
sample_rate = audio_config.sampling_rate
@@ -468,17 +470,17 @@ class VoxtralForConditionalGeneration(
def get_generation_prompt(
cls,
audio: np.ndarray,
model_config: ModelConfig,
renderer_config: RendererConfig, # not needed here
stt_config: SpeechToTextConfig,
language: str | None,
task_type: Literal["transcribe", "translate"],
request_prompt: str,
to_language: str | None,
) -> PromptType:
tokenizer = cached_tokenizer_from_config(model_config)
tokenizer = cached_tokenizer_from_config(renderer_config)
audio = Audio(audio, int(stt_config.sample_rate), format="wav") # lossless
req = TranscriptionRequest(
model=model_config.model,
model=renderer_config.model_config.model,
audio=RawAudio.from_audio(audio),
language=language,
)
@@ -494,14 +496,14 @@ class VoxtralForConditionalGeneration(
cls,
audio_duration_s: float,
stt_config: SpeechToTextConfig,
model_config: ModelConfig,
renderer_config: RendererConfig,
) -> int | None:
"""
Map from audio duration to number of audio tokens produced by the ASR
model, without running a forward pass.
This is used for estimating the amount of processing for this audio.
"""
tokenizer = cached_tokenizer_from_config(model_config)
tokenizer = cached_tokenizer_from_config(renderer_config)
adapter = VoxtralProcessorAdapter(tokenizer)
return adapter.get_num_audio_tokens(
int(audio_duration_s * stt_config.sample_rate)

View File

@@ -19,7 +19,7 @@ from transformers.models.whisper.modeling_whisper import sinusoids
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention, MultiHeadAttention
from vllm.attention.layers.cross_attention import CrossAttention
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.inputs.data import PromptType
@@ -811,7 +811,7 @@ class WhisperForConditionalGeneration(
def get_generation_prompt(
cls,
audio: np.ndarray,
model_config: ModelConfig, # not needed here
renderer_config: RendererConfig, # not needed here
stt_config: SpeechToTextConfig,
language: str | None,
task_type: Literal["transcribe", "translate"],
@@ -847,9 +847,11 @@ class WhisperForConditionalGeneration(
@classmethod
def get_speech_to_text_config(
cls, model_config: ModelConfig, task_type: str
cls,
renderer_config: RendererConfig,
task_type: str,
) -> SpeechToTextConfig:
processor = cached_processor_from_config(model_config)
processor = cached_processor_from_config(renderer_config)
return SpeechToTextConfig(
max_audio_clip_s=processor.feature_extractor.chunk_length,
@@ -861,9 +863,9 @@ class WhisperForConditionalGeneration(
cls,
audio_duration_s: float,
stt_config: SpeechToTextConfig,
model_config: ModelConfig,
renderer_config: RendererConfig,
) -> int | None:
processor = cached_processor_from_config(model_config)
processor = cached_processor_from_config(renderer_config)
hop_length = processor.feature_extractor.hop_length
assert hop_length is not None
# NOTE(NickLucche) user can't pass encoder