[Renderer] Separate out RendererConfig from ModelConfig (#30145)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -415,7 +415,7 @@ def load_weights_using_from_2_way_softmax(
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
model_config = model.vllm_config.model_config
|
||||
renderer_config = model.vllm_config.renderer_config
|
||||
quant_config = model.vllm_config.quant_config
|
||||
text_config = model.config.get_text_config()
|
||||
|
||||
@@ -447,10 +447,10 @@ def load_weights_using_from_2_way_softmax(
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
tokenizer = get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
revision=model_config.tokenizer_revision,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
renderer_config.tokenizer,
|
||||
revision=renderer_config.tokenizer_revision,
|
||||
tokenizer_mode=renderer_config.tokenizer_mode,
|
||||
trust_remote_code=renderer_config.trust_remote_code,
|
||||
)
|
||||
|
||||
false_id = tokenizer.convert_tokens_to_ids(tokens[0])
|
||||
@@ -473,7 +473,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
model_config = model.vllm_config.model_config
|
||||
renderer_config = model.vllm_config.renderer_config
|
||||
quant_config = model.vllm_config.quant_config
|
||||
text_config = model.config.get_text_config()
|
||||
|
||||
@@ -501,10 +501,10 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
tokenizer = get_tokenizer(
|
||||
model_config.tokenizer,
|
||||
revision=model_config.tokenizer_revision,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
renderer_config.tokenizer,
|
||||
revision=renderer_config.tokenizer_revision,
|
||||
tokenizer_mode=renderer_config.tokenizer_mode,
|
||||
trust_remote_code=renderer_config.trust_remote_code,
|
||||
)
|
||||
|
||||
token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
|
||||
|
||||
@@ -377,8 +377,8 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
self.projector_config = config.projector_config
|
||||
self.text_config = config.text_config
|
||||
|
||||
model_config = vllm_config.model_config
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
renderer_config = vllm_config.renderer_config
|
||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
||||
self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
|
||||
|
||||
self.sam_model = build_sam_vit_b()
|
||||
|
||||
@@ -370,8 +370,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
self.projector_config = config.projector_config
|
||||
self.text_config = config.text_config
|
||||
|
||||
model_config = vllm_config.model_config
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
renderer_config = vllm_config.renderer_config
|
||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
||||
self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN]
|
||||
|
||||
self.vision = self._init_vision_module(
|
||||
|
||||
@@ -18,7 +18,7 @@ from transformers.models.gemma3n import (
|
||||
)
|
||||
from transformers.models.siglip import SiglipImageProcessorFast
|
||||
|
||||
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.logger import init_logger
|
||||
@@ -760,7 +760,7 @@ class Gemma3nForConditionalGeneration(
|
||||
cls,
|
||||
audio: np.ndarray,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig,
|
||||
renderer_config: RendererConfig,
|
||||
language: Optional[str],
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
request_prompt: str,
|
||||
@@ -798,7 +798,9 @@ class Gemma3nForConditionalGeneration(
|
||||
|
||||
@classmethod
|
||||
def get_speech_to_text_config(
|
||||
cls, model_config: ModelConfig, task_type: str
|
||||
cls,
|
||||
renderer_config: RendererConfig,
|
||||
task_type: str,
|
||||
) -> SpeechToTextConfig:
|
||||
return SpeechToTextConfig(
|
||||
# Let's set this to 30 as suggested in the docs for now, although
|
||||
|
||||
@@ -34,7 +34,7 @@ import torch.nn.functional as F
|
||||
from torch import nn
|
||||
from transformers import BatchFeature, PretrainedConfig
|
||||
|
||||
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
|
||||
@@ -840,7 +840,7 @@ class GraniteSpeechForConditionalGeneration(
|
||||
def get_generation_prompt(
|
||||
cls,
|
||||
audio: np.ndarray,
|
||||
model_config: ModelConfig,
|
||||
renderer_config: RendererConfig,
|
||||
stt_config: SpeechToTextConfig,
|
||||
language: str | None,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
@@ -861,7 +861,7 @@ class GraniteSpeechForConditionalGeneration(
|
||||
else:
|
||||
raise ValueError(f"Unsupported task type {task_type}")
|
||||
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
||||
chat = [dict(role="user", content=user_prompt)]
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
chat,
|
||||
@@ -882,10 +882,10 @@ class GraniteSpeechForConditionalGeneration(
|
||||
cls,
|
||||
audio_duration_s: float,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig,
|
||||
renderer_config: RendererConfig,
|
||||
) -> int | None:
|
||||
"""Get the number of audio tokens for an audio duration in sec."""
|
||||
processor = cached_processor_from_config(model_config)
|
||||
processor = cached_processor_from_config(renderer_config)
|
||||
hop_length = processor.audio_processor.melspec_kwargs["hop_length"]
|
||||
proj_win_size = processor.audio_processor.projector_window_size
|
||||
ds_rate = processor.audio_processor.projector_downsample_rate
|
||||
@@ -903,7 +903,9 @@ class GraniteSpeechForConditionalGeneration(
|
||||
|
||||
@classmethod
|
||||
def get_speech_to_text_config(
|
||||
cls, model_config: ModelConfig, task_type: str
|
||||
cls,
|
||||
renderer_config: RendererConfig,
|
||||
task_type: str,
|
||||
) -> SpeechToTextConfig:
|
||||
"""Get the stt config for this model."""
|
||||
# Default settings are reasonable for this model and we don't currently
|
||||
|
||||
@@ -6,7 +6,7 @@ import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.config import RendererConfig, VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.pooler import (
|
||||
DispatchPooler,
|
||||
@@ -29,12 +29,12 @@ logger = init_logger(__name__)
|
||||
class GritLMMeanPool(nn.Module):
|
||||
"""As `MeanPool`, but only includes non-instruction tokens."""
|
||||
|
||||
def __init__(self, model_config: ModelConfig):
|
||||
def __init__(self, renderer_config: RendererConfig):
|
||||
super().__init__()
|
||||
|
||||
self.model_config = model_config
|
||||
self.renderer_config = renderer_config
|
||||
|
||||
tokenizer = cached_tokenizer_from_config(self.model_config)
|
||||
tokenizer = cached_tokenizer_from_config(self.renderer_config)
|
||||
|
||||
# Collect the tokens needed for pattern matching.
|
||||
# "▁<" is different from "_<". The former uses "▁" to indicate that
|
||||
@@ -174,10 +174,10 @@ class GritLMMeanPool(nn.Module):
|
||||
|
||||
|
||||
class GritLMPooler(Pooler):
|
||||
def __init__(self, model_config: ModelConfig):
|
||||
def __init__(self, renderer_config: RendererConfig):
|
||||
super().__init__()
|
||||
|
||||
self.pooling = GritLMMeanPool(model_config)
|
||||
self.pooling = GritLMMeanPool(renderer_config)
|
||||
self.head = PoolerHead(PoolerNormalize())
|
||||
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
@@ -238,6 +238,6 @@ class GritLM(LlamaForCausalLM):
|
||||
self.pooler = DispatchPooler(
|
||||
{
|
||||
"token_embed": Pooler.for_token_embed(pooler_config),
|
||||
"embed": GritLMPooler(vllm_config.model_config),
|
||||
"embed": GritLMPooler(vllm_config.renderer_config),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -19,7 +19,7 @@ from torch import Tensor
|
||||
from transformers.models.whisper.tokenization_whisper import LANGUAGES
|
||||
from typing_extensions import Self, TypeIs
|
||||
|
||||
from vllm.config import ModelConfig, SpeechToTextConfig
|
||||
from vllm.config import RendererConfig, SpeechToTextConfig
|
||||
from vllm.inputs import TokensPrompt
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.logger import init_logger
|
||||
@@ -887,7 +887,7 @@ class SupportsTranscription(Protocol):
|
||||
cls,
|
||||
audio: np.ndarray,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig,
|
||||
renderer_config: RendererConfig,
|
||||
language: str | None,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
request_prompt: str,
|
||||
@@ -930,7 +930,9 @@ class SupportsTranscription(Protocol):
|
||||
|
||||
@classmethod
|
||||
def get_speech_to_text_config(
|
||||
cls, model_config: ModelConfig, task_type: Literal["transcribe", "translate"]
|
||||
cls,
|
||||
renderer_config: RendererConfig,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
) -> SpeechToTextConfig:
|
||||
"""Get the speech to text config for the ASR model."""
|
||||
...
|
||||
@@ -940,7 +942,7 @@ class SupportsTranscription(Protocol):
|
||||
cls,
|
||||
audio_duration_s: float,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig,
|
||||
renderer_config: RendererConfig,
|
||||
) -> int | None:
|
||||
"""
|
||||
Map from audio duration to number of audio tokens produced by the ASR
|
||||
|
||||
@@ -182,7 +182,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
|
||||
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
|
||||
hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
|
||||
hf_processor.video_processor = cached_video_processor_from_config(
|
||||
self.ctx.model_config,
|
||||
self.ctx.renderer_config,
|
||||
processor_cls=InternVLVideoProcessor,
|
||||
size=hf_processor.image_processor.size,
|
||||
**kwargs,
|
||||
|
||||
@@ -1169,16 +1169,17 @@ class NemotronH_Nano_VL_V2(
|
||||
self.mlp1 = self.mlp1.to(self.language_model.config.dtype)
|
||||
|
||||
self.config = config
|
||||
self.model_config = vllm_config.model_config
|
||||
|
||||
# Pre-tokenize special tokens for video processing
|
||||
# to avoid repeated tokenization
|
||||
tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
|
||||
self._img_start_token_ids = tokenizer.encode(
|
||||
self._tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
|
||||
self._img_start_token_ids = self._tokenizer.encode(
|
||||
IMG_START, add_special_tokens=False
|
||||
)
|
||||
self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
|
||||
self._img_context_token_ids = tokenizer.encode(
|
||||
self._img_end_token_ids = self._tokenizer.encode(
|
||||
IMG_END, add_special_tokens=False
|
||||
)
|
||||
self._img_context_token_ids = self._tokenizer.encode(
|
||||
IMG_CONTEXT, add_special_tokens=False
|
||||
)
|
||||
|
||||
@@ -1364,7 +1365,7 @@ class NemotronH_Nano_VL_V2(
|
||||
input_embeds for the LLM.
|
||||
"""
|
||||
device = video_embeddings.device
|
||||
tokenizer = cached_tokenizer_from_config(self.model_config)
|
||||
tokenizer = self._tokenizer
|
||||
|
||||
# Generate video replacement token IDs using get_video_repl
|
||||
# This tokenizes each frame separator independently, then uses pre-tokenized
|
||||
|
||||
@@ -347,7 +347,7 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
|
||||
|
||||
def get_image_processor(self, **kwargs: object):
|
||||
return cached_image_processor_from_config(
|
||||
self.ctx.model_config,
|
||||
self.ctx.renderer_config,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
@@ -193,7 +193,7 @@ class PixtralProcessorAdapter:
|
||||
|
||||
class PixtralProcessingInfo(BaseProcessingInfo):
|
||||
def get_tokenizer(self) -> MistralTokenizer:
|
||||
tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
|
||||
tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
|
||||
if not isinstance(tokenizer, MistralTokenizer):
|
||||
raise ValueError("This model requires `--tokenizer-mode mistral`")
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder
|
||||
from transformers import BatchFeature, TensorType, WhisperConfig
|
||||
from transformers.tokenization_utils_base import TextInput
|
||||
|
||||
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.logger import init_logger
|
||||
@@ -176,7 +176,7 @@ class VoxtralProcessorAdapter:
|
||||
|
||||
class VoxtralProcessingInfo(BaseProcessingInfo):
|
||||
def get_tokenizer(self) -> MistralTokenizer:
|
||||
tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
|
||||
tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
|
||||
if not isinstance(tokenizer, MistralTokenizer):
|
||||
raise ValueError("This model requires `--tokenizer-mode mistral`")
|
||||
|
||||
@@ -339,7 +339,7 @@ class VoxtralForConditionalGeneration(
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
|
||||
self.tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
|
||||
|
||||
# update quant config to so that ignored module and target module names
|
||||
# match the vLLM model names
|
||||
@@ -450,9 +450,11 @@ class VoxtralForConditionalGeneration(
|
||||
|
||||
@classmethod
|
||||
def get_speech_to_text_config(
|
||||
cls, model_config: ModelConfig, task_type: str
|
||||
cls,
|
||||
renderer_config: RendererConfig,
|
||||
task_type: str,
|
||||
) -> SpeechToTextConfig:
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
||||
audio_config = tokenizer.instruct.audio_encoder.audio_config
|
||||
max_audio_clip_s = audio_config.chunk_length_s
|
||||
sample_rate = audio_config.sampling_rate
|
||||
@@ -468,17 +470,17 @@ class VoxtralForConditionalGeneration(
|
||||
def get_generation_prompt(
|
||||
cls,
|
||||
audio: np.ndarray,
|
||||
model_config: ModelConfig,
|
||||
renderer_config: RendererConfig, # not needed here
|
||||
stt_config: SpeechToTextConfig,
|
||||
language: str | None,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
request_prompt: str,
|
||||
to_language: str | None,
|
||||
) -> PromptType:
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
||||
audio = Audio(audio, int(stt_config.sample_rate), format="wav") # lossless
|
||||
req = TranscriptionRequest(
|
||||
model=model_config.model,
|
||||
model=renderer_config.model_config.model,
|
||||
audio=RawAudio.from_audio(audio),
|
||||
language=language,
|
||||
)
|
||||
@@ -494,14 +496,14 @@ class VoxtralForConditionalGeneration(
|
||||
cls,
|
||||
audio_duration_s: float,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig,
|
||||
renderer_config: RendererConfig,
|
||||
) -> int | None:
|
||||
"""
|
||||
Map from audio duration to number of audio tokens produced by the ASR
|
||||
model, without running a forward pass.
|
||||
This is used for estimating the amount of processing for this audio.
|
||||
"""
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
||||
adapter = VoxtralProcessorAdapter(tokenizer)
|
||||
return adapter.get_num_audio_tokens(
|
||||
int(audio_duration_s * stt_config.sample_rate)
|
||||
|
||||
@@ -19,7 +19,7 @@ from transformers.models.whisper.modeling_whisper import sinusoids
|
||||
from vllm.attention.backends.abstract import AttentionType
|
||||
from vllm.attention.layer import Attention, MultiHeadAttention
|
||||
from vllm.attention.layers.cross_attention import CrossAttention
|
||||
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||
from vllm.inputs.data import PromptType
|
||||
@@ -811,7 +811,7 @@ class WhisperForConditionalGeneration(
|
||||
def get_generation_prompt(
|
||||
cls,
|
||||
audio: np.ndarray,
|
||||
model_config: ModelConfig, # not needed here
|
||||
renderer_config: RendererConfig, # not needed here
|
||||
stt_config: SpeechToTextConfig,
|
||||
language: str | None,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
@@ -847,9 +847,11 @@ class WhisperForConditionalGeneration(
|
||||
|
||||
@classmethod
|
||||
def get_speech_to_text_config(
|
||||
cls, model_config: ModelConfig, task_type: str
|
||||
cls,
|
||||
renderer_config: RendererConfig,
|
||||
task_type: str,
|
||||
) -> SpeechToTextConfig:
|
||||
processor = cached_processor_from_config(model_config)
|
||||
processor = cached_processor_from_config(renderer_config)
|
||||
|
||||
return SpeechToTextConfig(
|
||||
max_audio_clip_s=processor.feature_extractor.chunk_length,
|
||||
@@ -861,9 +863,9 @@ class WhisperForConditionalGeneration(
|
||||
cls,
|
||||
audio_duration_s: float,
|
||||
stt_config: SpeechToTextConfig,
|
||||
model_config: ModelConfig,
|
||||
renderer_config: RendererConfig,
|
||||
) -> int | None:
|
||||
processor = cached_processor_from_config(model_config)
|
||||
processor = cached_processor_from_config(renderer_config)
|
||||
hop_length = processor.feature_extractor.hop_length
|
||||
assert hop_length is not None
|
||||
# NOTE(NickLucche) user can't pass encoder
|
||||
|
||||
Reference in New Issue
Block a user