[Renderer] Separate out RendererConfig from ModelConfig (#30145)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-12-07 15:15:42 +08:00
committed by GitHub
parent a49d813fa8
commit 27f4c2fd46
105 changed files with 969 additions and 797 deletions

View File

@@ -19,7 +19,7 @@ from transformers.models.whisper.modeling_whisper import sinusoids
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention, MultiHeadAttention
from vllm.attention.layers.cross_attention import CrossAttention
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.inputs.data import PromptType
@@ -811,7 +811,7 @@ class WhisperForConditionalGeneration(
def get_generation_prompt(
cls,
audio: np.ndarray,
model_config: ModelConfig, # not needed here
renderer_config: RendererConfig, # not needed here
stt_config: SpeechToTextConfig,
language: str | None,
task_type: Literal["transcribe", "translate"],
@@ -847,9 +847,11 @@ class WhisperForConditionalGeneration(
@classmethod
def get_speech_to_text_config(
cls, model_config: ModelConfig, task_type: str
cls,
renderer_config: RendererConfig,
task_type: str,
) -> SpeechToTextConfig:
processor = cached_processor_from_config(model_config)
processor = cached_processor_from_config(renderer_config)
return SpeechToTextConfig(
max_audio_clip_s=processor.feature_extractor.chunk_length,
@@ -861,9 +863,9 @@ class WhisperForConditionalGeneration(
cls,
audio_duration_s: float,
stt_config: SpeechToTextConfig,
model_config: ModelConfig,
renderer_config: RendererConfig,
) -> int | None:
processor = cached_processor_from_config(model_config)
processor = cached_processor_from_config(renderer_config)
hop_length = processor.feature_extractor.hop_length
assert hop_length is not None
# NOTE(NickLucche) user can't pass encoder