- **Add SPDX license headers to python source files** - **Check for SPDX headers using pre-commit** commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745 Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:18:24 2025 -0500 Add SPDX license headers to python source files This commit adds SPDX license headers to python source files as recommended to the project by the Linux Foundation. These headers provide a concise way that is both human and machine readable for communicating license information for each source file. It helps avoid any ambiguity about the license of the code and can also be easily used by tools to help manage license compliance. The Linux Foundation runs license scans against the codebase to help ensure we are in compliance with the licenses of the code we use, including dependencies. Having these headers in place helps that tool do its job. More information can be found on the SPDX site: - https://spdx.dev/learn/handling-license-info/ Signed-off-by: Russell Bryant <rbryant@redhat.com> commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:36:32 2025 -0500 Check for SPDX headers using pre-commit Signed-off-by: Russell Bryant <rbryant@redhat.com> --------- Signed-off-by: Russell Bryant <rbryant@redhat.com>
102 lines
4.0 KiB
Python
102 lines
4.0 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py
|
|
from typing import Any, Dict, Optional
|
|
|
|
import transformers
|
|
|
|
|
|
class UltravoxConfig(transformers.PretrainedConfig):
|
|
r"""
|
|
This is the configuration class to store the configuration of a
|
|
[`UltravoxForConditionalGeneration`]. It is used to instantiate an
|
|
Ultravox model according to the specified arguments, defining the model
|
|
architecture.
|
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to
|
|
control the model outputs. Read the documentation from [`PretrainedConfig`]
|
|
for more information.
|
|
|
|
Args:
|
|
audio_config (`Union[AutoConfig, dict]`, *optional*):
|
|
Custom audio config or dict
|
|
text_config (`Union[AutoConfig, dict]`, *optional*):
|
|
The config object of the text backbone. Can be any of `LlamaConfig`
|
|
or `MistralConfig`.
|
|
ignore_index (`int`, *optional*, defaults to -100):
|
|
The ignore index for the loss function.
|
|
audio_token_index (`int`, *optional*, defaults to 32000):
|
|
The audio token index to encode the audio prompt.
|
|
stack_factor (`int`, *optional*, defaults to 8):
|
|
Audio downsampling factor for the multimodal projector.
|
|
norm_init (`float`, *optional*, defaults to 0.4):
|
|
The initialization value for the layer normalization.
|
|
projector_act (`str`, *optional*, defaults to `"swiglu"`):
|
|
The activation function used by the multimodal projector.
|
|
text_model_lora_config (`LoraConfigSimplified`, *optional*):
|
|
The LoRA configuration for finetuning the text model.
|
|
audio_model_lora_config (`LoraConfigSimplified`, *optional*):
|
|
The LoRA configuration for finetuning the audio model.
|
|
"""
|
|
|
|
model_type = "ultravox"
|
|
is_composition = False
|
|
|
|
def __init__(
|
|
self,
|
|
audio_config: Optional[Dict[str, Any]] = None,
|
|
text_config: Optional[Dict[str, Any]] = None,
|
|
audio_model_id: Optional[str] = None,
|
|
text_model_id: Optional[str] = None,
|
|
ignore_index: int = -100,
|
|
audio_token_index: int = 32000,
|
|
hidden_size: int = 4096,
|
|
stack_factor: int = 8,
|
|
norm_init: float = 0.4,
|
|
projector_act: str = "swiglu",
|
|
text_model_lora_config: Optional[Dict[str, Any]] = None,
|
|
audio_model_lora_config: Optional[Dict[str, Any]] = None,
|
|
**kwargs,
|
|
):
|
|
self.ignore_index = ignore_index
|
|
|
|
self.audio_model_id = audio_model_id
|
|
self.text_model_id = text_model_id
|
|
self.audio_token_index = audio_token_index
|
|
|
|
self.hidden_size = hidden_size
|
|
self.stack_factor = stack_factor
|
|
self.norm_init = norm_init
|
|
self.projector_act = projector_act
|
|
|
|
if text_model_id is not None:
|
|
# Avoid circular import
|
|
from vllm.transformers_utils.config import get_config
|
|
|
|
self.text_config = get_config(text_model_id,
|
|
trust_remote_code=False)
|
|
else:
|
|
text_config = text_config or {}
|
|
self.text_config = transformers.CONFIG_MAPPING[text_config.get(
|
|
"model_type", "llama")](**text_config)
|
|
|
|
if audio_model_id is not None:
|
|
# Avoid circular import
|
|
from vllm.transformers_utils.config import get_config
|
|
|
|
self.audio_config = get_config(audio_model_id,
|
|
trust_remote_code=False)
|
|
else:
|
|
audio_config = audio_config or {}
|
|
self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
|
|
"model_type", "whisper")](**audio_config)
|
|
|
|
self.text_model_lora_config = text_model_lora_config or {}
|
|
self.audio_model_lora_config = audio_model_lora_config or {}
|
|
|
|
self.vocab_size = self.text_config.vocab_size
|
|
|
|
self.initializer_range = self.text_config.initializer_range
|
|
|
|
super().__init__(**kwargs)
|