[Model] Define merge_by_field_config MM interface (U-Z) (#26261)

Signed-off-by: Ayush Satyam <ayushsatyam146@gmail.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-07 12:15:49 +05:30
parent 4dbdf4a294
commit 5f7e8a916a
4 changed files with 32 additions and 25 deletions
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -61,7 +61,7 @@ from vllm.transformers_utils.tokenizer import (
 )

 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription
-from .utils import flatten_bn, init_vllm_registered_model, maybe_prefix
+from .utils import init_vllm_registered_model, maybe_prefix

 logger = init_logger(__name__)

@@ -337,6 +337,8 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo])
 class VoxtralForConditionalGeneration(
    nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, SupportsTranscription
 ):
+    merge_by_field_config = True
+
    supported_languages = ISO639_1_SUPPORTED_LANGS

    packed_modules_mapping = {
@@ -445,7 +447,6 @@ class VoxtralForConditionalGeneration(
                f"Incorrect type of audio_arrays. Got type: {type(audio_arrays)}"
            )

-        audio_arrays = flatten_bn(audio_arrays)
        if isinstance(audio_arrays, torch.Tensor):
            audio_arrays = list(audio_arrays.unbind(0))
        return audio_arrays