diff --git a/tests/entrypoints/openai/test_realtime_validation.py b/tests/entrypoints/openai/test_realtime_validation.py index d0a37cd5e..e0868a87d 100644 --- a/tests/entrypoints/openai/test_realtime_validation.py +++ b/tests/entrypoints/openai/test_realtime_validation.py @@ -126,8 +126,8 @@ async def test_multi_chunk_streaming( assert event["type"] == "transcription.done" assert event["text"] == full_text assert full_text == ( - " He has first words I spoke in the original phonograph." + " First words I spoke in the original phonograph." " A little piece of practical poetry. Mary had a little lamb," - " it squeaked with quite a flow, and everywhere that Mary went," + " it sleeps with quite a flow, and everywhere that Mary went," " the lamb was sure to go" ) diff --git a/tests/models/multimodal/generation/test_voxtral_realtime.py b/tests/models/multimodal/generation/test_voxtral_realtime.py index d7906003a..a8fe162f8 100644 --- a/tests/models/multimodal/generation/test_voxtral_realtime.py +++ b/tests/models/multimodal/generation/test_voxtral_realtime.py @@ -37,7 +37,7 @@ EXPECTED_TEXT = [ ( " First words I spoke in the original phonograph. " "A little piece of practical poetry. Mary had a little lamb," - " it sleeps with quite a snow, and everywhere that Mary went, " + " its fleece was quite a slow, and everywhere that Mary went, " "the lamb was sure to go." ), ( @@ -246,13 +246,6 @@ async def test_voxtral_realtime_generator(audio_assets, tokenizer, async_engine) texts = [tokenizer.decode(output_tokens) for output_tokens in output_tokens_list] - # 'true' streaming and 'offline' streaming differ a bit because log-mels are - # differently noramalized - texts[0] = ( - texts[0] - .replace("He has f", "F") - .replace("its fleece was quite a slow", "it sleeps with quite a snow") - ) texts[1] = texts[1].replace("a base hit", "OBS").replace("oh my", "oh, my") assert texts == EXPECTED_TEXT diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 2fc987682..86ee98147 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -782,7 +782,19 @@ class VoxtralEncoderModel(nn.Module): magnitudes = stft[..., :-1].abs() ** 2 mel_spec = self.mel_filters.T @ magnitudes log_spec = torch.clamp(mel_spec, min=1e-10).log10() - log_spec = torch.maximum(log_spec, log_spec.max() - 8.0) + + if global_log_mel_max := self.config.global_log_mel_max: + if not isinstance(global_log_mel_max, float): + raise TypeError(f"{global_log_mel_max=} needs to be of type float.") + log_spec_max = torch.tensor( + global_log_mel_max, + device=log_spec.device, + dtype=log_spec.dtype, + ) + else: + log_spec_max = log_spec.max() + + log_spec = torch.maximum(log_spec, log_spec_max - 8.0) log_spec = (log_spec + 4.0) / 4.0 return log_spec.to(input_dtype) diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index 6a9985583..1a0e25021 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -248,6 +248,9 @@ def _remap_mistral_audio_args(config: dict) -> dict: sliding_window=encoder_args.get("sliding_window", None), block_pool_size=block_pool_size, pos_embed=encoder_args.get("pos_embed", "sinusoidal"), + global_log_mel_max=encoder_args["audio_encoding_args"].get( + "global_log_mel_max" + ), # only needed for RoPE max_position_embeddings=block_pool_size * config["max_position_embeddings"], ),