[Voxtral Realtime] Introduce global log mel max (#33574)

Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-02-02 23:01:47 +01:00
parent 089cd4f002
commit 5019c59dd2
4 changed files with 19 additions and 11 deletions
--- a/tests/models/multimodal/generation/test_voxtral_realtime.py
+++ b/tests/models/multimodal/generation/test_voxtral_realtime.py
@@ -37,7 +37,7 @@ EXPECTED_TEXT = [
    (
        " First words I spoke in the original phonograph. "
        "A little piece of practical poetry. Mary had a little lamb,"
-        " it sleeps with quite a snow, and everywhere that Mary went, "
+        " its fleece was quite a slow, and everywhere that Mary went, "
        "the lamb was sure to go."
    ),
    (
@@ -246,13 +246,6 @@ async def test_voxtral_realtime_generator(audio_assets, tokenizer, async_engine)

    texts = [tokenizer.decode(output_tokens) for output_tokens in output_tokens_list]

-    # 'true' streaming and 'offline' streaming differ a bit because log-mels are
-    # differently noramalized
-    texts[0] = (
-        texts[0]
-        .replace("He has f", "F")
-        .replace("its fleece was quite a slow", "it sleeps with quite a snow")
-    )
    texts[1] = texts[1].replace("a base hit", "OBS").replace("oh my", "oh, my")

    assert texts == EXPECTED_TEXT