[Frontend] Gemma3n audio transcriptions/translations endpoint (#23735)

Signed-off-by: NickLucche <nlucches@redhat.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-09-01 12:07:46 +02:00
parent 107284959a
commit d46934b229
9 changed files with 189 additions and 63 deletions
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -12,8 +12,6 @@ import pytest
 import pytest_asyncio
 import soundfile as sf

-from vllm.assets.audio import AudioAsset
-
 from ...utils import RemoteOpenAIServer

 MODEL_NAME = "openai/whisper-large-v3-turbo"
@@ -24,20 +22,6 @@ MISTRAL_FORMAT_ARGS = [
 ]


-@pytest.fixture
-def mary_had_lamb():
-    path = AudioAsset('mary_had_lamb').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
-@pytest.fixture
-def winning_call():
-    path = AudioAsset('winning_call').get_local_path()
-    with open(str(path), "rb") as f:
-        yield f
-
-
@pytest.fixture(scope="module")
 def server():
    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
@@ -76,6 +60,25 @@ async def test_basic_audio(mary_had_lamb, model_name):
        assert out_usage["seconds"] == 16, out_usage["seconds"]


+@pytest.mark.asyncio
+async def test_basic_audio_gemma(foscolo):
+    # Gemma accuracy on some of the audio samples we use is particularly bad,
+    # hence we use a different one here. WER is evaluated separately.
+    model_name = "google/gemma-3n-E2B-it"
+    server_args = ["--enforce-eager"]
+
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=foscolo,
+            language="it",
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert "da cui vergine nacque Venere" in out
+
+
@pytest.mark.asyncio
 async def test_non_asr_model(winning_call):
    # text to text model