[Frontend]Add support for transcriptions and translations to run_batch (#33934)

Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2026-02-07 05:24:57 -08:00
parent 4df44c16ba
commit 2cb2340f7a
3 changed files with 555 additions and 101 deletions
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -7,6 +7,7 @@ import tempfile

 import pytest

+from vllm.assets.audio import AudioAsset
 from vllm.entrypoints.openai.run_batch import BatchRequestOutput

 MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
@@ -42,6 +43,27 @@ INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/re
 INPUT_REASONING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Solve this math problem: 2+2=?"}]}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "Qwen/Qwen3-0.6B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "What is the capital of France?"}]}}"""

+# This is a valid but minimal audio file for testing
+MINIMAL_WAV_BASE64 = "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA="
+INPUT_TRANSCRIPTION_BATCH = (
+    '{{"custom_id": "request-1", "method": "POST", "url": "/v1/audio/transcriptions", '
+    '"body": {{"model": "openai/whisper-large-v3", "file_url": "data:audio/wav;base64,{}", '
+    '"response_format": "json"}}}}\n'
+).format(MINIMAL_WAV_BASE64)
+
+INPUT_TRANSCRIPTION_HTTP_BATCH = (
+    '{{"custom_id": "request-1", "method": "POST", "url": "/v1/audio/transcriptions", '
+    '"body": {{"model": "openai/whisper-large-v3", "file_url": "{}", '
+    '"response_format": "json"}}}}\n'
+).format(AudioAsset("mary_had_lamb").url)
+
+INPUT_TRANSLATION_BATCH = (
+    '{{"custom_id": "request-1", "method": "POST", "url": "/v1/audio/translations", '
+    '"body": {{"model": "openai/whisper-small", "file_url": "{}", '
+    '"response_format": "text", "language": "it", "to_language": "en", '
+    '"temperature": 0.0}}}}\n'
+).format(AudioAsset("mary_had_lamb").url)
+

 def test_empty_file():
    with (
@@ -238,3 +260,121 @@ def test_reasoning_parser():
            ]
            assert reasoning is not None
            assert len(reasoning) > 0
+
+
+def test_transcription():
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
+        input_file.write(INPUT_TRANSCRIPTION_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "openai/whisper-large-v3",
+            ],
+        )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        print(f"\n\ncontents: {contents}\n\n")
+        for line in contents.strip().split("\n"):
+            BatchRequestOutput.model_validate_json(line)
+
+            line_dict = json.loads(line)
+            assert isinstance(line_dict, dict)
+            assert line_dict["error"] is None
+
+            response_body = line_dict["response"]["body"]
+            assert response_body is not None
+            assert "text" in response_body
+            assert "usage" in response_body
+
+
+def test_transcription_http_url():
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
+        input_file.write(INPUT_TRANSCRIPTION_HTTP_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "openai/whisper-large-v3",
+            ],
+        )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        for line in contents.strip().split("\n"):
+            BatchRequestOutput.model_validate_json(line)
+
+            line_dict = json.loads(line)
+            assert isinstance(line_dict, dict)
+            assert line_dict["error"] is None
+
+            response_body = line_dict["response"]["body"]
+            assert response_body is not None
+            assert "text" in response_body
+            assert "usage" in response_body
+
+            transcription_text = response_body["text"]
+            assert "Mary had a little lamb" in transcription_text
+
+
+def test_translation():
+    with (
+        tempfile.NamedTemporaryFile("w") as input_file,
+        tempfile.NamedTemporaryFile("r") as output_file,
+    ):
+        input_file.write(INPUT_TRANSLATION_BATCH)
+        input_file.flush()
+        proc = subprocess.Popen(
+            [
+                "vllm",
+                "run-batch",
+                "-i",
+                input_file.name,
+                "-o",
+                output_file.name,
+                "--model",
+                "openai/whisper-small",
+            ],
+        )
+        proc.communicate()
+        proc.wait()
+        assert proc.returncode == 0, f"{proc=}"
+
+        contents = output_file.read()
+        for line in contents.strip().split("\n"):
+            BatchRequestOutput.model_validate_json(line)
+
+            line_dict = json.loads(line)
+            assert isinstance(line_dict, dict)
+            assert line_dict["error"] is None
+
+            response_body = line_dict["response"]["body"]
+            assert response_body is not None
+            assert "text" in response_body
+
+            translation_text = response_body["text"]
+            translation_text_lower = str(translation_text).strip().lower()
+            assert "mary" in translation_text_lower or "lamb" in translation_text_lower