diff --git a/vllm/entrypoints/openai/translations/speech_to_text.py b/vllm/entrypoints/openai/translations/speech_to_text.py
index 9f92bf559..c993e1ebd 100644
--- a/vllm/entrypoints/openai/translations/speech_to_text.py
+++ b/vllm/entrypoints/openai/translations/speech_to_text.py
@@ -518,7 +518,8 @@ class OpenAISpeechToText(OpenAIServing):
                         total_segments.extend(segments)
                         text_parts.extend([seg.text for seg in segments])
                     else:
-                        text_parts.append(op.outputs[0].text)
+                        raw_text = op.outputs[0].text
+                        text_parts.append(self.model_cls.post_process_output(raw_text))
             text = "".join(text_parts)
             if self.task_type == "transcribe":
                 final_response: ResponseType
@@ -607,6 +608,10 @@ class OpenAISpeechToText(OpenAIServing):
                     assert len(res.outputs) == 1
                     output = res.outputs[0]
 
+                    # TODO: For models that output structured formats (e.g.,
+                    # Qwen3-ASR with "language X<asr_text>" prefix), streaming
+                    # would need buffering to strip the prefix properly since
+                    # deltas may split the tag across chunks.
                     delta_message = DeltaMessage(content=output.text)
                     completion_tokens += len(output.token_ids)
 
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index e6ee212af..f05231356 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -1145,6 +1145,22 @@ class SupportsTranscription(Protocol):
         """
         return None
 
+    @classmethod
+    def post_process_output(cls, text: str) -> str:
+        """
+        Post-process the raw model output text.
+
+        Some ASR models output structured formats (e.g., language tags,
+        special tokens) that need to be stripped before returning to the user.
+
+        Args:
+            text: Raw decoded text from the model.
+
+        Returns:
+            Cleaned transcription text.
+        """
+        return text
+
 
 @overload
 def supports_transcription(
diff --git a/vllm/model_executor/models/qwen3_asr.py b/vllm/model_executor/models/qwen3_asr.py
index 43a9c49c4..605ccee48 100644
--- a/vllm/model_executor/models/qwen3_asr.py
+++ b/vllm/model_executor/models/qwen3_asr.py
@@ -90,6 +90,7 @@ from vllm.transformers_utils.processors.qwen3_asr import (
 )
 
 logger = init_logger(__name__)
+_ASR_TEXT_TAG = "<asr_text>"
 
 
 def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
@@ -556,7 +557,7 @@ class Qwen3ASRForConditionalGeneration(
         else:
             prompt = (
                 f"<|im_start|>user\n{audio_placeholder}<|im_end|>\n"
-                f"<|im_start|>assistant\nlanguage {full_lang_name_to}<asr_text>"
+                f"<|im_start|>assistant\nlanguage {full_lang_name_to}{_ASR_TEXT_TAG}"
             )
 
         prompt_token_ids = tokenizer.encode(prompt)
@@ -565,3 +566,21 @@ class Qwen3ASRForConditionalGeneration(
             "multi_modal_data": {"audio": audio},
         }
         return cast(PromptType, prompt_dict)
+
+    @classmethod
+    def post_process_output(cls, text: str) -> str:
+        """
+        Post-process Qwen3-ASR raw output to extract clean transcription.
+
+        The model outputs in format: "language {lang}<asr_text>{transcription}"
+        This method strips the language prefix and asr_text tags.
+        """
+        if not text:
+            return ""
+
+        if _ASR_TEXT_TAG not in text:
+            return text
+
+        # Split on <asr_text> tag and take the transcription part
+        _, text_part = text.rsplit(_ASR_TEXT_TAG, 1)
+        return text_part