diff --git a/vllm/entrypoints/openai/translations/speech_to_text.py b/vllm/entrypoints/openai/translations/speech_to_text.py index 9f92bf559..c993e1ebd 100644 --- a/vllm/entrypoints/openai/translations/speech_to_text.py +++ b/vllm/entrypoints/openai/translations/speech_to_text.py @@ -518,7 +518,8 @@ class OpenAISpeechToText(OpenAIServing): total_segments.extend(segments) text_parts.extend([seg.text for seg in segments]) else: - text_parts.append(op.outputs[0].text) + raw_text = op.outputs[0].text + text_parts.append(self.model_cls.post_process_output(raw_text)) text = "".join(text_parts) if self.task_type == "transcribe": final_response: ResponseType @@ -607,6 +608,10 @@ class OpenAISpeechToText(OpenAIServing): assert len(res.outputs) == 1 output = res.outputs[0] + # TODO: For models that output structured formats (e.g., + # Qwen3-ASR with "language X" prefix), streaming + # would need buffering to strip the prefix properly since + # deltas may split the tag across chunks. delta_message = DeltaMessage(content=output.text) completion_tokens += len(output.token_ids) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index e6ee212af..f05231356 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1145,6 +1145,22 @@ class SupportsTranscription(Protocol): """ return None + @classmethod + def post_process_output(cls, text: str) -> str: + """ + Post-process the raw model output text. + + Some ASR models output structured formats (e.g., language tags, + special tokens) that need to be stripped before returning to the user. + + Args: + text: Raw decoded text from the model. + + Returns: + Cleaned transcription text. + """ + return text + @overload def supports_transcription( diff --git a/vllm/model_executor/models/qwen3_asr.py b/vllm/model_executor/models/qwen3_asr.py index 43a9c49c4..605ccee48 100644 --- a/vllm/model_executor/models/qwen3_asr.py +++ b/vllm/model_executor/models/qwen3_asr.py @@ -90,6 +90,7 @@ from vllm.transformers_utils.processors.qwen3_asr import ( ) logger = init_logger(__name__) +_ASR_TEXT_TAG = "" def _get_feat_extract_output_lengths(input_lengths: torch.Tensor): @@ -556,7 +557,7 @@ class Qwen3ASRForConditionalGeneration( else: prompt = ( f"<|im_start|>user\n{audio_placeholder}<|im_end|>\n" - f"<|im_start|>assistant\nlanguage {full_lang_name_to}" + f"<|im_start|>assistant\nlanguage {full_lang_name_to}{_ASR_TEXT_TAG}" ) prompt_token_ids = tokenizer.encode(prompt) @@ -565,3 +566,21 @@ class Qwen3ASRForConditionalGeneration( "multi_modal_data": {"audio": audio}, } return cast(PromptType, prompt_dict) + + @classmethod + def post_process_output(cls, text: str) -> str: + """ + Post-process Qwen3-ASR raw output to extract clean transcription. + + The model outputs in format: "language {lang}{transcription}" + This method strips the language prefix and asr_text tags. + """ + if not text: + return "" + + if _ASR_TEXT_TAG not in text: + return text + + # Split on tag and take the transcription part + _, text_part = text.rsplit(_ASR_TEXT_TAG, 1) + return text_part