[Bugfix] Fix Qwen3ASR language asr tag in output (#33410)

Signed-off-by: NickLucche <nlucches@redhat.com>
This commit is contained in:
Nicolò Lucchesi
2026-01-31 06:24:49 +01:00
committed by GitHub
parent 8ecd213c0b
commit e77f162cf5
3 changed files with 42 additions and 2 deletions

View File

@@ -1145,6 +1145,22 @@ class SupportsTranscription(Protocol):
"""
return None
@classmethod
def post_process_output(cls, text: str) -> str:
"""
Post-process the raw model output text.
Some ASR models output structured formats (e.g., language tags,
special tokens) that need to be stripped before returning to the user.
Args:
text: Raw decoded text from the model.
Returns:
Cleaned transcription text.
"""
return text
@overload
def supports_transcription(

View File

@@ -90,6 +90,7 @@ from vllm.transformers_utils.processors.qwen3_asr import (
)
logger = init_logger(__name__)
_ASR_TEXT_TAG = "<asr_text>"
def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
@@ -556,7 +557,7 @@ class Qwen3ASRForConditionalGeneration(
else:
prompt = (
f"<|im_start|>user\n{audio_placeholder}<|im_end|>\n"
f"<|im_start|>assistant\nlanguage {full_lang_name_to}<asr_text>"
f"<|im_start|>assistant\nlanguage {full_lang_name_to}{_ASR_TEXT_TAG}"
)
prompt_token_ids = tokenizer.encode(prompt)
@@ -565,3 +566,21 @@ class Qwen3ASRForConditionalGeneration(
"multi_modal_data": {"audio": audio},
}
return cast(PromptType, prompt_dict)
@classmethod
def post_process_output(cls, text: str) -> str:
"""
Post-process Qwen3-ASR raw output to extract clean transcription.
The model outputs in format: "language {lang}<asr_text>{transcription}"
This method strips the language prefix and asr_text tags.
"""
if not text:
return ""
if _ASR_TEXT_TAG not in text:
return text
# Split on <asr_text> tag and take the transcription part
_, text_part = text.rsplit(_ASR_TEXT_TAG, 1)
return text_part