diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 791344b4f..e5c85dbe8 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -654,6 +654,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition. | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | | ------------ | ------ | ----------------- | -------------------- | ------------------------- | +| `CohereAsrForConditionalGeneration` | Cohere-Transcribe | `CohereLabs/cohere-transcribe-03-2026` | | | | `FireRedASR2ForConditionalGeneration` | FireRedASR2 | `allendou/FireRedASR2-LLM-vllm`, etc. | | | | `FunASRForConditionalGeneration` | FunASR | `allendou/Fun-ASR-Nano-2512-vllm`, etc. | | | | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index f384dc2bb..690aada03 100755 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -72,8 +72,7 @@ def run_audioflamingo3(question: str, audio_count: int) -> ModelRequestData: # CohereASR def run_cohere_asr(question: str, audio_count: int) -> ModelRequestData: assert audio_count == 1, "CohereASR only support single audio input per prompt" - # TODO (ekagra): add HF ckpt after asr release - model_name = "/host/engines/vllm/audio/2b-release" + model_name = "CohereLabs/cohere-transcribe-03-2026" prompt = ( "<|startofcontext|><|startoftranscript|>" diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py index c4c7b8b7f..194c52eae 100644 --- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py +++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py @@ -19,6 +19,7 @@ import soundfile import torch from datasets import load_dataset from evaluate import load +from transformers.models.whisper.english_normalizer import EnglishTextNormalizer from vllm.tokenizers import get_tokenizer @@ -33,6 +34,16 @@ def to_bytes(y, sr): return buffer +# not all models have a normalizer so use the one from whisper as a standard option +normalizer_model_info = HF_EXAMPLE_MODELS.find_hf_info("openai/whisper-large-v3") +normalizer_tokenizer = get_tokenizer( + "openai/whisper-large-v3", + tokenizer_mode=normalizer_model_info.tokenizer_mode, + trust_remote_code=normalizer_model_info.trust_remote_code, +) +normalizer = EnglishTextNormalizer(normalizer_tokenizer.english_spelling_normalizer) + + async def transcribe_audio(client, tokenizer, y, sr): # Send loaded audio directly instead of loading from disk, # don't account for that time though @@ -58,8 +69,8 @@ async def bound_transcribe(sem, client, tokenizer, audio, reference): async with sem: result = await transcribe_audio(client, tokenizer, *audio) # Normalize *english* output/reference for evaluation. - out = tokenizer.normalize(result[2]) - ref = tokenizer.normalize(reference) + out = normalizer(result[2]) + ref = normalizer(reference) return result[:2] + (out, ref) @@ -156,8 +167,9 @@ def run_evaluation( "model_config", [ ("openai/whisper-large-v3", 12.744980), - # TODO (ekagra): add HF ckpt after asr release - # ("/host/engines/vllm/audio/2b-release", 11.73), + # TODO (ekagra): turn on after asr release + # CohereASR is used to test the variable encoder length code paths + # ("CohereLabs/cohere-transcribe-03-2026", 11.92), ], ) # Original dataset is 20GB+ in size, hence we use a pre-filtered slice. diff --git a/tests/models/registry.py b/tests/models/registry.py index 6ffd5d50a..feb074f11 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -1128,8 +1128,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { tokenizer_mode="mistral", ), # [Encoder-decoder] - "CohereASRForConditionalGeneration": _HfExamplesInfo( - "/host/engines/vllm/audio/2b-release", + "CohereAsrForConditionalGeneration": _HfExamplesInfo( + "CohereLabs/cohere-transcribe-03-2026", trust_remote_code=True, is_available_online=False, # TODO (ekagra): revert after asr release ), diff --git a/vllm/model_executor/models/cohere_asr.py b/vllm/model_executor/models/cohere_asr.py index 2f8513823..1cebea56a 100644 --- a/vllm/model_executor/models/cohere_asr.py +++ b/vllm/model_executor/models/cohere_asr.py @@ -1988,7 +1988,7 @@ class CohereASRMultiModalProcessor(EncDecMultiModalProcessor[CohereASRProcessing info=CohereASRProcessingInfo, dummy_inputs=CohereASRDummyInputsBuilder, ) -class CohereASRForConditionalGeneration( +class CohereAsrForConditionalGeneration( nn.Module, SupportsTranscription, SupportsMultiModal ): packed_modules_mapping = { diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index c3e7edb7d..839aba11c 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -525,9 +525,9 @@ _MULTIMODAL_MODELS = { "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), "VoxtralRealtimeGeneration": ("voxtral_realtime", "VoxtralRealtimeGeneration"), # [Encoder-decoder] - "CohereASRForConditionalGeneration": ( + "CohereAsrForConditionalGeneration": ( "cohere_asr", - "CohereASRForConditionalGeneration", + "CohereAsrForConditionalGeneration", ), "NemotronParseForConditionalGeneration": ( "nemotron_parse",