[Cohere] Enable Cohere-Transcribe (#38120)
Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
This commit is contained in:
@@ -654,6 +654,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
|
||||
|
||||
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
| ------------ | ------ | ----------------- | -------------------- | ------------------------- |
|
||||
| `CohereAsrForConditionalGeneration` | Cohere-Transcribe | `CohereLabs/cohere-transcribe-03-2026` | | |
|
||||
| `FireRedASR2ForConditionalGeneration` | FireRedASR2 | `allendou/FireRedASR2-LLM-vllm`, etc. | | |
|
||||
| `FunASRForConditionalGeneration` | FunASR | `allendou/Fun-ASR-Nano-2512-vllm`, etc. | | |
|
||||
| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
|
||||
|
||||
@@ -72,8 +72,7 @@ def run_audioflamingo3(question: str, audio_count: int) -> ModelRequestData:
|
||||
# CohereASR
|
||||
def run_cohere_asr(question: str, audio_count: int) -> ModelRequestData:
|
||||
assert audio_count == 1, "CohereASR only support single audio input per prompt"
|
||||
# TODO (ekagra): add HF ckpt after asr release
|
||||
model_name = "/host/engines/vllm/audio/2b-release"
|
||||
model_name = "CohereLabs/cohere-transcribe-03-2026"
|
||||
|
||||
prompt = (
|
||||
"<|startofcontext|><|startoftranscript|>"
|
||||
|
||||
@@ -19,6 +19,7 @@ import soundfile
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from evaluate import load
|
||||
from transformers.models.whisper.english_normalizer import EnglishTextNormalizer
|
||||
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
@@ -33,6 +34,16 @@ def to_bytes(y, sr):
|
||||
return buffer
|
||||
|
||||
|
||||
# not all models have a normalizer so use the one from whisper as a standard option
|
||||
normalizer_model_info = HF_EXAMPLE_MODELS.find_hf_info("openai/whisper-large-v3")
|
||||
normalizer_tokenizer = get_tokenizer(
|
||||
"openai/whisper-large-v3",
|
||||
tokenizer_mode=normalizer_model_info.tokenizer_mode,
|
||||
trust_remote_code=normalizer_model_info.trust_remote_code,
|
||||
)
|
||||
normalizer = EnglishTextNormalizer(normalizer_tokenizer.english_spelling_normalizer)
|
||||
|
||||
|
||||
async def transcribe_audio(client, tokenizer, y, sr):
|
||||
# Send loaded audio directly instead of loading from disk,
|
||||
# don't account for that time though
|
||||
@@ -58,8 +69,8 @@ async def bound_transcribe(sem, client, tokenizer, audio, reference):
|
||||
async with sem:
|
||||
result = await transcribe_audio(client, tokenizer, *audio)
|
||||
# Normalize *english* output/reference for evaluation.
|
||||
out = tokenizer.normalize(result[2])
|
||||
ref = tokenizer.normalize(reference)
|
||||
out = normalizer(result[2])
|
||||
ref = normalizer(reference)
|
||||
return result[:2] + (out, ref)
|
||||
|
||||
|
||||
@@ -156,8 +167,9 @@ def run_evaluation(
|
||||
"model_config",
|
||||
[
|
||||
("openai/whisper-large-v3", 12.744980),
|
||||
# TODO (ekagra): add HF ckpt after asr release
|
||||
# ("/host/engines/vllm/audio/2b-release", 11.73),
|
||||
# TODO (ekagra): turn on after asr release
|
||||
# CohereASR is used to test the variable encoder length code paths
|
||||
# ("CohereLabs/cohere-transcribe-03-2026", 11.92),
|
||||
],
|
||||
)
|
||||
# Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
|
||||
|
||||
@@ -1128,8 +1128,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
tokenizer_mode="mistral",
|
||||
),
|
||||
# [Encoder-decoder]
|
||||
"CohereASRForConditionalGeneration": _HfExamplesInfo(
|
||||
"/host/engines/vllm/audio/2b-release",
|
||||
"CohereAsrForConditionalGeneration": _HfExamplesInfo(
|
||||
"CohereLabs/cohere-transcribe-03-2026",
|
||||
trust_remote_code=True,
|
||||
is_available_online=False, # TODO (ekagra): revert after asr release
|
||||
),
|
||||
|
||||
@@ -1988,7 +1988,7 @@ class CohereASRMultiModalProcessor(EncDecMultiModalProcessor[CohereASRProcessing
|
||||
info=CohereASRProcessingInfo,
|
||||
dummy_inputs=CohereASRDummyInputsBuilder,
|
||||
)
|
||||
class CohereASRForConditionalGeneration(
|
||||
class CohereAsrForConditionalGeneration(
|
||||
nn.Module, SupportsTranscription, SupportsMultiModal
|
||||
):
|
||||
packed_modules_mapping = {
|
||||
|
||||
@@ -525,9 +525,9 @@ _MULTIMODAL_MODELS = {
|
||||
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),
|
||||
"VoxtralRealtimeGeneration": ("voxtral_realtime", "VoxtralRealtimeGeneration"),
|
||||
# [Encoder-decoder]
|
||||
"CohereASRForConditionalGeneration": (
|
||||
"CohereAsrForConditionalGeneration": (
|
||||
"cohere_asr",
|
||||
"CohereASRForConditionalGeneration",
|
||||
"CohereAsrForConditionalGeneration",
|
||||
),
|
||||
"NemotronParseForConditionalGeneration": (
|
||||
"nemotron_parse",
|
||||
|
||||
Reference in New Issue
Block a user