Unverified Commit 7b54f60d authored by Ekagra Ranjan's avatar Ekagra Ranjan Committed by GitHub
Browse files

[Cohere] Enable Cohere-Transcribe (#38120)


Signed-off-by: default avatarEkagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
parent a0e8c740
...@@ -654,6 +654,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition. ...@@ -654,6 +654,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
| ------------ | ------ | ----------------- | -------------------- | ------------------------- | | ------------ | ------ | ----------------- | -------------------- | ------------------------- |
| `CohereAsrForConditionalGeneration` | Cohere-Transcribe | `CohereLabs/cohere-transcribe-03-2026` | | |
| `FireRedASR2ForConditionalGeneration` | FireRedASR2 | `allendou/FireRedASR2-LLM-vllm`, etc. | | | | `FireRedASR2ForConditionalGeneration` | FireRedASR2 | `allendou/FireRedASR2-LLM-vllm`, etc. | | |
| `FunASRForConditionalGeneration` | FunASR | `allendou/Fun-ASR-Nano-2512-vllm`, etc. | | | | `FunASRForConditionalGeneration` | FunASR | `allendou/Fun-ASR-Nano-2512-vllm`, etc. | | |
| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
......
...@@ -72,8 +72,7 @@ def run_audioflamingo3(question: str, audio_count: int) -> ModelRequestData: ...@@ -72,8 +72,7 @@ def run_audioflamingo3(question: str, audio_count: int) -> ModelRequestData:
# CohereASR # CohereASR
def run_cohere_asr(question: str, audio_count: int) -> ModelRequestData: def run_cohere_asr(question: str, audio_count: int) -> ModelRequestData:
assert audio_count == 1, "CohereASR only support single audio input per prompt" assert audio_count == 1, "CohereASR only support single audio input per prompt"
# TODO (ekagra): add HF ckpt after asr release model_name = "CohereLabs/cohere-transcribe-03-2026"
model_name = "/host/engines/vllm/audio/2b-release"
prompt = ( prompt = (
"<|startofcontext|><|startoftranscript|>" "<|startofcontext|><|startoftranscript|>"
......
...@@ -19,6 +19,7 @@ import soundfile ...@@ -19,6 +19,7 @@ import soundfile
import torch import torch
from datasets import load_dataset from datasets import load_dataset
from evaluate import load from evaluate import load
from transformers.models.whisper.english_normalizer import EnglishTextNormalizer
from vllm.tokenizers import get_tokenizer from vllm.tokenizers import get_tokenizer
...@@ -33,6 +34,16 @@ def to_bytes(y, sr): ...@@ -33,6 +34,16 @@ def to_bytes(y, sr):
return buffer return buffer
# not all models have a normalizer so use the one from whisper as a standard option
normalizer_model_info = HF_EXAMPLE_MODELS.find_hf_info("openai/whisper-large-v3")
normalizer_tokenizer = get_tokenizer(
"openai/whisper-large-v3",
tokenizer_mode=normalizer_model_info.tokenizer_mode,
trust_remote_code=normalizer_model_info.trust_remote_code,
)
normalizer = EnglishTextNormalizer(normalizer_tokenizer.english_spelling_normalizer)
async def transcribe_audio(client, tokenizer, y, sr): async def transcribe_audio(client, tokenizer, y, sr):
# Send loaded audio directly instead of loading from disk, # Send loaded audio directly instead of loading from disk,
# don't account for that time though # don't account for that time though
...@@ -58,8 +69,8 @@ async def bound_transcribe(sem, client, tokenizer, audio, reference): ...@@ -58,8 +69,8 @@ async def bound_transcribe(sem, client, tokenizer, audio, reference):
async with sem: async with sem:
result = await transcribe_audio(client, tokenizer, *audio) result = await transcribe_audio(client, tokenizer, *audio)
# Normalize *english* output/reference for evaluation. # Normalize *english* output/reference for evaluation.
out = tokenizer.normalize(result[2]) out = normalizer(result[2])
ref = tokenizer.normalize(reference) ref = normalizer(reference)
return result[:2] + (out, ref) return result[:2] + (out, ref)
...@@ -156,8 +167,9 @@ def run_evaluation( ...@@ -156,8 +167,9 @@ def run_evaluation(
"model_config", "model_config",
[ [
("openai/whisper-large-v3", 12.744980), ("openai/whisper-large-v3", 12.744980),
# TODO (ekagra): add HF ckpt after asr release # TODO (ekagra): turn on after asr release
# ("/host/engines/vllm/audio/2b-release", 11.73), # CohereASR is used to test the variable encoder length code paths
# ("CohereLabs/cohere-transcribe-03-2026", 11.92),
], ],
) )
# Original dataset is 20GB+ in size, hence we use a pre-filtered slice. # Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
......
...@@ -1128,8 +1128,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -1128,8 +1128,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
tokenizer_mode="mistral", tokenizer_mode="mistral",
), ),
# [Encoder-decoder] # [Encoder-decoder]
"CohereASRForConditionalGeneration": _HfExamplesInfo( "CohereAsrForConditionalGeneration": _HfExamplesInfo(
"/host/engines/vllm/audio/2b-release", "CohereLabs/cohere-transcribe-03-2026",
trust_remote_code=True, trust_remote_code=True,
is_available_online=False, # TODO (ekagra): revert after asr release is_available_online=False, # TODO (ekagra): revert after asr release
), ),
......
...@@ -1988,7 +1988,7 @@ class CohereASRMultiModalProcessor(EncDecMultiModalProcessor[CohereASRProcessing ...@@ -1988,7 +1988,7 @@ class CohereASRMultiModalProcessor(EncDecMultiModalProcessor[CohereASRProcessing
info=CohereASRProcessingInfo, info=CohereASRProcessingInfo,
dummy_inputs=CohereASRDummyInputsBuilder, dummy_inputs=CohereASRDummyInputsBuilder,
) )
class CohereASRForConditionalGeneration( class CohereAsrForConditionalGeneration(
nn.Module, SupportsTranscription, SupportsMultiModal nn.Module, SupportsTranscription, SupportsMultiModal
): ):
packed_modules_mapping = { packed_modules_mapping = {
......
...@@ -525,9 +525,9 @@ _MULTIMODAL_MODELS = { ...@@ -525,9 +525,9 @@ _MULTIMODAL_MODELS = {
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),
"VoxtralRealtimeGeneration": ("voxtral_realtime", "VoxtralRealtimeGeneration"), "VoxtralRealtimeGeneration": ("voxtral_realtime", "VoxtralRealtimeGeneration"),
# [Encoder-decoder] # [Encoder-decoder]
"CohereASRForConditionalGeneration": ( "CohereAsrForConditionalGeneration": (
"cohere_asr", "cohere_asr",
"CohereASRForConditionalGeneration", "CohereAsrForConditionalGeneration",
), ),
"NemotronParseForConditionalGeneration": ( "NemotronParseForConditionalGeneration": (
"nemotron_parse", "nemotron_parse",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment