[Model] Add Granite 4.0 1B speech to supported models (#38019)

Signed-off-by: Nick Cao <ncao@redhat.com>
This commit is contained in:
Nick Cao
2026-03-24 14:23:41 -04:00
committed by GitHub
parent 057fc94cbd
commit 935c46dd9b
4 changed files with 20 additions and 9 deletions

View File

@@ -658,7 +658,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
| `FunASRForConditionalGeneration` | FunASR | `allendou/Fun-ASR-Nano-2512-vllm`, etc. | | |
| `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
| `GlmAsrForConditionalGeneration` | GLM-ASR | `zai-org/GLM-ASR-Nano-2512` | ✅︎ | ✅︎ |
| `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
| `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-4.0-1b-speech`, `ibm-granite/granite-speech-3.3-2b`, etc. | ✅︎ | ✅︎ |
| `Qwen3ASRForConditionalGeneration` | Qwen3-ASR | `Qwen/Qwen3-ASR-1.7B`, etc. | | ✅︎ |
| `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, etc. | | ✅︎ |
| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ |

View File

@@ -29,10 +29,13 @@ def vllm_to_hf_output(
MODEL_NAME = "ibm-granite/granite-speech-3.3-2b"
# Audio lora co-exists directly in the model directory, but
# currently still needs to be passed directly to vLLM.
audio_lora_path = MODEL_NAME
models = [MODEL_NAME]
MODEL_NAME_4_0 = "ibm-granite/granite-4.0-1b-speech"
# Audio lora co-exists directly in the 3.3 model directory,
# the 4.0 model has adapters merged into the weights.
models: dict[str, str | None] = {
MODEL_NAME: MODEL_NAME,
MODEL_NAME_4_0: None,
}
@pytest.fixture
@@ -60,6 +63,7 @@ def run_test(
tensor_parallel_size: int,
distributed_executor_backend: str | None = None,
attention_config: dict | None = None,
audio_lora_path: str | None = None,
):
"""Inference result should be the same between hf and vllm.
@@ -84,12 +88,14 @@ def run_test(
limit_mm_per_prompt={"audio": 1},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
enable_lora=audio_lora_path is not None,
max_lora_rank=64,
enforce_eager=True,
attention_config=attention_config,
) as vllm_model:
lora_request = LoRARequest("audio", 1, audio_lora_path)
lora_request = (
LoRARequest("audio", 1, audio_lora_path) if audio_lora_path else None
)
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(
prompts,
@@ -125,7 +131,7 @@ def run_test(
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("model,audio_lora_path", models.items())
@pytest.mark.parametrize(
"dtype", ["float16"] if current_platform.is_rocm() else ["bfloat16"]
)
@@ -138,6 +144,7 @@ def test_models(
hf_runner,
vllm_runner,
model: str,
audio_lora_path: str | None,
audio_assets: AudioTestAssets,
granite_speech_attention_config,
dtype: str,
@@ -167,4 +174,5 @@ def test_models(
num_logprobs=num_logprobs,
tensor_parallel_size=1,
attention_config=granite_speech_attention_config,
audio_lora_path=audio_lora_path,
)

View File

@@ -810,7 +810,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
),
"GraniteVision": _HfExamplesInfo("ibm-granite/granite-vision-3.3-2b"),
"GraniteSpeechForConditionalGeneration": _HfExamplesInfo(
"ibm-granite/granite-speech-3.3-2b"
"ibm-granite/granite-speech-3.3-2b",
extras={"4.0-1b": "ibm-granite/granite-4.0-1b-speech"},
),
"GLM4VForCausalLM": _HfExamplesInfo(
"zai-org/glm-4v-9b",

View File

@@ -75,12 +75,14 @@ from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
# NOTE lang support is based on what is written here:
# https://huggingface.co/ibm-granite/granite-speech-3.3-2b
# https://huggingface.co/ibm-granite/granite-4.0-1b-speech
# Though this may vary from model to model, and also many langs
# work pretty well with zero shot.
ISO639_1_SUPPORTED_LANGS = {
"en": "English",
"fr": "French",
"de": "German",
"ja": "Japanese",
"pt": "Portuguese",
"es": "Spanish",
}