diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 09051a37f..791344b4f 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -658,7 +658,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition. | `FunASRForConditionalGeneration` | FunASR | `allendou/Fun-ASR-Nano-2512-vllm`, etc. | | | | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | | `GlmAsrForConditionalGeneration` | GLM-ASR | `zai-org/GLM-ASR-Nano-2512` | ✅︎ | ✅︎ | -| `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ | +| `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-4.0-1b-speech`, `ibm-granite/granite-speech-3.3-2b`, etc. | ✅︎ | ✅︎ | | `Qwen3ASRForConditionalGeneration` | Qwen3-ASR | `Qwen/Qwen3-ASR-1.7B`, etc. | | ✅︎ | | `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, etc. | | ✅︎ | | `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ | diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py index f0650d4c2..038a15d05 100644 --- a/tests/models/multimodal/generation/test_granite_speech.py +++ b/tests/models/multimodal/generation/test_granite_speech.py @@ -29,10 +29,13 @@ def vllm_to_hf_output( MODEL_NAME = "ibm-granite/granite-speech-3.3-2b" -# Audio lora co-exists directly in the model directory, but -# currently still needs to be passed directly to vLLM. -audio_lora_path = MODEL_NAME -models = [MODEL_NAME] +MODEL_NAME_4_0 = "ibm-granite/granite-4.0-1b-speech" +# Audio lora co-exists directly in the 3.3 model directory, +# the 4.0 model has adapters merged into the weights. +models: dict[str, str | None] = { + MODEL_NAME: MODEL_NAME, + MODEL_NAME_4_0: None, +} @pytest.fixture @@ -60,6 +63,7 @@ def run_test( tensor_parallel_size: int, distributed_executor_backend: str | None = None, attention_config: dict | None = None, + audio_lora_path: str | None = None, ): """Inference result should be the same between hf and vllm. @@ -84,12 +88,14 @@ def run_test( limit_mm_per_prompt={"audio": 1}, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, - enable_lora=True, + enable_lora=audio_lora_path is not None, max_lora_rank=64, enforce_eager=True, attention_config=attention_config, ) as vllm_model: - lora_request = LoRARequest("audio", 1, audio_lora_path) + lora_request = ( + LoRARequest("audio", 1, audio_lora_path) if audio_lora_path else None + ) vllm_outputs_per_case = [ vllm_model.generate_greedy_logprobs( prompts, @@ -125,7 +131,7 @@ def run_test( ) -@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("model,audio_lora_path", models.items()) @pytest.mark.parametrize( "dtype", ["float16"] if current_platform.is_rocm() else ["bfloat16"] ) @@ -138,6 +144,7 @@ def test_models( hf_runner, vllm_runner, model: str, + audio_lora_path: str | None, audio_assets: AudioTestAssets, granite_speech_attention_config, dtype: str, @@ -167,4 +174,5 @@ def test_models( num_logprobs=num_logprobs, tensor_parallel_size=1, attention_config=granite_speech_attention_config, + audio_lora_path=audio_lora_path, ) diff --git a/tests/models/registry.py b/tests/models/registry.py index 0d1e8e348..6ffd5d50a 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -810,7 +810,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { ), "GraniteVision": _HfExamplesInfo("ibm-granite/granite-vision-3.3-2b"), "GraniteSpeechForConditionalGeneration": _HfExamplesInfo( - "ibm-granite/granite-speech-3.3-2b" + "ibm-granite/granite-speech-3.3-2b", + extras={"4.0-1b": "ibm-granite/granite-4.0-1b-speech"}, ), "GLM4VForCausalLM": _HfExamplesInfo( "zai-org/glm-4v-9b", diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 1209f1cbe..b97fc67f1 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -75,12 +75,14 @@ from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix # NOTE lang support is based on what is written here: # https://huggingface.co/ibm-granite/granite-speech-3.3-2b +# https://huggingface.co/ibm-granite/granite-4.0-1b-speech # Though this may vary from model to model, and also many langs # work pretty well with zero shot. ISO639_1_SUPPORTED_LANGS = { "en": "English", "fr": "French", "de": "German", + "ja": "Japanese", "pt": "Portuguese", "es": "Spanish", }