[Model] Add Granite 4.0 1B speech to supported models (#38019)

Signed-off-by: Nick Cao <ncao@redhat.com>

[Model] Add Granite 4.0 1B speech to supported models (#38019)
Signed-off-by: Nick Cao <ncao@redhat.com>
935c46dd · Nick Cao · GitHub · 057fc94c · 935c46dd · 935c46dd
Unverified Commit 935c46dd authored Mar 24, 2026 by Nick Cao Committed by GitHub Mar 24, 2026
4 changed files
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -658,7 +658,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
 | `FunASRForConditionalGeneration` | FunASR | `allendou/Fun-ASR-Nano-2512-vllm`, etc. | | |
 | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
 | `GlmAsrForConditionalGeneration` | GLM-ASR | `zai-org/GLM-ASR-Nano-2512` | ✅︎ | ✅︎ |
-| `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
+| `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-4.0-1b-speech`, `ibm-granite/granite-speech-3.3-2b`, etc. | ✅︎ | ✅︎ |
 | `Qwen3ASRForConditionalGeneration` | Qwen3-ASR | `Qwen/Qwen3-ASR-1.7B`, etc. | | ✅︎ |
 | `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, etc. | | ✅︎ |
 | `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | ✅︎ | ✅︎ |

--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -29,10 +29,13 @@ def vllm_to_hf_output(


 MODEL_NAME = "ibm-granite/granite-speech-3.3-2b"
-# Audio lora co-exists directly in the model directory, but
-# currently still needs to be passed directly to vLLM.
-audio_lora_path = MODEL_NAME
-models = [MODEL_NAME]
+MODEL_NAME_4_0 = "ibm-granite/granite-4.0-1b-speech"
+# Audio lora co-exists directly in the 3.3 model directory,
+# the 4.0 model has adapters merged into the weights.
+models: dict[str, str | None] = {
+    MODEL_NAME: MODEL_NAME,
+    MODEL_NAME_4_0: None,
+}


 @pytest.fixture
@@ -60,6 +63,7 @@ def run_test(
    tensor_parallel_size: int,
    distributed_executor_backend: str | None = None,
    attention_config: dict | None = None,
+    audio_lora_path: str | None = None,
 ):
    """Inference result should be the same between hf and vllm.

@@ -84,12 +88,14 @@ def run_test(
        limit_mm_per_prompt={"audio": 1},
        tensor_parallel_size=tensor_parallel_size,
        distributed_executor_backend=distributed_executor_backend,
-        enable_lora=True,
+        enable_lora=audio_lora_path is not None,
        max_lora_rank=64,
        enforce_eager=True,
        attention_config=attention_config,
    ) as vllm_model:
-        lora_request = LoRARequest("audio", 1, audio_lora_path)
+        lora_request = (
+            LoRARequest("audio", 1, audio_lora_path) if audio_lora_path else None
+        )
        vllm_outputs_per_case = [
            vllm_model.generate_greedy_logprobs(
                prompts,
@@ -125,7 +131,7 @@ def run_test(
        )


-@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("model,audio_lora_path", models.items())
 @pytest.mark.parametrize(
    "dtype", ["float16"] if current_platform.is_rocm() else ["bfloat16"]
 )
@@ -138,6 +144,7 @@ def test_models(
    hf_runner,
    vllm_runner,
    model: str,
+    audio_lora_path: str | None,
    audio_assets: AudioTestAssets,
    granite_speech_attention_config,
    dtype: str,
@@ -167,4 +174,5 @@ def test_models(
        num_logprobs=num_logprobs,
        tensor_parallel_size=1,
        attention_config=granite_speech_attention_config,
+        audio_lora_path=audio_lora_path,
    )
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -810,7 +810,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    ),
    "GraniteVision": _HfExamplesInfo("ibm-granite/granite-vision-3.3-2b"),
    "GraniteSpeechForConditionalGeneration": _HfExamplesInfo(
-        "ibm-granite/granite-speech-3.3-2b"
+        "ibm-granite/granite-speech-3.3-2b",
+        extras={"4.0-1b": "ibm-granite/granite-4.0-1b-speech"},
    ),
    "GLM4VForCausalLM": _HfExamplesInfo(
        "zai-org/glm-4v-9b",

--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -75,12 +75,14 @@ from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix

 # NOTE lang support is based on what is written here:
 # https://huggingface.co/ibm-granite/granite-speech-3.3-2b
+# https://huggingface.co/ibm-granite/granite-4.0-1b-speech
 # Though this may vary from model to model, and also many langs
 # work pretty well with zero shot.
 ISO639_1_SUPPORTED_LANGS = {
    "en": "English",
    "fr": "French",
    "de": "German",
+    "ja": "Japanese",
    "pt": "Portuguese",
    "es": "Spanish",
 }