[Model] Add GraniteMoeHybrid 4.0 model (#17497)

Signed-off-by: Thomas Ortner <boh@zurich.ibm.com> Signed-off-by: Stanislaw Wozniak <stw@zurich.ibm.com> Co-authored-by: Thomas Ortner <boh@zurich.ibm.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>

[Model] Add GraniteMoeHybrid 4.0 model (#17497)
Signed-off-by: Thomas Ortner <boh@zurich.ibm.com> Signed-off-by: Stanislaw Wozniak <stw@zurich.ibm.com> Co-authored-by: Thomas Ortner <boh@zurich.ibm.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
999328be · Stan Wozniak · GitHub · 98834fef · 999328be · 999328be
Unverified Commit 999328be authored May 06, 2025 by Stan Wozniak Committed by GitHub May 06, 2025
6 changed files
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -385,6 +385,11 @@ See [this page](#generative-models) for more information on how to use generativ
  * `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.
  * ✅︎
  * ✅︎
+- * `GraniteMoeHybridForCausalLM`
+  * Granite 4.0 MoE Hybrid
+  * `ibm-granite/granite-4.0-tiny-preview`, etc.
+  * ✅︎
+  * ✅︎
 - * `GraniteMoeSharedForCausalLM`
  * Granite MoE Shared
  * `ibm-research/moe-7b-1b-active-shared-experts` (test model)

--- a/tests/models/language/generation/test_granitemoehybrid.py
+++ b/tests/models/language/generation/test_granitemoehybrid.py
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from ...utils import check_logprobs_close
+
+# Path of the checkpoints
+MODELS = [
+    "ibm-granite/granite-4.0-tiny-preview",
+]
+
+
+@pytest.mark.skip(
+    reason="Granite 4.0 is not yet available in huggingface transformers")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float16", "bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_model_equivalence_to_hf_greedy(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+):
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -23,6 +23,9 @@ SSM_MODELS = [

 HYBRID_MODELS = [
    "ai21labs/Jamba-tiny-dev",
+    # NOTE: ibm-granite/granite-4.0-tiny-preview are skipped currently as
+    # it is not yet available in huggingface transformers
+    # "ibm-granite/granite-4.0-tiny-preview",
    # NOTE: Running Plamo2 in transformers implementation requires to install
    # causal-conv1d package, which is not listed as a test dependency as it's
    # not compatible with pip-compile.

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -166,6 +166,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                          {"1b": "EleutherAI/pythia-1.4b"}),
    "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
    "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
+    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview",  # noqa: E501
+                                                   min_transformers_version="4.52.0"),  # noqa: E501
    "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"),  # noqa: E501
    "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
                                             trust_remote_code=True),

--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -64,6 +64,7 @@ _TEXT_GENERATION_MODELS = {
    "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
    "GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
    "GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
+    "GraniteMoeHybridForCausalLM": ("granitemoehybrid", "GraniteMoeHybridForCausalLM"),   # noqa: E501
    "GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"),   # noqa: E501
    "GritLM": ("gritlm", "GritLM"),
    "Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),