[Model] Nemotron Parse 1.1 Support (#30864)

Signed-off-by: amitz-nv <203509407+amitz-nv@users.noreply.github.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>

[Model] Nemotron Parse 1.1 Support (#30864)
Signed-off-by: amitz-nv <203509407+amitz-nv@users.noreply.github.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
ee212918 · amitz-nv · GitHub · af1b07b0 · ee212918 · ee212918
Unverified Commit ee212918 authored Jan 05, 2026 by amitz-nv Committed by GitHub Jan 05, 2026
13 changed files
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -9,6 +9,7 @@ pytest-timeout
 pytest-cov
 # testing utils
+albumentations # required for Nemotron Parse in test_common.py
 backoff # required for phi4mm test
 blobfile # required for kimi-vl test
 einops # required for MPT, qwen-vl
@@ -31,7 +32,7 @@ transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[image,audio] >= 1.8.8 # required for voxtral test
 num2words # required for smolvlm test
-open_clip_torch==2.32.0 # Required for nemotron_vl test
+open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]>=0.4.9.2 # required for model evaluation test

--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -27,7 +27,9 @@ aiosignal==1.4.0
 albucore==0.0.16
    # via terratorch
 albumentations==1.4.6
-    # via terratorch
+    # via
+    #   -r requirements/test.in
+    #   terratorch
 alembic==1.16.4
    # via mlflow
 annotated-types==0.7.0

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -685,6 +685,7 @@ class HfRunner:
        images: PromptImageInput | None = None,
        audios: PromptAudioInput | None = None,
        videos: PromptVideoInput | None = None,
+        use_cache: bool = True,
        **kwargs: Any,
    ) -> list[TokensTextLogprobs]:
        all_inputs = self.get_inputs(
@@ -698,7 +699,7 @@ class HfRunner:
        for inputs in all_inputs:
            output: "GenerateOutput" = self.model.generate(
                **self.wrap_device(inputs),
-                use_cache=True,
+                use_cache=use_cache,
                do_sample=False,
                max_new_tokens=max_tokens,
                output_hidden_states=True,

--- a/tests/models/multimodal/generation/test_nemotron_parse.py
+++ b/tests/models/multimodal/generation/test_nemotron_parse.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Sequence
+import pytest
+from transformers import AutoModel
+from tests.models.utils import check_logprobs_close
+from vllm.assets.image import ImageAsset
+from ....conftest import HfRunner, PromptImageInput, VllmRunner
+from ....utils import create_new_process_for_each_test
+IMAGE = ImageAsset("paper-11").pil_image_ext(ext="png").convert("RGB")
+PROMPT = "</s><s><predict_bbox><predict_classes><output_markdown>"
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: Sequence[tuple[list[str], PromptImageInput]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    """Verify that the inference result is the same between hf and vllm."""
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        max_num_seqs=64,
+        limit_mm_per_prompt={"image": 1},
+        trust_remote_code=True,
+    ) as vllm_model:
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+            )
+            for prompts, images in inputs
+        ]
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                use_cache=False,  # HF Nemotron Parse crashes here without this
+            )
+            for prompts, images in inputs
+        ]
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("num_logprobs", [5])
+@create_new_process_for_each_test("spawn")
+def test_models(
+    hf_runner, vllm_runner, model: str, dtype: str, num_logprobs: int
+) -> None:
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs=[
+            (
+                [PROMPT] * 10,
+                [IMAGE] * 10,
+            ),
+        ],
+        model=model,
+        dtype=dtype,
+        max_tokens=100,
+        num_logprobs=num_logprobs,
+    )
--- a/tests/models/multimodal/pooling/test_radio.py
+++ b/tests/models/multimodal/pooling/test_radio.py
@@ -40,15 +40,15 @@ def run_radio_test(
        for image in images
    ]
-    config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+    hf_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
    # RADIO model on HF does not properly handle torch_dtype argument
    # And relies on args["dtype"] which we have to patch manually:
-    config.args["dtype"] = torch_dtype
+    hf_config.args["dtype"] = torch_dtype
    hf_model = AutoModel.from_pretrained(
        model_id,
-        config=config,
+        config=hf_config,
        dtype=torch_dtype,
        trust_remote_code=True,
    ).to("cuda")
@@ -62,13 +62,14 @@ def run_radio_test(
    hf_model.make_preprocessor_external()
    hf_outputs_per_image = [
-        hf_model(pixel_value.to("cuda")).features for pixel_value in pixel_values
+        hf_model(pixel_value.to("cuda")) for pixel_value in pixel_values
    ]
-    radio_config = RadioConfig(
+    vllm_config = RadioConfig(
-        model_name=config.args["model"], reg_tokens=config.args["register_multiple"]
+        model_name=hf_config.args["model"],
+        **hf_config.args,
    )
-    vllm_model = RadioModel(radio_config)
+    vllm_model = RadioModel(vllm_config)
    vllm_model.load_weights(hf_model.state_dict())
    vllm_model = vllm_model.to("cuda", torch_dtype)
@@ -80,7 +81,8 @@ def run_radio_test(
    cos_similar = nn.CosineSimilarity(dim=-1)
    for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
-        assert cos_similar(vllm_output, hf_output).mean() > 0.99
+        assert cos_similar(vllm_output[0], hf_output[0]).mean() > 0.99
+        assert cos_similar(vllm_output[1], hf_output[1]).mean() > 0.99
 @pytest.mark.parametrize(

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -102,6 +102,7 @@ def glmasr_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
 # incorrect token ids. So we need use `add_special_tokens=False` here
 # to leave bos_token to be added by the processor.
 _ADD_SPECIAL_TOKENS_OVERRIDES = {
+    "nemotron_parse": False,
    "ovis": False,
    "ovis2_5": False,
    "paligemma": False,

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -907,6 +907,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        is_available_online=False,
    ),
    # [Encoder-decoder]
+    "NemotronParseForConditionalGeneration": _HfExamplesInfo(
+        "nvidia/NVIDIA-Nemotron-Parse-v1.1", trust_remote_code=True
+    ),
    "WhisperForConditionalGeneration": _HfExamplesInfo(
        "openai/whisper-large-v3-turbo",
        extras={"v3": "openai/whisper-large-v3"},

--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -42,8 +42,11 @@ class ImageAsset:
        )
    @property
-    def pil_image(self, ext="jpg") -> Image.Image:
+    def pil_image(self) -> Image.Image:
-        image_path = self.get_path(ext)
+        return self.pil_image_ext(ext="jpg")
+    def pil_image_ext(self, ext: str) -> Image.Image:
+        image_path = self.get_path(ext=ext)
        return Image.open(image_path)
    @property

--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -1220,7 +1220,7 @@ class NemotronH_Nano_VL_V2(
        n = pixel_values.shape[0]
        vit_embeds_list = []
        for i in range(0, n, micro_batch_size):
-            vit_embeds = self.vision_model(pixel_values[i : i + micro_batch_size])
+            _, vit_embeds = self.vision_model(pixel_values[i : i + micro_batch_size])
            vit_embeds = vit_embeds.to(dtype=torch.bfloat16)
            h = w = int(vit_embeds.shape[1] ** 0.5)
            vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
@@ -1695,12 +1695,7 @@ class NemotronH_Nano_VL_V2(
            patch_size=patch_size,
            norm_mean=hf_config.norm_mean,
            norm_std=hf_config.norm_std,
-            reg_tokens=(
+            **hf_config_vision.args,
-                hf_config_vision.args.get("register_multiple")
-                if hasattr(hf_config_vision, "args")
-                and isinstance(hf_config_vision.args, dict)
-                else None
-            ),
        )
        return RadioModel(config=radio_config)

--- a/vllm/model_executor/models/nemotron_parse.py
+++ b/vllm/model_executor/models/nemotron_parse.py
--- a/vllm/model_executor/models/radio.py
+++ b/vllm/model_executor/models/radio.py
@@ -427,15 +427,17 @@ class RadioInternVisionModel(nn.Module):
            to_2tuple(config.patch_size), config.image_size
        )
        max_img_size = int(
-            round(config.max_img_size / config.patch_size) * config.patch_size
+            round(config.cpe_max_size / config.patch_size) * config.patch_size
        )
+        unique_teachers = set(t["name"] for t in config.teachers)
        self.patch_generator = ViTPatchGenerator(
            config.patch_size,
            config.hidden_size,
            input_dims=self.img_size,
            max_input_dims=max_img_size,
            cls_token=True,
-            register_multiple=config.reg_tokens,
+            num_cls_tokens=len(unique_teachers) if config.cls_token_per_teacher else 1,
+            register_multiple=config.register_multiple,
        )
        self.encoder = InternVisionEncoder(
@@ -489,11 +491,20 @@ class RadioModel(nn.Module):
            prefix=prefix,
        )
+        summary_idxs = None
+        if config.teachers:
+            summary_idxs = torch.tensor(
+                [i for i, t in enumerate(config.teachers) if t.get("use_summary", True)]
+            )
+            if summary_idxs.numel() > 0:
+                self.register_buffer("summary_idxs", summary_idxs)
+        self.summary_idxs = summary_idxs
    def forward(
        self,
        pixel_values: torch.Tensor | None = None,
        pixel_embeds: torch.Tensor | None = None,
-    ) -> torch.FloatTensor:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
        y = self.model(pixel_values)
        return self._extract_final(y)
@@ -546,10 +557,17 @@ class RadioModel(nn.Module):
        return loaded_params
-    def _extract_final(self, y: torch.Tensor):
+    def _extract_final(
+        self, y: torch.Tensor
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
        # Remove CLS + REGISTERS tokens
        patch_gen = getattr(self.model, "patch_generator", None)
        if patch_gen is not None:
+            all_summary = y[:, : patch_gen.num_cls_tokens]
+            if self.summary_idxs is not None:
+                bb_summary = all_summary[:, self.summary_idxs]
+            else:
+                bb_summary = all_summary
            all_feat = y[:, patch_gen.num_skip :]
-        return all_feat
+        return bb_summary.flatten(1), all_feat
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -428,6 +428,10 @@ _MULTIMODAL_MODELS = {
    "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
    "VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"),  # noqa: E501
    # [Encoder-decoder]
+    "NemotronParseForConditionalGeneration": (
+        "nemotron_parse",
+        "NemotronParseForConditionalGeneration",
+    ),
    "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
 }

--- a/vllm/transformers_utils/configs/radio.py
+++ b/vllm/transformers_utils/configs/radio.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Radio vision model configuration"""
+from typing import Any
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
@@ -36,12 +38,15 @@ class RadioConfig(PretrainedConfig):
        layer_norm_eps: The epsilon used by the layer normalization layers.
        initializer_factor: A factor for initializing all weight matrices.
        hidden_act: The non-linear activation function in the encoder.
-        max_img_size: Maximum image size for position embeddings.
+        cpe_max_size: Maximum image size for position embeddings.
        norm_mean: Mean values for image normalization (RGB channels).
            Defaults to (0.48145466, 0.4578275, 0.40821073)).
        norm_std: Standard deviation values for image normalization
            (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
-        reg_tokens: Number of register tokens to use.
+        register_multiple: Number of register tokens to use.
+        teachers: A list of teacher model configurations. Each teacher configuration is
+            a dict with keys like "name" and some may have "use_summary".
+        cls_token_per_teacher: Whether to use a separate CLS token for each teacher.
    """
    model_type = "radio"
@@ -57,10 +62,12 @@ class RadioConfig(PretrainedConfig):
        layer_norm_eps: float = 1e-6,
        initializer_factor: float = 1.0,
        hidden_act: str = "gelu",
-        max_img_size: int = 2048,
+        cpe_max_size: int = 2048,
        norm_mean: tuple[float, float, float] | list = OPENAI_CLIP_MEAN,
        norm_std: tuple[float, float, float] | list = OPENAI_CLIP_STD,
-        reg_tokens: int | None = None,
+        register_multiple: int | None = None,
+        teachers: list[dict[str, Any]] | None = None,
+        cls_token_per_teacher: bool = False,
        **kwargs,
    ):
        self.model_name = model_name
@@ -78,12 +85,14 @@ class RadioConfig(PretrainedConfig):
        self.layer_norm_eps = layer_norm_eps
        self.initializer_factor = initializer_factor
        self.hidden_act = hidden_act
-        self.max_img_size = max_img_size
+        self.cpe_max_size = cpe_max_size
        self.norm_mean = (
            list(norm_mean) if isinstance(norm_mean, (tuple, list)) else norm_mean
        )
        self.norm_std = (
            list(norm_std) if isinstance(norm_std, (tuple, list)) else norm_std
        )
-        self.reg_tokens = reg_tokens
+        self.register_multiple = register_multiple
+        self.teachers = teachers if teachers is not None else []
+        self.cls_token_per_teacher = cls_token_per_teacher
        super().__init__(**kwargs)