Unverified Commit ee212918 authored by amitz-nv's avatar amitz-nv Committed by GitHub
Browse files

[Model] Nemotron Parse 1.1 Support (#30864)


Signed-off-by: default avataramitz-nv <203509407+amitz-nv@users.noreply.github.com>
Signed-off-by: default avatarMichael Goin <mgoin64@gmail.com>
Co-authored-by: default avatarMichael Goin <mgoin64@gmail.com>
parent af1b07b0
...@@ -9,6 +9,7 @@ pytest-timeout ...@@ -9,6 +9,7 @@ pytest-timeout
pytest-cov pytest-cov
# testing utils # testing utils
albumentations # required for Nemotron Parse in test_common.py
backoff # required for phi4mm test backoff # required for phi4mm test
blobfile # required for kimi-vl test blobfile # required for kimi-vl test
einops # required for MPT, qwen-vl einops # required for MPT, qwen-vl
...@@ -31,7 +32,7 @@ transformers_stream_generator # required for qwen-vl test ...@@ -31,7 +32,7 @@ transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.8 # required for voxtral test mistral_common[image,audio] >= 1.8.8 # required for voxtral test
num2words # required for smolvlm test num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
opencv-python-headless >= 4.11.0 # required for video test opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.9.2 # required for model evaluation test lm-eval[api]>=0.4.9.2 # required for model evaluation test
......
...@@ -27,7 +27,9 @@ aiosignal==1.4.0 ...@@ -27,7 +27,9 @@ aiosignal==1.4.0
albucore==0.0.16 albucore==0.0.16
# via terratorch # via terratorch
albumentations==1.4.6 albumentations==1.4.6
# via terratorch # via
# -r requirements/test.in
# terratorch
alembic==1.16.4 alembic==1.16.4
# via mlflow # via mlflow
annotated-types==0.7.0 annotated-types==0.7.0
......
...@@ -685,6 +685,7 @@ class HfRunner: ...@@ -685,6 +685,7 @@ class HfRunner:
images: PromptImageInput | None = None, images: PromptImageInput | None = None,
audios: PromptAudioInput | None = None, audios: PromptAudioInput | None = None,
videos: PromptVideoInput | None = None, videos: PromptVideoInput | None = None,
use_cache: bool = True,
**kwargs: Any, **kwargs: Any,
) -> list[TokensTextLogprobs]: ) -> list[TokensTextLogprobs]:
all_inputs = self.get_inputs( all_inputs = self.get_inputs(
...@@ -698,7 +699,7 @@ class HfRunner: ...@@ -698,7 +699,7 @@ class HfRunner:
for inputs in all_inputs: for inputs in all_inputs:
output: "GenerateOutput" = self.model.generate( output: "GenerateOutput" = self.model.generate(
**self.wrap_device(inputs), **self.wrap_device(inputs),
use_cache=True, use_cache=use_cache,
do_sample=False, do_sample=False,
max_new_tokens=max_tokens, max_new_tokens=max_tokens,
output_hidden_states=True, output_hidden_states=True,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
import pytest
from transformers import AutoModel
from tests.models.utils import check_logprobs_close
from vllm.assets.image import ImageAsset
from ....conftest import HfRunner, PromptImageInput, VllmRunner
from ....utils import create_new_process_for_each_test
IMAGE = ImageAsset("paper-11").pil_image_ext(ext="png").convert("RGB")
PROMPT = "</s><s><predict_bbox><predict_classes><output_markdown>"
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], PromptImageInput]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
"""Verify that the inference result is the same between hf and vllm."""
with vllm_runner(
model,
dtype=dtype,
max_num_seqs=64,
limit_mm_per_prompt={"image": 1},
trust_remote_code=True,
) as vllm_model:
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
)
for prompts, images in inputs
]
with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
use_cache=False, # HF Nemotron Parse crashes here without this
)
for prompts, images in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("num_logprobs", [5])
@create_new_process_for_each_test("spawn")
def test_models(
hf_runner, vllm_runner, model: str, dtype: str, num_logprobs: int
) -> None:
run_test(
hf_runner,
vllm_runner,
inputs=[
(
[PROMPT] * 10,
[IMAGE] * 10,
),
],
model=model,
dtype=dtype,
max_tokens=100,
num_logprobs=num_logprobs,
)
...@@ -40,15 +40,15 @@ def run_radio_test( ...@@ -40,15 +40,15 @@ def run_radio_test(
for image in images for image in images
] ]
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) hf_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
# RADIO model on HF does not properly handle torch_dtype argument # RADIO model on HF does not properly handle torch_dtype argument
# And relies on args["dtype"] which we have to patch manually: # And relies on args["dtype"] which we have to patch manually:
config.args["dtype"] = torch_dtype hf_config.args["dtype"] = torch_dtype
hf_model = AutoModel.from_pretrained( hf_model = AutoModel.from_pretrained(
model_id, model_id,
config=config, config=hf_config,
dtype=torch_dtype, dtype=torch_dtype,
trust_remote_code=True, trust_remote_code=True,
).to("cuda") ).to("cuda")
...@@ -62,13 +62,14 @@ def run_radio_test( ...@@ -62,13 +62,14 @@ def run_radio_test(
hf_model.make_preprocessor_external() hf_model.make_preprocessor_external()
hf_outputs_per_image = [ hf_outputs_per_image = [
hf_model(pixel_value.to("cuda")).features for pixel_value in pixel_values hf_model(pixel_value.to("cuda")) for pixel_value in pixel_values
] ]
radio_config = RadioConfig( vllm_config = RadioConfig(
model_name=config.args["model"], reg_tokens=config.args["register_multiple"] model_name=hf_config.args["model"],
**hf_config.args,
) )
vllm_model = RadioModel(radio_config) vllm_model = RadioModel(vllm_config)
vllm_model.load_weights(hf_model.state_dict()) vllm_model.load_weights(hf_model.state_dict())
vllm_model = vllm_model.to("cuda", torch_dtype) vllm_model = vllm_model.to("cuda", torch_dtype)
...@@ -80,7 +81,8 @@ def run_radio_test( ...@@ -80,7 +81,8 @@ def run_radio_test(
cos_similar = nn.CosineSimilarity(dim=-1) cos_similar = nn.CosineSimilarity(dim=-1)
for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image): for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
assert cos_similar(vllm_output, hf_output).mean() > 0.99 assert cos_similar(vllm_output[0], hf_output[0]).mean() > 0.99
assert cos_similar(vllm_output[1], hf_output[1]).mean() > 0.99
@pytest.mark.parametrize( @pytest.mark.parametrize(
......
...@@ -102,6 +102,7 @@ def glmasr_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict: ...@@ -102,6 +102,7 @@ def glmasr_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
# incorrect token ids. So we need use `add_special_tokens=False` here # incorrect token ids. So we need use `add_special_tokens=False` here
# to leave bos_token to be added by the processor. # to leave bos_token to be added by the processor.
_ADD_SPECIAL_TOKENS_OVERRIDES = { _ADD_SPECIAL_TOKENS_OVERRIDES = {
"nemotron_parse": False,
"ovis": False, "ovis": False,
"ovis2_5": False, "ovis2_5": False,
"paligemma": False, "paligemma": False,
......
...@@ -907,6 +907,9 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -907,6 +907,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
is_available_online=False, is_available_online=False,
), ),
# [Encoder-decoder] # [Encoder-decoder]
"NemotronParseForConditionalGeneration": _HfExamplesInfo(
"nvidia/NVIDIA-Nemotron-Parse-v1.1", trust_remote_code=True
),
"WhisperForConditionalGeneration": _HfExamplesInfo( "WhisperForConditionalGeneration": _HfExamplesInfo(
"openai/whisper-large-v3-turbo", "openai/whisper-large-v3-turbo",
extras={"v3": "openai/whisper-large-v3"}, extras={"v3": "openai/whisper-large-v3"},
......
...@@ -42,8 +42,11 @@ class ImageAsset: ...@@ -42,8 +42,11 @@ class ImageAsset:
) )
@property @property
def pil_image(self, ext="jpg") -> Image.Image: def pil_image(self) -> Image.Image:
image_path = self.get_path(ext) return self.pil_image_ext(ext="jpg")
def pil_image_ext(self, ext: str) -> Image.Image:
image_path = self.get_path(ext=ext)
return Image.open(image_path) return Image.open(image_path)
@property @property
......
...@@ -1220,7 +1220,7 @@ class NemotronH_Nano_VL_V2( ...@@ -1220,7 +1220,7 @@ class NemotronH_Nano_VL_V2(
n = pixel_values.shape[0] n = pixel_values.shape[0]
vit_embeds_list = [] vit_embeds_list = []
for i in range(0, n, micro_batch_size): for i in range(0, n, micro_batch_size):
vit_embeds = self.vision_model(pixel_values[i : i + micro_batch_size]) _, vit_embeds = self.vision_model(pixel_values[i : i + micro_batch_size])
vit_embeds = vit_embeds.to(dtype=torch.bfloat16) vit_embeds = vit_embeds.to(dtype=torch.bfloat16)
h = w = int(vit_embeds.shape[1] ** 0.5) h = w = int(vit_embeds.shape[1] ** 0.5)
vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
...@@ -1695,12 +1695,7 @@ class NemotronH_Nano_VL_V2( ...@@ -1695,12 +1695,7 @@ class NemotronH_Nano_VL_V2(
patch_size=patch_size, patch_size=patch_size,
norm_mean=hf_config.norm_mean, norm_mean=hf_config.norm_mean,
norm_std=hf_config.norm_std, norm_std=hf_config.norm_std,
reg_tokens=( **hf_config_vision.args,
hf_config_vision.args.get("register_multiple")
if hasattr(hf_config_vision, "args")
and isinstance(hf_config_vision.args, dict)
else None
),
) )
return RadioModel(config=radio_config) return RadioModel(config=radio_config)
......
This diff is collapsed.
...@@ -427,15 +427,17 @@ class RadioInternVisionModel(nn.Module): ...@@ -427,15 +427,17 @@ class RadioInternVisionModel(nn.Module):
to_2tuple(config.patch_size), config.image_size to_2tuple(config.patch_size), config.image_size
) )
max_img_size = int( max_img_size = int(
round(config.max_img_size / config.patch_size) * config.patch_size round(config.cpe_max_size / config.patch_size) * config.patch_size
) )
unique_teachers = set(t["name"] for t in config.teachers)
self.patch_generator = ViTPatchGenerator( self.patch_generator = ViTPatchGenerator(
config.patch_size, config.patch_size,
config.hidden_size, config.hidden_size,
input_dims=self.img_size, input_dims=self.img_size,
max_input_dims=max_img_size, max_input_dims=max_img_size,
cls_token=True, cls_token=True,
register_multiple=config.reg_tokens, num_cls_tokens=len(unique_teachers) if config.cls_token_per_teacher else 1,
register_multiple=config.register_multiple,
) )
self.encoder = InternVisionEncoder( self.encoder = InternVisionEncoder(
...@@ -489,11 +491,20 @@ class RadioModel(nn.Module): ...@@ -489,11 +491,20 @@ class RadioModel(nn.Module):
prefix=prefix, prefix=prefix,
) )
summary_idxs = None
if config.teachers:
summary_idxs = torch.tensor(
[i for i, t in enumerate(config.teachers) if t.get("use_summary", True)]
)
if summary_idxs.numel() > 0:
self.register_buffer("summary_idxs", summary_idxs)
self.summary_idxs = summary_idxs
def forward( def forward(
self, self,
pixel_values: torch.Tensor | None = None, pixel_values: torch.Tensor | None = None,
pixel_embeds: torch.Tensor | None = None, pixel_embeds: torch.Tensor | None = None,
) -> torch.FloatTensor: ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
y = self.model(pixel_values) y = self.model(pixel_values)
return self._extract_final(y) return self._extract_final(y)
...@@ -546,10 +557,17 @@ class RadioModel(nn.Module): ...@@ -546,10 +557,17 @@ class RadioModel(nn.Module):
return loaded_params return loaded_params
def _extract_final(self, y: torch.Tensor): def _extract_final(
self, y: torch.Tensor
) -> tuple[torch.FloatTensor, torch.FloatTensor]:
# Remove CLS + REGISTERS tokens # Remove CLS + REGISTERS tokens
patch_gen = getattr(self.model, "patch_generator", None) patch_gen = getattr(self.model, "patch_generator", None)
if patch_gen is not None: if patch_gen is not None:
all_summary = y[:, : patch_gen.num_cls_tokens]
if self.summary_idxs is not None:
bb_summary = all_summary[:, self.summary_idxs]
else:
bb_summary = all_summary
all_feat = y[:, patch_gen.num_skip :] all_feat = y[:, patch_gen.num_skip :]
return all_feat return bb_summary.flatten(1), all_feat
...@@ -428,6 +428,10 @@ _MULTIMODAL_MODELS = { ...@@ -428,6 +428,10 @@ _MULTIMODAL_MODELS = {
"VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501 "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501
"VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"), # noqa: E501 "VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"), # noqa: E501
# [Encoder-decoder] # [Encoder-decoder]
"NemotronParseForConditionalGeneration": (
"nemotron_parse",
"NemotronParseForConditionalGeneration",
),
"WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501 "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501
} }
......
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Radio vision model configuration""" """Radio vision model configuration"""
from typing import Any
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging from transformers.utils import logging
...@@ -36,12 +38,15 @@ class RadioConfig(PretrainedConfig): ...@@ -36,12 +38,15 @@ class RadioConfig(PretrainedConfig):
layer_norm_eps: The epsilon used by the layer normalization layers. layer_norm_eps: The epsilon used by the layer normalization layers.
initializer_factor: A factor for initializing all weight matrices. initializer_factor: A factor for initializing all weight matrices.
hidden_act: The non-linear activation function in the encoder. hidden_act: The non-linear activation function in the encoder.
max_img_size: Maximum image size for position embeddings. cpe_max_size: Maximum image size for position embeddings.
norm_mean: Mean values for image normalization (RGB channels). norm_mean: Mean values for image normalization (RGB channels).
Defaults to (0.48145466, 0.4578275, 0.40821073)). Defaults to (0.48145466, 0.4578275, 0.40821073)).
norm_std: Standard deviation values for image normalization norm_std: Standard deviation values for image normalization
(RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)). (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
reg_tokens: Number of register tokens to use. register_multiple: Number of register tokens to use.
teachers: A list of teacher model configurations. Each teacher configuration is
a dict with keys like "name" and some may have "use_summary".
cls_token_per_teacher: Whether to use a separate CLS token for each teacher.
""" """
model_type = "radio" model_type = "radio"
...@@ -57,10 +62,12 @@ class RadioConfig(PretrainedConfig): ...@@ -57,10 +62,12 @@ class RadioConfig(PretrainedConfig):
layer_norm_eps: float = 1e-6, layer_norm_eps: float = 1e-6,
initializer_factor: float = 1.0, initializer_factor: float = 1.0,
hidden_act: str = "gelu", hidden_act: str = "gelu",
max_img_size: int = 2048, cpe_max_size: int = 2048,
norm_mean: tuple[float, float, float] | list = OPENAI_CLIP_MEAN, norm_mean: tuple[float, float, float] | list = OPENAI_CLIP_MEAN,
norm_std: tuple[float, float, float] | list = OPENAI_CLIP_STD, norm_std: tuple[float, float, float] | list = OPENAI_CLIP_STD,
reg_tokens: int | None = None, register_multiple: int | None = None,
teachers: list[dict[str, Any]] | None = None,
cls_token_per_teacher: bool = False,
**kwargs, **kwargs,
): ):
self.model_name = model_name self.model_name = model_name
...@@ -78,12 +85,14 @@ class RadioConfig(PretrainedConfig): ...@@ -78,12 +85,14 @@ class RadioConfig(PretrainedConfig):
self.layer_norm_eps = layer_norm_eps self.layer_norm_eps = layer_norm_eps
self.initializer_factor = initializer_factor self.initializer_factor = initializer_factor
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.max_img_size = max_img_size self.cpe_max_size = cpe_max_size
self.norm_mean = ( self.norm_mean = (
list(norm_mean) if isinstance(norm_mean, (tuple, list)) else norm_mean list(norm_mean) if isinstance(norm_mean, (tuple, list)) else norm_mean
) )
self.norm_std = ( self.norm_std = (
list(norm_std) if isinstance(norm_std, (tuple, list)) else norm_std list(norm_std) if isinstance(norm_std, (tuple, list)) else norm_std
) )
self.reg_tokens = reg_tokens self.register_multiple = register_multiple
self.teachers = teachers if teachers is not None else []
self.cls_token_per_teacher = cls_token_per_teacher
super().__init__(**kwargs) super().__init__(**kwargs)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment