Unverified Commit fa9e6802 authored by Netanel Haber's avatar Netanel Haber Committed by GitHub
Browse files

Fix Nano Nemotron VL regressions (#38655)


Signed-off-by: default avatarNetanel Haber <58652339+netanel-haber@users.noreply.github.com>
parent 55064354
...@@ -7,6 +7,7 @@ from typing import Any, Literal ...@@ -7,6 +7,7 @@ from typing import Any, Literal
import pytest import pytest
from packaging.version import Version from packaging.version import Version
from transformers import PretrainedConfig
from transformers import __version__ as TRANSFORMERS_VERSION from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.config.model import ModelDType, TokenizerMode from vllm.config.model import ModelDType, TokenizerMode
...@@ -1004,7 +1005,26 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -1004,7 +1005,26 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True, trust_remote_code=True,
), ),
"NemotronH_Nano_VL_V2": _HfExamplesInfo( "NemotronH_Nano_VL_V2": _HfExamplesInfo(
"nano_vl_dummy", is_available_online=False, trust_remote_code=True "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
max_model_len=4096,
# NemotronH layers are constructed via `hybrid_override_pattern`:
use_original_num_layers=True,
hf_overrides={
"vision_config": PretrainedConfig(
args={
"min_num_patches": 1, # Trigger image dynamic res
"max_num_patches": 12,
"model": "vit_huge_patch16_224",
},
# Trigger conv3d:
video_temporal_patch_size=2,
),
"text_config": {
"num_hidden_layers": 2,
"hybrid_override_pattern": "M*",
},
},
trust_remote_code=True,
), ),
"OpenCUAForConditionalGeneration": _HfExamplesInfo( "OpenCUAForConditionalGeneration": _HfExamplesInfo(
"xlangai/OpenCUA-7B", trust_remote_code=True "xlangai/OpenCUA-7B", trust_remote_code=True
......
...@@ -447,9 +447,16 @@ def dummy_hf_overrides( ...@@ -447,9 +447,16 @@ def dummy_hf_overrides(
Dummy HF overrides function used to create dummy model Dummy HF overrides function used to create dummy model
with only minimum nums of layer. with only minimum nums of layer.
""" """
hf_config.update(exist_overrides or {}) # Copy because this helper is called more than once
# while loading config, and we `.pop()`
exist_overrides = (exist_overrides or {}).copy()
text_config_override = exist_overrides.pop("text_config", None)
hf_config.update(exist_overrides)
text_config = hf_config.get_text_config() text_config = hf_config.get_text_config()
if text_config_override is not None:
# multimodal test models may override *some* text-model fields
text_config.update(text_config_override)
# Ensure at least 2 expert per group # Ensure at least 2 expert per group
# Since `grouped_topk` assumes top-2 # Since `grouped_topk` assumes top-2
......
...@@ -7,7 +7,6 @@ ...@@ -7,7 +7,6 @@
# LICENSE is in root directory. # LICENSE is in root directory.
# -------------------------------------------------------- # --------------------------------------------------------
import copy
import math import math
import warnings import warnings
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
...@@ -17,7 +16,7 @@ from typing import Annotated, Literal, TypeAlias ...@@ -17,7 +16,7 @@ from typing import Annotated, Literal, TypeAlias
import torch import torch
import torch.nn as nn import torch.nn as nn
from transformers import BatchFeature from transformers import BatchFeature, PretrainedConfig
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
...@@ -210,11 +209,15 @@ class NanoNemotronVLProcessingInfo(BaseProcessingInfo): ...@@ -210,11 +209,15 @@ class NanoNemotronVLProcessingInfo(BaseProcessingInfo):
@cached_property @cached_property
def is_dynamic_tiler(self) -> bool: def is_dynamic_tiler(self) -> bool:
return self.get_hf_processor().dynamic_tiler is not None return BaseNanoNemotronVLProcessor.use_dynamic_resolution(self.get_hf_config())
@cached_property @property
def supports_video(self): def supports_video(self):
return self.get_hf_processor().supports_video return True
@property
def supports_audio(self) -> bool:
return self.sound_config is not None
def get_video_token(self) -> str | None: def get_video_token(self) -> str | None:
return IMG_CONTEXT return IMG_CONTEXT
...@@ -223,8 +226,8 @@ class NanoNemotronVLProcessingInfo(BaseProcessingInfo): ...@@ -223,8 +226,8 @@ class NanoNemotronVLProcessingInfo(BaseProcessingInfo):
return self.ctx.get_mm_config().video_pruning_rate return self.ctx.get_mm_config().video_pruning_rate
@property @property
def audio_extractor(self) -> ParakeetExtractor | None: def sound_config(self) -> PretrainedConfig | None:
return self.get_hf_processor().audio_extractor return getattr(self.get_hf_config(), "sound_config", None)
def get_default_tok_params(self) -> TokenizeParams: def get_default_tok_params(self) -> TokenizeParams:
return super().get_default_tok_params().with_kwargs(add_special_tokens=False) return super().get_default_tok_params().with_kwargs(add_special_tokens=False)
...@@ -232,14 +235,14 @@ class NanoNemotronVLProcessingInfo(BaseProcessingInfo): ...@@ -232,14 +235,14 @@ class NanoNemotronVLProcessingInfo(BaseProcessingInfo):
def get_supported_mm_limits(self) -> Mapping[str, int | None]: def get_supported_mm_limits(self) -> Mapping[str, int | None]:
image_limit = {"image": None} image_limit = {"image": None}
video_limit = {"video": None} if self.supports_video else {} video_limit = {"video": None} if self.supports_video else {}
audio_limit = {"audio": None} if self.audio_extractor is not None else {} audio_limit = {"audio": None} if self.supports_audio else {}
return {**image_limit, **video_limit, **audio_limit} return {**image_limit, **video_limit, **audio_limit}
def get_data_parser(self): def get_data_parser(self):
target_sr = None target_sr = None
target_channels = None target_channels = None
if extractor := self.audio_extractor: if self.sound_config:
target_sr = extractor.sampling_rate target_sr = self.sound_config.sampling_rate
target_channels = 1 target_channels = 1
return MultiModalDataParser( return MultiModalDataParser(
...@@ -371,7 +374,7 @@ class NanoNemotronVLMultiModalProcessor( ...@@ -371,7 +374,7 @@ class NanoNemotronVLMultiModalProcessor(
fields = self._get_image_fields_config(hf_inputs) fields = self._get_image_fields_config(hf_inputs)
if self.info.supports_video: if self.info.supports_video:
fields |= self._get_video_fields_config(hf_inputs) fields |= self._get_video_fields_config(hf_inputs)
if self.info.audio_extractor: if self.info.supports_audio:
fields |= self._get_audio_fields_config(hf_inputs) fields |= self._get_audio_fields_config(hf_inputs)
return fields return fields
...@@ -399,9 +402,8 @@ class NanoNemotronVLMultiModalProcessor( ...@@ -399,9 +402,8 @@ class NanoNemotronVLMultiModalProcessor(
if isinstance(images, ImageEmbeddingItems): if isinstance(images, ImageEmbeddingItems):
feature_size = images.get_feature_size(item_idx) feature_size = images.get_feature_size(item_idx)
elif tiler := hf_processor.dynamic_tiler: elif self.info.is_dynamic_tiler:
image = images.get(item_idx) feature_size = out_mm_data["num_tokens_per_image"][item_idx]
feature_size = tiler.get_cached_feature_size(image)
else: else:
image_size = images.get_image_size(item_idx) image_size = images.get_image_size(item_idx)
max_num_tiles = hf_processor.max_num_tiles max_num_tiles = hf_processor.max_num_tiles
...@@ -536,7 +538,7 @@ class NanoNemotronVLMultiModalProcessor( ...@@ -536,7 +538,7 @@ class NanoNemotronVLMultiModalProcessor(
prompt_repls.append( prompt_repls.append(
self._get_prompt_repl_video(mm_items, hf_processor, out_mm_data) self._get_prompt_repl_video(mm_items, hf_processor, out_mm_data)
) )
if self.info.audio_extractor: if self.info.supports_audio:
prompt_repls.append( prompt_repls.append(
self._get_prompt_repl_audio(mm_items, hf_processor, out_mm_data) self._get_prompt_repl_audio(mm_items, hf_processor, out_mm_data)
) )
...@@ -772,12 +774,14 @@ class NanoNemotronVLDummyInputsBuilder( ...@@ -772,12 +774,14 @@ class NanoNemotronVLDummyInputsBuilder(
else: else:
dummy_video = {} dummy_video = {}
if extractor := self.info.audio_extractor: if sound_config := self.info.sound_config:
num_audios = mm_counts.get("audio", 0) num_audios = mm_counts.get("audio", 0)
audio_overrides = mm_options.get("audio") if mm_options else None audio_overrides = mm_options.get("audio") if mm_options else None
tokens_per_audio = max(1, seq_len // max(num_audios, 1)) tokens_per_audio = max(1, seq_len // max(num_audios, 1))
max_audio_num_samples = MAX_AUDIO_LEN_S * extractor.sampling_rate max_audio_num_samples = MAX_AUDIO_LEN_S * sound_config.sampling_rate
calculated_max_audio_num_samples = extractor.audio_length(tokens_per_audio) calculated_max_audio_num_samples = ParakeetExtractor.audio_length(
sound_config, tokens_per_audio
)
audio_len = min(max_audio_num_samples, calculated_max_audio_num_samples) audio_len = min(max_audio_num_samples, calculated_max_audio_num_samples)
dummy_audio = { dummy_audio = {
"audio": self._get_dummy_audios( "audio": self._get_dummy_audios(
...@@ -1029,9 +1033,13 @@ class NemotronH_Nano_VL_V2( ...@@ -1029,9 +1033,13 @@ class NemotronH_Nano_VL_V2(
data=image_embeds, data=image_embeds,
) )
pixel_values_flat = kwargs.pop("pixel_values_flat", None)
if pixel_values_flat is None:
return None
if self.dynamic_resolution: if self.dynamic_resolution:
pixel_values_flat = DynamicResolutionImageTiler.stack( pixel_values_flat = DynamicResolutionImageTiler.stack(
kwargs.pop("pixel_values_flat"), self.patch_size pixel_values_flat, self.patch_size
) )
return NanoNemotronVLImagePixelInputsDynamic( return NanoNemotronVLImagePixelInputsDynamic(
pixel_values_flat=pixel_values_flat, **kwargs pixel_values_flat=pixel_values_flat, **kwargs
...@@ -1497,15 +1505,13 @@ class NemotronH_Nano_VL_V2( ...@@ -1497,15 +1505,13 @@ class NemotronH_Nano_VL_V2(
@classmethod @classmethod
def get_mamba_state_shape_from_config(cls, vllm_config: "VllmConfig"): def get_mamba_state_shape_from_config(cls, vllm_config: "VllmConfig"):
text_config = vllm_config.model_config.hf_config.text_config text_config = vllm_config.model_config.hf_config.text_config
temp_vllm_config = copy.deepcopy(vllm_config) temp_vllm_config = vllm_config.with_hf_config(text_config)
temp_vllm_config.model_config.hf_config = text_config
return NemotronHForCausalLM.get_mamba_state_shape_from_config(temp_vllm_config) return NemotronHForCausalLM.get_mamba_state_shape_from_config(temp_vllm_config)
@classmethod @classmethod
def get_mamba_state_dtype_from_config(cls, vllm_config: "VllmConfig"): def get_mamba_state_dtype_from_config(cls, vllm_config: "VllmConfig"):
text_config = vllm_config.model_config.hf_config.text_config text_config = vllm_config.model_config.hf_config.text_config
temp_vllm_config = copy.deepcopy(vllm_config) temp_vllm_config = vllm_config.with_hf_config(text_config)
temp_vllm_config.model_config.hf_config = text_config
return NemotronHForCausalLM.get_mamba_state_dtype_from_config(temp_vllm_config) return NemotronHForCausalLM.get_mamba_state_dtype_from_config(temp_vllm_config)
@classmethod @classmethod
......
...@@ -159,5 +159,7 @@ class ParakeetExtractor(ParakeetFeatureExtractor): ...@@ -159,5 +159,7 @@ class ParakeetExtractor(ParakeetFeatureExtractor):
outputs["audio_num_clips"] = audio_num_clips outputs["audio_num_clips"] = audio_num_clips
return outputs return outputs
def audio_length(self, audio_tokens: int) -> int: @staticmethod
return int(audio_tokens * self.config.subsampling_factor * self.hop_length) def audio_length(raw_config: PretrainedConfig, audio_tokens: int) -> int:
config = ExtractorConfig.from_hf_config(raw_config)
return int(audio_tokens * config.subsampling_factor * config.hop_length)
...@@ -176,7 +176,6 @@ class ViTPatchGenerator(nn.Module): ...@@ -176,7 +176,6 @@ class ViTPatchGenerator(nn.Module):
temporal_patch_size=temporal_patch_size, temporal_patch_size=temporal_patch_size,
**factory, **factory,
) )
self._video_embedder_loaded = False
if abs_pos: if abs_pos:
scale = embed_dim**-0.5 scale = embed_dim**-0.5
...@@ -225,12 +224,7 @@ class ViTPatchGenerator(nn.Module): ...@@ -225,12 +224,7 @@ class ViTPatchGenerator(nn.Module):
Returns: Returns:
Embedded patches with temporal compression applied. Embedded patches with temporal compression applied.
""" """
if not self._video_embedder_loaded: assert self.temporal_patch_size > 1
raise ValueError(
"Temporal compression (video_temporal_patch_size > 1) requires "
"video_embedder weights, but they were never loaded. "
"Ensure the checkpoint was trained with temporal compression."
)
T = self.temporal_patch_size T = self.temporal_patch_size
input_size = x.shape[2:] input_size = x.shape[2:]
...@@ -794,9 +788,6 @@ class RadioModel(nn.Module): ...@@ -794,9 +788,6 @@ class RadioModel(nn.Module):
weight_loader(param, weight) weight_loader(param, weight)
loaded_params.add(vllm_key) loaded_params.add(vllm_key)
if "model.patch_generator.video_embedder.weight" in loaded_params:
self.model.patch_generator._video_embedder_loaded = True
return loaded_params return loaded_params
def _extract_final( def _extract_final(
......
...@@ -44,15 +44,19 @@ class ExtractorConfig: ...@@ -44,15 +44,19 @@ class ExtractorConfig:
subsampling_factor: int subsampling_factor: int
subsampling_conv_kernel_size: int subsampling_conv_kernel_size: int
subsampling_conv_stride: int subsampling_conv_stride: int
hop_length: int = 160
"""Default `160`: Matches HF default"""
clip_duration_s: int = 30 clip_duration_s: int = 30
clip_min_duration_s: float = 0.1 clip_min_duration_s: float = 0.1
@staticmethod @staticmethod
def from_hf_config(config: PretrainedConfig) -> "ExtractorConfig": def from_hf_config(config: PretrainedConfig) -> "ExtractorConfig":
assert isinstance(config, PretrainedConfig) assert isinstance(config, PretrainedConfig)
hop_length = int(getattr(config, "hop_length", ExtractorConfig.hop_length))
return ExtractorConfig( return ExtractorConfig(
feature_size=config.num_mel_bins, feature_size=config.num_mel_bins,
sampling_rate=config.sampling_rate, sampling_rate=config.sampling_rate,
hop_length=hop_length,
subsampling_factor=config.subsampling_factor, subsampling_factor=config.subsampling_factor,
subsampling_conv_kernel_size=config.subsampling_conv_kernel_size, subsampling_conv_kernel_size=config.subsampling_conv_kernel_size,
subsampling_conv_stride=config.subsampling_conv_stride, subsampling_conv_stride=config.subsampling_conv_stride,
......
...@@ -356,15 +356,6 @@ class DynamicResolutionImageTiler: ...@@ -356,15 +356,6 @@ class DynamicResolutionImageTiler:
feature_sizes.append(param.num_embeddings) feature_sizes.append(param.num_embeddings)
return images, feature_sizes return images, feature_sizes
feature_size_cache: dict[Image.Image, int] = {}
@classmethod
def get_cached_feature_size(cls, image: Image.Image) -> int:
feature_size = cls.feature_size_cache[id(image)]
# hard assert that we only use the feature size once
del cls.feature_size_cache[id(image)]
return feature_size
@dataclass @dataclass
class DynamicResolutionParams: class DynamicResolutionParams:
media: Image.Image media: Image.Image
...@@ -519,7 +510,6 @@ class DynamicResolutionImageTiler: ...@@ -519,7 +510,6 @@ class DynamicResolutionImageTiler:
param, token_count = self.process_media(media, tokens_for_media) param, token_count = self.process_media(media, tokens_for_media)
params.append(param) params.append(param)
token_counts.append(token_count) token_counts.append(token_count)
self.feature_size_cache[id(param.media)] = param.num_embeddings
# Step 2: Check if total tokens is within budget # Step 2: Check if total tokens is within budget
total_tokens = sum(token_counts) total_tokens = sum(token_counts)
...@@ -857,13 +847,12 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): ...@@ -857,13 +847,12 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
@property @property
def supports_video(self) -> bool: def supports_video(self) -> bool:
return self.video_token_id is not None return True
@property @property
def video_token_id(self) -> int | None: def video_token_id(self) -> int:
if self.video_token is None: assert self.video_token is not None
return None return self.tokenizer.get_vocab()[self.video_token]
return self.tokenizer.get_vocab().get(self.video_token, None)
@property @property
def image_token_id(self) -> int: def image_token_id(self) -> int:
...@@ -1055,6 +1044,13 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): ...@@ -1055,6 +1044,13 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
text_inputs = self.tokenizer(text, add_special_tokens=False) text_inputs = self.tokenizer(text, add_special_tokens=False)
combined_inputs = {**text_inputs, **video_inputs, **audio_inputs} combined_inputs = {**text_inputs, **video_inputs, **audio_inputs}
frames_indices = combined_inputs.get("frames_indices")
ragged_frames_indices = (
isinstance(frames_indices, list)
and len({len(frame_indices) for frame_indices in frames_indices}) > 1
)
if ragged_frames_indices:
combined_inputs.pop("frames_indices")
if self.dynamic_tiler is None: if self.dynamic_tiler is None:
batch = BatchFeature( batch = BatchFeature(
...@@ -1066,6 +1062,12 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): ...@@ -1066,6 +1062,12 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
# allow images to be exempt from the BatchFeature validation: # allow images to be exempt from the BatchFeature validation:
# We will .stack() them in _parse_and_validate_image_input # We will .stack() them in _parse_and_validate_image_input
batch.update(image_inputs) batch.update(image_inputs)
if ragged_frames_indices:
assert isinstance(frames_indices, list)
batch["frames_indices"] = [
torch.as_tensor(frame_indices, dtype=torch.int64)
for frame_indices in frames_indices
]
return batch return batch
def get_image_repl( def get_image_repl(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment