Unverified Commit e93ff6c8 authored by Eugene Khvedchenya's avatar Eugene Khvedchenya Committed by GitHub
Browse files

Nemotron Nano V2 VL + EVS Video Support (#27107)


Signed-off-by: default avatarEugene Khvedchenia <ekhvedchenia@nvidia.com>
Signed-off-by: default avatarNatan Bagrov <nbagrov@nvidia.com>
Signed-off-by: default avatarRoger Wang <hey@rogerw.io>
Co-authored-by: default avatarNatan Bagrov <nbagrov@nvidia.com>
Co-authored-by: default avatarRoger Wang <hey@rogerw.io>
parent 1c691f4a
......@@ -43,32 +43,6 @@ to_4tuple = _ntuple(4)
to_ntuple = _ntuple
class InputConditioner(nn.Module):
def __init__(
self,
input_scale: float,
norm_mean: norm_t,
norm_std: norm_t,
dtype: torch.dtype = None,
):
super().__init__()
self.dtype = dtype
self.register_buffer("norm_mean", _to_tensor(norm_mean) / input_scale)
self.register_buffer("norm_std", _to_tensor(norm_std) / input_scale)
def forward(self, x: torch.Tensor):
y = (x - self.norm_mean) / self.norm_std
if self.dtype is not None:
y = y.to(self.dtype)
return y
def _to_tensor(v: norm_t):
return torch.as_tensor(v, dtype=torch.float32).view(-1, 1, 1)
class ClsToken(nn.Module):
def __init__(
self,
......@@ -507,11 +481,6 @@ class RadioModel(nn.Module):
super().__init__()
self.config = config
self.input_conditioner = InputConditioner(
input_scale=1.0,
norm_mean=config.norm_mean,
norm_std=config.norm_std,
)
self.model = RadioInternVisionModel(
config=config,
quant_config=quant_config,
......@@ -525,8 +494,7 @@ class RadioModel(nn.Module):
pixel_values: torch.Tensor | None = None,
pixel_embeds: torch.Tensor | None = None,
) -> torch.FloatTensor:
x = self.input_conditioner(pixel_values)
y = self.model(x)
y = self.model(pixel_values)
return self._extract_final(y)
def load_weights(self, weights) -> set[str]:
......@@ -548,6 +516,10 @@ class RadioModel(nn.Module):
# Skip buffers not used in vLLM
if sub in {"summary_idxs"}:
continue
if sub.startswith("input_conditioner."):
# we normalize in the input processor,
# based on norm and std values from the config
continue
vllm_key = None
if sub.startswith("model.patch_generator."):
......
......@@ -223,7 +223,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
height,
)
height = min(height, overrides.height)
video = np.full((num_frames, width, height, 3), 255)
video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
return [video] * num_videos
......
......@@ -13,10 +13,13 @@ import numpy.typing as npt
from PIL import Image
from vllm import envs
from vllm.logger import init_logger
from .base import MediaIO
from .image import ImageMediaIO
logger = init_logger(__name__)
def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
num_frames, _, _, channels = frames.shape
......@@ -103,6 +106,7 @@ class OpenCVVideoBackend(VideoLoader):
cls,
data: bytes,
num_frames: int = -1,
fps: int = -1,
**kwargs,
) -> tuple[npt.NDArray, dict[str, Any]]:
import cv2
......@@ -116,14 +120,20 @@ class OpenCVVideoBackend(VideoLoader):
original_fps = cap.get(cv2.CAP_PROP_FPS)
duration = total_frames_num / original_fps if original_fps > 0 else 0
# resample video to target num_frames
full_read = num_frames == -1 or total_frames_num < num_frames
if full_read:
num_frames = total_frames_num
frame_idx = list(range(0, num_frames))
# resample video to target num_frames and fps
# - the minimum of the two will be used
num_frames_to_sample = total_frames_num
if num_frames > 0:
num_frames_to_sample = min(num_frames, total_frames_num)
if fps > 0:
num_frames_to_sample = min(num_frames_to_sample, math.floor(duration * fps))
num_frames_to_sample = max(1, num_frames_to_sample) # at least one sample
if num_frames_to_sample == total_frames_num:
frame_idx = list(range(0, num_frames_to_sample))
else:
uniform_sampled_frames = np.linspace(
0, total_frames_num - 1, num_frames, dtype=int
0, total_frames_num - 1, num_frames_to_sample, dtype=int
)
frame_idx = uniform_sampled_frames.tolist()
......@@ -132,7 +142,7 @@ class OpenCVVideoBackend(VideoLoader):
frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)
i = 0
for idx in range(total_frames_num):
for idx in range(max(frame_idx) + 1):
ok = cap.grab()
if not ok:
break
......@@ -142,8 +152,8 @@ class OpenCVVideoBackend(VideoLoader):
frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
i += 1
assert i == num_frames, (
f"Expected reading {num_frames} frames, "
assert i == num_frames_to_sample, (
f"Expected reading {num_frames_to_sample} frames, "
f"but only loaded {i} frames from video."
)
......@@ -151,14 +161,14 @@ class OpenCVVideoBackend(VideoLoader):
# NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata
# can cause incorrect timestamp calculation without num_frames=-1.
metadata = {
"total_num_frames": num_frames,
"fps": num_frames / duration,
"total_num_frames": total_frames_num,
"fps": original_fps,
"duration": duration,
"video_backend": "opencv",
"frames_indices": list(range(num_frames)),
"frames_indices": list(frame_idx),
# extra field used to control hf processor's video
# sampling behavior
"do_sample_frames": num_frames == total_frames_num,
"do_sample_frames": num_frames_to_sample == total_frames_num,
}
return frames, metadata
......
......@@ -1735,20 +1735,32 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
pin_memory=self.pin_memory,
merge_by_field_config=model.merge_by_field_config,
):
curr_group_outputs = []
# EVS-related change.
# (ekhvedchenia): Temporary hack to limit peak memory usage when
# processing multimodal data.This solves the issue with scheduler
# processing multimodal data. This solves the issue with scheduler
# putting too many video samples into a single batch. Scheduler
# uses pruned vision tokens count to compare it versus compute
# budget which is incorrect (Either input media size or non-pruned
# output vision tokens count should be considered)
curr_group_outputs = []
if self.is_multimodal_pruning_enabled and modality == "video":
micro_batch_size = 1
for i in range(0, num_items, micro_batch_size):
micro_batch_mm_inputs = dict(
(k, v[i : i + micro_batch_size])
for k, v in mm_kwargs_group.items()
# TODO(ywang96): Fix memory profiling to take EVS into account and
# remove this hack.
if (
self.is_multimodal_pruning_enabled
and modality == "video"
and num_items > 1
):
for video_mm_kwargs_item in filter(
lambda item: item.modality == "video", mm_kwargs
):
_, _, micro_batch_mm_inputs = next(
group_mm_kwargs_by_modality(
[video_mm_kwargs_item],
device=self.device,
pin_memory=self.pin_memory,
merge_by_field_config=model.merge_by_field_config,
)
)
micro_batch_outputs = model.get_multimodal_embeddings(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment