Nemotron Nano V2 VL + EVS Video Support (#27107)

Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com> Signed-off-by: Natan Bagrov <nbagrov@nvidia.com> Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: Natan Bagrov <nbagrov@nvidia.com> Co-authored-by: Roger Wang <hey@rogerw.io>

Nemotron Nano V2 VL + EVS Video Support (#27107)
Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com> Signed-off-by: Natan Bagrov <nbagrov@nvidia.com> Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: Natan Bagrov <nbagrov@nvidia.com> Co-authored-by: Roger Wang <hey@rogerw.io>
e93ff6c8 · Eugene Khvedchenya · GitHub · 1c691f4a · e93ff6c8 · e93ff6c8
Unverified Commit e93ff6c8 authored Oct 20, 2025 by Eugene Khvedchenya Committed by GitHub Oct 20, 2025
5 changed files
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
--- a/vllm/model_executor/models/radio.py
+++ b/vllm/model_executor/models/radio.py
@@ -43,32 +43,6 @@ to_4tuple = _ntuple(4)
 to_ntuple = _ntuple


-class InputConditioner(nn.Module):
-    def __init__(
-        self,
-        input_scale: float,
-        norm_mean: norm_t,
-        norm_std: norm_t,
-        dtype: torch.dtype = None,
-    ):
-        super().__init__()
-
-        self.dtype = dtype
-
-        self.register_buffer("norm_mean", _to_tensor(norm_mean) / input_scale)
-        self.register_buffer("norm_std", _to_tensor(norm_std) / input_scale)
-
-    def forward(self, x: torch.Tensor):
-        y = (x - self.norm_mean) / self.norm_std
-        if self.dtype is not None:
-            y = y.to(self.dtype)
-        return y
-
-
-def _to_tensor(v: norm_t):
-    return torch.as_tensor(v, dtype=torch.float32).view(-1, 1, 1)
-
-
 class ClsToken(nn.Module):
    def __init__(
        self,
@@ -507,11 +481,6 @@ class RadioModel(nn.Module):
        super().__init__()

        self.config = config
-        self.input_conditioner = InputConditioner(
-            input_scale=1.0,
-            norm_mean=config.norm_mean,
-            norm_std=config.norm_std,
-        )
        self.model = RadioInternVisionModel(
            config=config,
            quant_config=quant_config,
@@ -525,8 +494,7 @@ class RadioModel(nn.Module):
        pixel_values: torch.Tensor | None = None,
        pixel_embeds: torch.Tensor | None = None,
    ) -> torch.FloatTensor:
-        x = self.input_conditioner(pixel_values)
-        y = self.model(x)
+        y = self.model(pixel_values)
        return self._extract_final(y)

    def load_weights(self, weights) -> set[str]:
@@ -548,6 +516,10 @@ class RadioModel(nn.Module):
            # Skip buffers not used in vLLM
            if sub in {"summary_idxs"}:
                continue
+            if sub.startswith("input_conditioner."):
+                # we normalize in the input processor,
+                # based on norm and std values from the config
+                continue

            vllm_key = None
            if sub.startswith("model.patch_generator."):

--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -223,7 +223,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
                        height,
                    )
                height = min(height, overrides.height)
-        video = np.full((num_frames, width, height, 3), 255)
+        video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
        return [video] * num_videos



--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -13,10 +13,13 @@ import numpy.typing as npt
 from PIL import Image

 from vllm import envs
+from vllm.logger import init_logger

 from .base import MediaIO
 from .image import ImageMediaIO

+logger = init_logger(__name__)
+

 def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
    num_frames, _, _, channels = frames.shape
@@ -103,6 +106,7 @@ class OpenCVVideoBackend(VideoLoader):
        cls,
        data: bytes,
        num_frames: int = -1,
+        fps: int = -1,
        **kwargs,
    ) -> tuple[npt.NDArray, dict[str, Any]]:
        import cv2
@@ -116,14 +120,20 @@ class OpenCVVideoBackend(VideoLoader):
        original_fps = cap.get(cv2.CAP_PROP_FPS)
        duration = total_frames_num / original_fps if original_fps > 0 else 0

-        # resample video to target num_frames
-        full_read = num_frames == -1 or total_frames_num < num_frames
-        if full_read:
-            num_frames = total_frames_num
-            frame_idx = list(range(0, num_frames))
+        # resample video to target num_frames and fps
+        # - the minimum of the two will be used
+        num_frames_to_sample = total_frames_num
+        if num_frames > 0:
+            num_frames_to_sample = min(num_frames, total_frames_num)
+        if fps > 0:
+            num_frames_to_sample = min(num_frames_to_sample, math.floor(duration * fps))
+        num_frames_to_sample = max(1, num_frames_to_sample)  # at least one sample
+
+        if num_frames_to_sample == total_frames_num:
+            frame_idx = list(range(0, num_frames_to_sample))
        else:
            uniform_sampled_frames = np.linspace(
-                0, total_frames_num - 1, num_frames, dtype=int
+                0, total_frames_num - 1, num_frames_to_sample, dtype=int
            )
            frame_idx = uniform_sampled_frames.tolist()

@@ -132,7 +142,7 @@ class OpenCVVideoBackend(VideoLoader):
        frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)

        i = 0
-        for idx in range(total_frames_num):
+        for idx in range(max(frame_idx) + 1):
            ok = cap.grab()
            if not ok:
                break
@@ -142,8 +152,8 @@ class OpenCVVideoBackend(VideoLoader):
                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                    i += 1

-        assert i == num_frames, (
-            f"Expected reading {num_frames} frames, "
+        assert i == num_frames_to_sample, (
+            f"Expected reading {num_frames_to_sample} frames, "
            f"but only loaded {i} frames from video."
        )

@@ -151,14 +161,14 @@ class OpenCVVideoBackend(VideoLoader):
        # NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata
        # can cause incorrect timestamp calculation without num_frames=-1.
        metadata = {
-            "total_num_frames": num_frames,
-            "fps": num_frames / duration,
+            "total_num_frames": total_frames_num,
+            "fps": original_fps,
            "duration": duration,
            "video_backend": "opencv",
-            "frames_indices": list(range(num_frames)),
+            "frames_indices": list(frame_idx),
            # extra field used to control hf processor's video
            # sampling behavior
-            "do_sample_frames": num_frames == total_frames_num,
+            "do_sample_frames": num_frames_to_sample == total_frames_num,
        }

        return frames, metadata

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1735,20 +1735,32 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            pin_memory=self.pin_memory,
            merge_by_field_config=model.merge_by_field_config,
        ):
+            curr_group_outputs = []
+
+            # EVS-related change.
            # (ekhvedchenia): Temporary hack to limit peak memory usage when
-            # processing multimodal data.This solves the issue with scheduler
+            # processing multimodal data. This solves the issue with scheduler
            # putting too many video samples into a single batch. Scheduler
            # uses pruned vision tokens count to compare it versus compute
            # budget which is incorrect (Either input media size or non-pruned
            # output vision tokens count should be considered)
-            curr_group_outputs = []
-
-            if self.is_multimodal_pruning_enabled and modality == "video":
-                micro_batch_size = 1
-                for i in range(0, num_items, micro_batch_size):
-                    micro_batch_mm_inputs = dict(
-                        (k, v[i : i + micro_batch_size])
-                        for k, v in mm_kwargs_group.items()
+            # TODO(ywang96): Fix memory profiling to take EVS into account and
+            # remove this hack.
+            if (
+                self.is_multimodal_pruning_enabled
+                and modality == "video"
+                and num_items > 1
+            ):
+                for video_mm_kwargs_item in filter(
+                    lambda item: item.modality == "video", mm_kwargs
+                ):
+                    _, _, micro_batch_mm_inputs = next(
+                        group_mm_kwargs_by_modality(
+                            [video_mm_kwargs_item],
+                            device=self.device,
+                            pin_memory=self.pin_memory,
+                            merge_by_field_config=model.merge_by_field_config,
+                        )
                    )

                    micro_batch_outputs = model.get_multimodal_embeddings(