Unverified Commit 1fcccda4 authored by Kevin Tuan's avatar Kevin Tuan Committed by GitHub
Browse files

fix(internvl): fix accuracy issue of normalization (#10375)

parent 79acec4f
# Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py # Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
from functools import lru_cache
import numpy as np import numpy as np
import torch import torch
import torchvision.transforms as T import torchvision.transforms as T
...@@ -19,6 +21,20 @@ from sglang.srt.multimodal.processors.base_processor import ( ...@@ -19,6 +21,20 @@ from sglang.srt.multimodal.processors.base_processor import (
class InternVLImageProcessor(BaseMultimodalProcessor): class InternVLImageProcessor(BaseMultimodalProcessor):
models = [InternVLChatModel, InternS1ForConditionalGeneration] models = [InternVLChatModel, InternS1ForConditionalGeneration]
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]
@staticmethod
@lru_cache(maxsize=1)
def _get_normalize_tensors(device="cuda", dtype=torch.float32):
mean = torch.tensor(
InternVLImageProcessor.IMAGENET_MEAN, device=device, dtype=dtype
).view(-1, 1, 1)
std = torch.tensor(
InternVLImageProcessor.IMAGENET_STD, device=device, dtype=dtype
).view(-1, 1, 1)
return mean, std
def __init__(self, hf_config, server_args, _image_processor, *args, **kwargs): def __init__(self, hf_config, server_args, _image_processor, *args, **kwargs):
super().__init__(hf_config, server_args, _image_processor, *args, **kwargs) super().__init__(hf_config, server_args, _image_processor, *args, **kwargs)
image_size = ( image_size = (
...@@ -88,6 +104,8 @@ class InternVLImageProcessor(BaseMultimodalProcessor): ...@@ -88,6 +104,8 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
bound, fps, max_frame, first_idx=0, num_segments=num_segments bound, fps, max_frame, first_idx=0, num_segments=num_segments
) )
mean, std = InternVLImageProcessor._get_normalize_tensors(device="cuda")
for frame_index in frame_indices: for frame_index in frame_indices:
# Load frame # Load frame
frame = vr[frame_index] frame = vr[frame_index]
...@@ -97,10 +115,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor): ...@@ -97,10 +115,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
img_np = frame.asnumpy() img_np = frame.asnumpy()
img = torch.from_numpy(img_np).permute(2, 0, 1).cuda().float() / 255.0 img = torch.from_numpy(img_np).permute(2, 0, 1).cuda().float() / 255.0
# Using the mean and variance of the ImageNet dataset for all input images can lead to accuracy issues, while using the mean and variance of each input image is a more accurate choice.
mean = img.mean(dim=[1, 2], keepdim=True)
# Prevent division by zero; clamp to minimum value of 1e-6
std = img.std(dim=[1, 2], keepdim=True).clamp(min=1e-6)
img = (img - mean) / std img = (img - mean) / std
tiles = InternVLImageProcessor.dynamic_preprocess( tiles = InternVLImageProcessor.dynamic_preprocess(
...@@ -188,6 +202,8 @@ class InternVLImageProcessor(BaseMultimodalProcessor): ...@@ -188,6 +202,8 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
num_patches_list = [] num_patches_list = []
pixel_values = [] pixel_values = []
mean, std = InternVLImageProcessor._get_normalize_tensors(device="cuda")
# Process each input with allocated frames # Process each input with allocated frames
for image_index, image in enumerate(base_output.images): for image_index, image in enumerate(base_output.images):
try: try:
...@@ -201,10 +217,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor): ...@@ -201,10 +217,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
else: else:
tensor = image.cuda() # assume already tensor tensor = image.cuda() # assume already tensor
# Using the mean and variance of the ImageNet dataset for all input images can lead to accuracy issues, while using the mean and variance of each input image is a more accurate choice.
mean = tensor.mean(dim=[1, 2], keepdim=True)
# Prevent division by zero; clamp to minimum value of 1e-6
std = tensor.std(dim=[1, 2], keepdim=True).clamp(min=1e-6)
tensor = (tensor - mean) / std tensor = (tensor - mean) / std
tiles = self.dynamic_preprocess( tiles = self.dynamic_preprocess(
tensor, image_size=448, max_num=12, use_thumbnail=True tensor, image_size=448, max_num=12, use_thumbnail=True
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment