Unverified Commit 18e519ec authored by Isotr0py's avatar Isotr0py Committed by GitHub
Browse files

[Bugfix] Fix ndarray video color from VideoAsset (#21064)


Signed-off-by: default avatarIsotr0py <2037008807@qq.com>
parent 1eaff278
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import tempfile
from pathlib import Path
import numpy as np import numpy as np
import numpy.typing as npt import numpy.typing as npt
import pytest import pytest
from PIL import Image
from vllm import envs from vllm.assets.base import get_vllm_public_assets
from vllm.assets.video import video_to_ndarrays, video_to_pil_images_list
from vllm.multimodal.image import ImageMediaIO from vllm.multimodal.image import ImageMediaIO
from vllm.multimodal.video import (VIDEO_LOADER_REGISTRY, VideoLoader, from vllm.multimodal.video import (VIDEO_LOADER_REGISTRY, VideoLoader,
VideoMediaIO) VideoMediaIO)
from .utils import cosine_similarity, create_video_from_image, normalize_image
NUM_FRAMES = 10 NUM_FRAMES = 10
FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3) FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3) FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
...@@ -59,8 +67,9 @@ class Assert10Frames1FPSVideoLoader(VideoLoader): ...@@ -59,8 +67,9 @@ class Assert10Frames1FPSVideoLoader(VideoLoader):
return FAKE_OUTPUT_2 return FAKE_OUTPUT_2
def test_video_media_io_kwargs(): def test_video_media_io_kwargs(monkeypatch: pytest.MonkeyPatch):
envs.VLLM_VIDEO_LOADER_BACKEND = "assert_10_frames_1_fps" with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "assert_10_frames_1_fps")
imageio = ImageMediaIO() imageio = ImageMediaIO()
# Verify that different args pass/fail assertions as expected. # Verify that different args pass/fail assertions as expected.
...@@ -86,3 +95,51 @@ def test_video_media_io_kwargs(): ...@@ -86,3 +95,51 @@ def test_video_media_io_kwargs():
with pytest.raises(AssertionError, match="bad fps"): with pytest.raises(AssertionError, match="bad fps"):
videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 2.0}) videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 2.0})
_ = videoio.load_bytes(b"test") _ = videoio.load_bytes(b"test")
@pytest.mark.parametrize("is_color", [True, False])
@pytest.mark.parametrize("fourcc, ext", [("mp4v", "mp4"), ("XVID", "avi")])
def test_opencv_video_io_colorspace(is_color: bool, fourcc: str, ext: str):
"""
Test all functions that use OpenCV for video I/O return RGB format.
Both RGB and grayscale videos are tested.
"""
image_path = get_vllm_public_assets(filename="stop_sign.jpg",
s3_prefix="vision_model_images")
image = Image.open(image_path)
with tempfile.TemporaryDirectory() as tmpdir:
if not is_color:
image_path = f"{tmpdir}/test_grayscale_image.png"
image = image.convert("L")
image.save(image_path)
# Convert to gray RGB for comparison
image = image.convert("RGB")
video_path = f"{tmpdir}/test_RGB_video.{ext}"
create_video_from_image(
image_path,
video_path,
num_frames=2,
is_color=is_color,
fourcc=fourcc,
)
frames = video_to_ndarrays(video_path)
for frame in frames:
sim = cosine_similarity(normalize_image(np.array(frame)),
normalize_image(np.array(image)))
assert np.sum(np.isnan(sim)) / sim.size < 0.001
assert np.nanmean(sim) > 0.99
pil_frames = video_to_pil_images_list(video_path)
for frame in pil_frames:
sim = cosine_similarity(normalize_image(np.array(frame)),
normalize_image(np.array(image)))
assert np.sum(np.isnan(sim)) / sim.size < 0.001
assert np.nanmean(sim) > 0.99
io_frames, _ = VideoMediaIO(ImageMediaIO()).load_file(Path(video_path))
for frame in io_frames:
sim = cosine_similarity(normalize_image(np.array(frame)),
normalize_image(np.array(image)))
assert np.sum(np.isnan(sim)) / sim.size < 0.001
assert np.nanmean(sim) > 0.99
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import cv2
import numpy as np import numpy as np
import numpy.typing as npt
from PIL import Image from PIL import Image
...@@ -31,3 +33,47 @@ def random_audio( ...@@ -31,3 +33,47 @@ def random_audio(
): ):
audio_len = rng.randint(min_len, max_len) audio_len = rng.randint(min_len, max_len)
return rng.rand(audio_len), sr return rng.rand(audio_len), sr
def create_video_from_image(
image_path: str,
video_path: str,
num_frames: int = 10,
fps: float = 1.0,
is_color: bool = True,
fourcc: str = "mp4v",
):
image = cv2.imread(image_path)
if not is_color:
# Convert to grayscale if is_color is False
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
height, width = image.shape
else:
height, width, _ = image.shape
video_writer = cv2.VideoWriter(
video_path,
cv2.VideoWriter_fourcc(*fourcc),
fps,
(width, height),
isColor=is_color,
)
for _ in range(num_frames):
video_writer.write(image)
video_writer.release()
return video_path
def cosine_similarity(A: npt.NDArray,
B: npt.NDArray,
axis: int = -1) -> npt.NDArray:
"""Compute cosine similarity between two vectors."""
return (np.sum(A * B, axis=axis) /
(np.linalg.norm(A, axis=axis) * np.linalg.norm(B, axis=axis)))
def normalize_image(image: npt.NDArray) -> npt.NDArray:
"""Normalize image to [0, 1] range."""
return image.astype(np.float32) / 255.0
\ No newline at end of file
...@@ -59,7 +59,9 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: ...@@ -59,7 +59,9 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
if idx in frame_indices: # only decompress needed if idx in frame_indices: # only decompress needed
ret, frame = cap.retrieve() ret, frame = cap.retrieve()
if ret: if ret:
frames.append(frame) # OpenCV uses BGR format, we need to convert it to RGB
# for PIL and transformers compatibility
frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
frames = np.stack(frames) frames = np.stack(frames)
if len(frames) < num_frames: if len(frames) < num_frames:
...@@ -71,10 +73,7 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: ...@@ -71,10 +73,7 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
def video_to_pil_images_list(path: str, def video_to_pil_images_list(path: str,
num_frames: int = -1) -> list[Image.Image]: num_frames: int = -1) -> list[Image.Image]:
frames = video_to_ndarrays(path, num_frames) frames = video_to_ndarrays(path, num_frames)
return [ return [Image.fromarray(frame) for frame in frames]
Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
for frame in frames
]
def video_get_metadata(path: str) -> dict[str, Any]: def video_get_metadata(path: str) -> dict[str, Any]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment