Unverified Commit 7d8bbe6f authored by Isotr0py's avatar Isotr0py Committed by GitHub
Browse files

[CI/Build] Automatically patch video metadata for multimodal processor test (#35822)


Signed-off-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
parent 25e02647
...@@ -33,32 +33,9 @@ from ...registry import ( ...@@ -33,32 +33,9 @@ from ...registry import (
) )
def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict: def add_video_metadata(mm_data: MultiModalDataDict) -> MultiModalDataDict:
""" """
Patch the multimodal data for GLM4.1V model. Add metadata to video mm_data
"""
# Ensure video metadata is included
if "video" in mm_data:
# GLM4.1V doesn't support multiple videos
video = mm_data["video"]
num_frames = len(video)
mm_data["video"] = (
video,
{
"total_num_frames": num_frames,
"fps": num_frames,
"duration": 1,
"frames_indices": [i for i in range(num_frames)],
"video_backend": "opencv",
"do_sample_frames": True,
},
)
return mm_data
def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
"""
Patch the multimodal data for Qwen3-VL model.
""" """
def create_metadata(frames: np.ndarray): def create_metadata(frames: np.ndarray):
...@@ -119,18 +96,7 @@ _IGNORE_MM_KEYS = { ...@@ -119,18 +96,7 @@ _IGNORE_MM_KEYS = {
} }
MM_DATA_PATCHES = { MM_DATA_PATCHES = {
# Ernie4.5-VL, GLM4.1V and Qwen3-VL requires video metadata
"ernie4_5_moe_vl": qwen3_vl_patch_mm_data,
"glm4v": glm4_1v_patch_mm_data,
"glm4v_moe": glm4_1v_patch_mm_data,
"glm_ocr": glm4_1v_patch_mm_data,
"glmasr": glmasr_patch_mm_data, "glmasr": glmasr_patch_mm_data,
"interns1_pro": qwen3_vl_patch_mm_data,
"molmo2": qwen3_vl_patch_mm_data,
"qwen3_5": qwen3_vl_patch_mm_data,
"qwen3_5_moe": qwen3_vl_patch_mm_data,
"qwen3_vl": qwen3_vl_patch_mm_data,
"qwen3_vl_moe": qwen3_vl_patch_mm_data,
} }
...@@ -176,6 +142,9 @@ def get_text_token_prompts( ...@@ -176,6 +142,9 @@ def get_text_token_prompts(
tokenizer: TokenizerLike = processor.info.get_tokenizer() tokenizer: TokenizerLike = processor.info.get_tokenizer()
model_config = processor.info.ctx.model_config model_config = processor.info.ctx.model_config
if processor.info.data_parser.video_needs_metadata:
mm_data = add_video_metadata(mm_data)
model_type = model_config.hf_config.model_type model_type = model_config.hf_config.model_type
if model_type in MM_DATA_PATCHES: if model_type in MM_DATA_PATCHES:
mm_data = MM_DATA_PATCHES[model_type](mm_data) mm_data = MM_DATA_PATCHES[model_type](mm_data)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment