Unverified Commit 60f0843e authored by Chatcharin Sangbutsarakum's avatar Chatcharin Sangbutsarakum Committed by GitHub
Browse files

[Model] Remove unnecessary CUDA sync of Qwen2VL image and video preprocess (#24334)


Signed-off-by: default avatarWin <chatcharinsang@gmail.com>
Co-authored-by: default avatarRoger Wang <hey@rogerw.io>
parent 8a466026
...@@ -1218,6 +1218,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1218,6 +1218,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
grid_thw = image_input["image_grid_thw"] grid_thw = image_input["image_grid_thw"]
assert grid_thw.ndim == 2 assert grid_thw.ndim == 2
grid_thw_list = grid_thw.tolist()
if image_input["type"] == "image_embeds": if image_input["type"] == "image_embeds":
image_embeds = image_input["image_embeds"] image_embeds = image_input["image_embeds"]
...@@ -1227,15 +1228,17 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1227,15 +1228,17 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
# Split concatenated embeddings for each image item. # Split concatenated embeddings for each image item.
merge_size = self.visual.spatial_merge_size merge_size = self.visual.spatial_merge_size
sizes = grid_thw.prod(-1) // merge_size // merge_size sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
(merge_size * merge_size)).tolist()
return image_embeds.split(sizes.tolist()) return image_embeds.split(sizes)
def _process_video_input( def _process_video_input(
self, video_input: Qwen2VLVideoInputs) -> tuple[torch.Tensor, ...]: self, video_input: Qwen2VLVideoInputs) -> tuple[torch.Tensor, ...]:
grid_thw = video_input["video_grid_thw"] grid_thw = video_input["video_grid_thw"]
assert grid_thw.ndim == 2 assert grid_thw.ndim == 2
grid_thw_list = grid_thw.tolist()
if video_input["type"] == "video_embeds": if video_input["type"] == "video_embeds":
video_embeds = video_input["video_embeds"] video_embeds = video_input["video_embeds"]
...@@ -1245,9 +1248,10 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1245,9 +1248,10 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
# Split concatenated embeddings for each video item. # Split concatenated embeddings for each video item.
merge_size = self.visual.spatial_merge_size merge_size = self.visual.spatial_merge_size
sizes = grid_thw.prod(-1) // merge_size // merge_size sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
(merge_size * merge_size)).tolist()
return video_embeds.split(sizes.tolist()) return video_embeds.split(sizes)
def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
modalities = {} modalities = {}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment