Unverified Commit 8234e663 authored by Kaichen Zhang - NTU's avatar Kaichen Zhang - NTU Committed by GitHub
Browse files

[Minor Fix] Fix llava modalities issue for single-image (#1402)

parent debbdb51
......@@ -185,11 +185,14 @@ class LlavaBaseForCausalLM(nn.Module):
new_image_features = []
height = width = self.num_patches_per_side
for image_idx, image_feature in enumerate(image_features):
if modalities_list[image_idx] == 1:
if modalities_list[image_idx] == "image":
image_aspect_ratio = (
self.config.image_aspect_ratio
) # single image
else:
elif (
modalities_list[image_idx] == "multi-images"
or modalities_list[image_idx] == "video"
):
image_aspect_ratio = "pad" # multi image
# image_aspect_ratio = (
# "anyres" if len(image_sizes[image_idx]) == 1 else "pad"
......@@ -319,6 +322,21 @@ class LlavaBaseForCausalLM(nn.Module):
.transpose(1, 2)
.contiguous()
) # N, C, H*W
if "unpad" in self.mm_patch_merge_type:
image_feature = torch.cat(
(
image_feature,
# Expand to (bs, 1, hidden_dim) and concat at the end of the image tokens
self.language_model.model.image_newline[
None, None
].expand(
image_feature.shape[0],
1,
image_feature.shape[-1],
),
),
dim=1,
)
new_image_features.append(image_feature)
image_features = new_image_features
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment