total_len:int# Total token count in the sample, including text and image tokens
labels:torch.Tensor=None
@dataclass
classImageTaskSamplePacked(Sample):
"""Dataclass to store a single packed sample (not a batch).
P = Number of sub-samples in the packed sample
seq_len = Total sequence length
num_imgs = Number of images across all samples in the packed sample
"""
__key__:str# Sample name
__restore_key__:Tuple[Union[str,int,tuple],...]
__subflavor__:Dict# Sample metadata. Deprecated.
__subflavors__:Dict# Sample metadata.
tokens:torch.Tensor# Input tokens packed into a single tensor (seq_len,)
labels:torch.Tensor# Target tokens packed into a single tensor (seq_len,)
imgs:List[torch.Tensor]# Input images
num_tiles:List[int]# Number of tiles for each image of each sample (num_imgs)
max_length:int# Maximum length across sub-samples.
cu_lengths:List[int]# Cumulative length of each sub-sample in this packed sample incl. text and image tokens (P,)
# Typing for the resulting batch data after encode_batch()
@dataclass
classImageTaskBatchPacked(Batch):
"""Dataclass to store a batch of packed samples.
N = Batch size
P = Number of samples in the packed sample
seq_len = Maximum sequence length
num_imgs = Number of images across all samples in the packed sample
"""
__key__:List[str]# Sample names
__restore_key__:Tuple[Union[str,int,tuple],...]
__subflavor__:Dict# Sample metadata. Deprecated.
__subflavors__:List[Dict]# Sample metadatas.
tokens:torch.Tensor# Input tokens packed and padded (N, seq_len)
labels:torch.Tensor# Target tokens packed and padded (N, seq_len)
imgs:torch.Tensor# All image tiles stacked into a single tensor (num_tiles, C, H, W)
num_tiles:List[List[int]]# Number of tiles per image (N, num_imgs)
max_lengths:List[int]# Maximum length across sub-samples (N,)
cu_lengths:List[List[int]]# Cumulative length of each sub-sample in each packed sample of the batch (N, P)
# Based on https://github.com/hiyouga/LLaMA-Factory/blob/641d0dab08d96a93c34657742213d8994d9ed476/src/llamafactory/data/processors/processor_utils.py#L19
"""Finds the index of largest number that fits into the knapsack with the given capacity."""
index=bisect.bisect(numbers,capacity)
return-1ifindex==0else(index-1)
# Based on https://github.com/hiyouga/LLaMA-Factory/blob/641d0dab08d96a93c34657742213d8994d9ed476/src/llamafactory/data/processors/processor_utils.py#L27
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. Except portions as noted which are Copyright (c) 2023 OpenGVLab and licensed under the MIT license found in LICENSE.
"COMMENT":"Sources for these prompts include https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/viewer and https://huggingface.co/datasets/HuggingFaceM4/M3IT",
"Captioning":{
"raw":[
"Can you briefly explain what you see in the image?",
"Describe what's happening in this image in one short sentence.",
"Write a short caption that accurately represents the content of this image.",
"Please generate a descriptive caption for the image provided.",
"How would you summarize the scene depicted in the picture in short?",
"Describe the image briefly.",
"Write a succinct description of the image, capturing its main components, the relationships between them, and any notable details.",
"Create a concise caption that accurately describes the main elements in the image provided.",
"Write a brief, yet comprehensive, description of the image.",
"Describe the image in a clear and concise manner.",
"For the given image, provide a one-sentence summary that captures the most important details.",
"Generate a short caption for the picture.",
"Write a short and informative description that highlights the primary subjects and actions occurring in the given image.",
"Provide a concise and informative caption for the image, focusing on the primary subjects.",
"Write a clear description of the image, make sure the key features are well covered.",
"Offer a succinct explanation of the picture presented."
]
},
"CaptioningPretraining":{
"raw":[
"Generate a short caption of the image.",
"Describe the image concisely.",
"Provide a brief description of the given image."
],
"llava":[
"Give a brief description of image.",
"Give a brief description of the image.",
"Provide a brief description of the given image.",
"Provide a one-sentence caption for the provided image.",
"Write a terse but informative summary of the picture.",
"Describe the image concisely.",
"Generate a clear and concise summary of the photo."
]
},
"OCR":{
"raw":[
"Can you read the text from image and output here?",
"Extract and document the text from the provided image.",
"Converting the text embedded in this image into a readable document.",
"Transcribe all the text you find.",
"Can you extract all visible text from the image here?"