Commit 0063a668 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
This diff is collapsed.
import torch
from dataclasses import dataclass, field
from magma.processing_magma import MagmaProcessor
from typing import Dict, Optional, Sequence, List
import transformers
from data.utils.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
@dataclass
class DataCollatorForSupervisedDataset(object):
"""Collate examples for supervised fine-tuning."""
processor: MagmaProcessor
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
input_ids, labels, pixel_values, image_sizes = \
tuple([instance[key] for instance in instances] for key in ("input_ids", "labels", "pixel_values", "image_sizes"))
input_ids = torch.nn.utils.rnn.pad_sequence(
input_ids,
batch_first=True,
padding_value=self.processor.tokenizer.pad_token_id)
labels = torch.nn.utils.rnn.pad_sequence(labels,
batch_first=True,
padding_value=IGNORE_INDEX)
input_ids = input_ids[:, :self.processor.tokenizer.model_max_length]
labels = labels[:, :self.processor.tokenizer.model_max_length]
pixel_values = [torch.cat(pv, dim=0) for pv in pixel_values]
image_sizes = [torch.cat(isz, dim=0) for isz in image_sizes]
pixel_values_padded = torch.nn.utils.rnn.pad_sequence(pixel_values, batch_first=True, padding_value=0)
image_sizes_padded = torch.nn.utils.rnn.pad_sequence(image_sizes, batch_first=True, padding_value=0)
batch = dict(
input_ids=input_ids,
labels=labels,
attention_mask=input_ids.ne(self.processor.tokenizer.pad_token_id),
pixel_values=pixel_values_padded,
image_sizes=image_sizes_padded
)
return batch
@dataclass
class DataCollatorForHFDataset(object):
"""Collate hugging face examples for supervised fine-tuning."""
tokenizer: transformers.PreTrainedTokenizer
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
input_ids, labels = tuple([instance[key] for instance in instances]
for key in ("input_ids", "labels"))
input_ids = torch.nn.utils.rnn.pad_sequence(
input_ids,
batch_first=True,
padding_value=self.tokenizer.pad_token_id)
labels = torch.nn.utils.rnn.pad_sequence(labels,
batch_first=True,
padding_value=IGNORE_INDEX)
input_ids = input_ids[:, :self.tokenizer.model_max_length]
labels = labels[:, :self.tokenizer.model_max_length]
batch = dict(
input_ids=input_ids,
labels=labels,
attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
)
if 'image' in instances[0] and instances[0]['image'] is not None:
images = [instance['image'] for instance in instances]
# if all(x is not None and x.shape == images[0].shape for x in images):
# batch['images'] = torch.stack(images)
# else:
batch['images'] = images
if 'add_im_loss' in instances[0]:
batch['add_im_loss'] = True
if 'max_num_crops' in instances[0]:
batch['max_num_crops'] = instances[0]['max_num_crops']
return batch
\ No newline at end of file
import json
import yaml
import torch
import random
import os
import glob
import pickle
from datasets import load_dataset
from .openx import OpenXDataItem
from tqdm import tqdm
class DataItem:
"""
Curate data items from all data sources
"""
def __init__(self, training_size=-1, local_run=False):
self.training_size = training_size
self.local_run = local_run
def _get_dataset_tag(self, data_path):
if "epic" in data_path.lower():
return "epic"
elif "open-x" in data_path or "openx" in data_path:
if 'traces' in data_path:
return "openx_magma"
else:
return "openx"
elif "sthv2" in data_path.lower():
return "sthv2"
elif "exoego4d" in data_path.lower():
return "exoego4d"
elif 'ego4d' in data_path.lower():
return "ego4d"
elif 'aitw' in data_path.lower():
return "aitw"
elif 'seeclick' in data_path.lower() and 'ocr' in data_path.lower():
return "seeclick_ocr"
elif 'seeclick' in data_path.lower():
return "seeclick"
elif 'mind2web' in data_path.lower():
return "mind2web"
elif 'vision2ui' in data_path.lower():
return "vision2ui"
elif 'llava' in data_path.lower():
return "llava"
elif 'magma' in data_path.lower():
return "magma"
elif 'sharegpt4v' in data_path.lower():
return "sharegpt4v"
else:
raise ValueError(f"Dataset tag not found for {data_path}")
def _get_items(self, data_path, image_folder=None, processor=None, conversation_lib=None):
if data_path.endswith(".json"):
list_data_dict = json.load(open(data_path, "r"))
elif data_path.endswith(".jsonl"):
list_data_dict = [json.loads(line) for line in open(data_path, "r")]
elif data_path.endswith(".pth"):
list_data_dict = torch.load(data_path, map_location="cpu")
# random.shuffle(list_data_dict)
else:
if self._get_dataset_tag(data_path) == "openx":
list_data_dict = OpenXDataItem()(data_path, image_folder, processor=processor, conversation_lib=conversation_lib, local_run=self.local_run)
elif self._get_dataset_tag(data_path) == "pixelprose":
# Load the dataset
list_data_dict = load_dataset(
data_path,
cache_dir=image_folder
)
else:
data_folder = os.path.dirname(data_path)
# get file name from data_path
data_files = data_path.split('/')[-1].split('+')
list_data_dict = []
for file in data_files:
json_path = os.path.join(data_folder, file + '.json')
list_data_dict.extend(json.load(open(json_path, "r")))
return list_data_dict
def __call__(self, data_path, processor=None, conversation_lib=None, is_eval=False):
assert data_path is not None, "Data path is not provided"
if data_path.endswith(".yaml"):
data_dict = yaml.load(open(data_path, "r"), Loader=yaml.FullLoader)
data_path_key = 'DATA_PATH' if not is_eval else 'DATA_PATH_VAL'
image_folder_key = 'IMAGE_FOLDER' if not is_eval else 'IMAGE_FOLDER_VAL'
assert len(data_dict[data_path_key]) == len(data_dict[image_folder_key]), "Data path and image folder mismatch"
items = {}
dataset_names = []
dataset_folders = []
for i, (data_path, image_folder) in enumerate(zip(data_dict[data_path_key], data_dict[image_folder_key])):
items_temp = self._get_items(data_path, image_folder, processor, conversation_lib)
dataset_tag = self._get_dataset_tag(data_path)
if dataset_tag != "openx":
# if self.training_size > 0:
# items_temp = items_temp[:self.training_size]
if dataset_tag in ['sthv2', "ego4d", "exoego4d"]:
for item in items_temp:
item['image_folder'] = image_folder
item['dataset_tag'] = dataset_tag
item['gpt_response'] = ''
item['global_instructions'] = item['annotations']
elif dataset_tag in ["openx_magma"]:
items_dict_temp = []
for item in items_temp:
items_dict_temp.append(
{
'image': item.replace('traces', 'images').replace('.pth', '.jpg'),
'trace': item,
'image_folder': image_folder,
'dataset_tag': dataset_tag
}
)
items_temp = items_dict_temp
else:
# add image_foler to each item
for item in items_temp:
item['image_folder'] = image_folder
# add dataset tag to each item
for item in items_temp:
item['dataset_tag'] = dataset_tag
if dataset_tag in items:
items[dataset_tag].extend(items_temp)
else:
items[dataset_tag] = items_temp
dataset_names.append(dataset_tag)
dataset_folders.append(image_folder)
else:
items = self._get_items(data_path)
dataset_names = None
dataset_folders = None
return items, dataset_names, dataset_folders
\ No newline at end of file
This diff is collapsed.
from .data_utils import Ego4d as ego4d
\ No newline at end of file
import torch
import torchvision
import re
import cv2
import numpy as np
import os
import yaml
from tqdm import tqdm
from PIL import Image
from data.utils.visual_trace import visual_trace
from data.utils.som_tom import som_prompting, tom_prompting
from data.conversations import Constructor
import logging
logger = logging.getLogger(__name__)
class Ego4d(Constructor):
def __init__(self, **kwargs):
super(Ego4d, self).__init__(**kwargs)
# load settings from settings.yaml file
with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'settings.yaml'), 'r') as file:
self.settings = yaml.safe_load(file)
self.spatial_quant_size = kwargs.get('spatial_quant_size', 256) # this is also used for open-x
self.num_clusters = self.settings['trace_processor']['num_clusters']
self.root_dir = kwargs.get('dataset_folder', None)
self.task = kwargs.get('task', 'agent')
self.use_som_tom = kwargs.get('mm_use_som_tom', True)
if kwargs.get('training_size', 'default') == 'default':
self.training_size = self.settings['training'].get('size', -1)
else:
self.training_size = kwargs.get('training_size', -1)
# convert M to 1000000, e.g, 10M means 10,000,000
if 'M' in self.training_size:
self.training_size = int(float(self.training_size.replace('M', '')) * 1000000)
else:
self.training_size = int(self.training_size)
self.filtered_verb = [
'converse',
'walk',
'laugh',
'stand',
'move around',
'looks around',
]
def __call__(self, **kwargs):
return super()._construct_conv(**kwargs)
def filter_items(self, items):
"""
Filter invalid items
"""
filtered_items = []
print("Filtering items")
for item in tqdm(items):
global_instruction = item['global_instructions']
if len(global_instruction) == 0:
continue
# check if global_instruction contain any word in self.filtered_verb
# if so, skip this item
if any(verb in global_instruction for verb in self.filtered_verb):
continue
seg_name = item['video'].split('/')[-1]
start_str, end_str = seg_name.split('___')[0:2]
start_time = float(start_str.split('_')[-1])
end_time = float(end_str.split('_')[-1])
if (end_time-start_time) < 1:
continue
filtered_items.append(item)
if self.training_size > 0 and self.training_size < len(filtered_items):
# sample uniformly self.training_size samples from the filtered items
filtered_items = filtered_items[::(len(filtered_items)//self.training_size)]
print(f"Keep {len(filtered_items)} items from {len(items)} items")
return filtered_items
\ No newline at end of file
# tracker settings
tracker:
backward_tracking: true
ckpt_path: ./checkpoints/cotracker2.pth
grid_query_frame: 0
grid_size: 32
save_dir: ./
# sft settings
trace_processor:
num_clusters: 5
postive_factor_threshold: 0.5 # this will times the max value of the trace to get the threshold
postive_speed_threshold: 2 # this is the speed threshold for the positive trace
trace_planner:
quant_size: 200
skip_frames: 16
step_to_predict: 16 # use same setting as COIN since the videos have 30fps
step_rightmost_ratio: 0.5 # the ratio of the rightmost point to set as the start frame
training:
size: 1_000_000
\ No newline at end of file
from .data_utils import EpicKitchen as epic
\ No newline at end of file
import torch
import torchvision
import re
import cv2
import numpy as np
import os
import yaml
from PIL import Image
from data.conversations import Constructor
class EpicKitchen(Constructor):
def __init__(self, **kwargs):
super(EpicKitchen, self).__init__(**kwargs)
# load settings from settings.yaml file
with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'settings.yaml'), 'r') as file:
self.settings = yaml.safe_load(file)
self.spatial_quant_size = kwargs.get('spatial_quant_size', 256) # this is also used for open-x
self.num_clusters = self.settings['trace_processor']['num_clusters']
self.root_dir = kwargs.get('dataset_folder', None)
self.task = kwargs.get('task', 'agent')
self.use_som_tom = kwargs.get('mm_use_som_tom', True)
def __call__(self, **kwargs):
if self.task == "captioner":
return super()._construct_caption(**kwargs)
else:
return super()._construct_conv(**kwargs)
def filter_items(self, items):
"""
filter out items that are not suitable for conversation construction
"""
filtered_items = []
for item in items:
# remove closeup videos
if 'closeup' in item['gpt_response'][0] or \
'close-up' in item['gpt_response'][0] or \
'close up' in item['gpt_response'][0] or \
'What you should do next' not in item['gpt_response'][0]:
continue
# item['gpt_response'][0] = item['gpt_response'][0].replace('blue', 'yellow')
filtered_items.append(item)
print(f"Filtered {len(items) - len(filtered_items)} items from {len(items)} items")
return filtered_items
\ No newline at end of file
# tracker settings
tracker:
ckpt_path: "./checkpoints/cotracker2.pth"
grid_size: 32
grid_query_frame: 0
backward_tracking: True
save_dir: "./"
# sft settings
trace_processor:
num_clusters: 5
postive_factor_threshold: 0.5 # this will times the max value of the trace to get the threshold
postive_speed_threshold: 1 # this is the speed threshold for the positive trace
trace_planner:
step_rightmost_ratio: 0.5 # the ratio of the rightmost point to set as the start frame
\ No newline at end of file
from .data_utils import LlaVA as llava
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment