v1.0

0063a668 · chenzk · 0063a668 · 0063a668 · 0063a668 · 0063a668
Commit 0063a668 authored May 13, 2025 by chenzk
20 changed files
--- a/data/__pycache__/conversations.cpython-310.pyc
+++ b/data/__pycache__/conversations.cpython-310.pyc
--- a/data/__pycache__/data_collator.cpython-310.pyc
+++ b/data/__pycache__/data_collator.cpython-310.pyc
--- a/data/__pycache__/data_item.cpython-310.pyc
+++ b/data/__pycache__/data_item.cpython-310.pyc
--- a/data/__pycache__/dataset.cpython-310.pyc
+++ b/data/__pycache__/dataset.cpython-310.pyc
--- a/data/conversations.py
+++ b/data/conversations.py
--- a/data/data_collator.py
+++ b/data/data_collator.py
+import torch
+from dataclasses import dataclass, field
+from magma.processing_magma import MagmaProcessor
+from typing import Dict, Optional, Sequence, List
+import transformers 
+from data.utils.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+    processor: MagmaProcessor
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:       
+        input_ids, labels, pixel_values, image_sizes = \
+            tuple([instance[key] for instance in instances] for key in ("input_ids", "labels", "pixel_values", "image_sizes"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.processor.tokenizer.pad_token_id)
+        labels = torch.nn.utils.rnn.pad_sequence(labels,
+                                                 batch_first=True,
+                                                 padding_value=IGNORE_INDEX)
+        input_ids = input_ids[:, :self.processor.tokenizer.model_max_length]
+        labels = labels[:, :self.processor.tokenizer.model_max_length]
+        pixel_values = [torch.cat(pv, dim=0) for pv in pixel_values]
+        image_sizes = [torch.cat(isz, dim=0) for isz in image_sizes]
+        pixel_values_padded = torch.nn.utils.rnn.pad_sequence(pixel_values, batch_first=True, padding_value=0)
+        image_sizes_padded = torch.nn.utils.rnn.pad_sequence(image_sizes, batch_first=True, padding_value=0)
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.processor.tokenizer.pad_token_id),
+            pixel_values=pixel_values_padded,
+            image_sizes=image_sizes_padded
+        )
+        return batch
+@dataclass
+class DataCollatorForHFDataset(object):
+    """Collate hugging face examples for supervised fine-tuning."""
+    tokenizer: transformers.PreTrainedTokenizer
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances]
+                                  for key in ("input_ids", "labels"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id)
+        labels = torch.nn.utils.rnn.pad_sequence(labels,
+                                                 batch_first=True,
+                                                 padding_value=IGNORE_INDEX)
+        input_ids = input_ids[:, :self.tokenizer.model_max_length]
+        labels = labels[:, :self.tokenizer.model_max_length]
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+        if 'image' in instances[0] and instances[0]['image'] is not None:
+            images = [instance['image'] for instance in instances]
+            # if all(x is not None and x.shape == images[0].shape for x in images):
+            #     batch['images'] = torch.stack(images)
+            # else:
+            batch['images'] = images
+        if 'add_im_loss' in instances[0]:
+            batch['add_im_loss'] = True
+        if 'max_num_crops' in instances[0]:
+            batch['max_num_crops'] = instances[0]['max_num_crops']
+        return batch
\ No newline at end of file
--- a/data/data_item.py
+++ b/data/data_item.py
+import json
+import yaml
+import torch
+import random
+import os
+import glob
+import pickle
+from datasets import load_dataset
+from .openx import OpenXDataItem
+from tqdm import tqdm
+class DataItem:
+    """
+    Curate data items from all data sources
+    """
+    def __init__(self, training_size=-1, local_run=False):
+        self.training_size = training_size
+        self.local_run = local_run
+    def _get_dataset_tag(self, data_path):
+        if "epic" in data_path.lower():
+            return "epic"
+        elif "open-x" in data_path or "openx" in data_path:
+            if 'traces' in data_path:
+                return "openx_magma"
+            else:
+                return "openx"
+        elif "sthv2" in data_path.lower():
+            return "sthv2"
+        elif "exoego4d" in data_path.lower():
+            return "exoego4d"
+        elif 'ego4d' in data_path.lower():
+            return "ego4d"
+        elif 'aitw' in data_path.lower():
+            return "aitw"
+        elif 'seeclick' in data_path.lower() and 'ocr' in data_path.lower():
+            return "seeclick_ocr"            
+        elif 'seeclick' in data_path.lower():
+            return "seeclick"
+        elif 'mind2web' in data_path.lower():
+            return "mind2web"
+        elif 'vision2ui' in data_path.lower():
+            return "vision2ui"
+        elif 'llava' in data_path.lower():
+            return "llava"
+        elif 'magma' in data_path.lower():
+            return "magma"
+        elif 'sharegpt4v' in data_path.lower():
+            return "sharegpt4v"
+        else:
+            raise ValueError(f"Dataset tag not found for {data_path}")
+    def _get_items(self, data_path, image_folder=None, processor=None, conversation_lib=None):
+        if data_path.endswith(".json"):
+            list_data_dict = json.load(open(data_path, "r"))
+        elif data_path.endswith(".jsonl"):
+            list_data_dict = [json.loads(line) for line in open(data_path, "r")]
+        elif data_path.endswith(".pth"):
+            list_data_dict = torch.load(data_path, map_location="cpu")
+            # random.shuffle(list_data_dict)
+        else:
+            if self._get_dataset_tag(data_path) == "openx":
+                list_data_dict = OpenXDataItem()(data_path, image_folder, processor=processor, conversation_lib=conversation_lib, local_run=self.local_run)
+            elif self._get_dataset_tag(data_path) == "pixelprose":
+                # Load the dataset
+                list_data_dict = load_dataset(
+                    data_path, 
+                    cache_dir=image_folder
+                )
+            else:
+                data_folder = os.path.dirname(data_path)
+                # get file name from data_path
+                data_files = data_path.split('/')[-1].split('+')
+                list_data_dict = []
+                for file in data_files:
+                    json_path = os.path.join(data_folder, file + '.json')      
+                    list_data_dict.extend(json.load(open(json_path, "r")))                
+        return list_data_dict
+    def __call__(self, data_path, processor=None, conversation_lib=None, is_eval=False):
+        assert data_path is not None, "Data path is not provided"
+        if data_path.endswith(".yaml"):
+            data_dict = yaml.load(open(data_path, "r"), Loader=yaml.FullLoader)    
+            data_path_key = 'DATA_PATH' if not is_eval else 'DATA_PATH_VAL'
+            image_folder_key = 'IMAGE_FOLDER' if not is_eval else 'IMAGE_FOLDER_VAL'
+            assert len(data_dict[data_path_key]) == len(data_dict[image_folder_key]), "Data path and image folder mismatch"
+            items = {}
+            dataset_names = []
+            dataset_folders = []
+            for i, (data_path, image_folder) in enumerate(zip(data_dict[data_path_key], data_dict[image_folder_key])):
+                items_temp = self._get_items(data_path, image_folder, processor, conversation_lib)                
+                dataset_tag = self._get_dataset_tag(data_path)                
+                if dataset_tag != "openx":
+                    # if self.training_size > 0:
+                    #     items_temp = items_temp[:self.training_size]             
+                    if dataset_tag in ['sthv2', "ego4d", "exoego4d"]: 
+                        for item in items_temp:
+                            item['image_folder'] = image_folder
+                            item['dataset_tag'] = dataset_tag
+                            item['gpt_response'] = ''
+                            item['global_instructions'] = item['annotations']
+                    elif dataset_tag in ["openx_magma"]:
+                        items_dict_temp = []
+                        for item in items_temp:
+                            items_dict_temp.append(
+                                {
+                                    'image': item.replace('traces', 'images').replace('.pth', '.jpg'),
+                                    'trace': item,
+                                    'image_folder': image_folder,
+                                    'dataset_tag': dataset_tag
+                                }
+                            ) 
+                        items_temp = items_dict_temp         
+                    else:
+                        # add image_foler to each item
+                        for item in items_temp:
+                            item['image_folder'] = image_folder
+                        # add dataset tag to each item
+                        for item in items_temp:
+                            item['dataset_tag'] = dataset_tag
+                if dataset_tag in items:
+                    items[dataset_tag].extend(items_temp)
+                else:
+                    items[dataset_tag] = items_temp
+                    dataset_names.append(dataset_tag)
+                    dataset_folders.append(image_folder)
+        else:
+            items = self._get_items(data_path)
+            dataset_names = None
+            dataset_folders = None  
+        return items, dataset_names, dataset_folders
\ No newline at end of file
--- a/data/dataset.py
+++ b/data/dataset.py
--- a/data/ego4d/__init__.py
+++ b/data/ego4d/__init__.py
+from .data_utils import Ego4d as ego4d
\ No newline at end of file
--- a/data/ego4d/__pycache__/__init__.cpython-310.pyc
+++ b/data/ego4d/__pycache__/__init__.cpython-310.pyc
--- a/data/ego4d/__pycache__/data_utils.cpython-310.pyc
+++ b/data/ego4d/__pycache__/data_utils.cpython-310.pyc
--- a/data/ego4d/data_utils.py
+++ b/data/ego4d/data_utils.py
+import torch
+import torchvision
+import re
+import cv2
+import numpy as np
+import os
+import yaml
+from tqdm import tqdm
+from PIL import Image
+from data.utils.visual_trace import visual_trace
+from data.utils.som_tom import som_prompting, tom_prompting
+from data.conversations import Constructor
+import logging
+logger = logging.getLogger(__name__)
+class Ego4d(Constructor):
+    def __init__(self, **kwargs):
+        super(Ego4d, self).__init__(**kwargs)
+        # load settings from settings.yaml file
+        with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'settings.yaml'), 'r') as file:
+            self.settings = yaml.safe_load(file)
+        self.spatial_quant_size = kwargs.get('spatial_quant_size', 256)   # this is also used for open-x
+        self.num_clusters = self.settings['trace_processor']['num_clusters']
+        self.root_dir = kwargs.get('dataset_folder', None)
+        self.task = kwargs.get('task', 'agent')
+        self.use_som_tom = kwargs.get('mm_use_som_tom', True)
+        if kwargs.get('training_size', 'default') == 'default':
+            self.training_size = self.settings['training'].get('size', -1)
+        else:
+            self.training_size = kwargs.get('training_size', -1)
+            # convert M to 1000000, e.g, 10M means 10,000,000
+            if 'M' in self.training_size:
+                self.training_size = int(float(self.training_size.replace('M', '')) * 1000000)
+            else:
+                self.training_size = int(self.training_size)
+        self.filtered_verb = [
+            'converse',
+            'walk',
+            'laugh',
+            'stand',
+            'move around',
+            'looks around', 
+        ]
+    def __call__(self, **kwargs):
+        return super()._construct_conv(**kwargs)
+    def filter_items(self, items):
+        """
+        Filter invalid items
+        """
+        filtered_items = []
+        print("Filtering items")
+        for item in tqdm(items):
+            global_instruction = item['global_instructions']
+            if len(global_instruction) == 0:
+                continue
+            # check if global_instruction contain any word in self.filtered_verb
+            # if so, skip this item
+            if any(verb in global_instruction for verb in self.filtered_verb):           
+                continue
+            seg_name = item['video'].split('/')[-1]
+            start_str, end_str = seg_name.split('___')[0:2]
+            start_time = float(start_str.split('_')[-1])
+            end_time = float(end_str.split('_')[-1])
+            if (end_time-start_time) < 1:
+                continue
+            filtered_items.append(item)        
+        if self.training_size > 0 and self.training_size < len(filtered_items):
+            # sample uniformly self.training_size samples from the filtered items
+            filtered_items = filtered_items[::(len(filtered_items)//self.training_size)]
+        print(f"Keep {len(filtered_items)} items from {len(items)} items")
+        return filtered_items
\ No newline at end of file
--- a/data/ego4d/settings.yaml
+++ b/data/ego4d/settings.yaml
+# tracker settings
+tracker:
+  backward_tracking: true
+  ckpt_path: ./checkpoints/cotracker2.pth
+  grid_query_frame: 0
+  grid_size: 32
+  save_dir: ./
+# sft settings
+trace_processor:
+  num_clusters: 5
+  postive_factor_threshold: 0.5  # this will times the max value of the trace to get the threshold
+  postive_speed_threshold: 2 # this is the speed threshold for the positive trace
+trace_planner:
+  quant_size: 200
+  skip_frames: 16
+  step_to_predict: 16 # use same setting as COIN since the videos have 30fps
+  step_rightmost_ratio: 0.5 # the ratio of the rightmost point to set as the start frame
+training:
+  size: 1_000_000
\ No newline at end of file
--- a/data/epic/__init__.py
+++ b/data/epic/__init__.py
+from .data_utils import EpicKitchen as epic
\ No newline at end of file
--- a/data/epic/__pycache__/__init__.cpython-310.pyc
+++ b/data/epic/__pycache__/__init__.cpython-310.pyc
--- a/data/epic/__pycache__/data_utils.cpython-310.pyc
+++ b/data/epic/__pycache__/data_utils.cpython-310.pyc
--- a/data/epic/data_utils.py
+++ b/data/epic/data_utils.py
+import torch
+import torchvision
+import re
+import cv2
+import numpy as np
+import os
+import yaml
+from PIL import Image
+from data.conversations import Constructor
+class EpicKitchen(Constructor):
+    def __init__(self, **kwargs):
+        super(EpicKitchen, self).__init__(**kwargs)
+        # load settings from settings.yaml file
+        with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'settings.yaml'), 'r') as file:
+            self.settings = yaml.safe_load(file)
+        self.spatial_quant_size = kwargs.get('spatial_quant_size', 256)   # this is also used for open-x
+        self.num_clusters = self.settings['trace_processor']['num_clusters']
+        self.root_dir = kwargs.get('dataset_folder', None)
+        self.task = kwargs.get('task', 'agent')
+        self.use_som_tom = kwargs.get('mm_use_som_tom', True)
+    def __call__(self, **kwargs):
+        if self.task == "captioner":
+            return super()._construct_caption(**kwargs)
+        else:
+            return super()._construct_conv(**kwargs)
+    def filter_items(self, items):
+        """
+        filter out items that are not suitable for conversation construction
+        """
+        filtered_items = []
+        for item in items:
+            # remove closeup videos
+            if 'closeup' in item['gpt_response'][0] or \
+                'close-up' in item['gpt_response'][0] or \
+                    'close up' in item['gpt_response'][0] or \
+                        'What you should do next' not in item['gpt_response'][0]:
+                continue
+            # item['gpt_response'][0] = item['gpt_response'][0].replace('blue', 'yellow')
+            filtered_items.append(item)
+        print(f"Filtered {len(items) - len(filtered_items)} items from {len(items)} items")
+        return filtered_items
\ No newline at end of file
--- a/data/epic/settings.yaml
+++ b/data/epic/settings.yaml
+# tracker settings
+tracker:
+  ckpt_path: "./checkpoints/cotracker2.pth"
+  grid_size: 32
+  grid_query_frame: 0
+  backward_tracking: True
+  save_dir: "./"
+# sft settings
+trace_processor:
+  num_clusters: 5
+  postive_factor_threshold: 0.5  # this will times the max value of the trace to get the threshold
+  postive_speed_threshold: 1 # this is the speed threshold for the positive trace
+trace_planner:
+  step_rightmost_ratio: 0.5 # the ratio of the rightmost point to set as the start frame
\ No newline at end of file
--- a/data/llava/__init__.py
+++ b/data/llava/__init__.py
+from .data_utils import LlaVA as llava
\ No newline at end of file
--- a/data/llava/__pycache__/__init__.cpython-310.pyc
+++ b/data/llava/__pycache__/__init__.cpython-310.pyc