init

ef30d662 · bailuo · ef30d662 · ef30d662 · ef30d662 · ef30d662
Commit ef30d662 authored Mar 13, 2025 by bailuo
20 changed files
--- a/projects/llava_sam2/deepspeed_zero2_sam2.json
+++ b/projects/llava_sam2/deepspeed_zero2_sam2.json
+{
+  "gradient_accumulation_steps": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "zero_force_ds_cpu_optimizer": false,
+  "zero_optimization": {
+    "stage": 2,
+    "overlap_comm": true,
+    "allgather_bucket_size": 5368709120,
+    "reduce_bucket_size": 5368709120,
+    "reduce_scatter": true,
+    "sub_group_size": 1e9,
+    "contiguous_gradients": true,
+    "allgather_partitions": true
+  },
+  "fp16": {
+    "enabled": false,
+    "initial_scale_power": 16
+  },
+  "bf16": {
+    "enabled": true
+  }
+}
--- a/projects/llava_sam2/evaluation/dataset/RES.py
+++ b/projects/llava_sam2/evaluation/dataset/RES.py
+import os
+import numpy as np
+import torch
+from PIL import Image
+from pycocotools import mask as _mask
+from utils import REFER, Summary, AverageMeter, intersectionAndUnionGPU, master_only
+
+DATASETS_ATTRIBUTES = {
+    'refcoco': {'splitBy': "unc", 'dataset_name': 'refcoco'},
+    'refcoco_plus': {'splitBy': "unc", 'dataset_name': 'refcoco+'},
+    'refcocog': {'splitBy': "umd", 'dataset_name': 'refcocog'},
+}
+
+class RESDataset:
+    METAINFO: dict = dict(name='Referring Expression Segmentation')
+
+    def __init__(self,
+                 image_folder,
+                 dataset_name,
+                 data_path=None,
+                 split='val',
+                 ):
+        self.split = split
+        self._set_attribute(dataset_name)
+        json_datas = self.json_file_preprocess(data_path)
+        self.json_datas = json_datas
+        self.image_folder = image_folder
+
+    def _set_attribute(self, dataset_name):
+        attr_dict = DATASETS_ATTRIBUTES[dataset_name]
+        self.splitBy = attr_dict['splitBy']
+        self.dataset_name = attr_dict['dataset_name']
+
+    def __len__(self):
+        return len(self.json_datas)
+
+    def real_len(self):
+        return len(self.json_datas)
+
+    def json_file_preprocess(self, data_path):
+        splitBy = self.splitBy
+        dataset_name = self.dataset_name
+        refer_api = REFER(data_path, dataset_name, splitBy)
+        ref_ids_train = refer_api.getRefIds(split=self.split)
+        images_ids_train = refer_api.getImgIds(ref_ids=ref_ids_train)
+        refs_train = refer_api.loadRefs(ref_ids=ref_ids_train)
+        self.img2refs = self.create_img_to_refs_mapping(refs_train)
+
+        image_infos = []
+        loaded_images = refer_api.loadImgs(image_ids=images_ids_train)
+        for item in loaded_images:
+            item = item.copy()
+            image_infos.append(item)
+
+        self.annotations = refer_api.Anns
+        refs = [self.img2refs[image_info['id']] for image_info in image_infos]
+
+        ret = []
+        for image_info, ref in zip(image_infos, refs):
+            if len(ref) == 0:
+                continue
+
+            sents = []
+            ann_ids = []
+            for _ref in ref:
+                for sent in _ref["sentences"]:
+                    text = sent["sent"]
+                    sents.append(text)
+                    ann_ids.append(_ref["ann_id"])
+
+            sampled_inds = list(range(len(sents)))
+            sampled_sents = np.vectorize(sents.__getitem__)(sampled_inds).tolist()
+            sampled_ann_ids = [ann_ids[ind] for ind in sampled_inds]
+            selected_labels = sampled_sents
+            ret.append(
+                {'image_info': image_info,
+                 'sampled_ann_id': sampled_ann_ids,
+                 'selected_labels': selected_labels,
+                 'image': image_info['file_name']
+                 }
+            )
+        return ret
+
+    def create_img_to_refs_mapping(self, refs_train):
+        img2refs = {}
+        for ref in refs_train:
+            img2refs[ref["image_id"]] = img2refs.get(ref["image_id"], []) + [ref, ]
+        return img2refs
+
+    def decode_mask(self, annotations_ids, image_info):
+        flag = False
+        masks = []
+
+        for ann_id in annotations_ids:
+            if isinstance(ann_id, list):
+                flag = True
+                if -1 in ann_id:
+                    assert len(ann_id) == 1
+                    m = np.zeros((image_info["height"], image_info["width"])).astype(
+                        np.uint8
+                    )
+                else:
+                    m_final = np.zeros(
+                        (image_info["height"], image_info["width"])
+                    ).astype(np.uint8)
+                    for ann_id_i in ann_id:
+                        ann = self.annotations[ann_id_i]
+
+                        if len(ann["segmentation"]) == 0:
+                            m = np.zeros(
+                                (image_info["height"], image_info["width"])
+                            ).astype(np.uint8)
+                        else:
+                            if type(ann["segmentation"][0]) == list:  # polygon
+                                rle = _mask.frPyObjects(
+                                    ann["segmentation"], image_info["height"], image_info["width"], )
+                            else:
+                                rle = ann["segmentation"]
+                                for i in range(len(rle)):
+                                    if not isinstance(rle[i]["counts"], bytes):
+                                        rle[i]["counts"] = rle[i]["counts"].encode()
+                            m = _mask.decode(rle)
+                            m = np.sum(
+                                m, axis=2
+                            )  # sometimes there are multiple binary map (corresponding to multiple segs)
+                            m = m.astype(np.uint8)  # convert to np.uint8
+                        m_final = m_final | m
+                    m = m_final
+                masks.append(m)
+                continue
+
+            ann = self.annotations[ann_id]
+
+            if len(ann["segmentation"]) == 0:
+                m = np.zeros((image_info["height"], image_info["width"])).astype(
+                    np.uint8
+                )
+                masks.append(m)
+                continue
+
+            if type(ann["segmentation"][0]) == list:  # polygon
+                rle = _mask.frPyObjects(
+                    ann["segmentation"], image_info["height"], image_info["width"]
+                )
+            else:
+                rle = ann["segmentation"]
+                for i in range(len(rle)):
+                    if not isinstance(rle[i]["counts"], bytes):
+                        rle[i]["counts"] = rle[i]["counts"].encode()
+            m = _mask.decode(rle)
+            m = np.sum(m, axis=2)  # sometimes there are multiple binary map (corresponding to multiple segs)
+            m = m.astype(np.uint8)  # convert to np.uint8
+            masks.append(m)
+        masks = np.stack(masks, axis=0)
+
+        # if self.pad_image_to_square:
+        masks = torch.from_numpy(masks)
+        return masks
+
+    def only_get_text_infos(self, json_data):
+        return {'sampled_sents': json_data['selected_labels']}
+
+    def get_questions(self, text_require_infos):
+        sampled_sents = text_require_infos['sampled_sents']
+        ret = []
+        for sent in sampled_sents:
+            ret.append("<image>\n Please segment {} in this image.".format(sent))
+        return ret
+
+    def filter_data_dict(self, data_dict):
+        names = ['image', 'text', 'gt_masks', 'img_id']
+        ret = {name: data_dict[name] for name in names}
+        return ret
+
+    def __getitem__(self, index):
+        index = index % self.real_len()
+        data_dict = self.json_datas[index]
+        text_require_infos = self.only_get_text_infos(data_dict)
+        questions = self.get_questions(text_require_infos)
+
+        assert data_dict.get('image', None) is not None
+        if data_dict.get('image', None) is not None:
+            image_file = data_dict['image']
+            image_file = os.path.join(self.image_folder, image_file)
+            image = Image.open(image_file).convert('RGB')
+
+            # process and get masks for evaluation
+            masks = self.decode_mask(data_dict['sampled_ann_id'], data_dict['image_info'])
+            data_dict['gt_masks'] = masks
+            data_dict['image'] = image
+            data_dict['text'] = questions
+            data_dict['img_id'] = str(index)
+        return self.filter_data_dict(data_dict)
+
+    @master_only
+    def evaluate(self, result, work_dir):
+        trackers = {
+            "intersection": AverageMeter("Intersec", ":6.3f", Summary.SUM),
+            "union": AverageMeter("Union", ":6.3f", Summary.SUM),
+            "gIoU": AverageMeter("gIoU", ":6.3f", Summary.SUM)
+        }
+        for pred_dict in result:
+            intersection, union, accuracy_iou = 0.0, 0.0, 0.0
+            masks = pred_dict['prediction_masks']
+            _masks = []
+            for mask in masks:
+                if mask is not None:
+                    mask = rle_to_mask(mask)
+                _masks.append(mask)
+            targets = pred_dict['gt_masks']
+            _targets = rle_to_mask(targets)
+
+            for i_item, _mask in enumerate(_masks):
+                if _mask is None:
+                    continue
+
+                _target = _targets[i_item: i_item+1]
+                for prediction, target in zip(_mask, _target):
+                    prediction = torch.from_numpy(prediction).int().cuda()
+                    target = torch.from_numpy(target).int().cuda()
+                    intersect, union_, _ = intersectionAndUnionGPU(
+                        prediction.contiguous().clone(), target.contiguous(), 2, ignore_index=255
+                    )
+                    intersection += intersect
+                    union += union_
+                    accuracy_iou += intersect / (union_ + 1e-5)
+                    accuracy_iou[union_ == 0] += 1.0
+
+            intersection, union = intersection.cpu().numpy(), union.cpu().numpy()
+            accuracy_iou = accuracy_iou.cpu().numpy() / _targets.shape[0]
+            trackers["intersection"].update(intersection)
+            trackers["union"].update(union)
+            trackers["gIoU"].update(accuracy_iou, n=_targets.shape[0])
+
+        cur_results = {'pixel_intersection': trackers["intersection"].sum[1],
+                       'pixel_union': trackers["union"].sum[1],
+                       'gIoU': trackers["gIoU"].avg[1],
+                       'mask_counts': trackers["gIoU"].count,
+                       }
+        class_iou = cur_results['pixel_intersection'] / (cur_results['pixel_union'] + 1e-10)
+        global_iou = cur_results['gIoU']
+
+        print('============================================', 'current')
+        print('CIoU: {}, GIoU: {}'.format(class_iou, global_iou), 'current')
+        print('============================================', 'current')
+        print('RES_{}_{} successfully finished evaluating'.format(self.dataset_name, self.split),
+                  'current')
+        return {'Acc': class_iou}
+
+
+def rle_to_mask(rle):
+    mask = []
+    for r in rle:
+        m = _mask.decode(r)
+        m = np.uint8(m)
+        mask.append(m)
+    mask = np.stack(mask, axis=0)
+    return mask
\ No newline at end of file
--- a/projects/llava_sam2/evaluation/dataset/__init__.py
+++ b/projects/llava_sam2/evaluation/dataset/__init__.py
+from .RES import RESDataset
+from .refVOS import RefVOSDataset
--- a/projects/llava_sam2/evaluation/dataset/base_eval_dataset.py
+++ b/projects/llava_sam2/evaluation/dataset/base_eval_dataset.py
+from torch.utils.data import Dataset
+import copy
+from collections.abc import Mapping
+from typing import Union
+from mmengine.config import Config
+import logging
+from mmengine.fileio import list_from_file
+from mmengine.logging import print_log
+from abc import abstractmethod
+
+
+class BaseEvalDataset(Dataset):
+
+    METAINFO: dict = dict(name='default')
+
+    def __init__(self, metainfo: Union[Mapping, Config, None] = None):
+        self._metainfo = self._load_metainfo(copy.deepcopy(metainfo))
+
+    @classmethod
+    def _load_metainfo(cls,
+                       metainfo: Union[Mapping, Config, None] = None) -> dict:
+        """Collect meta information from the dictionary of meta.
+
+        Args:
+            metainfo (Mapping or Config, optional): Meta information dict.
+                If ``metainfo`` contains existed filename, it will be
+                parsed by ``list_from_file``.
+
+        Returns:
+            dict: Parsed meta information.
+        """
+        # avoid `cls.METAINFO` being overwritten by `metainfo`
+        cls_metainfo = copy.deepcopy(cls.METAINFO)
+        if metainfo is None:
+            return cls_metainfo
+        if not isinstance(metainfo, (Mapping, Config)):
+            raise TypeError('metainfo should be a Mapping or Config, '
+                            f'but got {type(metainfo)}')
+
+        for k, v in metainfo.items():
+            if isinstance(v, str):
+                # If type of value is string, and can be loaded from
+                # corresponding backend. it means the file name of meta file.
+                try:
+                    cls_metainfo[k] = list_from_file(v)
+                except (TypeError, FileNotFoundError):
+                    print_log(
+                        f'{v} is not a meta file, simply parsed as meta '
+                        'information',
+                        logger='current',
+                        level=logging.WARNING)
+                    cls_metainfo[k] = v
+            else:
+                cls_metainfo[k] = v
+        return cls_metainfo
+
+    @property
+    def metainfo(self) -> dict:
+        """Get meta information of dataset.
+
+        Returns:
+            dict: meta information collected from ``BaseDataset.METAINFO``,
+            annotation file and metainfo argument during instantiation.
+        """
+        return copy.deepcopy(self._metainfo)
+
+    @abstractmethod
+    def evaluate(self, results, work_dir):
+        pass
--- a/projects/llava_sam2/evaluation/dataset/refVOS.py
+++ b/projects/llava_sam2/evaluation/dataset/refVOS.py
+import os
+import json
+
+import mmengine
+
+from PIL import Image
+import copy
+
+from mmengine.dist import master_only
+
+from .base_eval_dataset import BaseEvalDataset
+
+SEG_PROMPT = "<image>\nPlease segment {}."
+
+
+class RefVOSDataset(BaseEvalDataset):
+    def __init__(self,
+                 image_folder,
+                 expression_file,
+                 mask_file,
+    ):
+        super().__init__()
+        vid2metaid, metas, mask_dict = self.json_file_preprocess(expression_file, mask_file)
+        self.vid2metaid = vid2metaid
+        self.videos = list(self.vid2metaid.keys())
+        self.mask_dict = mask_dict
+        self.text_data = metas
+
+        self.image_folder = image_folder
+
+    def __len__(self):
+        return len(self.text_data)
+
+    def real_len(self):
+        return len(self.text_data)
+
+    def json_file_preprocess(self, expression_file, mask_file):
+        with open(expression_file, 'r') as f:
+            expression_datas = json.load(f)['videos']
+        metas = []
+        vid2metaid = {}
+        for vid_name in expression_datas:
+            vid_express_data = expression_datas[vid_name]
+
+            vid_frames = sorted(vid_express_data['frames'])
+            vid_len = len(vid_frames)
+
+            exp_id_list = sorted(list(vid_express_data['expressions'].keys()))
+            for exp_id in exp_id_list:
+                exp_dict = vid_express_data['expressions'][exp_id]
+                meta = {}
+                meta['video'] = vid_name
+                meta['exp'] = exp_dict['exp']
+                meta['frames'] = vid_frames
+                meta['exp_id'] = exp_id
+                meta['length'] = vid_len
+                metas.append(meta)
+                if vid_name not in vid2metaid.keys():
+                    vid2metaid[vid_name] = []
+                vid2metaid[vid_name].append(len(metas) - 1)
+
+        if mask_file is not None:
+            mask_dict = mmengine.load(mask_file)
+        else:
+            mask_dict = None
+        return vid2metaid, metas, mask_dict
+
+    def __getitem__(self, index):
+        video_obj_info = copy.deepcopy(self.text_data[index])
+        exp = video_obj_info['exp']
+
+        data_dict = {}
+
+        video_id = video_obj_info['video']
+        frames_files = video_obj_info['frames']
+        frames_files = [
+            os.path.join(self.image_folder,video_id, frame_file + ".jpg") for frame_file in frames_files
+        ]
+        
+        images = []
+        ori_width, ori_height = None, None
+        for frame_idx, frame_path in enumerate(frames_files):
+            frame_image = Image.open(frame_path).convert('RGB')
+            if ori_height is None:
+                ori_width, ori_height = frame_image.size
+            else:
+                assert ori_width == frame_image.size[0]
+                assert ori_height == frame_image.size[1]
+            images.append(frame_image)
+
+        data_dict['type'] = 'video'
+        data_dict['index'] = index
+        data_dict['video_id'] = video_id
+        data_dict['images'] = images
+        data_dict['exp_id'] = video_obj_info['exp_id']
+
+        data_dict['frames'] = video_obj_info['frames']
+        data_dict['text_prompt'] = SEG_PROMPT.format(exp) if '?' not in exp else exp
+        data_dict['image_folder'] = self.image_folder
+
+        data_dict['length'] = video_obj_info['length']
+        data_dict['ori_height'] = ori_height
+        data_dict['ori_width'] = ori_width
+
+        return data_dict
--- a/projects/llava_sam2/evaluation/dist_test.sh
+++ b/projects/llava_sam2/evaluation/dist_test.sh
+#!/usr/bin/env bash
+
+FILE=$1
+MODEL=$2
+GPUS=$3
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-$((28500 + $RANDOM % 2000))}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+if command -v torchrun &> /dev/null
+then
+  echo "Using torchrun mode."
+  PYTHONPATH="$(dirname $0)/..":$PYTHONPATH OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 \
+    torchrun --nnodes=${NNODES} \
+    --nnodes=${NNODES} \
+    --node_rank=${NODE_RANK} \
+    --master_addr=${MASTER_ADDR} \
+    --master_port=${PORT} \
+    --nproc_per_node=${GPUS} \
+    ${FILE} ${MODEL} --launcher pytorch "${@:4}"
+else
+  echo "Using launch mode."
+  PYTHONPATH="$(dirname $0)/..":$PYTHONPATH OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 \
+    python -m torch.distributed.launch \
+    --nnodes=${NNODES} \
+    --node_rank=${NODE_RANK} \
+    --master_addr=${MASTER_ADDR} \
+    --master_port=${PORT} \
+    --nproc_per_node=${GPUS} \
+    ${FILE} ${MODEL} --launcher pytorch "${@:4}"
+fi
+
--- a/projects/llava_sam2/evaluation/gcg_eval.py
+++ b/projects/llava_sam2/evaluation/gcg_eval.py
+import argparse
+import math
+import os
+import torch
+import tqdm
+from pycocotools import mask as mask_utils
+
+from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig, CLIPImageProcessor,
+                          CLIPVisionModel, GenerationConfig)
+
+from utils import _init_dist_pytorch, get_dist_info, collect_results_cpu
+from PIL import Image
+import re
+import json
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='GCG')
+    parser.add_argument('model_path', help='hf model path.')
+    parser.add_argument(
+        '--split',
+        default='val',
+        help='Specify a split')
+    parser.add_argument(
+        '--save_dir',
+        default='./gcg_pred/',
+        help='save path')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+IMAGE_FOLDER = './data/glamm_data/images/grandf/val_test/'
+
+
+class GCGInferenceDataset:
+    def __init__(self,
+                 image_folder,
+                 save_dir=None,
+                 ):
+        self.image_folder = image_folder
+
+        self.images = os.listdir(image_folder)
+
+        if save_dir is not None:
+            # filter evaluated
+            self.save_dir = save_dir
+            exsits_files = os.listdir(self.save_dir)
+            exsits_files = [_file[:-5] for _file in exsits_files]
+            _images = []
+            for i, item in enumerate(self.images):
+                if item[:-4] not in exsits_files:
+                    _images.append(item)
+            self.images = _images
+
+    def __len__(self):
+        return len(self.images)
+
+    def get_questions(self):
+        question = "Could you please give me a brief description of the image? Please respond with interleaved \
+    segmentation masks for the corresponding parts of the answer."
+        return question
+
+    def __getitem__(self, index):
+        data_dict = {}
+        questions = self.get_questions()
+        image_file = self.images[index]
+        data_dict['image_file'] = image_file
+
+        image_file = os.path.join(self.image_folder, image_file)
+        image = Image.open(image_file).convert('RGB')
+
+        data_dict['image'] = image
+        data_dict['text'] = "<image>\n" + questions
+
+        data_dict['img_id'] = image_file
+        return data_dict
+
+def main():
+    args = parse_args()
+
+    if args.launcher != 'none':
+        _init_dist_pytorch('nccl')
+        rank, world_size = get_dist_info()
+        torch.cuda.set_device(rank)
+    else:
+        rank = 0
+        world_size = 1
+
+    # build model
+    model = AutoModel.from_pretrained(
+        args.model_path,
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        use_flash_attn=True,
+        trust_remote_code=True,
+    ).eval().cuda()
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_path,
+        trust_remote_code=True,
+    )
+
+    if not os.path.exists(args.save_dir):
+        os.mkdir(args.save_dir)
+
+    dataset = GCGInferenceDataset(
+        image_folder=IMAGE_FOLDER,
+        save_dir=args.save_dir,
+    )
+
+    results = []
+    n_samples = len(dataset)
+    per_rank_samples = math.ceil(n_samples / world_size) + 1
+    per_rank_ids = range(per_rank_samples * rank,
+                         min(n_samples, per_rank_samples * (rank + 1)))
+    for idx in tqdm.tqdm(per_rank_ids):
+        data_batch = dataset[idx]
+        prediction = {'img_id': data_batch['img_id'], 'image_file': data_batch['image_file']}
+        del data_batch['img_id'], data_batch['image_file']
+
+        w, h = data_batch['image'].size
+
+        pred_dict = model.predict_forward(**data_batch, tokenizer=tokenizer)
+        if 'prediction_masks' not in pred_dict.keys() or pred_dict['prediction_masks'] is None or len(pred_dict['prediction_masks']) == 0:
+            print("No SEG !!!")
+            prediction['prediction_masks'] = torch.zeros((0, h, w), dtype=torch.bool)
+        else:
+            prediction['prediction_masks'] = torch.stack(pred_dict['prediction_masks'], dim=0)[:, 0]
+        process_and_save_output(
+            args.save_dir,
+            prediction['image_file'],
+            pred_dict['prediction'],
+            prediction['prediction_masks']
+        )
+        results.append(pred_dict['prediction'])
+
+    results = collect_results_cpu(results, len(dataset), tmpdir='./gcg_eval_tmp')
+
+
+def process_and_save_output(output_dir, image_name, text_output, pred_masks):
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+
+    text_output = text_output.replace("<s>", "").replace("\n", "").replace("  ", " ")
+    text_output = text_output.split("ASSISTANT: ")[-1]
+
+    cleaned_str = re.sub(r'<.*?>', '', text_output)
+
+    pattern = re.compile(r'<p>(.*?)<\/p>')
+    phrases = pattern.findall(text_output)
+    phrases = [p.strip() for p in phrases]
+
+    # Remove the [SEG] token
+    cleaned_str = cleaned_str.replace('[SEG]', '')
+
+    # Strip unnecessary spaces
+    cleaned_str = ' '.join(cleaned_str.split()).strip("'")
+    cleaned_str = cleaned_str.strip()
+
+    # Convert the predicted masks into RLE format
+    pred_masks_tensor = pred_masks.cpu()
+    uncompressed_mask_rles = mask_to_rle_pytorch(pred_masks_tensor)
+    rle_masks = []
+    for m in uncompressed_mask_rles:
+        rle_masks.append(coco_encode_rle(m))
+
+    # Create results dictionary
+    # print(f"clean_str: {cleaned_str}")
+    result_dict = {
+        "image_id": image_name[:-4],
+        "caption": cleaned_str,
+        "phrases": phrases,
+        "pred_masks": rle_masks
+    }
+
+    # print(cleaned_str)
+    # print(phrases)
+
+    output_path = f"{output_dir}/{image_name[:-4]}.json"
+
+    with open(output_path, 'w') as f:
+        json.dump(result_dict, f)
+
+    return
+
+def mask_to_rle_pytorch(tensor: torch.Tensor):
+    """
+    Encodes masks to an uncompressed RLE, in the format expected by
+    pycoco tools.
+    """
+    # Put in fortran order and flatten h,w
+    b, h, w = tensor.shape
+    tensor = tensor.permute(0, 2, 1).flatten(1)
+
+    # Compute change indices
+    diff = tensor[:, 1:] ^ tensor[:, :-1]
+    change_indices = diff.nonzero()
+
+    # Encode run length
+    out = []
+    for i in range(b):
+        cur_idxs = change_indices[change_indices[:, 0] == i, 1]
+        cur_idxs = torch.cat(
+            [torch.tensor([0], dtype=cur_idxs.dtype, device=cur_idxs.device), cur_idxs + 1,
+             torch.tensor([h * w], dtype=cur_idxs.dtype, device=cur_idxs.device), ]
+        )
+        btw_idxs = cur_idxs[1:] - cur_idxs[:-1]
+        counts = [] if tensor[i, 0] == 0 else [0]
+        counts.extend(btw_idxs.detach().cpu().tolist())
+        out.append({"size": [h, w], "counts": counts})
+
+    return out
+
+def coco_encode_rle(uncompressed_rle):
+    h, w = uncompressed_rle["size"]
+    rle = mask_utils.frPyObjects(uncompressed_rle, h, w)
+    rle["counts"] = rle["counts"].decode("utf-8")  # Necessary to serialize with json
+
+    return rle
+
+if __name__ == '__main__':
+    main()
--- a/projects/llava_sam2/evaluation/metrics_gcg.py
+++ b/projects/llava_sam2/evaluation/metrics_gcg.py
+import os
+import json
+import argparse
+from tqdm import tqdm
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from pycocotools import mask as maskUtils
+from pycocoevalcap.eval import COCOEvalCap
+from transformers import AutoTokenizer, AutoModel
+from sklearn.metrics.pairwise import cosine_similarity
+import torch
+import numpy as np
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Training")
+
+    parser.add_argument("--split", required=True, help="Evaluation split, options are 'val', 'test'")
+    parser.add_argument("--prediction_dir_path", required=True, help="The path where the inference results are stored.")
+    parser.add_argument("--gt_dir_path", required=False, default="./data/glamm_data/annotations/gcg_val_test/",
+                        help="The path containing GranD-f evaluation annotations.")
+
+    args = parser.parse_args()
+
+    return args
+
+
+# Load pre-trained model tokenizer and model for evaluation
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+model = AutoModel.from_pretrained("bert-base-uncased")
+
+
+def get_bert_embedding(text):
+    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
+    outputs = model(**inputs)
+    # Use the mean of the last hidden states as sentence embedding
+    sentence_embedding = torch.mean(outputs.last_hidden_state[0], dim=0).detach().numpy()
+
+    return sentence_embedding
+
+def compute_iou(mask1, mask2):
+    intersection = np.logical_and(mask1, mask2)
+    union = np.logical_or(mask1, mask2)
+    iou = np.sum(intersection) / np.sum(union)
+
+    return iou
+
+def bbox_to_x1y1x2y2(bbox):
+    x1, y1, w, h = bbox
+    bbox = [x1, y1, x1 + w, y1 + h]
+
+    return bbox
+
+def compute_miou(pred_masks, gt_masks):
+    # Computing mIoU between predicted masks and ground truth masks
+    iou_matrix = np.zeros((len(pred_masks), len(gt_masks)))
+    for i, pred_mask in enumerate(pred_masks):
+        for j, gt_mask in enumerate(gt_masks):
+            iou_matrix[i, j] = compute_iou(pred_mask, gt_mask)
+
+    # One-to-one pairing and mean IoU calculation
+    paired_iou = []
+    while iou_matrix.size > 0 and np.max(iou_matrix) > 0:
+        max_iou_idx = np.unravel_index(np.argmax(iou_matrix, axis=None), iou_matrix.shape)
+        paired_iou.append(iou_matrix[max_iou_idx])
+        iou_matrix = np.delete(iou_matrix, max_iou_idx[0], axis=0)
+        iou_matrix = np.delete(iou_matrix, max_iou_idx[1], axis=1)
+
+    return np.mean(paired_iou) if paired_iou else 0.0
+
+
+def evaluate_mask_miou(coco_gt, image_ids, pred_save_path):
+    # Load predictions
+    coco_dt = coco_gt.loadRes(pred_save_path)
+
+    mious = []
+    for image_id in tqdm(image_ids):
+        # Getting ground truth masks
+        matching_anns = [ann for ann in coco_gt.anns.values() if ann['image_id'] == image_id]
+        ann_ids = [ann['id'] for ann in matching_anns]
+
+        gt_anns = coco_gt.loadAnns(ann_ids)
+        gt_masks = [maskUtils.decode(ann['segmentation']) for ann in gt_anns if 'segmentation' in ann]
+
+        # Getting predicted masks
+        matching_anns = [ann for ann in coco_dt.anns.values() if ann['image_id'] == image_id]
+        dt_ann_ids = [ann['id'] for ann in matching_anns]
+        pred_anns = coco_dt.loadAnns(dt_ann_ids)
+        pred_masks = [maskUtils.decode(ann['segmentation']) for ann in pred_anns if 'segmentation' in ann]
+
+        # Compute and save the mIoU for the current image
+        mious.append(compute_miou(pred_masks, gt_masks))
+
+    # Report mean IoU across all images
+    mean_miou = np.mean(mious) if mious else 0.0  # If list is empty, return 0.0
+
+    print(f"Mean IoU (mIoU) across all images: {mean_miou:.3f}")
+
+
+def compute_iou_matrix(pred_masks, gt_masks):
+    iou_matrix = np.zeros((len(pred_masks), len(gt_masks)))
+    for i, pred_mask in enumerate(pred_masks):
+        for j, gt_mask in enumerate(gt_masks):
+            iou_matrix[i, j] = compute_iou(pred_mask, gt_mask)
+
+    return iou_matrix
+
+
+def text_similarity_bert(str1, str2):
+    emb1 = get_bert_embedding(str1)
+    emb2 = get_bert_embedding(str2)
+
+    return cosine_similarity([emb1], [emb2])[0, 0]
+
+
+def find_best_matches(gt_anns, gt_labels, dt_anns, dt_labels, iou_threshold, text_sim_threshold, vectorizer=None):
+    best_matches = []
+
+    # Compute pair - wise IoU
+    pred_masks = [maskUtils.decode(ann['segmentation']) for ann in dt_anns]
+    gt_masks = [maskUtils.decode(ann['segmentation']) for ann in gt_anns]
+    ious = compute_iou_matrix(gt_masks, pred_masks)
+
+    text_sims = np.zeros((len(gt_labels), len(dt_labels)))
+
+    for i, gt_label in enumerate(gt_labels):
+        for j, dt_label in enumerate(dt_labels):
+            text_sims[i, j] = text_similarity_bert(gt_label, dt_label)
+
+    # Find one-to-one matches satisfying both IoU and text similarity thresholds
+    while ious.size > 0:
+        max_iou_idx = np.unravel_index(np.argmax(ious), ious.shape)
+        if ious[max_iou_idx] < iou_threshold or text_sims[max_iou_idx] < text_sim_threshold:
+            break  # No admissible pair found
+
+        best_matches.append(max_iou_idx)
+
+        # Remove selected annotations from consideration
+        ious[max_iou_idx[0], :] = 0
+        ious[:, max_iou_idx[1]] = 0
+        text_sims[max_iou_idx[0], :] = 0
+        text_sims[:, max_iou_idx[1]] = 0
+
+    return best_matches  # List of index pairs [(gt_idx, dt_idx), ...]
+
+
+def evaluate_recall_with_mapping(coco_gt, coco_cap_gt, image_ids, pred_save_path, cap_pred_save_path, iou_threshold=0.5,
+                                 text_sim_threshold=0.5):
+    coco_dt = coco_gt.loadRes(pred_save_path)
+    coco_cap_dt = coco_cap_gt.loadRes(cap_pred_save_path)
+
+    true_positives = 0
+    actual_positives = 0
+
+    for image_id in tqdm(image_ids):
+        try:
+            # gt_ann_ids = coco_gt.getAnnIds(imgIds=image_id, iscrowd=None)
+            matching_anns = [ann for ann in coco_gt.anns.values() if ann['image_id'] == image_id]
+            gt_ann_ids = [ann['id'] for ann in matching_anns]
+            gt_anns = coco_gt.loadAnns(gt_ann_ids)
+
+            # dt_ann_ids = coco_dt.getAnnIds(imgIds=image_id, iscrowd=None)
+            matching_anns = [ann for ann in coco_dt.anns.values() if ann['image_id'] == image_id]
+            dt_ann_ids = [ann['id'] for ann in matching_anns]
+            dt_anns = coco_dt.loadAnns(dt_ann_ids)
+
+            # gt_cap_ann_ids = coco_cap_gt.getAnnIds(imgIds=image_id)
+            matching_anns = [ann for ann in coco_cap_gt.anns.values() if ann['image_id'] == image_id]
+            gt_cap_ann_ids = [ann['id'] for ann in matching_anns]
+            gt_cap_ann = coco_cap_gt.loadAnns(gt_cap_ann_ids)[0]
+
+            # dt_cap_ann_ids = coco_cap_dt.getAnnIds(imgIds=image_id)
+            matching_anns = [ann for ann in coco_cap_dt.anns.values() if ann['image_id'] == image_id]
+            dt_cap_ann_ids = [ann['id'] for ann in matching_anns]
+            dt_cap_ann = coco_cap_dt.loadAnns(dt_cap_ann_ids)[0]
+
+            gt_labels = gt_cap_ann['labels']
+            dt_labels = dt_cap_ann['labels']
+
+            actual_positives += len(gt_labels)
+
+            # Find best matching pairs
+            best_matches = find_best_matches(gt_anns, gt_labels, dt_anns, dt_labels, iou_threshold, text_sim_threshold)
+
+            true_positives += len(best_matches)
+        except Exception as e:
+            print(e)
+
+    recall = true_positives / actual_positives if actual_positives > 0 else 0
+
+    print(f"Recall: {recall:.3f}")
+
+
+def main():
+    args = parse_args()
+
+    # Set the correct split
+    split = args.split
+    assert split == "val" or split == "test"  # GCG Evaluation has only val and test splits
+    gt_mask_path = f"{args.gt_dir_path}/{split}_gcg_coco_mask_gt.json"
+    gt_cap_path = f"{args.gt_dir_path}/{split}_gcg_coco_caption_gt.json"
+
+    print(f"Starting evalution on {split} split.")
+
+    # Get the image names of the split
+    all_images_ids = []
+    with open(gt_cap_path, 'r') as f:
+        contents = json.load(f)
+        for image in contents['images']:
+            all_images_ids.append(image['id'])
+
+    # The directory is used to store intermediate files
+    tmp_dir_path = f"tmp/{os.path.basename(args.prediction_dir_path)}_{split}"
+    os.makedirs(tmp_dir_path, exist_ok=True)  # Create directory if not exists already
+
+    # Create predictions
+    pred_save_path = f"{tmp_dir_path}/mask_pred_tmp_save.json"
+    cap_pred_save_path = f"{tmp_dir_path}/cap_pred_tmp_save.json"
+    coco_pred_file = []
+    caption_pred_dict = {}
+    for image_id in all_images_ids:
+        prediction_path = f"{args.prediction_dir_path}/{image_id}.json"
+        with open(prediction_path, 'r') as f:
+            pred = json.load(f)
+            bu = pred
+            key = list(pred.keys())[0]
+            pred = pred[key]
+            try:
+                caption_pred_dict[image_id] = {'caption': pred['caption'], 'labels': pred['phrases']}
+            except Exception as e:
+                pred = bu
+                caption_pred_dict[image_id] = {'caption': pred['caption'], 'labels': pred['phrases']}
+            for rle_mask in pred['pred_masks']:
+                coco_pred_file.append({"image_id": image_id, "category_id": 1, "segmentation": rle_mask, "score": 1.0})
+
+    # Save gcg_coco_predictions
+    with open(pred_save_path, 'w') as f:
+        json.dump(coco_pred_file, f)
+
+    # Prepare the CAPTION predictions in COCO format
+    cap_image_ids = []
+    coco_cap_pred_file = []
+    for image_id, values in caption_pred_dict.items():
+        cap_image_ids.append(image_id)
+        coco_cap_pred_file.append({"image_id": image_id, "caption": values['caption'], "labels": values['labels']})
+
+    # Save gcg_caption_coco_predictions
+    with open(cap_pred_save_path, 'w') as f:
+        json.dump(coco_cap_pred_file, f)
+
+    # # -------------------------------#
+    # 1. Evaluate AP
+    # Calculate mask mAP
+    # Load the ground truth and predictions in COCO format
+    coco_gt = COCO(gt_mask_path)
+    coco_dt = coco_gt.loadRes(pred_save_path)  # load predictions
+    # Initialize COCOEval and specify the metric you want to use
+    coco_eval = COCOeval(coco_gt, coco_dt, "segm")  # "segm" for segmentation
+    # Evaluate on a specific category
+    coco_eval.params.catIds = [1]  # your category ID
+    # Evaluate
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+
+    # # -------------------------------#
+    # # 2. Evaluate Caption Quality
+    try:
+        coco_cap_gt = COCO(gt_cap_path)
+        coco_cap_result = coco_cap_gt.loadRes(cap_pred_save_path)
+        # create coco_eval object by taking coco and coco_result
+        coco_eval = COCOEvalCap(coco_cap_gt, coco_cap_result)
+        coco_eval.params['image_id'] = coco_cap_result.getImgIds()
+        coco_eval.evaluate()
+        for metric, score in coco_eval.eval.items():
+            print(f'{metric}: {score:.3f}')
+    except:
+        pass
+
+    # # -------------------------------#
+    # 3. Evaluate Mask Mean MIoU
+    coco_gt = COCO(gt_mask_path)  # Load ground truth annotations
+    evaluate_mask_miou(coco_gt, all_images_ids, pred_save_path)
+
+    # # -------------------------------#
+    # 4. Evaluate Recall
+    evaluate_recall_with_mapping(coco_gt, coco_cap_gt, all_images_ids, pred_save_path, cap_pred_save_path,
+                                 iou_threshold=0.5, text_sim_threshold=0.5)
+
+
+if __name__ == "__main__":
+    main()
--- a/projects/llava_sam2/evaluation/metrics_region_cap.py
+++ b/projects/llava_sam2/evaluation/metrics_region_cap.py
+import argparse
+from pycocotools.coco import COCO
+from pycocoevalcap.eval import COCOEvalCap
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="GLaMM Inference - Region Captioning")
+
+    parser.add_argument("--annotation_file",
+                        default="./data/region_caption/mdetr_annotations/finetune_refcocog_val_captions.json", type=str,
+                        help="Replace with 'data/visual_genome/test_caption.json' for VG.")
+    parser.add_argument("--results_dir", default="results", type=str, help="The path to save the results.")
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # Load the annotation file
+    coco = COCO(args.annotation_file)
+    coco_result = coco.loadRes(args.results_dir)
+
+    # Create coco_eval object by taking coco and coco_result
+    coco_eval = COCOEvalCap(coco, coco_result)
+
+    # Evaluate results
+    coco_eval.params['image_id'] = coco_result.getImgIds()
+    coco_eval.evaluate()
+
+    # Print and save the output evaluation scores
+    output_file_path = f"./region_cap_metrics.txt"
+    f = open(output_file_path, 'w')
+    for metric, score in coco_eval.eval.items():
+        print(f'{metric}: {score:.3f}')
+        f.write(f"{metric}: {score:.3f}\n")
+    f.close()
+
+
+if __name__ == "__main__":
+    main()
--- a/projects/llava_sam2/evaluation/ref_vos_eval.py
+++ b/projects/llava_sam2/evaluation/ref_vos_eval.py
+import argparse
+import json
+import os
+
+import mmengine
+import numpy as np
+from PIL import Image
+
+import torch
+import torch.distributed
+import torch.utils.data
+import tqdm
+from transformers import AutoModel, AutoTokenizer
+
+from projects.llava_sam2.evaluation.dataset import RefVOSDataset
+from projects.llava_sam2.evaluation.utils import _init_dist_pytorch, _init_dist_slurm, get_dist_info, get_rank, collect_results_cpu
+
+import concurrent.futures
+from pycocotools import mask as cocomask
+
+
+def async_func(executor, func, **kwargs):
+    future = executor.submit(func, **kwargs)
+    return future
+
+
+def mask_to_rle(mask):
+    rle = []
+    for m in mask:
+        rle.append(cocomask.encode(np.asfortranarray(m.astype(np.uint8))))
+        rle[-1]['counts'] = rle[-1]['counts'].decode()
+    return rle
+
+
+def mask_save(item, mask_prediction, work_dir):
+    vid_id = item['video_id']
+    exp_id = item['exp_id']
+    save_path = os.path.join(work_dir, 'Annotations', vid_id, exp_id)
+    mmengine.mkdir_or_exist(save_path)
+    for id_m, mask in enumerate(mask_prediction):
+        mask = Image.fromarray(mask.astype(np.float32) * 255).convert('L')
+        file_name = item['frames'][id_m]
+        save_file = os.path.join(save_path, file_name + ".png")
+        mask.save(save_file)
+
+
+DATASETS_INFO = {
+    'DAVIS': {
+        'data_root': 'data/video_datas/davis17/',
+        'image_folder': 'data/video_datas/davis17/valid/JPEGImages/',
+        'expression_file': 'data/video_datas/davis17/meta_expressions/valid/meta_expressions.json',
+        'mask_file': 'data/video_datas/davis17/valid/mask_dict.pkl',
+    },
+    'MEVIS': {
+        'data_root': 'data/video_datas/mevis/valid/',
+        'image_folder': 'data/video_datas/mevis/valid/JPEGImages',
+        'expression_file': 'data/video_datas/mevis/valid/meta_expressions.json',
+        'mask_file': None,
+    },
+    'MEVIS_U': {
+        'data_root': 'data/video_datas/mevis/valid_u/',
+        'image_folder': 'data/video_datas/mevis/valid_u/JPEGImages',
+        'expression_file': 'data/video_datas/mevis/valid_u/meta_expressions.json',
+        'mask_file': 'data/video_datas/mevis/valid_u/mask_dict.json',
+    },
+    'REFYTVOS': {
+        'data_root': 'data/video_datas/rvos/',
+        'image_folder': 'data/video_datas/rvos/valid/JPEGImages/',
+        'expression_file': 'data/video_datas/rvos/meta_expressions/valid/meta_expressions.json',
+        'mask_file': None,
+    },
+    'REVOS': {
+        'data_root': 'data/video_datas/revos/',
+        'image_folder': 'data/video_datas/revos/',
+        'expression_file': 'data/video_datas/revos/meta_expressions_valid_.json',
+        'mask_file': None,
+    }
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='RefVOS')
+    parser.add_argument('model_path', help='hf model path.')
+    parser.add_argument(
+        '--dataset',
+        choices=DATASETS_INFO.keys(),
+        default='MEVIS',
+        help='Specify a dataset')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    parser.add_argument('--submit', action='store_true')
+    parser.add_argument('--work_dir', type=str, default=None)
+    parser.add_argument('--deepspeed', type=str, default=None) # dummy
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    work_dir = args.work_dir
+    if work_dir is None:
+        work_dir = 'work_dirs/foobar'
+
+    if args.launcher == 'none':
+        rank = 0
+        world_size = 1
+    elif args.launcher == 'pytorch':
+        _init_dist_pytorch('nccl')
+        rank, world_size = get_dist_info()
+    elif args.launcher == 'slurm':
+        _init_dist_slurm('nccl')
+        rank, world_size = get_dist_info()
+
+    model = AutoModel.from_pretrained(
+        args.model_path,
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        use_flash_attn=True,
+        trust_remote_code=True,
+    ).eval().cuda()
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_path,
+        trust_remote_code=True,
+    )
+    dataset_info = DATASETS_INFO[args.dataset]
+
+
+    dataset = RefVOSDataset(
+        image_folder=dataset_info['image_folder'],
+        expression_file=dataset_info['expression_file'],
+        mask_file=dataset_info['mask_file'],
+    )
+
+    sampler = torch.utils.data.DistributedSampler(
+        dataset, 
+        num_replicas=world_size, 
+        rank=rank, 
+        shuffle=False,
+        drop_last=False
+    )
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=1,
+        num_workers=2,
+        pin_memory=False,
+        collate_fn=lambda x:x[0],
+    )
+    results = []
+    executor = concurrent.futures.ThreadPoolExecutor()
+    for item in tqdm.tqdm(dataloader):
+        with torch.no_grad():
+            result = model.predict_forward(
+                video=item['images'],
+                text=item['text_prompt'],
+                tokenizer=tokenizer,
+            )
+
+        text_idx = 0
+        text_prediction = result['prediction']
+        if len(result['prediction_masks']) > 0:
+            mask_prediction = result['prediction_masks'][text_idx]
+        else:
+            print(text_prediction)
+            mask_prediction = np.zeros((item['length'], item['ori_height'], item['ori_width']), dtype=np.uint8)
+
+        if args.submit:
+            async_func(executor, mask_save, item=item, mask_prediction=mask_prediction, work_dir=work_dir)
+            encoded_mask = None
+        else:
+            encoded_mask = mask_to_rle(mask_prediction)
+
+        result = {
+            'index': item['index'],
+            'video_id': item['video_id'],
+            'exp_id': item['exp_id'],
+            'text_prediction': text_prediction,
+            'frames': item['frames'],
+            'exp': item['text_prompt'],
+            'prediction_masks': encoded_mask,
+
+        }
+        results.append(result)
+
+
+    executor.shutdown(wait=True)
+    print(f'[Rank {rank}] : Finished.')
+    
+    if not args.submit:
+        results = collect_results_cpu(results, len(dataset))
+        if get_rank() == 0:
+            final_results = {}
+            for item in results:
+                vid_id = item['video_id']
+                exp_id = item['exp_id']
+                if vid_id not in final_results:
+                    final_results[vid_id] = {}
+                assert exp_id not in final_results[vid_id]
+                final_results[vid_id][exp_id] = item
+            os.makedirs(work_dir, exist_ok=True)
+            json.dump(final_results, open(f'{work_dir}/results.json', 'w'))
+
+    if rank == 0:
+        print('Done')
--- a/projects/llava_sam2/evaluation/refcoco_eval.py
+++ b/projects/llava_sam2/evaluation/refcoco_eval.py
+import argparse
+import copy
+import math
+import os
+import torch
+import tqdm
+from pycocotools import mask as _mask
+import numpy as np
+import random
+
+from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig, CLIPImageProcessor,
+                          CLIPVisionModel, GenerationConfig)
+
+from utils import _init_dist_pytorch, get_dist_info, get_rank, collect_results_cpu
+from dataset import RESDataset
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='RefCocoSeg')
+    parser.add_argument('model_path', help='hf model path.')
+    parser.add_argument(
+        '--dataset',
+        choices=DATASETS_ATTRIBUTES.keys(),
+        default='refcoco',
+        help='Specify a ref dataset')
+    parser.add_argument(
+        '--split',
+        default='val',
+        help='Specify a split')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+DATASETS_ATTRIBUTES = {
+    'refcoco': {'splitBy': "unc", 'dataset_name': 'refcoco'},
+    'refcoco_plus': {'splitBy': "unc", 'dataset_name': 'refcoco_plus'},
+    'refcocog': {'splitBy': "umd", 'dataset_name': 'refcocog'},
+}
+
+IMAGE_FOLDER = './data/glamm_data/images/coco2014/train2014/'
+DATA_PATH = './data/ref_seg/'
+
+
+def main():
+    args = parse_args()
+
+    if args.launcher != 'none':
+        _init_dist_pytorch('nccl')
+        rank, world_size = get_dist_info()
+        torch.cuda.set_device(rank)
+    else:
+        rank = 0
+        world_size = 1
+
+    # build model
+    model = AutoModel.from_pretrained(
+        args.model_path,
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        use_flash_attn=True,
+        trust_remote_code=True,
+    ).eval().cuda()
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_path,
+        trust_remote_code=True,
+    )
+    dataset_info = DATASETS_ATTRIBUTES[args.dataset]
+
+    dataset = RESDataset(
+        image_folder=IMAGE_FOLDER,
+        dataset_name=dataset_info['dataset_name'],
+        data_path=DATA_PATH,
+        split=args.split,
+    )
+
+    results = []
+    n_samples = len(dataset)
+    per_rank_samples = math.ceil(n_samples / world_size) + 1
+    per_rank_ids = range(per_rank_samples * rank,
+                         min(n_samples, per_rank_samples * (rank + 1)))
+    for idx in tqdm.tqdm(per_rank_ids):
+        data_batch = dataset[idx]
+        prediction = {'img_id': data_batch['img_id'], 'gt_masks': data_batch['gt_masks']}
+        prediction['gt_masks'] = mask_to_rle(prediction['gt_masks'].cpu().numpy())
+        del data_batch['img_id'], data_batch['gt_masks']
+
+        texts = data_batch['text']
+        del data_batch['text']
+        pred_masks = []
+        for text in texts:
+            _data_batch = copy.deepcopy(data_batch)
+            _data_batch['text'] = text
+            pred_mask = model.predict_forward(**_data_batch, tokenizer=tokenizer)['prediction_masks']
+            if len(pred_mask) == 0:
+                # give a zero mask
+                print("No seg pred !!!")
+                pred_masks.append(None)
+            else:
+                _ret_mask = pred_mask[0].cpu().numpy()
+                _ret_mask = mask_to_rle(_ret_mask)
+                pred_masks.append(_ret_mask)
+
+        prediction.update({'prediction_masks': pred_masks})
+        results.append(prediction)
+
+    tmpdir = './dist_test_temp_res_' + args.dataset + args.split + args.model_path.replace('/', '').replace('.', '')
+    results = collect_results_cpu(results, len(dataset), tmpdir=tmpdir)
+    if get_rank() == 0:
+        metric = dataset.evaluate(results, './work_dirs')
+        print(metric)
+
+def mask_to_rle(mask):
+    rle = []
+    for m in mask:
+        rle.append(_mask.encode(np.asfortranarray(m.astype(np.uint8))))
+        rle[-1]['counts'] = rle[-1]['counts'].decode()
+    return rle
+
+if __name__ == '__main__':
+    main()
--- a/projects/llava_sam2/evaluation/region_cap_refcocog_eval.py
+++ b/projects/llava_sam2/evaluation/region_cap_refcocog_eval.py
+import argparse
+import re
+import math
+import os
+import torch
+import tqdm
+from pycocotools import mask as _mask
+import numpy as np
+from PIL import Image
+from pycocotools.coco import COCO
+
+from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig, CLIPImageProcessor,
+                          CLIPVisionModel, GenerationConfig)
+import json
+
+from utils import _init_dist_pytorch, get_dist_info, get_rank, collect_results_cpu
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='RefCocog region caption')
+    parser.add_argument('model_path', help='hf model path.')
+    parser.add_argument(
+        '--output-path',
+        default='./region_cap_pred.json',
+        help='save path of the prediction')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+class RegionCapInferenceDataset:
+    def __init__(self,
+                 image_folder,
+                 annotation_file=None,
+                 ):
+        self.image_folder = image_folder
+        self.coco = COCO(annotation_file)
+        self.image_dict = self.coco.imgs
+        self.ann_dict = self.coco.anns
+        self.image_dict_keys = list(self.image_dict.keys())
+
+    def __len__(self):
+        return len(self.image_dict_keys)
+
+    def decode_mask(self, annotation, image_info):
+        flag = False
+        masks = []
+
+        for ann_id in range(1):
+
+            ann = {"segmentation": annotation}
+
+            if len(ann["segmentation"]) == 0:
+                m = np.zeros((image_info["height"], image_info["width"])).astype(
+                    np.uint8
+                )
+                masks.append(m)
+                continue
+
+            if type(ann["segmentation"][0]) == list:  # polygon
+                rle = _mask.frPyObjects(
+                    ann["segmentation"], image_info["height"], image_info["width"]
+                )
+            else:
+                rle = ann["segmentation"]
+                for i in range(len(rle)):
+                    if not isinstance(rle[i]["counts"], bytes):
+                        rle[i]["counts"] = rle[i]["counts"].encode()
+            m = _mask.decode(rle)
+            m = np.sum(m, axis=2)  # sometimes there are multiple binary map (corresponding to multiple segs)
+            m = m.astype(np.uint8)  # convert to np.uint8
+            masks.append(m)
+        masks = np.stack(masks, axis=0)
+
+        return masks
+
+    def get_questions(self):
+        # question = "<image>\nPlease give me a short description of the region in the picture marked by region1. Please response in a word."
+        question = "<image>\nPlease give me a short description of the region in the picture marked by region1."
+        return question
+
+    def __getitem__(self, index):
+
+        data_dict = {}
+
+        image_id = self.image_dict_keys[index]
+        image_file = self.image_dict[image_id]['file_name']
+
+        questions = self.get_questions()
+
+        data_dict['image_file'] = image_file
+        image_file = os.path.join(self.image_folder, image_file)
+        image = Image.open(image_file).convert('RGB')
+
+        masks = self.ann_dict[image_id]['segmentation']
+        image_info = self.image_dict[image_id]
+        masks = self.decode_mask(masks, image_info)
+
+        data_dict['image'] = image
+        data_dict['text'] = questions
+        data_dict['img_id'] = image_id
+        data_dict['mask_prompts'] = [masks]
+
+        return data_dict
+
+
+ANNOTATION_FILE = './data/region_caption/refcocog/finetune_refcocog_val_with_mask.json'
+IMAGE_FOLDER = './data/glamm_data/images/coco2014/train2014/'
+
+def main():
+    args = parse_args()
+
+    if args.launcher != 'none':
+        _init_dist_pytorch('nccl')
+        rank, world_size = get_dist_info()
+        torch.cuda.set_device(rank)
+    else:
+        rank = 0
+        world_size = 1
+
+    # build model
+    model = AutoModel.from_pretrained(
+        args.model_path,
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        use_flash_attn=True,
+        trust_remote_code=True,
+    ).eval().cuda()
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_path,
+        trust_remote_code=True,
+    )
+
+    dataset = RegionCapInferenceDataset(
+        image_folder=IMAGE_FOLDER,
+        annotation_file=ANNOTATION_FILE,
+    )
+
+    results = []
+    n_samples = len(dataset)
+    per_rank_samples = math.ceil(n_samples / world_size) + 1
+    per_rank_ids = range(per_rank_samples * rank,
+                         min(n_samples, per_rank_samples * (rank + 1)))
+    for idx in tqdm.tqdm(per_rank_ids):
+        data_batch = dataset[idx]
+        result_dict = {'image_id': data_batch['img_id'], 'image_file': data_batch['image_file']}
+        del data_batch['img_id'], data_batch['image_file']
+
+        prediction = model.predict_forward(**data_batch, tokenizer=tokenizer)['prediction']
+
+        text_output = prediction.replace("<s>", "").replace("\n", "") \
+            .replace("region1", '').replace("Region1", '').replace("The region marked by", "").replace("The region marked as", "").replace("The region marked", "") \
+            .replace("is", "").replace("shows", "").replace(':', '').replace("   ", " ").replace("  ", " ")
+        text_output = text_output.split("ASSISTANT: ")[-1]
+        cleaned_str = re.sub(r'<.*?>', '', text_output)
+        cleaned_str = cleaned_str.replace('[SEG]', '')
+        cleaned_str = ' '.join(cleaned_str.split()).strip("'")
+        cleaned_str = cleaned_str.strip()
+
+        result_dict["caption"] = cleaned_str
+        result_dict["prediction"] = cleaned_str
+        results.append(result_dict)
+
+    tmpdir = './dist_test_temp_regioncap_' + args.model_path.replace('/', '').replace('.', '')
+    results = collect_results_cpu(results, len(dataset), tmpdir=tmpdir)
+    if get_rank() == 0:
+        with open(args.output_path, 'w') as json_file:
+            json.dump(results, json_file, indent=2)
+
+if __name__ == '__main__':
+    main()
--- a/projects/llava_sam2/evaluation/utils/__init__.py
+++ b/projects/llava_sam2/evaluation/utils/__init__.py
+from .dist import _init_dist_pytorch, get_dist_info, master_only, get_rank, collect_results_cpu, _init_dist_slurm, barrier
+from .refcoco_refer import REFER
+from .utils_refcoco import AverageMeter, Summary, intersectionAndUnionGPU
--- a/projects/llava_sam2/evaluation/utils/dist.py
+++ b/projects/llava_sam2/evaluation/utils/dist.py
+from itertools import zip_longest, chain
+import os.path as osp
+import subprocess
+import torch
+import os
+from torch import distributed as torch_dist
+from torch.distributed import ProcessGroup
+import functools
+from typing import Callable, Optional, Tuple
+import pickle
+import shutil
+
+
+def _init_dist_pytorch(backend, **kwargs) -> None:
+    """Initialize distributed environment with PyTorch launcher.
+
+    Args:
+        backend (str): Backend of torch.distributed. Supported backends are
+            'nccl', 'gloo' and 'mpi'. Defaults to 'nccl'.
+        **kwargs: keyword arguments are passed to ``init_process_group``.
+    """
+    # LOCAL_RANK is set by `torch.distributed.launch` since PyTorch 1.1
+    local_rank = int(os.environ['LOCAL_RANK'])
+    torch.cuda.set_device(local_rank)
+
+    torch_dist.init_process_group(backend=backend, **kwargs)
+
+
+def _init_dist_slurm(backend,
+                     port=None,
+                     init_backend='torch',
+                     **kwargs) -> None:
+    """Initialize slurm distributed training environment.
+
+    If argument ``port`` is not specified, then the master port will be system
+    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+    environment variable, then a default port ``29500`` will be used.
+
+    Args:
+        backend (str): Backend of torch.distributed.
+        port (int, optional): Master port. Defaults to None.
+    """
+    proc_id = int(os.environ['SLURM_PROCID'])
+    ntasks = int(os.environ['SLURM_NTASKS'])
+    node_list = os.environ['SLURM_NODELIST']
+    # Not sure when this environment variable could be None, so use a fallback
+    local_rank_env = os.environ.get('SLURM_LOCALID', None)
+    if local_rank_env is not None:
+        local_rank = int(local_rank_env)
+    else:
+        num_gpus = torch.cuda.device_count()
+        local_rank = proc_id % num_gpus
+    addr = subprocess.getoutput(
+        f'scontrol show hostname {node_list} | head -n1')
+    # specify master port
+    if port is not None:
+        os.environ['MASTER_PORT'] = str(port)
+    elif 'MASTER_PORT' in os.environ:
+        pass  # use MASTER_PORT in the environment variable
+    else:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = '29500'
+    # use MASTER_ADDR in the environment variable if it already exists
+    if 'MASTER_ADDR' not in os.environ:
+        os.environ['MASTER_ADDR'] = addr
+    os.environ['WORLD_SIZE'] = str(ntasks)
+    os.environ['LOCAL_RANK'] = str(local_rank)
+    os.environ['RANK'] = str(proc_id)
+
+    torch.cuda.set_device(local_rank)
+
+    if init_backend == 'torch':
+        torch_dist.init_process_group(backend=backend, **kwargs)
+    elif init_backend == 'deepspeed':
+        import deepspeed
+        deepspeed.init_distributed(dist_backend=backend, **kwargs)
+    elif init_backend == 'colossalai':
+        import colossalai
+        colossalai.launch_from_slurm(
+            backend=backend,
+            host=os.environ['MASTER_ADDR'],
+            port=os.environ['MASTER_PORT'],
+            **kwargs,
+        )
+    else:
+        raise ValueError(
+            'supported "init_backend" is "torch" or "deepspeed", '
+            f'but got {init_backend}')
+        
+
+def get_dist_info(group=None) -> Tuple[int, int]:
+    """Get distributed information of the given process group.
+
+    Note:
+        Calling ``get_dist_info`` in non-distributed environment will return
+        (0, 1).
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        tuple[int, int]: Return a tuple containing the ``rank`` and
+        ``world_size``.
+    """
+    world_size = get_world_size(group)
+    rank = get_rank(group)
+    return rank, world_size
+
+def get_world_size(group: Optional[ProcessGroup] = None) -> int:
+    """Return the number of the given process group.
+
+    Note:
+        Calling ``get_world_size`` in non-distributed environment will return
+        1.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        int: Return the number of processes of the given process group if in
+        distributed environment, otherwise 1.
+    """
+    if is_distributed():
+        # handle low versions of torch like 1.5.0 which does not support
+        # passing in None for group argument
+        if group is None:
+            group = get_default_group()
+        return torch_dist.get_world_size(group)
+    else:
+        return 1
+
+
+def get_rank(group: Optional[ProcessGroup] = None) -> int:
+    """Return the rank of the given process group.
+
+    Rank is a unique identifier assigned to each process within a distributed
+    process group. They are always consecutive integers ranging from 0 to
+    ``world_size``.
+
+    Note:
+        Calling ``get_rank`` in non-distributed environment will return 0.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        int: Return the rank of the process group if in distributed
+        environment, otherwise 0.
+    """
+
+    if is_distributed():
+        # handle low versions of torch like 1.5.0 which does not support
+        # passing in None for group argument
+        if group is None:
+            group = get_default_group()
+        return torch_dist.get_rank(group)
+    else:
+        return 0
+
+def is_distributed() -> bool:
+    """Return True if distributed environment has been initialized."""
+    return torch_dist.is_available() and torch_dist.is_initialized()
+
+def get_default_group() -> Optional[ProcessGroup]:
+    """Return default process group."""
+
+    return torch_dist.distributed_c10d._get_default_group()
+
+def is_main_process(group: Optional[ProcessGroup] = None) -> bool:
+    """Whether the current rank of the given process group is equal to 0.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        bool: Return True if the current rank of the given process group is
+        equal to 0, otherwise False.
+    """
+    return get_rank(group) == 0
+
+def master_only(func: Callable) -> Callable:
+    """Decorate those methods which should be executed in master process.
+
+    Args:
+        func (callable): Function to be decorated.
+
+    Returns:
+        callable: Return decorated function.
+    """
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_main_process():
+            return func(*args, **kwargs)
+    return wrapper
+
+def collect_results_cpu(result_part: list,
+                        size: int,
+                        tmpdir='./dist_test_temp'):
+    """Collect results under cpu mode.
+
+    On cpu mode, this function will save the results on different gpus to
+    ``tmpdir`` and collect them by the rank 0 worker.
+
+    Args:
+        result_part (list): Result list containing result parts
+            to be collected. Each item of ``result_part`` should be a picklable
+            object.
+        size (int): Size of the results, commonly equal to length of
+            the results.
+        tmpdir (str | None): Temporal directory for collected results to
+            store. If set to None, it will create a random temporal directory
+            for it. Defaults to None.
+
+    Returns:
+        list or None: The collected results.
+    """
+    rank, world_size = get_dist_info()
+    if world_size == 1:
+        return result_part[:size]
+
+    # create a tmp dir if it is not specified
+    if not os.path.exists(tmpdir):
+        os.mkdir(tmpdir)
+
+    # dump the part result to the dir
+    with open(osp.join(tmpdir, f'part_{rank}.pkl'), 'wb') as f:  # type: ignore
+        pickle.dump(result_part, f, protocol=2)
+
+    barrier()
+
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            path = osp.join(tmpdir, f'part_{i}.pkl')  # type: ignore
+            if not osp.exists(path):
+                raise FileNotFoundError(
+                    f'{tmpdir} is not an shared directory for '
+                    f'rank {i}, please make sure {tmpdir} is a shared '
+                    'directory for all ranks!')
+            with open(path, 'rb') as f:
+                part_list.append(pickle.load(f))
+        # sort the results
+        ordered_results = []
+        zipped_results = zip_longest(*part_list)
+        ordered_results = [
+            i for i in chain.from_iterable(zipped_results) if i is not None
+        ]
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)  # type: ignore
+        return ordered_results
+
+
+def barrier(group: Optional[ProcessGroup] = None) -> None:
+    """Synchronize all processes from the given process group.
+
+    This collective blocks processes until the whole group enters this
+    function.
+
+    Note:
+        Calling ``barrier`` in non-distributed environment will do nothing.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+    """
+    if is_distributed():
+        # handle low versions of torch like 1.5.0 which does not support
+        # passing in None for group argument
+        if group is None:
+            group = get_default_group()
+        torch_dist.barrier(group)
\ No newline at end of file
--- a/projects/llava_sam2/evaluation/utils/refcoco_refer.py
+++ b/projects/llava_sam2/evaluation/utils/refcoco_refer.py
+__author__ = "licheng"
+
+"""
+This interface provides access to four datasets:
+1) refclef
+2) refcoco
+3) refcoco+
+4) refcocog
+split by unc and google
+
+The following API functions are defined:
+REFER      - REFER api class
+getRefIds  - get ref ids that satisfy given filter conditions.
+getAnnIds  - get ann ids that satisfy given filter conditions.
+getImgIds  - get image ids that satisfy given filter conditions.
+getCatIds  - get category ids that satisfy given filter conditions.
+loadRefs   - load refs with the specified ref ids.
+loadAnns   - load anns with the specified ann ids.
+loadImgs   - load images with the specified image ids.
+loadCats   - load category names with the specified category ids.
+getRefBox  - get ref's bounding box [x, y, w, h] given the ref_id
+showRef    - show image, segmentation or box of the referred object with the ref
+getMask    - get mask and area of the referred object given ref
+showMask   - show mask of the referred object given ref
+"""
+
+import itertools
+import json
+import os.path as osp
+import pickle
+import sys
+import time
+from pprint import pprint
+
+import matplotlib.pyplot as plt
+import numpy as np
+import skimage.io as io
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from pycocotools import mask
+
+
+class REFER:
+    def __init__(self, data_root, dataset="refcoco", splitBy="unc"):
+        # provide data_root folder which contains refclef, refcoco, refcoco+ and refcocog
+        # also provide dataset name and splitBy information
+        # e.g., dataset = 'refcoco', splitBy = 'unc'
+        print("loading dataset %s into memory..." % dataset)
+        self.ROOT_DIR = osp.abspath(osp.dirname(__file__))
+        self.DATA_DIR = osp.join(data_root, dataset)
+        if dataset in ["refcoco", "refcoco+", "refcocog"]:
+            self.IMAGE_DIR = osp.join(data_root, "images/mscoco/images/train2014")
+        elif dataset == "refclef":
+            self.IMAGE_DIR = osp.join(data_root, "images/saiapr_tc-12")
+        else:
+            print("No refer dataset is called [%s]" % dataset)
+            sys.exit()
+
+        self.dataset = dataset
+
+        # load refs from data/dataset/refs(dataset).json
+        tic = time.time()
+
+        ref_file = osp.join(self.DATA_DIR, "refs(" + splitBy + ").p")
+        print("ref_file: ", ref_file)
+        self.data = {}
+        self.data["dataset"] = dataset
+        self.data["refs"] = pickle.load(open(ref_file, "rb"))
+
+        # load annotations from data/dataset/instances.json
+        instances_file = osp.join(self.DATA_DIR, "instances.json")
+        instances = json.load(open(instances_file, "rb"))
+        self.data["images"] = instances["images"]
+        self.data["annotations"] = instances["annotations"]
+        self.data["categories"] = instances["categories"]
+
+        # create index
+        self.createIndex()
+        print("DONE (t=%.2fs)" % (time.time() - tic))
+
+    def createIndex(self):
+        # create sets of mapping
+        # 1)  Refs: 	 	{ref_id: ref}
+        # 2)  Anns: 	 	{ann_id: ann}
+        # 3)  Imgs:		 	{image_id: image}
+        # 4)  Cats: 	 	{category_id: category_name}
+        # 5)  Sents:     	{sent_id: sent}
+        # 6)  imgToRefs: 	{image_id: refs}
+        # 7)  imgToAnns: 	{image_id: anns}
+        # 8)  refToAnn:  	{ref_id: ann}
+        # 9)  annToRef:  	{ann_id: ref}
+        # 10) catToRefs: 	{category_id: refs}
+        # 11) sentToRef: 	{sent_id: ref}
+        # 12) sentToTokens: {sent_id: tokens}
+        print("creating index...")
+        # fetch info from instances
+        Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
+        for ann in self.data["annotations"]:
+            Anns[ann["id"]] = ann
+            imgToAnns[ann["image_id"]] = imgToAnns.get(ann["image_id"], []) + [ann]
+        for img in self.data["images"]:
+            Imgs[img["id"]] = img
+        for cat in self.data["categories"]:
+            Cats[cat["id"]] = cat["name"]
+
+        # fetch info from refs
+        Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
+        Sents, sentToRef, sentToTokens = {}, {}, {}
+        for ref in self.data["refs"]:
+            # ids
+            ref_id = ref["ref_id"]
+            ann_id = ref["ann_id"]
+            category_id = ref["category_id"]
+            image_id = ref["image_id"]
+
+            # add mapping related to ref
+            Refs[ref_id] = ref
+            imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref]
+            catToRefs[category_id] = catToRefs.get(category_id, []) + [ref]
+            refToAnn[ref_id] = Anns[ann_id]
+            annToRef[ann_id] = ref
+
+            # add mapping of sent
+            for sent in ref["sentences"]:
+                Sents[sent["sent_id"]] = sent
+                sentToRef[sent["sent_id"]] = ref
+                sentToTokens[sent["sent_id"]] = sent["tokens"]
+
+        # create class members
+        self.Refs = Refs
+        self.Anns = Anns
+        self.Imgs = Imgs
+        self.Cats = Cats
+        self.Sents = Sents
+        self.imgToRefs = imgToRefs
+        self.imgToAnns = imgToAnns
+        self.refToAnn = refToAnn
+        self.annToRef = annToRef
+        self.catToRefs = catToRefs
+        self.sentToRef = sentToRef
+        self.sentToTokens = sentToTokens
+        print("index created.")
+
+    def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=""):
+        image_ids = image_ids if type(image_ids) == list else [image_ids]
+        cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+
+        if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0:
+            refs = self.data["refs"]
+        else:
+            if not len(image_ids) == 0:
+                refs = [self.imgToRefs[image_id] for image_id in image_ids]
+            else:
+                refs = self.data["refs"]
+            if not len(cat_ids) == 0:
+                refs = [ref for ref in refs if ref["category_id"] in cat_ids]
+            if not len(ref_ids) == 0:
+                refs = [ref for ref in refs if ref["ref_id"] in ref_ids]
+            if not len(split) == 0:
+                if split in ["testA", "testB", "testC"]:
+                    refs = [
+                        ref for ref in refs if split[-1] in ref["split"]
+                    ]  # we also consider testAB, testBC, ...
+                elif split in ["testAB", "testBC", "testAC"]:
+                    refs = [
+                        ref for ref in refs if ref["split"] == split
+                    ]  # rarely used I guess...
+                elif split == "test":
+                    refs = [ref for ref in refs if "test" in ref["split"]]
+                elif split == "train" or split == "val":
+                    refs = [ref for ref in refs if ref["split"] == split]
+                else:
+                    print("No such split [%s]" % split)
+                    sys.exit()
+        ref_ids = [ref["ref_id"] for ref in refs]
+        return ref_ids
+
+    def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]):
+        image_ids = image_ids if type(image_ids) == list else [image_ids]
+        cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+
+        if len(image_ids) == len(cat_ids) == len(ref_ids) == 0:
+            ann_ids = [ann["id"] for ann in self.data["annotations"]]
+        else:
+            if not len(image_ids) == 0:
+                lists = [
+                    self.imgToAnns[image_id]
+                    for image_id in image_ids
+                    if image_id in self.imgToAnns
+                ]  # list of [anns]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.data["annotations"]
+            if not len(cat_ids) == 0:
+                anns = [ann for ann in anns if ann["category_id"] in cat_ids]
+            ann_ids = [ann["id"] for ann in anns]
+            if not len(ref_ids) == 0:
+                ids = set(ann_ids).intersection(
+                    set([self.Refs[ref_id]["ann_id"] for ref_id in ref_ids])
+                )
+        return ann_ids
+
+    def getImgIds(self, ref_ids=[]):
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+
+        if not len(ref_ids) == 0:
+            image_ids = list(set([self.Refs[ref_id]["image_id"] for ref_id in ref_ids]))
+        else:
+            image_ids = self.Imgs.keys()
+        return image_ids
+
+    def getCatIds(self):
+        return self.Cats.keys()
+
+    def loadRefs(self, ref_ids=[]):
+        if type(ref_ids) == list:
+            return [self.Refs[ref_id] for ref_id in ref_ids]
+        elif type(ref_ids) == int:
+            return [self.Refs[ref_ids]]
+
+    def loadAnns(self, ann_ids=[]):
+        if type(ann_ids) == list:
+            return [self.Anns[ann_id] for ann_id in ann_ids]
+        elif type(ann_ids) == int or type(ann_ids) == unicode:
+            return [self.Anns[ann_ids]]
+
+    def loadImgs(self, image_ids=[]):
+        if type(image_ids) == list:
+            return [self.Imgs[image_id] for image_id in image_ids]
+        elif type(image_ids) == int:
+            return [self.Imgs[image_ids]]
+
+    def loadCats(self, cat_ids=[]):
+        if type(cat_ids) == list:
+            return [self.Cats[cat_id] for cat_id in cat_ids]
+        elif type(cat_ids) == int:
+            return [self.Cats[cat_ids]]
+
+    def getRefBox(self, ref_id):
+        ref = self.Refs[ref_id]
+        ann = self.refToAnn[ref_id]
+        return ann["bbox"]  # [x, y, w, h]
+
+    def showRef(self, ref, seg_box="seg"):
+        ax = plt.gca()
+        # show image
+        image = self.Imgs[ref["image_id"]]
+        I = io.imread(osp.join(self.IMAGE_DIR, image["file_name"]))
+        ax.imshow(I)
+        # show refer expression
+        for sid, sent in enumerate(ref["sentences"]):
+            print("%s. %s" % (sid + 1, sent["sent"]))
+        # show segmentations
+        if seg_box == "seg":
+            ann_id = ref["ann_id"]
+            ann = self.Anns[ann_id]
+            polygons = []
+            color = []
+            c = "none"
+            if type(ann["segmentation"][0]) == list:
+                # polygon used for refcoco*
+                for seg in ann["segmentation"]:
+                    poly = np.array(seg).reshape((len(seg) / 2, 2))
+                    polygons.append(Polygon(poly, True, alpha=0.4))
+                    color.append(c)
+                p = PatchCollection(
+                    polygons,
+                    facecolors=color,
+                    edgecolors=(1, 1, 0, 0),
+                    linewidths=3,
+                    alpha=1,
+                )
+                ax.add_collection(p)  # thick yellow polygon
+                p = PatchCollection(
+                    polygons,
+                    facecolors=color,
+                    edgecolors=(1, 0, 0, 0),
+                    linewidths=1,
+                    alpha=1,
+                )
+                ax.add_collection(p)  # thin red polygon
+            else:
+                # mask used for refclef
+                rle = ann["segmentation"]
+                m = mask.decode(rle)
+                img = np.ones((m.shape[0], m.shape[1], 3))
+                color_mask = np.array([2.0, 166.0, 101.0]) / 255
+                for i in range(3):
+                    img[:, :, i] = color_mask[i]
+                ax.imshow(np.dstack((img, m * 0.5)))
+        # show bounding-box
+        elif seg_box == "box":
+            ann_id = ref["ann_id"]
+            ann = self.Anns[ann_id]
+            bbox = self.getRefBox(ref["ref_id"])
+            box_plot = Rectangle(
+                (bbox[0], bbox[1]),
+                bbox[2],
+                bbox[3],
+                fill=False,
+                edgecolor="green",
+                linewidth=3,
+            )
+            ax.add_patch(box_plot)
+
+    def getMask(self, ref):
+        # return mask, area and mask-center
+        ann = self.refToAnn[ref["ref_id"]]
+        image = self.Imgs[ref["image_id"]]
+        if type(ann["segmentation"][0]) == list:  # polygon
+            rle = mask.frPyObjects(ann["segmentation"], image["height"], image["width"])
+        else:
+            rle = ann["segmentation"]
+        m = mask.decode(rle)
+        m = np.sum(
+            m, axis=2
+        )  # sometimes there are multiple binary map (corresponding to multiple segs)
+        m = m.astype(np.uint8)  # convert to np.uint8
+        # compute area
+        area = sum(mask.area(rle))  # should be close to ann['area']
+        return {"mask": m, "area": area}
+        # # position
+        # position_x = np.mean(np.where(m==1)[1]) # [1] means columns (matlab style) -> x (c style)
+        # position_y = np.mean(np.where(m==1)[0]) # [0] means rows (matlab style)    -> y (c style)
+        # # mass position (if there were multiple regions, we use the largest one.)
+        # label_m = label(m, connectivity=m.ndim)
+        # regions = regionprops(label_m)
+        # if len(regions) > 0:
+        # 	largest_id = np.argmax(np.array([props.filled_area for props in regions]))
+        # 	largest_props = regions[largest_id]
+        # 	mass_y, mass_x = largest_props.centroid
+        # else:
+        # 	mass_x, mass_y = position_x, position_y
+        # # if centroid is not in mask, we find the closest point to it from mask
+        # if m[mass_y, mass_x] != 1:
+        # 	print('Finding closes mask point ...')
+        # 	kernel = np.ones((10, 10),np.uint8)
+        # 	me = cv2.erode(m, kernel, iterations = 1)
+        # 	points = zip(np.where(me == 1)[0].tolist(), np.where(me == 1)[1].tolist())  # row, col style
+        # 	points = np.array(points)
+        # 	dist   = np.sum((points - (mass_y, mass_x))**2, axis=1)
+        # 	id     = np.argsort(dist)[0]
+        # 	mass_y, mass_x = points[id]
+        # 	# return
+        # return {'mask': m, 'area': area, 'position_x': position_x, 'position_y': position_y, 'mass_x': mass_x, 'mass_y': mass_y}
+        # # show image and mask
+        # I = io.imread(osp.join(self.IMAGE_DIR, image['file_name']))
+        # plt.figure()
+        # plt.imshow(I)
+        # ax = plt.gca()
+        # img = np.ones( (m.shape[0], m.shape[1], 3) )
+        # color_mask = np.array([2.0,166.0,101.0])/255
+        # for i in range(3):
+        #     img[:,:,i] = color_mask[i]
+        # ax.imshow(np.dstack( (img, m*0.5) ))
+        # plt.show()
+
+    def showMask(self, ref):
+        M = self.getMask(ref)
+        msk = M["mask"]
+        ax = plt.gca()
+        ax.imshow(msk)
+
+
+if __name__ == "__main__":
+    refer = REFER(dataset="refcocog", splitBy="google")
+    ref_ids = refer.getRefIds()
+    print(len(ref_ids))
+
+    print(len(refer.Imgs))
+    print(len(refer.imgToRefs))
+
+    ref_ids = refer.getRefIds(split="train")
+    print("There are %s training referred objects." % len(ref_ids))
+
+    for ref_id in ref_ids:
+        ref = refer.loadRefs(ref_id)[0]
+        if len(ref["sentences"]) < 2:
+            continue
+
+        pprint(ref)
+        print("The label is %s." % refer.Cats[ref["category_id"]])
+        plt.figure()
+        refer.showRef(ref, seg_box="box")
+        plt.show()
+
+        # plt.figure()
+        # refer.showMask(ref)
+        # plt.show()
--- a/projects/llava_sam2/evaluation/utils/utils_refcoco.py
+++ b/projects/llava_sam2/evaluation/utils/utils_refcoco.py
+from enum import Enum
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+
+class Summary(Enum):
+    NONE = 0
+    AVERAGE = 1
+    SUM = 2
+    COUNT = 3
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self, name, fmt=":f", summary_type=Summary.AVERAGE):
+        self.name = name
+        self.fmt = fmt
+        self.summary_type = summary_type
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def all_reduce(self):
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        if isinstance(self.sum, np.ndarray):
+            total = torch.tensor(
+                self.sum.tolist()
+                + [
+                    self.count,
+                ],
+                dtype=torch.float32,
+                device=device,
+            )
+        else:
+            total = torch.tensor(
+                [self.sum, self.count], dtype=torch.float32, device=device
+            )
+
+        dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
+        if total.shape[0] > 2:
+            self.sum, self.count = total[:-1].cpu().numpy(), total[-1].cpu().item()
+        else:
+            self.sum, self.count = total.tolist()
+        self.avg = self.sum / (self.count + 1e-5)
+
+    def __str__(self):
+        fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
+        return fmtstr.format(**self.__dict__)
+
+    def summary(self):
+        fmtstr = ""
+        if self.summary_type is Summary.NONE:
+            fmtstr = ""
+        elif self.summary_type is Summary.AVERAGE:
+            fmtstr = "{name} {avg:.3f}"
+        elif self.summary_type is Summary.SUM:
+            fmtstr = "{name} {sum:.3f}"
+        elif self.summary_type is Summary.COUNT:
+            fmtstr = "{name} {count:.3f}"
+        else:
+            raise ValueError("invalid summary type %r" % self.summary_type)
+
+        return fmtstr.format(**self.__dict__)
+
+
+def intersectionAndUnionGPU(output, target, K, ignore_index=255):
+    # 'K' classes, output and target sizes are N or N * L or N * H * W, each value in range 0 to K - 1.
+    assert output.dim() in [1, 2, 3]
+    assert output.shape == target.shape
+    output = output.view(-1)
+    target = target.view(-1)
+    output[target == ignore_index] = ignore_index
+    intersection = output[output == target]
+    area_intersection = torch.histc(intersection, bins=K, min=0, max=K - 1)
+    area_output = torch.histc(output, bins=K, min=0, max=K - 1)
+    area_target = torch.histc(target, bins=K, min=0, max=K - 1)
+    area_union = area_output + area_target - area_intersection
+    return area_intersection, area_union, area_target
+
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print("\t".join(entries))
+
+    def display_summary(self):
+        entries = [" *"]
+        entries += [meter.summary() for meter in self.meters]
+        print(" ".join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = "{:" + str(num_digits) + "d}"
+        return "[" + fmt + "/" + fmt.format(num_batches) + "]"
+
+
+def dict_to_cuda(input_dict):
+    for k, v in input_dict.items():
+        if isinstance(input_dict[k], torch.Tensor):
+            input_dict[k] = v.cuda(non_blocking=True)
+        elif isinstance(v, list) and len(v) > 0:
+            input_dict[k] = [ele.cuda(non_blocking=True) if isinstance(ele, torch.Tensor) else ele for ele in v]
+    return input_dict
--- a/projects/llava_sam2/gradio/__pycache__/app_utils.cpython-310.pyc
+++ b/projects/llava_sam2/gradio/__pycache__/app_utils.cpython-310.pyc
--- a/projects/llava_sam2/gradio/app.py
+++ b/projects/llava_sam2/gradio/app.py
+import gradio as gr
+import sys
+from projects.llava_sam2.gradio.app_utils import\
+    process_markdown, show_mask_pred, description, preprocess_video,\
+    show_mask_pred_video, image2video_and_save
+
+import torch
+from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig, CLIPImageProcessor,
+                          CLIPVisionModel, GenerationConfig)
+import argparse
+import os
+
+TORCH_DTYPE_MAP = dict(
+    fp16=torch.float16, bf16=torch.bfloat16, fp32=torch.float32, auto='auto')
+
+def parse_args(args):
+    parser = argparse.ArgumentParser(description="Sa2VA Demo")
+    parser.add_argument('hf_path', help='Sa2VA hf path.')
+    return parser.parse_args(args)
+
+def inference(image, video, follow_up, input_str):
+    input_image = image
+    if image is not None and (video is not None and os.path.exists(video)):
+        return image, video, "Error: Please only input a image or a video !!!"
+    if image is None and (video is None or not os.path.exists(video)) and not follow_up:
+        return image, video, "Error: Please input a image or a video !!!"
+
+    if not follow_up:
+        # reset
+        print('Log: History responses have been removed!')
+        global_infos.n_turn = 0
+        global_infos.inputs = ''
+        text = input_str
+
+        image = input_image
+        global_infos.image_for_show = image
+        global_infos.image = image
+        video = video
+        global_infos.video = video
+
+        if image is not None:
+            global_infos.input_type = "image"
+        else:
+            global_infos.input_type = "video"
+
+    else:
+        text = input_str
+        image = global_infos.image
+        video = global_infos.video
+
+    input_type = global_infos.input_type
+    if input_type == "video":
+        video = preprocess_video(video, global_infos.inputs+input_str)
+
+    past_text = global_infos.inputs
+
+    if past_text == "" and "<image>" not in text:
+        text = "<image>" + text
+    if input_type == "image":
+        input_dict = {
+            'image': image,
+            'text': text,
+            'past_text': past_text,
+            'mask_prompts': None,
+            'tokenizer': tokenizer,
+        }
+    else:
+        input_dict = {
+            'video': video,
+            'text': text,
+            'past_text': past_text,
+            'mask_prompts': None,
+            'tokenizer': tokenizer,
+        }
+
+    return_dict = sa2va_model.predict_forward(**input_dict)
+    global_infos.inputs = return_dict["past_text"]
+    print(return_dict['past_text'])
+    if 'prediction_masks' in return_dict.keys() and return_dict['prediction_masks'] and len(
+            return_dict['prediction_masks']) != 0:
+        if input_type == "image":
+            image_mask_show, selected_colors = show_mask_pred(global_infos.image_for_show, return_dict['prediction_masks'],)
+            video_mask_show = global_infos.video
+        else:
+            image_mask_show = None
+            video_mask_show, selected_colors = show_mask_pred_video(video, return_dict['prediction_masks'],)
+            video_mask_show = image2video_and_save(video_mask_show, save_path="./ret_video.mp4")
+    else:
+        image_mask_show = global_infos.image_for_show
+        video_mask_show = global_infos.video
+        selected_colors = []
+
+    predict = return_dict['prediction'].strip()
+    global_infos.n_turn += 1
+
+    predict = process_markdown(predict, selected_colors)
+    return image_mask_show, video_mask_show, predict
+
+def init_models(args):
+    model_path = args.hf_path
+    model = AutoModel.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        use_flash_attn=True,
+        trust_remote_code=True,
+    ).eval().cuda()
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path,
+        trust_remote_code=True,
+    )
+    return model, tokenizer
+
+class global_infos:
+    inputs = ''
+    n_turn = 0
+    image_width = 0
+    image_height = 0
+
+    image_for_show = None
+    image = None
+    video = None
+
+    input_type = "image" # "image" or "video"
+
+if __name__ == "__main__":
+    # get parse args and set models
+    args = parse_args(sys.argv[1:])
+
+    sa2va_model, tokenizer = \
+        init_models(args)
+
+    demo = gr.Interface(
+        inference,
+        inputs=[
+            gr.Image(type="pil", label="Upload Image", height=360),
+            gr.Video(sources=["upload", "webcam"], label="Upload mp4 video", height=360),
+            gr.Checkbox(label="Follow up Question"),
+            gr.Textbox(lines=1, placeholder=None, label="Text Instruction"),],
+        outputs=[
+            gr.Image(type="pil", label="Output Image"),
+            gr.Video(label="Output Video", show_download_button=True, format='mp4'),
+            gr.Markdown()],
+        theme=gr.themes.Soft(), allow_flagging="auto", description=description,
+        title='Sa2VA'
+    )
+
+    demo.queue()
+    demo.launch(share=True)
\ No newline at end of file
--- a/projects/llava_sam2/gradio/app_utils.py
+++ b/projects/llava_sam2/gradio/app_utils.py
+import numpy as np
+from PIL import Image
+import cv2
+
+markdown_default = """
+<link href="https://fonts.googleapis.com/css2?family=Montserrat:wght@400;700&display=swap" rel="stylesheet">
+<style>
+        .highlighted-text {
+            font-family: 'Montserrat', sans-serif;
+            font-weight: 600;
+            font-size: 14px;
+            color: rgb(255, 255, 239);
+            background-color: rgb(225, 231, 254);
+            border-radius: 7px;
+            padding: 5px 7px;
+            display: inline-block;
+        }
+        .regular-text {
+            font-family: 'Montserrat', sans-serif;
+            font-weight: 400;
+            font-size: 14px;
+        }
+        .highlighted-response {
+            font-family: 'Montserrat', sans-serif;
+            font-weight: 600;
+            font-size: 14px;
+            border-radius: 6px;
+            padding: 3px 4px;
+            display: inline-block;
+        }
+</style>
+<span class="highlighted-text" style='color:rgb(107, 100, 239)'>Sa2VA</span>
+"""
+
+description = """
+**Usage** : <br>
+&ensp;(1) For **Grounded Caption Generation** Interleaved Segmentation, input prompt like: *"Could you provide me with a detailed analysis of this photo? Please output with interleaved segmentation masks for the corresponding parts of the answer."* <br>
+&ensp;(2) For **Segmentation Output**, input prompt like: *"Can you please segment xxx in the given image"* <br>
+&ensp;(3) For **Image Captioning** VQA, input prompt like: *"Could you please give me a detailed description of the image?"* <br>
+&ensp;(4) For **Image Conversation**, input arbitrary text instruction. <br>
+"""
+
+ONE_THIRD = 1.0/3.0
+ONE_SIXTH = 1.0/6.0
+TWO_THIRD = 2.0/3.0
+
+def desaturate(rgb, factor=0.65):
+    """
+    Desaturate an RGB color by a given factor.
+
+    :param rgb: A tuple of (r, g, b) where each value is in [0, 255].
+    :param factor: The factor by which to reduce the saturation.
+                   0 means completely desaturated, 1 means original color.
+    :return: A tuple of desaturated (r, g, b) values in [0, 255].
+    """
+    r, g, b = [x / 255.0 for x in rgb]
+    h, l, s = rgb_to_hls(r, g, b)
+    l = factor
+    new_r, new_g, new_b = hls_to_rgb(h, l, s)
+    return (int(new_r * 255), int(new_g * 255), int(new_b * 255))
+
+def rgb_to_hls(r, g, b):
+    maxc = max(r, g, b)
+    minc = min(r, g, b)
+    sumc = (maxc+minc)
+    rangec = (maxc-minc)
+    l = sumc/2.0
+    if minc == maxc:
+        return 0.0, l, 0.0
+    if l <= 0.5:
+        s = rangec / sumc
+    else:
+        s = rangec / (2.0-sumc)
+    rc = (maxc-r) / rangec
+    gc = (maxc-g) / rangec
+    bc = (maxc-b) / rangec
+    if r == maxc:
+        h = bc-gc
+    elif g == maxc:
+        h = 2.0+rc-bc
+    else:
+        h = 4.0+gc-rc
+    h = (h/6.0) % 1.0
+    return h, l, s
+
+def hls_to_rgb(h, l, s):
+    if s == 0.0:
+        return l, l, l
+    if l <= 0.5:
+        m2 = l * (1.0+s)
+    else:
+        m2 = l+s-(l*s)
+    m1 = 2.0*l - m2
+    return (_v(m1, m2, h+ONE_THIRD), _v(m1, m2, h), _v(m1, m2, h-ONE_THIRD))
+
+def _v(m1, m2, hue):
+    hue = hue % 1.0
+    if hue < ONE_SIXTH:
+        return m1 + (m2-m1)*hue*6.0
+    if hue < 0.5:
+        return m2
+    if hue < TWO_THIRD:
+        return m1 + (m2-m1)*(TWO_THIRD-hue)*6.0
+    return m1
+
+def process_markdown(output_str, colors):
+    output_str = output_str.replace("\n", "").replace("  ", " ").replace("<s>", "")\
+        .replace("<|im_end|>", '').replace("<|end|>", "")
+    output_str = output_str.split("ASSISTANT: ")[-1]
+
+    # markdown_out = output_str.replace('[SEG]', '')
+    markdown_out = output_str
+    markdown_out = markdown_out.replace(
+        "<p>", "<span class='highlighted-response' style='background-color:rgb[COLOR]'>"
+    )
+    markdown_out = markdown_out.replace("</p>", "</span>")
+
+    for color in colors:
+        markdown_out = markdown_out.replace("[COLOR]", str(desaturate(tuple(color))), 1)
+
+    markdown_out = f""" 
+    {markdown_out}
+    """
+    markdown_out = markdown_default + "<p><span class='regular-text'>" + markdown_out
+    return markdown_out
+
+def show_mask_pred(image, masks):
+    masks = [mask[:1] for mask in masks]
+    masks = np.concatenate(masks, axis=0)  # (n, h, w)
+
+    selected_colors = []
+
+    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255),
+              (255, 255, 0), (255, 0, 255), (0, 255, 255),
+              (128, 128, 255), [255, 192, 203],  # Pink
+              [165, 42, 42],    # Brown
+              [255, 165, 0],    # Orange
+              [128, 0, 128],     # Purple
+              [0, 0, 128],       # Navy
+              [128, 0, 0],      # Maroon
+              [128, 128, 0],    # Olive
+              [70, 130, 180],   # Steel Blue
+              [173, 216, 230],  # Light Blue
+              [255, 192, 0],    # Gold
+              [255, 165, 165],  # Light Salmon
+              [255, 20, 147],   # Deep Pink
+              ]
+
+    _mask_image = np.zeros((masks.shape[1], masks.shape[2], 3), dtype=np.uint8)
+
+    for i, mask in enumerate(masks):
+        color = colors[i % len(colors)]
+        selected_colors.append(color)
+        _mask_image[:, :, 0] = _mask_image[:, :, 0] + mask.astype(np.uint8) * color[0]
+        _mask_image[:, :, 1] = _mask_image[:, :, 1] + mask.astype(np.uint8) * color[1]
+        _mask_image[:, :, 2] = _mask_image[:, :, 2] + mask.astype(np.uint8) * color[2]
+
+
+    image = np.array(image)
+    image = image * 0.5 + _mask_image * 0.5
+    image = image.astype(np.uint8)
+    return image, selected_colors
+
+def show_mask_pred_video(video, masks):
+    ret_video = []
+    selected_colors = []
+    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255),
+              (255, 255, 0), (255, 0, 255), (0, 255, 255),
+              (128, 128, 255), [255, 192, 203],  # Pink
+              [165, 42, 42],  # Brown
+              [255, 165, 0],  # Orange
+              [128, 0, 128],  # Purple
+              [0, 0, 128],  # Navy
+              [128, 0, 0],  # Maroon
+              [128, 128, 0],  # Olive
+              [70, 130, 180],  # Steel Blue
+              [173, 216, 230],  # Light Blue
+              [255, 192, 0],  # Gold
+              [255, 165, 165],  # Light Salmon
+              [255, 20, 147],  # Deep Pink
+              ]
+    for i_frame in range(len(video)):
+        frame_masks = [mask[i_frame:i_frame+1] for mask in masks]
+        frame_masks = np.concatenate(frame_masks, axis=0)
+        _mask_image = np.zeros((frame_masks.shape[1], frame_masks.shape[2], 3), dtype=np.uint8)
+
+        for i, mask in enumerate(frame_masks):
+            if i_frame == 0:
+                color = colors[i % len(colors)]
+                selected_colors.append(color)
+            else:
+                color = selected_colors[i]
+            _mask_image[:, :, 0] = _mask_image[:, :, 0] + mask.astype(np.uint8) * color[0]
+            _mask_image[:, :, 1] = _mask_image[:, :, 1] + mask.astype(np.uint8) * color[1]
+            _mask_image[:, :, 2] = _mask_image[:, :, 2] + mask.astype(np.uint8) * color[2]
+
+        image = np.array(video[i_frame])
+        image = image * 0.5 + _mask_image * 0.5
+        image = image.astype(np.uint8)
+        ret_video.append(image)
+    return ret_video, selected_colors
+
+def parse_visual_prompts(points):
+    ret = {'points': [], 'boxes': []}
+    for item in points:
+        if item[2] == 1.0:
+            ret['points'].append([item[0], item[1]])
+        elif item[2] == 2.0 or item[2] == 3.0:
+            ret['boxes'].append([item[0], item[1], item[3], item[4]])
+        else:
+            raise NotImplementedError
+    return ret
+
+def get_video_frames(video_path):
+    cap = cv2.VideoCapture(video_path)
+
+    if not cap.isOpened():
+        print("Error: Cannot open video file.")
+        return
+
+    frames = []
+
+    frame_id = 0
+    while True:
+        ret, frame = cap.read()
+
+        if not ret:
+            break
+
+        frames.append(frame)
+
+        frame_id += 1
+
+    cap.release()
+    return frames
+
+def get_frames_from_video(video_path, n_frames=5, sample_type="uniform"):
+    frames = get_video_frames(video_path)
+    if sample_type == "uniform":
+        stride = len(frames) / (n_frames + 1e-4)
+        ret = []
+        for i in range(n_frames):
+            idx = int(i * stride)
+            frame = frames[idx]
+            frame = frame[:, :, ::-1]
+            frame_image = Image.fromarray(frame).convert('RGB')
+            ret.append(frame_image)
+    else:
+        ret = []
+        for frame in frames[:500]:
+            frame = frame[:, :, ::-1]
+            frame_image = Image.fromarray(frame).convert('RGB')
+            ret.append(frame_image)
+    return ret
+
+def preprocess_video(video_path, text):
+    if "Segment" in text or "segment" in text:
+        sample_type = 'begin'
+    else:
+        sample_type = 'uniform'
+    return get_frames_from_video(video_path, sample_type=sample_type)
+
+def image2video_and_save(frames, save_path):
+    success = frames_to_video(frames, save_path)
+    return save_path
+
+
+def frames_to_video(
+        frames,
+        output_path: str,
+        fps: int = 24,
+) -> bool:
+    try:
+        frames = [frame[:, :, ::-1] for frame in frames]
+        # Use provided frame size or get from first frame
+        height, width = frames[0].shape[:2]
+
+        # Initialize video writer
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+
+        # Process each frame
+        for frame in frames:
+            out.write(frame)
+
+        # Release video writer
+        out.release()
+        print(f"Video saved successfully to {output_path}")
+        return True
+
+    except Exception as e:
+        print(f"Error converting frames to video: {str(e)}")
+        return False
\ No newline at end of file
--- a/projects/llava_sam2/hf/convert_to_hf.py
+++ b/projects/llava_sam2/hf/convert_to_hf.py
+import argparse
+import copy
+import os.path as osp
+import torch
+from mmengine.dist import (collect_results, get_dist_info, get_rank, init_dist,
+                           master_only)
+from xtuner.registry import BUILDER
+from xtuner.configs import cfgs_name_path
+from xtuner.model.utils import guess_load_checkpoint
+from mmengine.config import Config
+from mmengine.fileio import PetrelBackend, get_file_backend
+from mmengine.config import ConfigDict
+import os
+
+def convert_dict2config_dict(input):
+    input = ConfigDict(**input)
+    for key in input.keys():
+        if isinstance(input[key], dict):
+            input[key] = convert_dict2config_dict(input[key])
+    return input
+
+TORCH_DTYPE_MAP = dict(
+    fp16=torch.float16, bf16=torch.bfloat16, fp32=torch.float32, auto='auto')
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='toHF script')
+    parser.add_argument('config', help='config file name or path.')
+    parser.add_argument('--pth-model', help='pth model file')
+    parser.add_argument(
+        '--save-path', type=str, default='./work_dirs/hf_model', help='save folder name')
+    args = parser.parse_args()
+    return args
+
+@master_only
+def master_print(msg):
+    print(msg)
+
+def main():
+    args = parse_args()
+
+    # build model
+    if not osp.isfile(args.config):
+        try:
+            args.config = cfgs_name_path[args.config]
+        except KeyError:
+            raise FileNotFoundError(f'Cannot find {args.config}')
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    model = BUILDER.build(cfg.model)
+    backend = get_file_backend(args.pth_model)
+
+    if isinstance(backend, PetrelBackend):
+        from xtuner.utils.fileio import patch_fileio
+        with patch_fileio():
+            state_dict = guess_load_checkpoint(args.pth_model)
+    else:
+        state_dict = guess_load_checkpoint(args.pth_model)
+
+    model.load_state_dict(state_dict, strict=False)
+    print(f'Load PTH model from {args.pth_model}')
+
+    model._merge_lora()
+    model.mllm.transfer_to_hf = True
+
+    all_state_dict = model.all_state_dict()
+
+    name_map = {'mllm.model.': '', '.gamma': '.g_weight'}
+
+    all_state_dict_new = {}
+    for key in all_state_dict.keys():
+        new_key = copy.deepcopy(key)
+        for _text in name_map.keys():
+            new_key = new_key.replace(_text, name_map[_text])
+        all_state_dict_new[new_key] = all_state_dict[key]
+
+    # build the hf format model
+    from projects.llava_sam2.hf.models.configuration_sa2va_chat import Sa2VAChatConfig
+    from projects.llava_sam2.hf.models.modeling_sa2va_chat import Sa2VAChatModel
+
+    internvl_config = Sa2VAChatConfig.from_pretrained(cfg.path)
+    config_dict = internvl_config.to_dict()
+    config_dict['auto_map'] = \
+        {'AutoConfig': 'configuration_sa2va_chat.Sa2VAChatConfig',
+         'AutoModel': 'modeling_sa2va_chat.Sa2VAChatModel',
+         'AutoModelForCausalLM': 'modeling_sa2va_chat.Sa2VAChatModel'}
+
+    config_dict["llm_config"]["vocab_size"] = len(model.tokenizer)
+    config_dict["template"] = cfg.template
+    sa2va_hf_config = Sa2VAChatConfig(
+        **config_dict
+    )
+    hf_sa2va_model = Sa2VAChatModel(
+        sa2va_hf_config, vision_model=model.mllm.model.vision_model,
+        language_model=model.mllm.model.language_model,
+    )
+    hf_sa2va_model.load_state_dict(all_state_dict_new)
+
+    hf_sa2va_model.save_pretrained(args.save_path)
+    model.tokenizer.save_pretrained(args.save_path)
+    print(f"Save the hf model into {args.save_path}")
+
+    # copy the files
+    os.system(f"cp -pr ./projects/llava_sam2/hf/models/* {args.save_path}")
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file