Commit ef30d662 authored by bailuo's avatar bailuo
Browse files

init

parents
Pipeline #2496 failed with stages
in 0 seconds
{
"gradient_accumulation_steps": "auto",
"train_micro_batch_size_per_gpu": "auto",
"gradient_clipping": "auto",
"zero_allow_untested_optimizer": true,
"zero_force_ds_cpu_optimizer": false,
"zero_optimization": {
"stage": 2,
"overlap_comm": true,
"allgather_bucket_size": 5368709120,
"reduce_bucket_size": 5368709120,
"reduce_scatter": true,
"sub_group_size": 1e9,
"contiguous_gradients": true,
"allgather_partitions": true
},
"fp16": {
"enabled": false,
"initial_scale_power": 16
},
"bf16": {
"enabled": true
}
}
import os
import numpy as np
import torch
from PIL import Image
from pycocotools import mask as _mask
from utils import REFER, Summary, AverageMeter, intersectionAndUnionGPU, master_only
DATASETS_ATTRIBUTES = {
'refcoco': {'splitBy': "unc", 'dataset_name': 'refcoco'},
'refcoco_plus': {'splitBy': "unc", 'dataset_name': 'refcoco+'},
'refcocog': {'splitBy': "umd", 'dataset_name': 'refcocog'},
}
class RESDataset:
METAINFO: dict = dict(name='Referring Expression Segmentation')
def __init__(self,
image_folder,
dataset_name,
data_path=None,
split='val',
):
self.split = split
self._set_attribute(dataset_name)
json_datas = self.json_file_preprocess(data_path)
self.json_datas = json_datas
self.image_folder = image_folder
def _set_attribute(self, dataset_name):
attr_dict = DATASETS_ATTRIBUTES[dataset_name]
self.splitBy = attr_dict['splitBy']
self.dataset_name = attr_dict['dataset_name']
def __len__(self):
return len(self.json_datas)
def real_len(self):
return len(self.json_datas)
def json_file_preprocess(self, data_path):
splitBy = self.splitBy
dataset_name = self.dataset_name
refer_api = REFER(data_path, dataset_name, splitBy)
ref_ids_train = refer_api.getRefIds(split=self.split)
images_ids_train = refer_api.getImgIds(ref_ids=ref_ids_train)
refs_train = refer_api.loadRefs(ref_ids=ref_ids_train)
self.img2refs = self.create_img_to_refs_mapping(refs_train)
image_infos = []
loaded_images = refer_api.loadImgs(image_ids=images_ids_train)
for item in loaded_images:
item = item.copy()
image_infos.append(item)
self.annotations = refer_api.Anns
refs = [self.img2refs[image_info['id']] for image_info in image_infos]
ret = []
for image_info, ref in zip(image_infos, refs):
if len(ref) == 0:
continue
sents = []
ann_ids = []
for _ref in ref:
for sent in _ref["sentences"]:
text = sent["sent"]
sents.append(text)
ann_ids.append(_ref["ann_id"])
sampled_inds = list(range(len(sents)))
sampled_sents = np.vectorize(sents.__getitem__)(sampled_inds).tolist()
sampled_ann_ids = [ann_ids[ind] for ind in sampled_inds]
selected_labels = sampled_sents
ret.append(
{'image_info': image_info,
'sampled_ann_id': sampled_ann_ids,
'selected_labels': selected_labels,
'image': image_info['file_name']
}
)
return ret
def create_img_to_refs_mapping(self, refs_train):
img2refs = {}
for ref in refs_train:
img2refs[ref["image_id"]] = img2refs.get(ref["image_id"], []) + [ref, ]
return img2refs
def decode_mask(self, annotations_ids, image_info):
flag = False
masks = []
for ann_id in annotations_ids:
if isinstance(ann_id, list):
flag = True
if -1 in ann_id:
assert len(ann_id) == 1
m = np.zeros((image_info["height"], image_info["width"])).astype(
np.uint8
)
else:
m_final = np.zeros(
(image_info["height"], image_info["width"])
).astype(np.uint8)
for ann_id_i in ann_id:
ann = self.annotations[ann_id_i]
if len(ann["segmentation"]) == 0:
m = np.zeros(
(image_info["height"], image_info["width"])
).astype(np.uint8)
else:
if type(ann["segmentation"][0]) == list: # polygon
rle = _mask.frPyObjects(
ann["segmentation"], image_info["height"], image_info["width"], )
else:
rle = ann["segmentation"]
for i in range(len(rle)):
if not isinstance(rle[i]["counts"], bytes):
rle[i]["counts"] = rle[i]["counts"].encode()
m = _mask.decode(rle)
m = np.sum(
m, axis=2
) # sometimes there are multiple binary map (corresponding to multiple segs)
m = m.astype(np.uint8) # convert to np.uint8
m_final = m_final | m
m = m_final
masks.append(m)
continue
ann = self.annotations[ann_id]
if len(ann["segmentation"]) == 0:
m = np.zeros((image_info["height"], image_info["width"])).astype(
np.uint8
)
masks.append(m)
continue
if type(ann["segmentation"][0]) == list: # polygon
rle = _mask.frPyObjects(
ann["segmentation"], image_info["height"], image_info["width"]
)
else:
rle = ann["segmentation"]
for i in range(len(rle)):
if not isinstance(rle[i]["counts"], bytes):
rle[i]["counts"] = rle[i]["counts"].encode()
m = _mask.decode(rle)
m = np.sum(m, axis=2) # sometimes there are multiple binary map (corresponding to multiple segs)
m = m.astype(np.uint8) # convert to np.uint8
masks.append(m)
masks = np.stack(masks, axis=0)
# if self.pad_image_to_square:
masks = torch.from_numpy(masks)
return masks
def only_get_text_infos(self, json_data):
return {'sampled_sents': json_data['selected_labels']}
def get_questions(self, text_require_infos):
sampled_sents = text_require_infos['sampled_sents']
ret = []
for sent in sampled_sents:
ret.append("<image>\n Please segment {} in this image.".format(sent))
return ret
def filter_data_dict(self, data_dict):
names = ['image', 'text', 'gt_masks', 'img_id']
ret = {name: data_dict[name] for name in names}
return ret
def __getitem__(self, index):
index = index % self.real_len()
data_dict = self.json_datas[index]
text_require_infos = self.only_get_text_infos(data_dict)
questions = self.get_questions(text_require_infos)
assert data_dict.get('image', None) is not None
if data_dict.get('image', None) is not None:
image_file = data_dict['image']
image_file = os.path.join(self.image_folder, image_file)
image = Image.open(image_file).convert('RGB')
# process and get masks for evaluation
masks = self.decode_mask(data_dict['sampled_ann_id'], data_dict['image_info'])
data_dict['gt_masks'] = masks
data_dict['image'] = image
data_dict['text'] = questions
data_dict['img_id'] = str(index)
return self.filter_data_dict(data_dict)
@master_only
def evaluate(self, result, work_dir):
trackers = {
"intersection": AverageMeter("Intersec", ":6.3f", Summary.SUM),
"union": AverageMeter("Union", ":6.3f", Summary.SUM),
"gIoU": AverageMeter("gIoU", ":6.3f", Summary.SUM)
}
for pred_dict in result:
intersection, union, accuracy_iou = 0.0, 0.0, 0.0
masks = pred_dict['prediction_masks']
_masks = []
for mask in masks:
if mask is not None:
mask = rle_to_mask(mask)
_masks.append(mask)
targets = pred_dict['gt_masks']
_targets = rle_to_mask(targets)
for i_item, _mask in enumerate(_masks):
if _mask is None:
continue
_target = _targets[i_item: i_item+1]
for prediction, target in zip(_mask, _target):
prediction = torch.from_numpy(prediction).int().cuda()
target = torch.from_numpy(target).int().cuda()
intersect, union_, _ = intersectionAndUnionGPU(
prediction.contiguous().clone(), target.contiguous(), 2, ignore_index=255
)
intersection += intersect
union += union_
accuracy_iou += intersect / (union_ + 1e-5)
accuracy_iou[union_ == 0] += 1.0
intersection, union = intersection.cpu().numpy(), union.cpu().numpy()
accuracy_iou = accuracy_iou.cpu().numpy() / _targets.shape[0]
trackers["intersection"].update(intersection)
trackers["union"].update(union)
trackers["gIoU"].update(accuracy_iou, n=_targets.shape[0])
cur_results = {'pixel_intersection': trackers["intersection"].sum[1],
'pixel_union': trackers["union"].sum[1],
'gIoU': trackers["gIoU"].avg[1],
'mask_counts': trackers["gIoU"].count,
}
class_iou = cur_results['pixel_intersection'] / (cur_results['pixel_union'] + 1e-10)
global_iou = cur_results['gIoU']
print('============================================', 'current')
print('CIoU: {}, GIoU: {}'.format(class_iou, global_iou), 'current')
print('============================================', 'current')
print('RES_{}_{} successfully finished evaluating'.format(self.dataset_name, self.split),
'current')
return {'Acc': class_iou}
def rle_to_mask(rle):
mask = []
for r in rle:
m = _mask.decode(r)
m = np.uint8(m)
mask.append(m)
mask = np.stack(mask, axis=0)
return mask
\ No newline at end of file
from .RES import RESDataset
from .refVOS import RefVOSDataset
from torch.utils.data import Dataset
import copy
from collections.abc import Mapping
from typing import Union
from mmengine.config import Config
import logging
from mmengine.fileio import list_from_file
from mmengine.logging import print_log
from abc import abstractmethod
class BaseEvalDataset(Dataset):
METAINFO: dict = dict(name='default')
def __init__(self, metainfo: Union[Mapping, Config, None] = None):
self._metainfo = self._load_metainfo(copy.deepcopy(metainfo))
@classmethod
def _load_metainfo(cls,
metainfo: Union[Mapping, Config, None] = None) -> dict:
"""Collect meta information from the dictionary of meta.
Args:
metainfo (Mapping or Config, optional): Meta information dict.
If ``metainfo`` contains existed filename, it will be
parsed by ``list_from_file``.
Returns:
dict: Parsed meta information.
"""
# avoid `cls.METAINFO` being overwritten by `metainfo`
cls_metainfo = copy.deepcopy(cls.METAINFO)
if metainfo is None:
return cls_metainfo
if not isinstance(metainfo, (Mapping, Config)):
raise TypeError('metainfo should be a Mapping or Config, '
f'but got {type(metainfo)}')
for k, v in metainfo.items():
if isinstance(v, str):
# If type of value is string, and can be loaded from
# corresponding backend. it means the file name of meta file.
try:
cls_metainfo[k] = list_from_file(v)
except (TypeError, FileNotFoundError):
print_log(
f'{v} is not a meta file, simply parsed as meta '
'information',
logger='current',
level=logging.WARNING)
cls_metainfo[k] = v
else:
cls_metainfo[k] = v
return cls_metainfo
@property
def metainfo(self) -> dict:
"""Get meta information of dataset.
Returns:
dict: meta information collected from ``BaseDataset.METAINFO``,
annotation file and metainfo argument during instantiation.
"""
return copy.deepcopy(self._metainfo)
@abstractmethod
def evaluate(self, results, work_dir):
pass
import os
import json
import mmengine
from PIL import Image
import copy
from mmengine.dist import master_only
from .base_eval_dataset import BaseEvalDataset
SEG_PROMPT = "<image>\nPlease segment {}."
class RefVOSDataset(BaseEvalDataset):
def __init__(self,
image_folder,
expression_file,
mask_file,
):
super().__init__()
vid2metaid, metas, mask_dict = self.json_file_preprocess(expression_file, mask_file)
self.vid2metaid = vid2metaid
self.videos = list(self.vid2metaid.keys())
self.mask_dict = mask_dict
self.text_data = metas
self.image_folder = image_folder
def __len__(self):
return len(self.text_data)
def real_len(self):
return len(self.text_data)
def json_file_preprocess(self, expression_file, mask_file):
with open(expression_file, 'r') as f:
expression_datas = json.load(f)['videos']
metas = []
vid2metaid = {}
for vid_name in expression_datas:
vid_express_data = expression_datas[vid_name]
vid_frames = sorted(vid_express_data['frames'])
vid_len = len(vid_frames)
exp_id_list = sorted(list(vid_express_data['expressions'].keys()))
for exp_id in exp_id_list:
exp_dict = vid_express_data['expressions'][exp_id]
meta = {}
meta['video'] = vid_name
meta['exp'] = exp_dict['exp']
meta['frames'] = vid_frames
meta['exp_id'] = exp_id
meta['length'] = vid_len
metas.append(meta)
if vid_name not in vid2metaid.keys():
vid2metaid[vid_name] = []
vid2metaid[vid_name].append(len(metas) - 1)
if mask_file is not None:
mask_dict = mmengine.load(mask_file)
else:
mask_dict = None
return vid2metaid, metas, mask_dict
def __getitem__(self, index):
video_obj_info = copy.deepcopy(self.text_data[index])
exp = video_obj_info['exp']
data_dict = {}
video_id = video_obj_info['video']
frames_files = video_obj_info['frames']
frames_files = [
os.path.join(self.image_folder,video_id, frame_file + ".jpg") for frame_file in frames_files
]
images = []
ori_width, ori_height = None, None
for frame_idx, frame_path in enumerate(frames_files):
frame_image = Image.open(frame_path).convert('RGB')
if ori_height is None:
ori_width, ori_height = frame_image.size
else:
assert ori_width == frame_image.size[0]
assert ori_height == frame_image.size[1]
images.append(frame_image)
data_dict['type'] = 'video'
data_dict['index'] = index
data_dict['video_id'] = video_id
data_dict['images'] = images
data_dict['exp_id'] = video_obj_info['exp_id']
data_dict['frames'] = video_obj_info['frames']
data_dict['text_prompt'] = SEG_PROMPT.format(exp) if '?' not in exp else exp
data_dict['image_folder'] = self.image_folder
data_dict['length'] = video_obj_info['length']
data_dict['ori_height'] = ori_height
data_dict['ori_width'] = ori_width
return data_dict
#!/usr/bin/env bash
FILE=$1
MODEL=$2
GPUS=$3
NNODES=${NNODES:-1}
NODE_RANK=${NODE_RANK:-0}
PORT=${PORT:-$((28500 + $RANDOM % 2000))}
MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
if command -v torchrun &> /dev/null
then
echo "Using torchrun mode."
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 \
torchrun --nnodes=${NNODES} \
--nnodes=${NNODES} \
--node_rank=${NODE_RANK} \
--master_addr=${MASTER_ADDR} \
--master_port=${PORT} \
--nproc_per_node=${GPUS} \
${FILE} ${MODEL} --launcher pytorch "${@:4}"
else
echo "Using launch mode."
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 \
python -m torch.distributed.launch \
--nnodes=${NNODES} \
--node_rank=${NODE_RANK} \
--master_addr=${MASTER_ADDR} \
--master_port=${PORT} \
--nproc_per_node=${GPUS} \
${FILE} ${MODEL} --launcher pytorch "${@:4}"
fi
import argparse
import math
import os
import torch
import tqdm
from pycocotools import mask as mask_utils
from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig, CLIPImageProcessor,
CLIPVisionModel, GenerationConfig)
from utils import _init_dist_pytorch, get_dist_info, collect_results_cpu
from PIL import Image
import re
import json
def parse_args():
parser = argparse.ArgumentParser(description='GCG')
parser.add_argument('model_path', help='hf model path.')
parser.add_argument(
'--split',
default='val',
help='Specify a split')
parser.add_argument(
'--save_dir',
default='./gcg_pred/',
help='save path')
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
IMAGE_FOLDER = './data/glamm_data/images/grandf/val_test/'
class GCGInferenceDataset:
def __init__(self,
image_folder,
save_dir=None,
):
self.image_folder = image_folder
self.images = os.listdir(image_folder)
if save_dir is not None:
# filter evaluated
self.save_dir = save_dir
exsits_files = os.listdir(self.save_dir)
exsits_files = [_file[:-5] for _file in exsits_files]
_images = []
for i, item in enumerate(self.images):
if item[:-4] not in exsits_files:
_images.append(item)
self.images = _images
def __len__(self):
return len(self.images)
def get_questions(self):
question = "Could you please give me a brief description of the image? Please respond with interleaved \
segmentation masks for the corresponding parts of the answer."
return question
def __getitem__(self, index):
data_dict = {}
questions = self.get_questions()
image_file = self.images[index]
data_dict['image_file'] = image_file
image_file = os.path.join(self.image_folder, image_file)
image = Image.open(image_file).convert('RGB')
data_dict['image'] = image
data_dict['text'] = "<image>\n" + questions
data_dict['img_id'] = image_file
return data_dict
def main():
args = parse_args()
if args.launcher != 'none':
_init_dist_pytorch('nccl')
rank, world_size = get_dist_info()
torch.cuda.set_device(rank)
else:
rank = 0
world_size = 1
# build model
model = AutoModel.from_pretrained(
args.model_path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(
args.model_path,
trust_remote_code=True,
)
if not os.path.exists(args.save_dir):
os.mkdir(args.save_dir)
dataset = GCGInferenceDataset(
image_folder=IMAGE_FOLDER,
save_dir=args.save_dir,
)
results = []
n_samples = len(dataset)
per_rank_samples = math.ceil(n_samples / world_size) + 1
per_rank_ids = range(per_rank_samples * rank,
min(n_samples, per_rank_samples * (rank + 1)))
for idx in tqdm.tqdm(per_rank_ids):
data_batch = dataset[idx]
prediction = {'img_id': data_batch['img_id'], 'image_file': data_batch['image_file']}
del data_batch['img_id'], data_batch['image_file']
w, h = data_batch['image'].size
pred_dict = model.predict_forward(**data_batch, tokenizer=tokenizer)
if 'prediction_masks' not in pred_dict.keys() or pred_dict['prediction_masks'] is None or len(pred_dict['prediction_masks']) == 0:
print("No SEG !!!")
prediction['prediction_masks'] = torch.zeros((0, h, w), dtype=torch.bool)
else:
prediction['prediction_masks'] = torch.stack(pred_dict['prediction_masks'], dim=0)[:, 0]
process_and_save_output(
args.save_dir,
prediction['image_file'],
pred_dict['prediction'],
prediction['prediction_masks']
)
results.append(pred_dict['prediction'])
results = collect_results_cpu(results, len(dataset), tmpdir='./gcg_eval_tmp')
def process_and_save_output(output_dir, image_name, text_output, pred_masks):
if not os.path.exists(output_dir):
os.mkdir(output_dir)
text_output = text_output.replace("<s>", "").replace("\n", "").replace(" ", " ")
text_output = text_output.split("ASSISTANT: ")[-1]
cleaned_str = re.sub(r'<.*?>', '', text_output)
pattern = re.compile(r'<p>(.*?)<\/p>')
phrases = pattern.findall(text_output)
phrases = [p.strip() for p in phrases]
# Remove the [SEG] token
cleaned_str = cleaned_str.replace('[SEG]', '')
# Strip unnecessary spaces
cleaned_str = ' '.join(cleaned_str.split()).strip("'")
cleaned_str = cleaned_str.strip()
# Convert the predicted masks into RLE format
pred_masks_tensor = pred_masks.cpu()
uncompressed_mask_rles = mask_to_rle_pytorch(pred_masks_tensor)
rle_masks = []
for m in uncompressed_mask_rles:
rle_masks.append(coco_encode_rle(m))
# Create results dictionary
# print(f"clean_str: {cleaned_str}")
result_dict = {
"image_id": image_name[:-4],
"caption": cleaned_str,
"phrases": phrases,
"pred_masks": rle_masks
}
# print(cleaned_str)
# print(phrases)
output_path = f"{output_dir}/{image_name[:-4]}.json"
with open(output_path, 'w') as f:
json.dump(result_dict, f)
return
def mask_to_rle_pytorch(tensor: torch.Tensor):
"""
Encodes masks to an uncompressed RLE, in the format expected by
pycoco tools.
"""
# Put in fortran order and flatten h,w
b, h, w = tensor.shape
tensor = tensor.permute(0, 2, 1).flatten(1)
# Compute change indices
diff = tensor[:, 1:] ^ tensor[:, :-1]
change_indices = diff.nonzero()
# Encode run length
out = []
for i in range(b):
cur_idxs = change_indices[change_indices[:, 0] == i, 1]
cur_idxs = torch.cat(
[torch.tensor([0], dtype=cur_idxs.dtype, device=cur_idxs.device), cur_idxs + 1,
torch.tensor([h * w], dtype=cur_idxs.dtype, device=cur_idxs.device), ]
)
btw_idxs = cur_idxs[1:] - cur_idxs[:-1]
counts = [] if tensor[i, 0] == 0 else [0]
counts.extend(btw_idxs.detach().cpu().tolist())
out.append({"size": [h, w], "counts": counts})
return out
def coco_encode_rle(uncompressed_rle):
h, w = uncompressed_rle["size"]
rle = mask_utils.frPyObjects(uncompressed_rle, h, w)
rle["counts"] = rle["counts"].decode("utf-8") # Necessary to serialize with json
return rle
if __name__ == '__main__':
main()
import os
import json
import argparse
from tqdm import tqdm
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from pycocotools import mask as maskUtils
from pycocoevalcap.eval import COCOEvalCap
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np
def parse_args():
parser = argparse.ArgumentParser(description="Training")
parser.add_argument("--split", required=True, help="Evaluation split, options are 'val', 'test'")
parser.add_argument("--prediction_dir_path", required=True, help="The path where the inference results are stored.")
parser.add_argument("--gt_dir_path", required=False, default="./data/glamm_data/annotations/gcg_val_test/",
help="The path containing GranD-f evaluation annotations.")
args = parser.parse_args()
return args
# Load pre-trained model tokenizer and model for evaluation
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
def get_bert_embedding(text):
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
outputs = model(**inputs)
# Use the mean of the last hidden states as sentence embedding
sentence_embedding = torch.mean(outputs.last_hidden_state[0], dim=0).detach().numpy()
return sentence_embedding
def compute_iou(mask1, mask2):
intersection = np.logical_and(mask1, mask2)
union = np.logical_or(mask1, mask2)
iou = np.sum(intersection) / np.sum(union)
return iou
def bbox_to_x1y1x2y2(bbox):
x1, y1, w, h = bbox
bbox = [x1, y1, x1 + w, y1 + h]
return bbox
def compute_miou(pred_masks, gt_masks):
# Computing mIoU between predicted masks and ground truth masks
iou_matrix = np.zeros((len(pred_masks), len(gt_masks)))
for i, pred_mask in enumerate(pred_masks):
for j, gt_mask in enumerate(gt_masks):
iou_matrix[i, j] = compute_iou(pred_mask, gt_mask)
# One-to-one pairing and mean IoU calculation
paired_iou = []
while iou_matrix.size > 0 and np.max(iou_matrix) > 0:
max_iou_idx = np.unravel_index(np.argmax(iou_matrix, axis=None), iou_matrix.shape)
paired_iou.append(iou_matrix[max_iou_idx])
iou_matrix = np.delete(iou_matrix, max_iou_idx[0], axis=0)
iou_matrix = np.delete(iou_matrix, max_iou_idx[1], axis=1)
return np.mean(paired_iou) if paired_iou else 0.0
def evaluate_mask_miou(coco_gt, image_ids, pred_save_path):
# Load predictions
coco_dt = coco_gt.loadRes(pred_save_path)
mious = []
for image_id in tqdm(image_ids):
# Getting ground truth masks
matching_anns = [ann for ann in coco_gt.anns.values() if ann['image_id'] == image_id]
ann_ids = [ann['id'] for ann in matching_anns]
gt_anns = coco_gt.loadAnns(ann_ids)
gt_masks = [maskUtils.decode(ann['segmentation']) for ann in gt_anns if 'segmentation' in ann]
# Getting predicted masks
matching_anns = [ann for ann in coco_dt.anns.values() if ann['image_id'] == image_id]
dt_ann_ids = [ann['id'] for ann in matching_anns]
pred_anns = coco_dt.loadAnns(dt_ann_ids)
pred_masks = [maskUtils.decode(ann['segmentation']) for ann in pred_anns if 'segmentation' in ann]
# Compute and save the mIoU for the current image
mious.append(compute_miou(pred_masks, gt_masks))
# Report mean IoU across all images
mean_miou = np.mean(mious) if mious else 0.0 # If list is empty, return 0.0
print(f"Mean IoU (mIoU) across all images: {mean_miou:.3f}")
def compute_iou_matrix(pred_masks, gt_masks):
iou_matrix = np.zeros((len(pred_masks), len(gt_masks)))
for i, pred_mask in enumerate(pred_masks):
for j, gt_mask in enumerate(gt_masks):
iou_matrix[i, j] = compute_iou(pred_mask, gt_mask)
return iou_matrix
def text_similarity_bert(str1, str2):
emb1 = get_bert_embedding(str1)
emb2 = get_bert_embedding(str2)
return cosine_similarity([emb1], [emb2])[0, 0]
def find_best_matches(gt_anns, gt_labels, dt_anns, dt_labels, iou_threshold, text_sim_threshold, vectorizer=None):
best_matches = []
# Compute pair - wise IoU
pred_masks = [maskUtils.decode(ann['segmentation']) for ann in dt_anns]
gt_masks = [maskUtils.decode(ann['segmentation']) for ann in gt_anns]
ious = compute_iou_matrix(gt_masks, pred_masks)
text_sims = np.zeros((len(gt_labels), len(dt_labels)))
for i, gt_label in enumerate(gt_labels):
for j, dt_label in enumerate(dt_labels):
text_sims[i, j] = text_similarity_bert(gt_label, dt_label)
# Find one-to-one matches satisfying both IoU and text similarity thresholds
while ious.size > 0:
max_iou_idx = np.unravel_index(np.argmax(ious), ious.shape)
if ious[max_iou_idx] < iou_threshold or text_sims[max_iou_idx] < text_sim_threshold:
break # No admissible pair found
best_matches.append(max_iou_idx)
# Remove selected annotations from consideration
ious[max_iou_idx[0], :] = 0
ious[:, max_iou_idx[1]] = 0
text_sims[max_iou_idx[0], :] = 0
text_sims[:, max_iou_idx[1]] = 0
return best_matches # List of index pairs [(gt_idx, dt_idx), ...]
def evaluate_recall_with_mapping(coco_gt, coco_cap_gt, image_ids, pred_save_path, cap_pred_save_path, iou_threshold=0.5,
text_sim_threshold=0.5):
coco_dt = coco_gt.loadRes(pred_save_path)
coco_cap_dt = coco_cap_gt.loadRes(cap_pred_save_path)
true_positives = 0
actual_positives = 0
for image_id in tqdm(image_ids):
try:
# gt_ann_ids = coco_gt.getAnnIds(imgIds=image_id, iscrowd=None)
matching_anns = [ann for ann in coco_gt.anns.values() if ann['image_id'] == image_id]
gt_ann_ids = [ann['id'] for ann in matching_anns]
gt_anns = coco_gt.loadAnns(gt_ann_ids)
# dt_ann_ids = coco_dt.getAnnIds(imgIds=image_id, iscrowd=None)
matching_anns = [ann for ann in coco_dt.anns.values() if ann['image_id'] == image_id]
dt_ann_ids = [ann['id'] for ann in matching_anns]
dt_anns = coco_dt.loadAnns(dt_ann_ids)
# gt_cap_ann_ids = coco_cap_gt.getAnnIds(imgIds=image_id)
matching_anns = [ann for ann in coco_cap_gt.anns.values() if ann['image_id'] == image_id]
gt_cap_ann_ids = [ann['id'] for ann in matching_anns]
gt_cap_ann = coco_cap_gt.loadAnns(gt_cap_ann_ids)[0]
# dt_cap_ann_ids = coco_cap_dt.getAnnIds(imgIds=image_id)
matching_anns = [ann for ann in coco_cap_dt.anns.values() if ann['image_id'] == image_id]
dt_cap_ann_ids = [ann['id'] for ann in matching_anns]
dt_cap_ann = coco_cap_dt.loadAnns(dt_cap_ann_ids)[0]
gt_labels = gt_cap_ann['labels']
dt_labels = dt_cap_ann['labels']
actual_positives += len(gt_labels)
# Find best matching pairs
best_matches = find_best_matches(gt_anns, gt_labels, dt_anns, dt_labels, iou_threshold, text_sim_threshold)
true_positives += len(best_matches)
except Exception as e:
print(e)
recall = true_positives / actual_positives if actual_positives > 0 else 0
print(f"Recall: {recall:.3f}")
def main():
args = parse_args()
# Set the correct split
split = args.split
assert split == "val" or split == "test" # GCG Evaluation has only val and test splits
gt_mask_path = f"{args.gt_dir_path}/{split}_gcg_coco_mask_gt.json"
gt_cap_path = f"{args.gt_dir_path}/{split}_gcg_coco_caption_gt.json"
print(f"Starting evalution on {split} split.")
# Get the image names of the split
all_images_ids = []
with open(gt_cap_path, 'r') as f:
contents = json.load(f)
for image in contents['images']:
all_images_ids.append(image['id'])
# The directory is used to store intermediate files
tmp_dir_path = f"tmp/{os.path.basename(args.prediction_dir_path)}_{split}"
os.makedirs(tmp_dir_path, exist_ok=True) # Create directory if not exists already
# Create predictions
pred_save_path = f"{tmp_dir_path}/mask_pred_tmp_save.json"
cap_pred_save_path = f"{tmp_dir_path}/cap_pred_tmp_save.json"
coco_pred_file = []
caption_pred_dict = {}
for image_id in all_images_ids:
prediction_path = f"{args.prediction_dir_path}/{image_id}.json"
with open(prediction_path, 'r') as f:
pred = json.load(f)
bu = pred
key = list(pred.keys())[0]
pred = pred[key]
try:
caption_pred_dict[image_id] = {'caption': pred['caption'], 'labels': pred['phrases']}
except Exception as e:
pred = bu
caption_pred_dict[image_id] = {'caption': pred['caption'], 'labels': pred['phrases']}
for rle_mask in pred['pred_masks']:
coco_pred_file.append({"image_id": image_id, "category_id": 1, "segmentation": rle_mask, "score": 1.0})
# Save gcg_coco_predictions
with open(pred_save_path, 'w') as f:
json.dump(coco_pred_file, f)
# Prepare the CAPTION predictions in COCO format
cap_image_ids = []
coco_cap_pred_file = []
for image_id, values in caption_pred_dict.items():
cap_image_ids.append(image_id)
coco_cap_pred_file.append({"image_id": image_id, "caption": values['caption'], "labels": values['labels']})
# Save gcg_caption_coco_predictions
with open(cap_pred_save_path, 'w') as f:
json.dump(coco_cap_pred_file, f)
# # -------------------------------#
# 1. Evaluate AP
# Calculate mask mAP
# Load the ground truth and predictions in COCO format
coco_gt = COCO(gt_mask_path)
coco_dt = coco_gt.loadRes(pred_save_path) # load predictions
# Initialize COCOEval and specify the metric you want to use
coco_eval = COCOeval(coco_gt, coco_dt, "segm") # "segm" for segmentation
# Evaluate on a specific category
coco_eval.params.catIds = [1] # your category ID
# Evaluate
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
# # -------------------------------#
# # 2. Evaluate Caption Quality
try:
coco_cap_gt = COCO(gt_cap_path)
coco_cap_result = coco_cap_gt.loadRes(cap_pred_save_path)
# create coco_eval object by taking coco and coco_result
coco_eval = COCOEvalCap(coco_cap_gt, coco_cap_result)
coco_eval.params['image_id'] = coco_cap_result.getImgIds()
coco_eval.evaluate()
for metric, score in coco_eval.eval.items():
print(f'{metric}: {score:.3f}')
except:
pass
# # -------------------------------#
# 3. Evaluate Mask Mean MIoU
coco_gt = COCO(gt_mask_path) # Load ground truth annotations
evaluate_mask_miou(coco_gt, all_images_ids, pred_save_path)
# # -------------------------------#
# 4. Evaluate Recall
evaluate_recall_with_mapping(coco_gt, coco_cap_gt, all_images_ids, pred_save_path, cap_pred_save_path,
iou_threshold=0.5, text_sim_threshold=0.5)
if __name__ == "__main__":
main()
import argparse
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
def parse_args():
parser = argparse.ArgumentParser(description="GLaMM Inference - Region Captioning")
parser.add_argument("--annotation_file",
default="./data/region_caption/mdetr_annotations/finetune_refcocog_val_captions.json", type=str,
help="Replace with 'data/visual_genome/test_caption.json' for VG.")
parser.add_argument("--results_dir", default="results", type=str, help="The path to save the results.")
return parser.parse_args()
def main():
args = parse_args()
# Load the annotation file
coco = COCO(args.annotation_file)
coco_result = coco.loadRes(args.results_dir)
# Create coco_eval object by taking coco and coco_result
coco_eval = COCOEvalCap(coco, coco_result)
# Evaluate results
coco_eval.params['image_id'] = coco_result.getImgIds()
coco_eval.evaluate()
# Print and save the output evaluation scores
output_file_path = f"./region_cap_metrics.txt"
f = open(output_file_path, 'w')
for metric, score in coco_eval.eval.items():
print(f'{metric}: {score:.3f}')
f.write(f"{metric}: {score:.3f}\n")
f.close()
if __name__ == "__main__":
main()
import argparse
import json
import os
import mmengine
import numpy as np
from PIL import Image
import torch
import torch.distributed
import torch.utils.data
import tqdm
from transformers import AutoModel, AutoTokenizer
from projects.llava_sam2.evaluation.dataset import RefVOSDataset
from projects.llava_sam2.evaluation.utils import _init_dist_pytorch, _init_dist_slurm, get_dist_info, get_rank, collect_results_cpu
import concurrent.futures
from pycocotools import mask as cocomask
def async_func(executor, func, **kwargs):
future = executor.submit(func, **kwargs)
return future
def mask_to_rle(mask):
rle = []
for m in mask:
rle.append(cocomask.encode(np.asfortranarray(m.astype(np.uint8))))
rle[-1]['counts'] = rle[-1]['counts'].decode()
return rle
def mask_save(item, mask_prediction, work_dir):
vid_id = item['video_id']
exp_id = item['exp_id']
save_path = os.path.join(work_dir, 'Annotations', vid_id, exp_id)
mmengine.mkdir_or_exist(save_path)
for id_m, mask in enumerate(mask_prediction):
mask = Image.fromarray(mask.astype(np.float32) * 255).convert('L')
file_name = item['frames'][id_m]
save_file = os.path.join(save_path, file_name + ".png")
mask.save(save_file)
DATASETS_INFO = {
'DAVIS': {
'data_root': 'data/video_datas/davis17/',
'image_folder': 'data/video_datas/davis17/valid/JPEGImages/',
'expression_file': 'data/video_datas/davis17/meta_expressions/valid/meta_expressions.json',
'mask_file': 'data/video_datas/davis17/valid/mask_dict.pkl',
},
'MEVIS': {
'data_root': 'data/video_datas/mevis/valid/',
'image_folder': 'data/video_datas/mevis/valid/JPEGImages',
'expression_file': 'data/video_datas/mevis/valid/meta_expressions.json',
'mask_file': None,
},
'MEVIS_U': {
'data_root': 'data/video_datas/mevis/valid_u/',
'image_folder': 'data/video_datas/mevis/valid_u/JPEGImages',
'expression_file': 'data/video_datas/mevis/valid_u/meta_expressions.json',
'mask_file': 'data/video_datas/mevis/valid_u/mask_dict.json',
},
'REFYTVOS': {
'data_root': 'data/video_datas/rvos/',
'image_folder': 'data/video_datas/rvos/valid/JPEGImages/',
'expression_file': 'data/video_datas/rvos/meta_expressions/valid/meta_expressions.json',
'mask_file': None,
},
'REVOS': {
'data_root': 'data/video_datas/revos/',
'image_folder': 'data/video_datas/revos/',
'expression_file': 'data/video_datas/revos/meta_expressions_valid_.json',
'mask_file': None,
}
}
def parse_args():
parser = argparse.ArgumentParser(description='RefVOS')
parser.add_argument('model_path', help='hf model path.')
parser.add_argument(
'--dataset',
choices=DATASETS_INFO.keys(),
default='MEVIS',
help='Specify a dataset')
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
parser.add_argument('--submit', action='store_true')
parser.add_argument('--work_dir', type=str, default=None)
parser.add_argument('--deepspeed', type=str, default=None) # dummy
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
if __name__ == '__main__':
args = parse_args()
work_dir = args.work_dir
if work_dir is None:
work_dir = 'work_dirs/foobar'
if args.launcher == 'none':
rank = 0
world_size = 1
elif args.launcher == 'pytorch':
_init_dist_pytorch('nccl')
rank, world_size = get_dist_info()
elif args.launcher == 'slurm':
_init_dist_slurm('nccl')
rank, world_size = get_dist_info()
model = AutoModel.from_pretrained(
args.model_path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(
args.model_path,
trust_remote_code=True,
)
dataset_info = DATASETS_INFO[args.dataset]
dataset = RefVOSDataset(
image_folder=dataset_info['image_folder'],
expression_file=dataset_info['expression_file'],
mask_file=dataset_info['mask_file'],
)
sampler = torch.utils.data.DistributedSampler(
dataset,
num_replicas=world_size,
rank=rank,
shuffle=False,
drop_last=False
)
dataloader = torch.utils.data.DataLoader(
dataset,
sampler=sampler,
batch_size=1,
num_workers=2,
pin_memory=False,
collate_fn=lambda x:x[0],
)
results = []
executor = concurrent.futures.ThreadPoolExecutor()
for item in tqdm.tqdm(dataloader):
with torch.no_grad():
result = model.predict_forward(
video=item['images'],
text=item['text_prompt'],
tokenizer=tokenizer,
)
text_idx = 0
text_prediction = result['prediction']
if len(result['prediction_masks']) > 0:
mask_prediction = result['prediction_masks'][text_idx]
else:
print(text_prediction)
mask_prediction = np.zeros((item['length'], item['ori_height'], item['ori_width']), dtype=np.uint8)
if args.submit:
async_func(executor, mask_save, item=item, mask_prediction=mask_prediction, work_dir=work_dir)
encoded_mask = None
else:
encoded_mask = mask_to_rle(mask_prediction)
result = {
'index': item['index'],
'video_id': item['video_id'],
'exp_id': item['exp_id'],
'text_prediction': text_prediction,
'frames': item['frames'],
'exp': item['text_prompt'],
'prediction_masks': encoded_mask,
}
results.append(result)
executor.shutdown(wait=True)
print(f'[Rank {rank}] : Finished.')
if not args.submit:
results = collect_results_cpu(results, len(dataset))
if get_rank() == 0:
final_results = {}
for item in results:
vid_id = item['video_id']
exp_id = item['exp_id']
if vid_id not in final_results:
final_results[vid_id] = {}
assert exp_id not in final_results[vid_id]
final_results[vid_id][exp_id] = item
os.makedirs(work_dir, exist_ok=True)
json.dump(final_results, open(f'{work_dir}/results.json', 'w'))
if rank == 0:
print('Done')
import argparse
import copy
import math
import os
import torch
import tqdm
from pycocotools import mask as _mask
import numpy as np
import random
from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig, CLIPImageProcessor,
CLIPVisionModel, GenerationConfig)
from utils import _init_dist_pytorch, get_dist_info, get_rank, collect_results_cpu
from dataset import RESDataset
def parse_args():
parser = argparse.ArgumentParser(description='RefCocoSeg')
parser.add_argument('model_path', help='hf model path.')
parser.add_argument(
'--dataset',
choices=DATASETS_ATTRIBUTES.keys(),
default='refcoco',
help='Specify a ref dataset')
parser.add_argument(
'--split',
default='val',
help='Specify a split')
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
DATASETS_ATTRIBUTES = {
'refcoco': {'splitBy': "unc", 'dataset_name': 'refcoco'},
'refcoco_plus': {'splitBy': "unc", 'dataset_name': 'refcoco_plus'},
'refcocog': {'splitBy': "umd", 'dataset_name': 'refcocog'},
}
IMAGE_FOLDER = './data/glamm_data/images/coco2014/train2014/'
DATA_PATH = './data/ref_seg/'
def main():
args = parse_args()
if args.launcher != 'none':
_init_dist_pytorch('nccl')
rank, world_size = get_dist_info()
torch.cuda.set_device(rank)
else:
rank = 0
world_size = 1
# build model
model = AutoModel.from_pretrained(
args.model_path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(
args.model_path,
trust_remote_code=True,
)
dataset_info = DATASETS_ATTRIBUTES[args.dataset]
dataset = RESDataset(
image_folder=IMAGE_FOLDER,
dataset_name=dataset_info['dataset_name'],
data_path=DATA_PATH,
split=args.split,
)
results = []
n_samples = len(dataset)
per_rank_samples = math.ceil(n_samples / world_size) + 1
per_rank_ids = range(per_rank_samples * rank,
min(n_samples, per_rank_samples * (rank + 1)))
for idx in tqdm.tqdm(per_rank_ids):
data_batch = dataset[idx]
prediction = {'img_id': data_batch['img_id'], 'gt_masks': data_batch['gt_masks']}
prediction['gt_masks'] = mask_to_rle(prediction['gt_masks'].cpu().numpy())
del data_batch['img_id'], data_batch['gt_masks']
texts = data_batch['text']
del data_batch['text']
pred_masks = []
for text in texts:
_data_batch = copy.deepcopy(data_batch)
_data_batch['text'] = text
pred_mask = model.predict_forward(**_data_batch, tokenizer=tokenizer)['prediction_masks']
if len(pred_mask) == 0:
# give a zero mask
print("No seg pred !!!")
pred_masks.append(None)
else:
_ret_mask = pred_mask[0].cpu().numpy()
_ret_mask = mask_to_rle(_ret_mask)
pred_masks.append(_ret_mask)
prediction.update({'prediction_masks': pred_masks})
results.append(prediction)
tmpdir = './dist_test_temp_res_' + args.dataset + args.split + args.model_path.replace('/', '').replace('.', '')
results = collect_results_cpu(results, len(dataset), tmpdir=tmpdir)
if get_rank() == 0:
metric = dataset.evaluate(results, './work_dirs')
print(metric)
def mask_to_rle(mask):
rle = []
for m in mask:
rle.append(_mask.encode(np.asfortranarray(m.astype(np.uint8))))
rle[-1]['counts'] = rle[-1]['counts'].decode()
return rle
if __name__ == '__main__':
main()
import argparse
import re
import math
import os
import torch
import tqdm
from pycocotools import mask as _mask
import numpy as np
from PIL import Image
from pycocotools.coco import COCO
from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig, CLIPImageProcessor,
CLIPVisionModel, GenerationConfig)
import json
from utils import _init_dist_pytorch, get_dist_info, get_rank, collect_results_cpu
def parse_args():
parser = argparse.ArgumentParser(description='RefCocog region caption')
parser.add_argument('model_path', help='hf model path.')
parser.add_argument(
'--output-path',
default='./region_cap_pred.json',
help='save path of the prediction')
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
class RegionCapInferenceDataset:
def __init__(self,
image_folder,
annotation_file=None,
):
self.image_folder = image_folder
self.coco = COCO(annotation_file)
self.image_dict = self.coco.imgs
self.ann_dict = self.coco.anns
self.image_dict_keys = list(self.image_dict.keys())
def __len__(self):
return len(self.image_dict_keys)
def decode_mask(self, annotation, image_info):
flag = False
masks = []
for ann_id in range(1):
ann = {"segmentation": annotation}
if len(ann["segmentation"]) == 0:
m = np.zeros((image_info["height"], image_info["width"])).astype(
np.uint8
)
masks.append(m)
continue
if type(ann["segmentation"][0]) == list: # polygon
rle = _mask.frPyObjects(
ann["segmentation"], image_info["height"], image_info["width"]
)
else:
rle = ann["segmentation"]
for i in range(len(rle)):
if not isinstance(rle[i]["counts"], bytes):
rle[i]["counts"] = rle[i]["counts"].encode()
m = _mask.decode(rle)
m = np.sum(m, axis=2) # sometimes there are multiple binary map (corresponding to multiple segs)
m = m.astype(np.uint8) # convert to np.uint8
masks.append(m)
masks = np.stack(masks, axis=0)
return masks
def get_questions(self):
# question = "<image>\nPlease give me a short description of the region in the picture marked by region1. Please response in a word."
question = "<image>\nPlease give me a short description of the region in the picture marked by region1."
return question
def __getitem__(self, index):
data_dict = {}
image_id = self.image_dict_keys[index]
image_file = self.image_dict[image_id]['file_name']
questions = self.get_questions()
data_dict['image_file'] = image_file
image_file = os.path.join(self.image_folder, image_file)
image = Image.open(image_file).convert('RGB')
masks = self.ann_dict[image_id]['segmentation']
image_info = self.image_dict[image_id]
masks = self.decode_mask(masks, image_info)
data_dict['image'] = image
data_dict['text'] = questions
data_dict['img_id'] = image_id
data_dict['mask_prompts'] = [masks]
return data_dict
ANNOTATION_FILE = './data/region_caption/refcocog/finetune_refcocog_val_with_mask.json'
IMAGE_FOLDER = './data/glamm_data/images/coco2014/train2014/'
def main():
args = parse_args()
if args.launcher != 'none':
_init_dist_pytorch('nccl')
rank, world_size = get_dist_info()
torch.cuda.set_device(rank)
else:
rank = 0
world_size = 1
# build model
model = AutoModel.from_pretrained(
args.model_path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(
args.model_path,
trust_remote_code=True,
)
dataset = RegionCapInferenceDataset(
image_folder=IMAGE_FOLDER,
annotation_file=ANNOTATION_FILE,
)
results = []
n_samples = len(dataset)
per_rank_samples = math.ceil(n_samples / world_size) + 1
per_rank_ids = range(per_rank_samples * rank,
min(n_samples, per_rank_samples * (rank + 1)))
for idx in tqdm.tqdm(per_rank_ids):
data_batch = dataset[idx]
result_dict = {'image_id': data_batch['img_id'], 'image_file': data_batch['image_file']}
del data_batch['img_id'], data_batch['image_file']
prediction = model.predict_forward(**data_batch, tokenizer=tokenizer)['prediction']
text_output = prediction.replace("<s>", "").replace("\n", "") \
.replace("region1", '').replace("Region1", '').replace("The region marked by", "").replace("The region marked as", "").replace("The region marked", "") \
.replace("is", "").replace("shows", "").replace(':', '').replace(" ", " ").replace(" ", " ")
text_output = text_output.split("ASSISTANT: ")[-1]
cleaned_str = re.sub(r'<.*?>', '', text_output)
cleaned_str = cleaned_str.replace('[SEG]', '')
cleaned_str = ' '.join(cleaned_str.split()).strip("'")
cleaned_str = cleaned_str.strip()
result_dict["caption"] = cleaned_str
result_dict["prediction"] = cleaned_str
results.append(result_dict)
tmpdir = './dist_test_temp_regioncap_' + args.model_path.replace('/', '').replace('.', '')
results = collect_results_cpu(results, len(dataset), tmpdir=tmpdir)
if get_rank() == 0:
with open(args.output_path, 'w') as json_file:
json.dump(results, json_file, indent=2)
if __name__ == '__main__':
main()
from .dist import _init_dist_pytorch, get_dist_info, master_only, get_rank, collect_results_cpu, _init_dist_slurm, barrier
from .refcoco_refer import REFER
from .utils_refcoco import AverageMeter, Summary, intersectionAndUnionGPU
from itertools import zip_longest, chain
import os.path as osp
import subprocess
import torch
import os
from torch import distributed as torch_dist
from torch.distributed import ProcessGroup
import functools
from typing import Callable, Optional, Tuple
import pickle
import shutil
def _init_dist_pytorch(backend, **kwargs) -> None:
"""Initialize distributed environment with PyTorch launcher.
Args:
backend (str): Backend of torch.distributed. Supported backends are
'nccl', 'gloo' and 'mpi'. Defaults to 'nccl'.
**kwargs: keyword arguments are passed to ``init_process_group``.
"""
# LOCAL_RANK is set by `torch.distributed.launch` since PyTorch 1.1
local_rank = int(os.environ['LOCAL_RANK'])
torch.cuda.set_device(local_rank)
torch_dist.init_process_group(backend=backend, **kwargs)
def _init_dist_slurm(backend,
port=None,
init_backend='torch',
**kwargs) -> None:
"""Initialize slurm distributed training environment.
If argument ``port`` is not specified, then the master port will be system
environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
environment variable, then a default port ``29500`` will be used.
Args:
backend (str): Backend of torch.distributed.
port (int, optional): Master port. Defaults to None.
"""
proc_id = int(os.environ['SLURM_PROCID'])
ntasks = int(os.environ['SLURM_NTASKS'])
node_list = os.environ['SLURM_NODELIST']
# Not sure when this environment variable could be None, so use a fallback
local_rank_env = os.environ.get('SLURM_LOCALID', None)
if local_rank_env is not None:
local_rank = int(local_rank_env)
else:
num_gpus = torch.cuda.device_count()
local_rank = proc_id % num_gpus
addr = subprocess.getoutput(
f'scontrol show hostname {node_list} | head -n1')
# specify master port
if port is not None:
os.environ['MASTER_PORT'] = str(port)
elif 'MASTER_PORT' in os.environ:
pass # use MASTER_PORT in the environment variable
else:
# 29500 is torch.distributed default port
os.environ['MASTER_PORT'] = '29500'
# use MASTER_ADDR in the environment variable if it already exists
if 'MASTER_ADDR' not in os.environ:
os.environ['MASTER_ADDR'] = addr
os.environ['WORLD_SIZE'] = str(ntasks)
os.environ['LOCAL_RANK'] = str(local_rank)
os.environ['RANK'] = str(proc_id)
torch.cuda.set_device(local_rank)
if init_backend == 'torch':
torch_dist.init_process_group(backend=backend, **kwargs)
elif init_backend == 'deepspeed':
import deepspeed
deepspeed.init_distributed(dist_backend=backend, **kwargs)
elif init_backend == 'colossalai':
import colossalai
colossalai.launch_from_slurm(
backend=backend,
host=os.environ['MASTER_ADDR'],
port=os.environ['MASTER_PORT'],
**kwargs,
)
else:
raise ValueError(
'supported "init_backend" is "torch" or "deepspeed", '
f'but got {init_backend}')
def get_dist_info(group=None) -> Tuple[int, int]:
"""Get distributed information of the given process group.
Note:
Calling ``get_dist_info`` in non-distributed environment will return
(0, 1).
Args:
group (ProcessGroup, optional): The process group to work on. If None,
the default process group will be used. Defaults to None.
Returns:
tuple[int, int]: Return a tuple containing the ``rank`` and
``world_size``.
"""
world_size = get_world_size(group)
rank = get_rank(group)
return rank, world_size
def get_world_size(group: Optional[ProcessGroup] = None) -> int:
"""Return the number of the given process group.
Note:
Calling ``get_world_size`` in non-distributed environment will return
1.
Args:
group (ProcessGroup, optional): The process group to work on. If None,
the default process group will be used. Defaults to None.
Returns:
int: Return the number of processes of the given process group if in
distributed environment, otherwise 1.
"""
if is_distributed():
# handle low versions of torch like 1.5.0 which does not support
# passing in None for group argument
if group is None:
group = get_default_group()
return torch_dist.get_world_size(group)
else:
return 1
def get_rank(group: Optional[ProcessGroup] = None) -> int:
"""Return the rank of the given process group.
Rank is a unique identifier assigned to each process within a distributed
process group. They are always consecutive integers ranging from 0 to
``world_size``.
Note:
Calling ``get_rank`` in non-distributed environment will return 0.
Args:
group (ProcessGroup, optional): The process group to work on. If None,
the default process group will be used. Defaults to None.
Returns:
int: Return the rank of the process group if in distributed
environment, otherwise 0.
"""
if is_distributed():
# handle low versions of torch like 1.5.0 which does not support
# passing in None for group argument
if group is None:
group = get_default_group()
return torch_dist.get_rank(group)
else:
return 0
def is_distributed() -> bool:
"""Return True if distributed environment has been initialized."""
return torch_dist.is_available() and torch_dist.is_initialized()
def get_default_group() -> Optional[ProcessGroup]:
"""Return default process group."""
return torch_dist.distributed_c10d._get_default_group()
def is_main_process(group: Optional[ProcessGroup] = None) -> bool:
"""Whether the current rank of the given process group is equal to 0.
Args:
group (ProcessGroup, optional): The process group to work on. If None,
the default process group will be used. Defaults to None.
Returns:
bool: Return True if the current rank of the given process group is
equal to 0, otherwise False.
"""
return get_rank(group) == 0
def master_only(func: Callable) -> Callable:
"""Decorate those methods which should be executed in master process.
Args:
func (callable): Function to be decorated.
Returns:
callable: Return decorated function.
"""
@functools.wraps(func)
def wrapper(*args, **kwargs):
if is_main_process():
return func(*args, **kwargs)
return wrapper
def collect_results_cpu(result_part: list,
size: int,
tmpdir='./dist_test_temp'):
"""Collect results under cpu mode.
On cpu mode, this function will save the results on different gpus to
``tmpdir`` and collect them by the rank 0 worker.
Args:
result_part (list): Result list containing result parts
to be collected. Each item of ``result_part`` should be a picklable
object.
size (int): Size of the results, commonly equal to length of
the results.
tmpdir (str | None): Temporal directory for collected results to
store. If set to None, it will create a random temporal directory
for it. Defaults to None.
Returns:
list or None: The collected results.
"""
rank, world_size = get_dist_info()
if world_size == 1:
return result_part[:size]
# create a tmp dir if it is not specified
if not os.path.exists(tmpdir):
os.mkdir(tmpdir)
# dump the part result to the dir
with open(osp.join(tmpdir, f'part_{rank}.pkl'), 'wb') as f: # type: ignore
pickle.dump(result_part, f, protocol=2)
barrier()
# collect all parts
if rank != 0:
return None
else:
# load results of all parts from tmp dir
part_list = []
for i in range(world_size):
path = osp.join(tmpdir, f'part_{i}.pkl') # type: ignore
if not osp.exists(path):
raise FileNotFoundError(
f'{tmpdir} is not an shared directory for '
f'rank {i}, please make sure {tmpdir} is a shared '
'directory for all ranks!')
with open(path, 'rb') as f:
part_list.append(pickle.load(f))
# sort the results
ordered_results = []
zipped_results = zip_longest(*part_list)
ordered_results = [
i for i in chain.from_iterable(zipped_results) if i is not None
]
# the dataloader may pad some samples
ordered_results = ordered_results[:size]
# remove tmp dir
shutil.rmtree(tmpdir) # type: ignore
return ordered_results
def barrier(group: Optional[ProcessGroup] = None) -> None:
"""Synchronize all processes from the given process group.
This collective blocks processes until the whole group enters this
function.
Note:
Calling ``barrier`` in non-distributed environment will do nothing.
Args:
group (ProcessGroup, optional): The process group to work on. If None,
the default process group will be used. Defaults to None.
"""
if is_distributed():
# handle low versions of torch like 1.5.0 which does not support
# passing in None for group argument
if group is None:
group = get_default_group()
torch_dist.barrier(group)
\ No newline at end of file
__author__ = "licheng"
"""
This interface provides access to four datasets:
1) refclef
2) refcoco
3) refcoco+
4) refcocog
split by unc and google
The following API functions are defined:
REFER - REFER api class
getRefIds - get ref ids that satisfy given filter conditions.
getAnnIds - get ann ids that satisfy given filter conditions.
getImgIds - get image ids that satisfy given filter conditions.
getCatIds - get category ids that satisfy given filter conditions.
loadRefs - load refs with the specified ref ids.
loadAnns - load anns with the specified ann ids.
loadImgs - load images with the specified image ids.
loadCats - load category names with the specified category ids.
getRefBox - get ref's bounding box [x, y, w, h] given the ref_id
showRef - show image, segmentation or box of the referred object with the ref
getMask - get mask and area of the referred object given ref
showMask - show mask of the referred object given ref
"""
import itertools
import json
import os.path as osp
import pickle
import sys
import time
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import skimage.io as io
from matplotlib.collections import PatchCollection
from matplotlib.patches import Polygon, Rectangle
from pycocotools import mask
class REFER:
def __init__(self, data_root, dataset="refcoco", splitBy="unc"):
# provide data_root folder which contains refclef, refcoco, refcoco+ and refcocog
# also provide dataset name and splitBy information
# e.g., dataset = 'refcoco', splitBy = 'unc'
print("loading dataset %s into memory..." % dataset)
self.ROOT_DIR = osp.abspath(osp.dirname(__file__))
self.DATA_DIR = osp.join(data_root, dataset)
if dataset in ["refcoco", "refcoco+", "refcocog"]:
self.IMAGE_DIR = osp.join(data_root, "images/mscoco/images/train2014")
elif dataset == "refclef":
self.IMAGE_DIR = osp.join(data_root, "images/saiapr_tc-12")
else:
print("No refer dataset is called [%s]" % dataset)
sys.exit()
self.dataset = dataset
# load refs from data/dataset/refs(dataset).json
tic = time.time()
ref_file = osp.join(self.DATA_DIR, "refs(" + splitBy + ").p")
print("ref_file: ", ref_file)
self.data = {}
self.data["dataset"] = dataset
self.data["refs"] = pickle.load(open(ref_file, "rb"))
# load annotations from data/dataset/instances.json
instances_file = osp.join(self.DATA_DIR, "instances.json")
instances = json.load(open(instances_file, "rb"))
self.data["images"] = instances["images"]
self.data["annotations"] = instances["annotations"]
self.data["categories"] = instances["categories"]
# create index
self.createIndex()
print("DONE (t=%.2fs)" % (time.time() - tic))
def createIndex(self):
# create sets of mapping
# 1) Refs: {ref_id: ref}
# 2) Anns: {ann_id: ann}
# 3) Imgs: {image_id: image}
# 4) Cats: {category_id: category_name}
# 5) Sents: {sent_id: sent}
# 6) imgToRefs: {image_id: refs}
# 7) imgToAnns: {image_id: anns}
# 8) refToAnn: {ref_id: ann}
# 9) annToRef: {ann_id: ref}
# 10) catToRefs: {category_id: refs}
# 11) sentToRef: {sent_id: ref}
# 12) sentToTokens: {sent_id: tokens}
print("creating index...")
# fetch info from instances
Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
for ann in self.data["annotations"]:
Anns[ann["id"]] = ann
imgToAnns[ann["image_id"]] = imgToAnns.get(ann["image_id"], []) + [ann]
for img in self.data["images"]:
Imgs[img["id"]] = img
for cat in self.data["categories"]:
Cats[cat["id"]] = cat["name"]
# fetch info from refs
Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
Sents, sentToRef, sentToTokens = {}, {}, {}
for ref in self.data["refs"]:
# ids
ref_id = ref["ref_id"]
ann_id = ref["ann_id"]
category_id = ref["category_id"]
image_id = ref["image_id"]
# add mapping related to ref
Refs[ref_id] = ref
imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref]
catToRefs[category_id] = catToRefs.get(category_id, []) + [ref]
refToAnn[ref_id] = Anns[ann_id]
annToRef[ann_id] = ref
# add mapping of sent
for sent in ref["sentences"]:
Sents[sent["sent_id"]] = sent
sentToRef[sent["sent_id"]] = ref
sentToTokens[sent["sent_id"]] = sent["tokens"]
# create class members
self.Refs = Refs
self.Anns = Anns
self.Imgs = Imgs
self.Cats = Cats
self.Sents = Sents
self.imgToRefs = imgToRefs
self.imgToAnns = imgToAnns
self.refToAnn = refToAnn
self.annToRef = annToRef
self.catToRefs = catToRefs
self.sentToRef = sentToRef
self.sentToTokens = sentToTokens
print("index created.")
def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=""):
image_ids = image_ids if type(image_ids) == list else [image_ids]
cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0:
refs = self.data["refs"]
else:
if not len(image_ids) == 0:
refs = [self.imgToRefs[image_id] for image_id in image_ids]
else:
refs = self.data["refs"]
if not len(cat_ids) == 0:
refs = [ref for ref in refs if ref["category_id"] in cat_ids]
if not len(ref_ids) == 0:
refs = [ref for ref in refs if ref["ref_id"] in ref_ids]
if not len(split) == 0:
if split in ["testA", "testB", "testC"]:
refs = [
ref for ref in refs if split[-1] in ref["split"]
] # we also consider testAB, testBC, ...
elif split in ["testAB", "testBC", "testAC"]:
refs = [
ref for ref in refs if ref["split"] == split
] # rarely used I guess...
elif split == "test":
refs = [ref for ref in refs if "test" in ref["split"]]
elif split == "train" or split == "val":
refs = [ref for ref in refs if ref["split"] == split]
else:
print("No such split [%s]" % split)
sys.exit()
ref_ids = [ref["ref_id"] for ref in refs]
return ref_ids
def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]):
image_ids = image_ids if type(image_ids) == list else [image_ids]
cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
if len(image_ids) == len(cat_ids) == len(ref_ids) == 0:
ann_ids = [ann["id"] for ann in self.data["annotations"]]
else:
if not len(image_ids) == 0:
lists = [
self.imgToAnns[image_id]
for image_id in image_ids
if image_id in self.imgToAnns
] # list of [anns]
anns = list(itertools.chain.from_iterable(lists))
else:
anns = self.data["annotations"]
if not len(cat_ids) == 0:
anns = [ann for ann in anns if ann["category_id"] in cat_ids]
ann_ids = [ann["id"] for ann in anns]
if not len(ref_ids) == 0:
ids = set(ann_ids).intersection(
set([self.Refs[ref_id]["ann_id"] for ref_id in ref_ids])
)
return ann_ids
def getImgIds(self, ref_ids=[]):
ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
if not len(ref_ids) == 0:
image_ids = list(set([self.Refs[ref_id]["image_id"] for ref_id in ref_ids]))
else:
image_ids = self.Imgs.keys()
return image_ids
def getCatIds(self):
return self.Cats.keys()
def loadRefs(self, ref_ids=[]):
if type(ref_ids) == list:
return [self.Refs[ref_id] for ref_id in ref_ids]
elif type(ref_ids) == int:
return [self.Refs[ref_ids]]
def loadAnns(self, ann_ids=[]):
if type(ann_ids) == list:
return [self.Anns[ann_id] for ann_id in ann_ids]
elif type(ann_ids) == int or type(ann_ids) == unicode:
return [self.Anns[ann_ids]]
def loadImgs(self, image_ids=[]):
if type(image_ids) == list:
return [self.Imgs[image_id] for image_id in image_ids]
elif type(image_ids) == int:
return [self.Imgs[image_ids]]
def loadCats(self, cat_ids=[]):
if type(cat_ids) == list:
return [self.Cats[cat_id] for cat_id in cat_ids]
elif type(cat_ids) == int:
return [self.Cats[cat_ids]]
def getRefBox(self, ref_id):
ref = self.Refs[ref_id]
ann = self.refToAnn[ref_id]
return ann["bbox"] # [x, y, w, h]
def showRef(self, ref, seg_box="seg"):
ax = plt.gca()
# show image
image = self.Imgs[ref["image_id"]]
I = io.imread(osp.join(self.IMAGE_DIR, image["file_name"]))
ax.imshow(I)
# show refer expression
for sid, sent in enumerate(ref["sentences"]):
print("%s. %s" % (sid + 1, sent["sent"]))
# show segmentations
if seg_box == "seg":
ann_id = ref["ann_id"]
ann = self.Anns[ann_id]
polygons = []
color = []
c = "none"
if type(ann["segmentation"][0]) == list:
# polygon used for refcoco*
for seg in ann["segmentation"]:
poly = np.array(seg).reshape((len(seg) / 2, 2))
polygons.append(Polygon(poly, True, alpha=0.4))
color.append(c)
p = PatchCollection(
polygons,
facecolors=color,
edgecolors=(1, 1, 0, 0),
linewidths=3,
alpha=1,
)
ax.add_collection(p) # thick yellow polygon
p = PatchCollection(
polygons,
facecolors=color,
edgecolors=(1, 0, 0, 0),
linewidths=1,
alpha=1,
)
ax.add_collection(p) # thin red polygon
else:
# mask used for refclef
rle = ann["segmentation"]
m = mask.decode(rle)
img = np.ones((m.shape[0], m.shape[1], 3))
color_mask = np.array([2.0, 166.0, 101.0]) / 255
for i in range(3):
img[:, :, i] = color_mask[i]
ax.imshow(np.dstack((img, m * 0.5)))
# show bounding-box
elif seg_box == "box":
ann_id = ref["ann_id"]
ann = self.Anns[ann_id]
bbox = self.getRefBox(ref["ref_id"])
box_plot = Rectangle(
(bbox[0], bbox[1]),
bbox[2],
bbox[3],
fill=False,
edgecolor="green",
linewidth=3,
)
ax.add_patch(box_plot)
def getMask(self, ref):
# return mask, area and mask-center
ann = self.refToAnn[ref["ref_id"]]
image = self.Imgs[ref["image_id"]]
if type(ann["segmentation"][0]) == list: # polygon
rle = mask.frPyObjects(ann["segmentation"], image["height"], image["width"])
else:
rle = ann["segmentation"]
m = mask.decode(rle)
m = np.sum(
m, axis=2
) # sometimes there are multiple binary map (corresponding to multiple segs)
m = m.astype(np.uint8) # convert to np.uint8
# compute area
area = sum(mask.area(rle)) # should be close to ann['area']
return {"mask": m, "area": area}
# # position
# position_x = np.mean(np.where(m==1)[1]) # [1] means columns (matlab style) -> x (c style)
# position_y = np.mean(np.where(m==1)[0]) # [0] means rows (matlab style) -> y (c style)
# # mass position (if there were multiple regions, we use the largest one.)
# label_m = label(m, connectivity=m.ndim)
# regions = regionprops(label_m)
# if len(regions) > 0:
# largest_id = np.argmax(np.array([props.filled_area for props in regions]))
# largest_props = regions[largest_id]
# mass_y, mass_x = largest_props.centroid
# else:
# mass_x, mass_y = position_x, position_y
# # if centroid is not in mask, we find the closest point to it from mask
# if m[mass_y, mass_x] != 1:
# print('Finding closes mask point ...')
# kernel = np.ones((10, 10),np.uint8)
# me = cv2.erode(m, kernel, iterations = 1)
# points = zip(np.where(me == 1)[0].tolist(), np.where(me == 1)[1].tolist()) # row, col style
# points = np.array(points)
# dist = np.sum((points - (mass_y, mass_x))**2, axis=1)
# id = np.argsort(dist)[0]
# mass_y, mass_x = points[id]
# # return
# return {'mask': m, 'area': area, 'position_x': position_x, 'position_y': position_y, 'mass_x': mass_x, 'mass_y': mass_y}
# # show image and mask
# I = io.imread(osp.join(self.IMAGE_DIR, image['file_name']))
# plt.figure()
# plt.imshow(I)
# ax = plt.gca()
# img = np.ones( (m.shape[0], m.shape[1], 3) )
# color_mask = np.array([2.0,166.0,101.0])/255
# for i in range(3):
# img[:,:,i] = color_mask[i]
# ax.imshow(np.dstack( (img, m*0.5) ))
# plt.show()
def showMask(self, ref):
M = self.getMask(ref)
msk = M["mask"]
ax = plt.gca()
ax.imshow(msk)
if __name__ == "__main__":
refer = REFER(dataset="refcocog", splitBy="google")
ref_ids = refer.getRefIds()
print(len(ref_ids))
print(len(refer.Imgs))
print(len(refer.imgToRefs))
ref_ids = refer.getRefIds(split="train")
print("There are %s training referred objects." % len(ref_ids))
for ref_id in ref_ids:
ref = refer.loadRefs(ref_id)[0]
if len(ref["sentences"]) < 2:
continue
pprint(ref)
print("The label is %s." % refer.Cats[ref["category_id"]])
plt.figure()
refer.showRef(ref, seg_box="box")
plt.show()
# plt.figure()
# refer.showMask(ref)
# plt.show()
from enum import Enum
import numpy as np
import torch
import torch.distributed as dist
class Summary(Enum):
NONE = 0
AVERAGE = 1
SUM = 2
COUNT = 3
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self, name, fmt=":f", summary_type=Summary.AVERAGE):
self.name = name
self.fmt = fmt
self.summary_type = summary_type
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def all_reduce(self):
device = "cuda" if torch.cuda.is_available() else "cpu"
if isinstance(self.sum, np.ndarray):
total = torch.tensor(
self.sum.tolist()
+ [
self.count,
],
dtype=torch.float32,
device=device,
)
else:
total = torch.tensor(
[self.sum, self.count], dtype=torch.float32, device=device
)
dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
if total.shape[0] > 2:
self.sum, self.count = total[:-1].cpu().numpy(), total[-1].cpu().item()
else:
self.sum, self.count = total.tolist()
self.avg = self.sum / (self.count + 1e-5)
def __str__(self):
fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
return fmtstr.format(**self.__dict__)
def summary(self):
fmtstr = ""
if self.summary_type is Summary.NONE:
fmtstr = ""
elif self.summary_type is Summary.AVERAGE:
fmtstr = "{name} {avg:.3f}"
elif self.summary_type is Summary.SUM:
fmtstr = "{name} {sum:.3f}"
elif self.summary_type is Summary.COUNT:
fmtstr = "{name} {count:.3f}"
else:
raise ValueError("invalid summary type %r" % self.summary_type)
return fmtstr.format(**self.__dict__)
def intersectionAndUnionGPU(output, target, K, ignore_index=255):
# 'K' classes, output and target sizes are N or N * L or N * H * W, each value in range 0 to K - 1.
assert output.dim() in [1, 2, 3]
assert output.shape == target.shape
output = output.view(-1)
target = target.view(-1)
output[target == ignore_index] = ignore_index
intersection = output[output == target]
area_intersection = torch.histc(intersection, bins=K, min=0, max=K - 1)
area_output = torch.histc(output, bins=K, min=0, max=K - 1)
area_target = torch.histc(target, bins=K, min=0, max=K - 1)
area_union = area_output + area_target - area_intersection
return area_intersection, area_union, area_target
class ProgressMeter(object):
def __init__(self, num_batches, meters, prefix=""):
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
self.meters = meters
self.prefix = prefix
def display(self, batch):
entries = [self.prefix + self.batch_fmtstr.format(batch)]
entries += [str(meter) for meter in self.meters]
print("\t".join(entries))
def display_summary(self):
entries = [" *"]
entries += [meter.summary() for meter in self.meters]
print(" ".join(entries))
def _get_batch_fmtstr(self, num_batches):
num_digits = len(str(num_batches // 1))
fmt = "{:" + str(num_digits) + "d}"
return "[" + fmt + "/" + fmt.format(num_batches) + "]"
def dict_to_cuda(input_dict):
for k, v in input_dict.items():
if isinstance(input_dict[k], torch.Tensor):
input_dict[k] = v.cuda(non_blocking=True)
elif isinstance(v, list) and len(v) > 0:
input_dict[k] = [ele.cuda(non_blocking=True) if isinstance(ele, torch.Tensor) else ele for ele in v]
return input_dict
import gradio as gr
import sys
from projects.llava_sam2.gradio.app_utils import\
process_markdown, show_mask_pred, description, preprocess_video,\
show_mask_pred_video, image2video_and_save
import torch
from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig, CLIPImageProcessor,
CLIPVisionModel, GenerationConfig)
import argparse
import os
TORCH_DTYPE_MAP = dict(
fp16=torch.float16, bf16=torch.bfloat16, fp32=torch.float32, auto='auto')
def parse_args(args):
parser = argparse.ArgumentParser(description="Sa2VA Demo")
parser.add_argument('hf_path', help='Sa2VA hf path.')
return parser.parse_args(args)
def inference(image, video, follow_up, input_str):
input_image = image
if image is not None and (video is not None and os.path.exists(video)):
return image, video, "Error: Please only input a image or a video !!!"
if image is None and (video is None or not os.path.exists(video)) and not follow_up:
return image, video, "Error: Please input a image or a video !!!"
if not follow_up:
# reset
print('Log: History responses have been removed!')
global_infos.n_turn = 0
global_infos.inputs = ''
text = input_str
image = input_image
global_infos.image_for_show = image
global_infos.image = image
video = video
global_infos.video = video
if image is not None:
global_infos.input_type = "image"
else:
global_infos.input_type = "video"
else:
text = input_str
image = global_infos.image
video = global_infos.video
input_type = global_infos.input_type
if input_type == "video":
video = preprocess_video(video, global_infos.inputs+input_str)
past_text = global_infos.inputs
if past_text == "" and "<image>" not in text:
text = "<image>" + text
if input_type == "image":
input_dict = {
'image': image,
'text': text,
'past_text': past_text,
'mask_prompts': None,
'tokenizer': tokenizer,
}
else:
input_dict = {
'video': video,
'text': text,
'past_text': past_text,
'mask_prompts': None,
'tokenizer': tokenizer,
}
return_dict = sa2va_model.predict_forward(**input_dict)
global_infos.inputs = return_dict["past_text"]
print(return_dict['past_text'])
if 'prediction_masks' in return_dict.keys() and return_dict['prediction_masks'] and len(
return_dict['prediction_masks']) != 0:
if input_type == "image":
image_mask_show, selected_colors = show_mask_pred(global_infos.image_for_show, return_dict['prediction_masks'],)
video_mask_show = global_infos.video
else:
image_mask_show = None
video_mask_show, selected_colors = show_mask_pred_video(video, return_dict['prediction_masks'],)
video_mask_show = image2video_and_save(video_mask_show, save_path="./ret_video.mp4")
else:
image_mask_show = global_infos.image_for_show
video_mask_show = global_infos.video
selected_colors = []
predict = return_dict['prediction'].strip()
global_infos.n_turn += 1
predict = process_markdown(predict, selected_colors)
return image_mask_show, video_mask_show, predict
def init_models(args):
model_path = args.hf_path
model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(
model_path,
trust_remote_code=True,
)
return model, tokenizer
class global_infos:
inputs = ''
n_turn = 0
image_width = 0
image_height = 0
image_for_show = None
image = None
video = None
input_type = "image" # "image" or "video"
if __name__ == "__main__":
# get parse args and set models
args = parse_args(sys.argv[1:])
sa2va_model, tokenizer = \
init_models(args)
demo = gr.Interface(
inference,
inputs=[
gr.Image(type="pil", label="Upload Image", height=360),
gr.Video(sources=["upload", "webcam"], label="Upload mp4 video", height=360),
gr.Checkbox(label="Follow up Question"),
gr.Textbox(lines=1, placeholder=None, label="Text Instruction"),],
outputs=[
gr.Image(type="pil", label="Output Image"),
gr.Video(label="Output Video", show_download_button=True, format='mp4'),
gr.Markdown()],
theme=gr.themes.Soft(), allow_flagging="auto", description=description,
title='Sa2VA'
)
demo.queue()
demo.launch(share=True)
\ No newline at end of file
import numpy as np
from PIL import Image
import cv2
markdown_default = """
<link href="https://fonts.googleapis.com/css2?family=Montserrat:wght@400;700&display=swap" rel="stylesheet">
<style>
.highlighted-text {
font-family: 'Montserrat', sans-serif;
font-weight: 600;
font-size: 14px;
color: rgb(255, 255, 239);
background-color: rgb(225, 231, 254);
border-radius: 7px;
padding: 5px 7px;
display: inline-block;
}
.regular-text {
font-family: 'Montserrat', sans-serif;
font-weight: 400;
font-size: 14px;
}
.highlighted-response {
font-family: 'Montserrat', sans-serif;
font-weight: 600;
font-size: 14px;
border-radius: 6px;
padding: 3px 4px;
display: inline-block;
}
</style>
<span class="highlighted-text" style='color:rgb(107, 100, 239)'>Sa2VA</span>
"""
description = """
**Usage** : <br>
&ensp;(1) For **Grounded Caption Generation** Interleaved Segmentation, input prompt like: *"Could you provide me with a detailed analysis of this photo? Please output with interleaved segmentation masks for the corresponding parts of the answer."* <br>
&ensp;(2) For **Segmentation Output**, input prompt like: *"Can you please segment xxx in the given image"* <br>
&ensp;(3) For **Image Captioning** VQA, input prompt like: *"Could you please give me a detailed description of the image?"* <br>
&ensp;(4) For **Image Conversation**, input arbitrary text instruction. <br>
"""
ONE_THIRD = 1.0/3.0
ONE_SIXTH = 1.0/6.0
TWO_THIRD = 2.0/3.0
def desaturate(rgb, factor=0.65):
"""
Desaturate an RGB color by a given factor.
:param rgb: A tuple of (r, g, b) where each value is in [0, 255].
:param factor: The factor by which to reduce the saturation.
0 means completely desaturated, 1 means original color.
:return: A tuple of desaturated (r, g, b) values in [0, 255].
"""
r, g, b = [x / 255.0 for x in rgb]
h, l, s = rgb_to_hls(r, g, b)
l = factor
new_r, new_g, new_b = hls_to_rgb(h, l, s)
return (int(new_r * 255), int(new_g * 255), int(new_b * 255))
def rgb_to_hls(r, g, b):
maxc = max(r, g, b)
minc = min(r, g, b)
sumc = (maxc+minc)
rangec = (maxc-minc)
l = sumc/2.0
if minc == maxc:
return 0.0, l, 0.0
if l <= 0.5:
s = rangec / sumc
else:
s = rangec / (2.0-sumc)
rc = (maxc-r) / rangec
gc = (maxc-g) / rangec
bc = (maxc-b) / rangec
if r == maxc:
h = bc-gc
elif g == maxc:
h = 2.0+rc-bc
else:
h = 4.0+gc-rc
h = (h/6.0) % 1.0
return h, l, s
def hls_to_rgb(h, l, s):
if s == 0.0:
return l, l, l
if l <= 0.5:
m2 = l * (1.0+s)
else:
m2 = l+s-(l*s)
m1 = 2.0*l - m2
return (_v(m1, m2, h+ONE_THIRD), _v(m1, m2, h), _v(m1, m2, h-ONE_THIRD))
def _v(m1, m2, hue):
hue = hue % 1.0
if hue < ONE_SIXTH:
return m1 + (m2-m1)*hue*6.0
if hue < 0.5:
return m2
if hue < TWO_THIRD:
return m1 + (m2-m1)*(TWO_THIRD-hue)*6.0
return m1
def process_markdown(output_str, colors):
output_str = output_str.replace("\n", "").replace(" ", " ").replace("<s>", "")\
.replace("<|im_end|>", '').replace("<|end|>", "")
output_str = output_str.split("ASSISTANT: ")[-1]
# markdown_out = output_str.replace('[SEG]', '')
markdown_out = output_str
markdown_out = markdown_out.replace(
"<p>", "<span class='highlighted-response' style='background-color:rgb[COLOR]'>"
)
markdown_out = markdown_out.replace("</p>", "</span>")
for color in colors:
markdown_out = markdown_out.replace("[COLOR]", str(desaturate(tuple(color))), 1)
markdown_out = f"""
{markdown_out}
"""
markdown_out = markdown_default + "<p><span class='regular-text'>" + markdown_out
return markdown_out
def show_mask_pred(image, masks):
masks = [mask[:1] for mask in masks]
masks = np.concatenate(masks, axis=0) # (n, h, w)
selected_colors = []
colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255),
(255, 255, 0), (255, 0, 255), (0, 255, 255),
(128, 128, 255), [255, 192, 203], # Pink
[165, 42, 42], # Brown
[255, 165, 0], # Orange
[128, 0, 128], # Purple
[0, 0, 128], # Navy
[128, 0, 0], # Maroon
[128, 128, 0], # Olive
[70, 130, 180], # Steel Blue
[173, 216, 230], # Light Blue
[255, 192, 0], # Gold
[255, 165, 165], # Light Salmon
[255, 20, 147], # Deep Pink
]
_mask_image = np.zeros((masks.shape[1], masks.shape[2], 3), dtype=np.uint8)
for i, mask in enumerate(masks):
color = colors[i % len(colors)]
selected_colors.append(color)
_mask_image[:, :, 0] = _mask_image[:, :, 0] + mask.astype(np.uint8) * color[0]
_mask_image[:, :, 1] = _mask_image[:, :, 1] + mask.astype(np.uint8) * color[1]
_mask_image[:, :, 2] = _mask_image[:, :, 2] + mask.astype(np.uint8) * color[2]
image = np.array(image)
image = image * 0.5 + _mask_image * 0.5
image = image.astype(np.uint8)
return image, selected_colors
def show_mask_pred_video(video, masks):
ret_video = []
selected_colors = []
colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255),
(255, 255, 0), (255, 0, 255), (0, 255, 255),
(128, 128, 255), [255, 192, 203], # Pink
[165, 42, 42], # Brown
[255, 165, 0], # Orange
[128, 0, 128], # Purple
[0, 0, 128], # Navy
[128, 0, 0], # Maroon
[128, 128, 0], # Olive
[70, 130, 180], # Steel Blue
[173, 216, 230], # Light Blue
[255, 192, 0], # Gold
[255, 165, 165], # Light Salmon
[255, 20, 147], # Deep Pink
]
for i_frame in range(len(video)):
frame_masks = [mask[i_frame:i_frame+1] for mask in masks]
frame_masks = np.concatenate(frame_masks, axis=0)
_mask_image = np.zeros((frame_masks.shape[1], frame_masks.shape[2], 3), dtype=np.uint8)
for i, mask in enumerate(frame_masks):
if i_frame == 0:
color = colors[i % len(colors)]
selected_colors.append(color)
else:
color = selected_colors[i]
_mask_image[:, :, 0] = _mask_image[:, :, 0] + mask.astype(np.uint8) * color[0]
_mask_image[:, :, 1] = _mask_image[:, :, 1] + mask.astype(np.uint8) * color[1]
_mask_image[:, :, 2] = _mask_image[:, :, 2] + mask.astype(np.uint8) * color[2]
image = np.array(video[i_frame])
image = image * 0.5 + _mask_image * 0.5
image = image.astype(np.uint8)
ret_video.append(image)
return ret_video, selected_colors
def parse_visual_prompts(points):
ret = {'points': [], 'boxes': []}
for item in points:
if item[2] == 1.0:
ret['points'].append([item[0], item[1]])
elif item[2] == 2.0 or item[2] == 3.0:
ret['boxes'].append([item[0], item[1], item[3], item[4]])
else:
raise NotImplementedError
return ret
def get_video_frames(video_path):
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print("Error: Cannot open video file.")
return
frames = []
frame_id = 0
while True:
ret, frame = cap.read()
if not ret:
break
frames.append(frame)
frame_id += 1
cap.release()
return frames
def get_frames_from_video(video_path, n_frames=5, sample_type="uniform"):
frames = get_video_frames(video_path)
if sample_type == "uniform":
stride = len(frames) / (n_frames + 1e-4)
ret = []
for i in range(n_frames):
idx = int(i * stride)
frame = frames[idx]
frame = frame[:, :, ::-1]
frame_image = Image.fromarray(frame).convert('RGB')
ret.append(frame_image)
else:
ret = []
for frame in frames[:500]:
frame = frame[:, :, ::-1]
frame_image = Image.fromarray(frame).convert('RGB')
ret.append(frame_image)
return ret
def preprocess_video(video_path, text):
if "Segment" in text or "segment" in text:
sample_type = 'begin'
else:
sample_type = 'uniform'
return get_frames_from_video(video_path, sample_type=sample_type)
def image2video_and_save(frames, save_path):
success = frames_to_video(frames, save_path)
return save_path
def frames_to_video(
frames,
output_path: str,
fps: int = 24,
) -> bool:
try:
frames = [frame[:, :, ::-1] for frame in frames]
# Use provided frame size or get from first frame
height, width = frames[0].shape[:2]
# Initialize video writer
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
# Process each frame
for frame in frames:
out.write(frame)
# Release video writer
out.release()
print(f"Video saved successfully to {output_path}")
return True
except Exception as e:
print(f"Error converting frames to video: {str(e)}")
return False
\ No newline at end of file
import argparse
import copy
import os.path as osp
import torch
from mmengine.dist import (collect_results, get_dist_info, get_rank, init_dist,
master_only)
from xtuner.registry import BUILDER
from xtuner.configs import cfgs_name_path
from xtuner.model.utils import guess_load_checkpoint
from mmengine.config import Config
from mmengine.fileio import PetrelBackend, get_file_backend
from mmengine.config import ConfigDict
import os
def convert_dict2config_dict(input):
input = ConfigDict(**input)
for key in input.keys():
if isinstance(input[key], dict):
input[key] = convert_dict2config_dict(input[key])
return input
TORCH_DTYPE_MAP = dict(
fp16=torch.float16, bf16=torch.bfloat16, fp32=torch.float32, auto='auto')
def parse_args():
parser = argparse.ArgumentParser(description='toHF script')
parser.add_argument('config', help='config file name or path.')
parser.add_argument('--pth-model', help='pth model file')
parser.add_argument(
'--save-path', type=str, default='./work_dirs/hf_model', help='save folder name')
args = parser.parse_args()
return args
@master_only
def master_print(msg):
print(msg)
def main():
args = parse_args()
# build model
if not osp.isfile(args.config):
try:
args.config = cfgs_name_path[args.config]
except KeyError:
raise FileNotFoundError(f'Cannot find {args.config}')
# load config
cfg = Config.fromfile(args.config)
model = BUILDER.build(cfg.model)
backend = get_file_backend(args.pth_model)
if isinstance(backend, PetrelBackend):
from xtuner.utils.fileio import patch_fileio
with patch_fileio():
state_dict = guess_load_checkpoint(args.pth_model)
else:
state_dict = guess_load_checkpoint(args.pth_model)
model.load_state_dict(state_dict, strict=False)
print(f'Load PTH model from {args.pth_model}')
model._merge_lora()
model.mllm.transfer_to_hf = True
all_state_dict = model.all_state_dict()
name_map = {'mllm.model.': '', '.gamma': '.g_weight'}
all_state_dict_new = {}
for key in all_state_dict.keys():
new_key = copy.deepcopy(key)
for _text in name_map.keys():
new_key = new_key.replace(_text, name_map[_text])
all_state_dict_new[new_key] = all_state_dict[key]
# build the hf format model
from projects.llava_sam2.hf.models.configuration_sa2va_chat import Sa2VAChatConfig
from projects.llava_sam2.hf.models.modeling_sa2va_chat import Sa2VAChatModel
internvl_config = Sa2VAChatConfig.from_pretrained(cfg.path)
config_dict = internvl_config.to_dict()
config_dict['auto_map'] = \
{'AutoConfig': 'configuration_sa2va_chat.Sa2VAChatConfig',
'AutoModel': 'modeling_sa2va_chat.Sa2VAChatModel',
'AutoModelForCausalLM': 'modeling_sa2va_chat.Sa2VAChatModel'}
config_dict["llm_config"]["vocab_size"] = len(model.tokenizer)
config_dict["template"] = cfg.template
sa2va_hf_config = Sa2VAChatConfig(
**config_dict
)
hf_sa2va_model = Sa2VAChatModel(
sa2va_hf_config, vision_model=model.mllm.model.vision_model,
language_model=model.mllm.model.language_model,
)
hf_sa2va_model.load_state_dict(all_state_dict_new)
hf_sa2va_model.save_pretrained(args.save_path)
model.tokenizer.save_pretrained(args.save_path)
print(f"Save the hf model into {args.save_path}")
# copy the files
os.system(f"cp -pr ./projects/llava_sam2/hf/models/* {args.save_path}")
if __name__ == '__main__':
main()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment