Commit fef630ee authored by suily's avatar suily
Browse files

init

parents
Pipeline #1942 failed with stages
in 0 seconds
#!/usr/bin/env python
def iou(interval_1, interval_2):
"""
interval: list (2 float elements)
"""
eps = 1e-8 # to avoid zero division
(s_1, e_1) = interval_1
(s_2, e_2) = interval_2
intersection = max(0., min(e_1, e_2) - max(s_1, s_2))
union = min(max(e_1, e_2) - min(s_1, s_2), e_1 - s_1 + e_2 - s_2)
iou = intersection / (union + eps)
return iou
def remove_nonascii(text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])
from .eval_dvc import eval_dvc
from .eval_soda import eval_soda
\ No newline at end of file
# --------------------------------------------------------
# evaluation scripts for dense video captioning, support python 3
# Modified from https://github.com/ranjaykrishna/densevid_eval/tree/9d4045aced3d827834a5d2da3c9f0692e3f33c1c
# --------------------------------------------------------
# Dense-Captioning Events in Videos Eval
# Copyright (c) 2017 Ranjay Krishna
# Licensed under The MIT License [see LICENSE for details]
# Written by Ranjay Krishna
# --------------------------------------------------------
import argparse
import json
import random
import string
import sys
import time
# sys.path.insert(0, './coco-caption') # Hack to allow the import of pycocoeval
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
Set = set
import numpy as np
def random_string(string_length):
letters = string.ascii_lowercase
return ''.join(random.choice(letters) for i in range(string_length))
def remove_nonascii(text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])
class ANETcaptions(object):
PREDICTION_FIELDS = ['results', 'version', 'external_data']
def __init__(self, ground_truth_filenames=None, prediction_filename=None,
tious=None, distances=[1, 3, 5, 10, 30, 60], max_proposals=1000,
prediction_fields=PREDICTION_FIELDS, verbose=False, no_lang_eval=False):
# Check that the gt and submission files exist and load them
if len(tious) == 0:
raise IOError('Please input a valid tIoU.')
if not ground_truth_filenames:
raise IOError('Please input a valid ground truth file.')
if not prediction_filename:
raise IOError('Please input a valid prediction file.')
self.verbose = verbose
self.no_lang_eval = no_lang_eval
self.tious = tious
self.distances = distances
self.max_proposals = max_proposals
self.pred_fields = prediction_fields
self.ground_truths = self.import_ground_truths(ground_truth_filenames)
self.prediction = self.import_prediction(prediction_filename)
self.ground_truths_keys = [vid for gt in self.ground_truths for vid in gt]
print('available video number', len(set(self.ground_truths_keys) & set(self.prediction.keys())))
# Set up scorers
if not self.no_lang_eval:
self.tokenizer = PTBTokenizer()
self.scorers = [
(Meteor(), "METEOR"),
(Cider(), "CIDEr"),
(Rouge(), "Rouge-L"),
(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
]
def import_prediction(self, prediction_filename):
if self.verbose:
print("| Loading submission...")
if isinstance(prediction_filename, dict):
submission = prediction_filename
else:
submission = json.load(open(prediction_filename))
# if not all([field in submission.keys() for field in self.pred_fields]):
# raise IOError('Please input a valid ground truth file.')
# Ensure that every video is limited to the correct maximum number of proposals.
results = {}
for vid_id in submission['results']:
results[vid_id] = submission['results'][vid_id][:self.max_proposals]
return results
def import_ground_truths(self, filenames):
gts = []
self.n_ref_vids = Set()
for filename in filenames:
if isinstance(filename, dict):
gt = filename
else:
gt = json.load(open(filename))
self.n_ref_vids.update(gt.keys())
gts.append(gt)
if self.verbose:
print("| Loading GT. #files: %d, #videos: %d" % (len(filenames), len(self.n_ref_vids)))
return gts
def iou(self, interval_1, interval_2):
start_i, end_i = interval_1[0], interval_1[1]
start, end = interval_2[0], interval_2[1]
intersection = max(0, min(end, end_i) - max(start, start_i))
union = min(max(end, end_i) - min(start, start_i), end - start + end_i - start_i)
iou = float(intersection) / (union + 1e-8)
return iou
def check_gt_exists(self, vid_id):
for gt in self.ground_truths:
if vid_id in gt:
return True
return False
def get_gt_vid_ids(self):
vid_ids = set([])
for gt in self.ground_truths:
vid_ids |= set(gt.keys())
return list(vid_ids)
def evaluate(self):
aggregator = {}
self.scores = {}
if not self.no_lang_eval:
for tiou in self.tious:
scores = self.evaluate_tiou(tiou)
for metric, score in scores.items():
if metric not in self.scores:
self.scores[metric] = []
self.scores[metric].append(score)
if True:
# if self.verbose:
self.scores['Recall'] = []
self.scores['Precision'] = []
self.scores['F1'] = []
for tiou in self.tious:
precision, recall = self.evaluate_detection(tiou)
self.scores['Recall'].append(recall)
self.scores['Precision'].append(precision)
self.scores['F1'].append(2 * recall * precision / (recall + precision) if recall + precision else 0.)
for tiou in self.distances:
precision, recall = self.evaluate_navigation(tiou)
self.scores['Recall'].append(recall)
self.scores['Precision'].append(precision)
self.scores['F1'].append(2 * recall * precision / (recall + precision) if recall + precision else 0.)
def evaluate_detection(self, tiou):
gt_vid_ids = self.get_gt_vid_ids()
# Recall is the percentage of ground truth that is covered by the predictions
# Precision is the percentage of predictions that are valid
recall = []
precision = []
for vid_i, vid_id in enumerate(gt_vid_ids):
if vid_id not in self.prediction: # missing video
continue
best_recall = 0
best_precision = 0
for gt in self.ground_truths:
if vid_id not in gt:
continue
refs = gt[vid_id]
ref_set_covered = set([])
pred_set_covered = set([])
num_gt = 0
num_pred = 0
if vid_id in self.prediction:
for pred_i, pred in enumerate(self.prediction[vid_id]):
pred_timestamp = pred['timestamp']
for ref_i, ref_timestamp in enumerate(refs['timestamps']):
if self.iou(pred_timestamp, ref_timestamp) > tiou:
ref_set_covered.add(ref_i)
pred_set_covered.add(pred_i)
new_precision = float(len(pred_set_covered)) / max(len(self.prediction[vid_id]), 1)
best_precision = max(best_precision, new_precision)
new_recall = float(len(ref_set_covered)) / len(refs['timestamps'])
best_recall = max(best_recall, new_recall)
recall.append(best_recall)
precision.append(best_precision)
return sum(precision) / len(precision), sum(recall) / len(recall)
def evaluate_navigation(self, tiou):
gt_vid_ids = self.get_gt_vid_ids()
# Recall is the percentage of ground truth that is covered by the predictions
# Precision is the percentage of predictions that are valid
recall = []
precision = []
for vid_i, vid_id in enumerate(gt_vid_ids):
if vid_id not in self.prediction: # missing video
continue
best_recall = 0
best_precision = 0
for gt in self.ground_truths:
if vid_id not in gt:
continue
refs = gt[vid_id]
ref_set_covered = set([])
pred_set_covered = set([])
num_gt = 0
num_pred = 0
if vid_id in self.prediction:
for pred_i, pred in enumerate(self.prediction[vid_id]):
pred_timestamp = pred['timestamp']
for ref_i, ref_timestamp in enumerate(refs['timestamps']):
if abs(pred_timestamp[0] - ref_timestamp[0]) < tiou:
ref_set_covered.add(ref_i)
pred_set_covered.add(pred_i)
new_precision = float(len(pred_set_covered)) / max(len(self.prediction[vid_id]), 1)
best_precision = max(best_precision, new_precision)
new_recall = float(len(ref_set_covered)) / len(refs['timestamps'])
best_recall = max(best_recall, new_recall)
recall.append(best_recall)
precision.append(best_precision)
return sum(precision) / len(precision), sum(recall) / len(recall)
def evaluate_tiou(self, tiou):
# This method averages the tIoU precision from METEOR, Bleu, etc. across videos
res = {}
gts = {}
gt_vid_ids = self.get_gt_vid_ids()
unique_index = 0
# video id to unique caption ids mapping
vid2capid = {}
cur_res = {}
cur_gts = {}
for vid_id in gt_vid_ids:
# If the video does not have a prediction, then we give it no matches
# We set it to empty, and use this as a sanity check later on
if vid_id not in self.prediction: # missing video
continue
# If we do have a prediction, then we find the scores based on all the
# valid tIoU overlaps.
else:
vid2capid[vid_id] = []
# For each prediction, we look at the tIoU with ground truth.
for pred in self.prediction[vid_id]:
has_added = False
for gt in self.ground_truths:
if vid_id not in gt:
continue
gt_captions = gt[vid_id]
for caption_idx, caption_timestamp in enumerate(gt_captions['timestamps']):
if self.iou(pred['timestamp'], caption_timestamp) >= tiou:
cur_res[unique_index] = [{'caption': remove_nonascii(pred['sentence'])}]
cur_gts[unique_index] = [
{'caption': remove_nonascii(gt_captions['sentences'][caption_idx])}]
vid2capid[vid_id].append(unique_index)
unique_index += 1
has_added = True
# If the predicted caption does not overlap with any ground truth,
# we should compare it with garbage.
if not has_added:
cur_res[unique_index] = [{'caption': remove_nonascii(pred['sentence'])}]
cur_gts[unique_index] = [{'caption': random_string(random.randint(10, 20))}]
vid2capid[vid_id].append(unique_index)
unique_index += 1
# Each scorer will compute across all videos and take average score
output = {}
for scorer, method in self.scorers:
if self.verbose:
print('computing %s score...' % (scorer.method()))
# For each video, take all the valid pairs (based from tIoU) and compute the score
all_scores = {}
# call tokenizer here for all predictions and gts
tokenize_res = self.tokenizer.tokenize(cur_res)
tokenize_gts = self.tokenizer.tokenize(cur_gts)
# reshape back
for vid in vid2capid.keys():
res[vid] = {index: tokenize_res[index] for index in vid2capid[vid]}
gts[vid] = {index: tokenize_gts[index] for index in vid2capid[vid]}
for vid_id in gt_vid_ids:
if vid_id not in self.prediction: # missing video
continue
if len(res[vid_id]) == 0 or len(gts[vid_id]) == 0:
if type(method) == list:
score = [0] * len(method)
else:
score = 0
else:
score, scores = scorer.compute_score(gts[vid_id], res[vid_id])
all_scores[vid_id] = score
# import ipdb;ipdb.set_trace()
# print(all_scores.values())
if type(method) == list:
scores = np.mean(list(all_scores.values()), axis=0)
for m in range(len(method)):
output[method[m]] = scores[m]
if self.verbose:
print("Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method[m], output[method[m]]))
else:
output[method] = np.mean(list(all_scores.values()))
if self.verbose:
print("Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method, output[method]))
return output
def eval_dvc(submission, references, tious=[0.3, 0.5, 0.7, 0.9], distances=[1, 3, 5, 10, 30, 60], max_proposals_per_video=1000, verbose=False, no_lang_eval=False):
# Call coco eval
evaluator = ANETcaptions(ground_truth_filenames=references,
prediction_filename=submission,
tious=tious,
distances=distances,
max_proposals=max_proposals_per_video,
verbose=verbose, no_lang_eval=no_lang_eval)
evaluator.evaluate()
score = evaluator.scores
# print(score)
loc_score = {}
for i, x in enumerate(tious):
for y in ["Recall", "Precision", "F1"]:
loc_score[y + "@" + str(x)] = score[y][i]
for y in ["Recall", "Precision", "F1"]:
loc_score[y] = np.array([score[y][i] for i in range(len(tious))]).mean()
if distances:
for i, x in enumerate(distances):
for y in ["Recall", "Precision", "F1"]:
loc_score[y + "@" + str(x) + "s"] = score[y][len(tious) + i]
avg_eval_score = {key: np.array(value).mean() for key, value in score.items() if key not in ["Recall", "Precision", "F1"]}
avg_eval_score.update(loc_score)
return avg_eval_score
if __name__ == '__main__':
eval_dvc(pred_path, references,
tious=[0.3, 0.5, 0.7, 0.9],
max_proposals_per_video=1000,
verbose=False,
no_lang_eval=False)
eval_soda(pred_path, references, verbose=False)
\ No newline at end of file
import numpy as np
from .SODA.soda import SODA
from .SODA.dataset import ANETCaptions
def eval_tool(prediction, referneces=None, metric='Meteor', soda_type='c', verbose=False):
args = type('args', (object,), {})()
args.prediction = prediction
args.references = referneces
args.metric = metric
args.soda_type = soda_type
args.tious = [0.3, 0.5, 0.7, 0.9]
args.verbose = verbose
args.multi_reference = False
data = ANETCaptions.from_load_files(args.references,
args.prediction,
multi_reference=args.multi_reference,
verbose=args.verbose,
)
data.preprocess()
if args.soda_type == 'a':
tious = args.tious
else:
tious = None
evaluator = SODA(data,
soda_type=args.soda_type,
tious=tious,
scorer=args.metric,
verbose=args.verbose
)
result = evaluator.evaluate()
return result
def eval_soda(p, ref_list,verbose=False):
score_sum = []
for ref in ref_list:
r = eval_tool(prediction=p, referneces=[ref], verbose=verbose, soda_type='c')
score_sum.append(r['Meteor'])
soda_avg = np.mean(score_sum, axis=0) #[avg_pre, avg_rec, avg_f1]
soda_c_avg = soda_avg[-1]
results = {'soda_c': soda_c_avg}
return results
\ No newline at end of file
import os
root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..")
import sys
sys.path.append(root_dir)
import clip
import re
import argparse
import torch
import json
import numpy as np
from tqdm import tqdm
from torchvision.transforms import Compose, Resize, CenterCrop, Normalize
from vtimellm.model.builder import load_pretrained_model
from vtimellm.utils import disable_torch_init
from vtimellm.mm_utils import VideoExtractor
from vtimellm.inference import inference
try:
from torchvision.transforms import InterpolationMode
BICUBIC = InterpolationMode.BICUBIC
except ImportError:
from PIL import Image
BICUBIC = Image.BICUBIC
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--clip_path", type=str, default="checkpoints/clip/ViT-L-14.pt")
parser.add_argument("--pretrain_mm_mlp_adapter", type=str, default="checkpoints/vtimellm-vicuna-v1-5-7b-stage1/mm_projector.bin")
parser.add_argument("--stage2", type=str, default="checkpoints/vtimellm-vicuna-v1-5-7b-stage2")
parser.add_argument("--stage3", type=str, default="checkpoints/vtimellm-vicuna-v1-5-7b-stage3")
parser.add_argument("--model_base", type=str, default="/path/to/vicuna-7b-v1.5")
parser.add_argument("--data_path", type=str, default="vtimellm/eval/data_example.json")
parser.add_argument("--feat_folder", type=str, default=None)
parser.add_argument("--video_folder", type=str, default=None)
parser.add_argument("--task", type=str, default='all', choices=['all', 'grounding', 'captioning'])
parser.add_argument("--log_path", type=str, default='vtimellm/eval/log/example_log.txt')
args = parser.parse_args()
return args
def iou(outputs, gt):
matches = re.search(r"(\d{2}) (to|and) (\d{2})", outputs)
if not matches:
return 0
from_number = float(matches.group(1)) / 100
to_number = float(matches.group(3)) / 100
s, e = gt
intersection = max(0, min(to_number, e) - max(from_number, s))
union = max(to_number, e) - min(from_number, s)
iou = intersection / union
return round(iou, 2)
def write_log(log_path, video_id, task, query_id, answer, info=None):
log = {
'video_id': video_id,
'task': task,
'query_id': query_id,
'answer': answer
}
if info is not None:
log['info'] = info
with open(log_path, 'a') as f:
f.write(json.dumps(log) + '\n')
questions = {
'grounding': ['During which frames can we see {}?'],
'captioning': ['Could you please describe the events in the video in detail? Be specific about the activities of individuals, their surroundings, and interactions with others. The output should be in JSON format, structured as follows: {"event": "xx", "timestamps": "from xx to xx"}.']
}
if __name__ == "__main__":
args = parse_args()
disable_torch_init()
tokenizer, model, context_len = load_pretrained_model(args, args.stage2, args.stage3)
model = model.cuda()
model.to(torch.float16)
if args.video_folder is not None:
clip_model, _ = clip.load(args.clip_path)
clip_model.eval()
clip_model = clip_model.cuda()
video_loader = VideoExtractor(N=100)
transform = Compose([
Resize(224, interpolation=BICUBIC),
CenterCrop(224),
Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
])
js = json.load(open(args.data_path))
for id, data in tqdm(js.items()):
features = None
if args.feat_folder is not None:
feat_path = os.path.join(args.feat_folder, f"{id}.npy")
if os.path.isfile(feat_path):
features = torch.from_numpy(np.load(feat_path)).cuda()
if features is None and args.video_folder is not None:
for ext in ['mp4', 'mkv', 'webm']:
video_path = os.path.join(args.video_folder, f"{id}.{ext}")
if os.path.isfile(video_path):
_, images = video_loader.extract({'id': None, 'video': video_path})
images = transform(images / 255.0)
images = images.to(torch.float16)
with torch.no_grad():
features = clip_model.encode_image(images.to('cuda'))
if features is None:
print(f'Can not find video {id}')
continue
if args.task in ['captioning', 'all']:
for query_id, query in enumerate(questions['captioning']):
answer = inference(model, features, "<video>\n " + query, tokenizer)
write_log(args.log_path, id, 'captioning', query_id, answer)
if args.task in ['grounding', 'all']:
for sentence_id, (timestamps, sentence) in enumerate(zip(data['timestamps'], data['sentences'])):
sentence = sentence.strip().lower()
if sentence.endswith("."):
sentence = sentence[:-1]
for query_id, query in enumerate(questions['grounding']):
answer = inference(model, features, "<video>\n" + query.format(sentence), tokenizer)
gt = (timestamps[0] / data['duration'], timestamps[1] / data['duration'])
u = iou(answer, gt)
write_log(args.log_path, id, 'grounding', query_id, answer, info={"sentence_id": sentence_id, 'iou': u})
{"video_id": "v_bXdq2zI1Ms0", "task": "captioning", "query_id": 0, "answer": "Here's the requested JSON format with the events and their corresponding timestamps:\n[{\"event\": \"A man is seen talking to the camera while a group of men are seen standing in front of him.\", \"timestamps\": \"from 00 to 29\"}, {\"event\": \"The man then leads the group in a series of martial arts moves.\", \"timestamps\": \"from 29 to 71\"}, {\"event\": \"The man then leads the group in a series of martial arts moves.\", \"timestamps\": \"from 71 to 99\"}]"}
{"video_id": "v_bXdq2zI1Ms0", "task": "grounding", "query_id": 0, "answer": "The three men are standing on a mat from 17 to 34.", "info": {"sentence_id": 0, "iou": 0.2}}
{"video_id": "v_bXdq2zI1Ms0", "task": "grounding", "query_id": 0, "answer": "The man in front begins to do karate on the mat from 28 to 71.", "info": {"sentence_id": 1, "iou": 0.2}}
{"video_id": "v_bXdq2zI1Ms0", "task": "grounding", "query_id": 0, "answer": "The man gets down on the ground and flips around from 60 to 99.", "info": {"sentence_id": 2, "iou": 0.4}}
{"video_id": "v_CN01Gm2Yc4k", "task": "captioning", "query_id": 0, "answer": "Here's the requested JSON format with the events and their corresponding timestamps:\n[{\"event\": \"A woman is seen hanging from a bar in a gym.\", \"timestamps\": \"from 00 to 29\"}, {\"event\": \"She then swings her legs up and down while hanging from the bar.\", \"timestamps\": \"from 29 to 71\"}, {\"event\": \"She then swings her legs up and down while hanging from the bar.\", \"timestamps\": \"from 71 to 99\"}]"}
{"video_id": "v_CN01Gm2Yc4k", "task": "grounding", "query_id": 0, "answer": "The young lady is gripping the punching bag between her legs from 00 to 17.", "info": {"sentence_id": 0, "iou": 0.6}}
{"video_id": "v_CN01Gm2Yc4k", "task": "grounding", "query_id": 0, "answer": "The woman begins doing a set of crunches by pulling herself up from 21 to 71.", "info": {"sentence_id": 1, "iou": 0.82}}
{"video_id": "v_CN01Gm2Yc4k", "task": "grounding", "query_id": 0, "answer": "The woman sits up and makes punches out into the air from 22 to 43.", "info": {"sentence_id": 2, "iou": 0.0}}
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from dvc_eval import eval_dvc, eval_soda
import json
import argparse
import re
import difflib
def print_metrics(metrics):
for k, v in metrics.items():
print(f"{k}: {v:.2f}")
def merge_similar_sentences(data):
if not data: return data
merged_data = []
current_sentence = data[0]["sentence"]
current_timestamp = data[0]["timestamp"]
for i in range(1, len(data)):
next_sentence = data[i]["sentence"]
next_timestamp = data[i]["timestamp"]
if difflib.SequenceMatcher(None, current_sentence, next_sentence).ratio() > 0.98 and -1 <= next_timestamp[0] - current_timestamp[1] <= 1:
current_timestamp = [current_timestamp[0], next_timestamp[1]]
else:
merged_data.append({"sentence": current_sentence, "timestamp": current_timestamp})
current_sentence = next_sentence
current_timestamp = next_timestamp
merged_data.append({"sentence": current_sentence, "timestamp": current_timestamp})
return merged_data
def captioning_metrics(all_logs, data_path):
logs = [x for x in all_logs if x['task'] == 'captioning']
pred = {}
for log in logs:
id = log['video_id']
answer = log['answer']
pred[id] = []
try:
items = json.loads(re.search(r'\[.*\]', answer).group(0))
for item in items:
pred[id].append({
'timestamp': [int(item['timestamps'][5:7]), int(item['timestamps'][-2:])],
'sentence': item['event'],
})
except Exception as e:
print("Error", e, answer)
gt_js = json.load(open(data_path))
gt_js = {k: v for k, v in gt_js.items() if k in pred.keys()}
for id, items in list(pred.items()):
items = merge_similar_sentences(items)
duration = gt_js[id]['duration']
for item in items:
item['timestamp'][0] = item['timestamp'][0] * duration / 100
item['timestamp'][1] = (item['timestamp'][1] + 1) * duration / 100
pred[id] = items
pred_result = {'results': pred}
metrics = eval_soda(pred_result, [gt_js])
metrics.update(eval_dvc(pred_result, [gt_js],
tious=[0.3, 0.5, 0.7, 0.9],
distances=[],
max_proposals_per_video=1000,
verbose=False,
no_lang_eval=False))
print(f"Found {len(pred)} logs")
metrics = {k: v * 100 for k, v in metrics.items() if k in ['soda_c', 'METEOR', 'CIDEr']}
return metrics
def grounding_metrics(all_logs):
ious = [x['info']['iou'] for x in all_logs if x['task'] == 'grounding']
l = len(ious)
print(f"Found {l} logs")
if l == 0: return
metrics = {
"mIoU": sum(ious) / l * 100
}
for m in [0.3, 0.5, 0.7]:
metrics[f"R1@{m}"] = sum(iou >= m for iou in ious) / l * 100
return metrics
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--log_path", type=str, default='vtimellm/eval/log/example_log.txt')
parser.add_argument("--task", type=str, default='all', choices=['all', 'grounding', 'captioning'])
parser.add_argument("--data_path", type=str, default='vtimellm/eval/data_example.json')
args = parser.parse_args()
logs = []
with open(args.log_path) as f:
for line in f:
try:
json_data = json.loads(line)
logs.append(json_data)
except Exception as e:
print(e, line)
if args.task in ['captioning', 'all']:
print("====================== Captioning =====================")
print_metrics(captioning_metrics(logs, args.data_path))
if args.task in ['grounding', 'all']:
print("====================== Grounding ======================")
print_metrics(grounding_metrics(logs))
import os
import sys
import argparse
import torch
from vtimellm.constants import IMAGE_TOKEN_INDEX
from vtimellm.conversation import conv_templates, SeparatorStyle
from vtimellm.model.builder import load_pretrained_model, load_lora
from vtimellm.utils import disable_torch_init
from vtimellm.mm_utils import tokenizer_image_token, KeywordsStoppingCriteria, VideoExtractor
from PIL import Image
import requests
from io import BytesIO
from transformers import TextStreamer
from easydict import EasyDict as edict
try:
from torchvision.transforms import InterpolationMode
BICUBIC = InterpolationMode.BICUBIC
except ImportError:
from PIL import Image
BICUBIC = Image.BICUBIC
from torchvision.transforms import Compose, Resize, CenterCrop, Normalize
import numpy as np
import clip
def inference(model, image, query, tokenizer):
conv = conv_templates["v1"].copy()
conv.append_message(conv.roles[0], query)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=image[None,].cuda(),
do_sample=True,
temperature=0.05,
num_beams=1,
# no_repeat_ngram_size=3,
max_new_tokens=1024,
use_cache=True)
# https://github.com/huggingface/transformers/blob/main/src/transformers/generation/utils.py#L1295
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
outputs = outputs.strip()
if outputs.endswith(stop_str):
outputs = outputs[:-len(stop_str)]
outputs = outputs.strip()
return outputs
def parse_args():
parser = argparse.ArgumentParser(description="Demo")
parser.add_argument("--clip_path", type=str, default="checkpoints/clip/ViT-L-14.pt")
parser.add_argument("--model_base", type=str, default="/path/to/vicuna-7b-v1.5")
parser.add_argument("--pretrain_mm_mlp_adapter", type=str, default="checkpoints/vtimellm-vicuna-v1-5-7b-stage1/mm_projector.bin")
parser.add_argument("--stage2", type=str, default="checkpoints/vtimellm-vicuna-v1-5-7b-stage2")
parser.add_argument("--stage3", type=str, default="checkpoints/vtimellm-vicuna-v1-5-7b-stage3")
parser.add_argument("--video_path", type=str, default="images/demo.mp4")
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
disable_torch_init()
tokenizer, model, context_len = load_pretrained_model(args, args.stage2, args.stage3)
model = model.cuda()
# model.get_model().mm_projector.to(torch.float16)
model.to(torch.float16)
clip_model, _ = clip.load(args.clip_path)
clip_model.eval()
clip_model = clip_model.cuda()
video_loader = VideoExtractor(N=100)
_, images = video_loader.extract({'id': None, 'video': args.video_path})
transform = Compose([
Resize(224, interpolation=BICUBIC),
CenterCrop(224),
Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
])
# print(images.shape) # <N, 3, H, W>
images = transform(images / 255.0)
images = images.to(torch.float16)
with torch.no_grad():
features = clip_model.encode_image(images.to('cuda'))
query = "describe the video."
print("query: ", query)
answer=inference(model, features, "<video>\n " + query, tokenizer)
print("answer: ", answer)
from PIL import Image
from io import BytesIO
import base64
import numpy as np
import torch
import decord
from transformers import StoppingCriteria
from vtimellm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
def load_image_from_base64(image):
return Image.open(BytesIO(base64.b64decode(image)))
def process_images(images, image_processor, model_cfg):
return image_processor(images, return_tensors='pt')['pixel_values']
def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split(DEFAULT_IMAGE_TOKEN)]
def insert_separator(X, sep):
return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
input_ids = []
offset = 0
if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
offset = 1
input_ids.append(prompt_chunks[0][0])
elif tokenizer.name == "GLMTokenizer":
offset = 2
input_ids = prompt_chunks[0][:2]
for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
input_ids.extend(x[offset:])
if return_tensors is not None:
if return_tensors == 'pt':
return torch.tensor(input_ids, dtype=torch.long)
raise ValueError(f'Unsupported tensor type: {return_tensors}')
return input_ids
def get_model_name_from_path(model_path):
model_path = model_path.strip("/")
model_paths = model_path.split("/")
if model_paths[-1].startswith('checkpoint-'):
return model_paths[-2] + "_" + model_paths[-1]
else:
return model_paths[-1]
class KeywordsStoppingCriteria(StoppingCriteria):
def __init__(self, keywords, tokenizer, input_ids):
self.keywords = keywords
self.keyword_ids = []
for keyword in keywords:
cur_keyword_ids = tokenizer(keyword).input_ids
if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
cur_keyword_ids = cur_keyword_ids[1:]
self.keyword_ids.append(torch.tensor(cur_keyword_ids))
self.tokenizer = tokenizer
self.start_len = input_ids.shape[1]
def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)" # TODO
offset = min(output_ids.shape[1] - self.start_len, 3)
self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
for keyword_id in self.keyword_ids:
if output_ids[0, -keyword_id.shape[0]:].equal(keyword_id):
return True
outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
for keyword in self.keywords:
if keyword in outputs:
return True
return False
def print_trainable_parameters(model):
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
# print(_, param.requires_grad, param.numel())
if param.requires_grad:
trainable_params += param.numel()
print(
f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
)
class VideoExtractor():
"""Dataset for supervised fine-tuning."""
def __init__(self, N=100):
self.N = N
def extract(self, data):
video_path = data['video']
id = data['id']
try:
video_reader = decord.VideoReader(video_path)
total_frames = len(video_reader)
start = 0
end = total_frames - 1
split = data.get('split', None)
if split is not None:
fps = video_reader.get_avg_fps()
start = max(int(fps * split[0]), 0)
end = min(int(fps * split[1]), total_frames - 1)
sampled_indices = np.linspace(start, end, self.N, dtype=np.int32)
sampled_frames = video_reader.get_batch(sampled_indices).asnumpy()
except Exception as e:
print(e)
return None, torch.zeros(1)
images = torch.from_numpy(sampled_frames.transpose((0, 3, 1, 2)))
return id, images
\ No newline at end of file
from .vtimellm_llama import VTimeLLMLlamaForCausalLM
from .vtimellm_chatglm import VTimeLLMChatGLMForCausalLM
\ No newline at end of file
import os
import shutil
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
import torch
from vtimellm.model import *
from peft import PeftModel
def load_lora(model, lora_path):
non_lora_trainables_path = os.path.join(lora_path, 'non_lora_trainables.bin')
if os.path.exists(non_lora_trainables_path):
non_lora_trainables = torch.load(non_lora_trainables_path, map_location='cpu')
non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
if any(k.startswith('model.model.') for k in non_lora_trainables):
non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
model.load_state_dict(non_lora_trainables, strict=False)
print('Loading LoRA weights...')
model = PeftModel.from_pretrained(model, lora_path)
return model
def load_pretrained_model(args, stage2=None, stage3=None):
kwargs = {'torch_dtype': torch.float16}
# model_path = os.path.expanduser(args.model_path)
model_base = args.model_base
# lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
print('Loading VTimeLLM from base model...')
if 'chatglm' in model_base:
tokenizer = AutoTokenizer.from_pretrained(model_base, trust_remote_code=True)
model = VTimeLLMChatGLMForCausalLM.from_pretrained(model_base)
else:
tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
model = VTimeLLMLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
if model.lm_head.weight.shape[0] != token_num:
model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
# load stage1:
model.get_model().initialize_vision_modules(args)
if stage2 is not None:
print('Loading stage2 weights...')
model = load_lora(model, stage2)
print('Merging stage2 weights...')
model = model.merge_and_unload()
if stage3 is not None:
print('Loading stage3 weights...')
model = load_lora(model, stage3)
print('Merging stage3 weights...')
model = model.merge_and_unload()
if hasattr(model.config, "max_sequence_length"):
context_len = model.config.max_sequence_length
else:
context_len = 2048
return tokenizer, model, context_len
from .configuration_chatglm import ChatGLMConfig
from .modeling_chatglm import ChatGLMModel, ChatGLMForConditionalGeneration
\ No newline at end of file
from transformers import PretrainedConfig
class ChatGLMConfig(PretrainedConfig):
model_type = "chatglm"
def __init__(
self,
num_layers=28,
padded_vocab_size=65024,
hidden_size=4096,
ffn_hidden_size=13696,
kv_channels=128,
num_attention_heads=32,
seq_length=2048,
hidden_dropout=0.0,
classifier_dropout=None,
attention_dropout=0.0,
layernorm_epsilon=1e-5,
rmsnorm=True,
apply_residual_connection_post_layernorm=False,
post_layer_norm=True,
add_bias_linear=False,
add_qkv_bias=False,
bias_dropout_fusion=True,
multi_query_attention=False,
multi_query_group_num=1,
apply_query_key_layer_scaling=True,
attention_softmax_in_fp32=True,
fp32_residual_connection=False,
quantization_bit=0,
pre_seq_len=None,
prefix_projection=False,
**kwargs
):
self.num_layers = num_layers
self.vocab_size = padded_vocab_size
self.padded_vocab_size = padded_vocab_size
self.hidden_size = hidden_size
self.ffn_hidden_size = ffn_hidden_size
self.kv_channels = kv_channels
self.num_attention_heads = num_attention_heads
self.seq_length = seq_length
self.hidden_dropout = hidden_dropout
self.classifier_dropout = classifier_dropout
self.attention_dropout = attention_dropout
self.layernorm_epsilon = layernorm_epsilon
self.rmsnorm = rmsnorm
self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
self.post_layer_norm = post_layer_norm
self.add_bias_linear = add_bias_linear
self.add_qkv_bias = add_qkv_bias
self.bias_dropout_fusion = bias_dropout_fusion
self.multi_query_attention = multi_query_attention
self.multi_query_group_num = multi_query_group_num
self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
self.attention_softmax_in_fp32 = attention_softmax_in_fp32
self.fp32_residual_connection = fp32_residual_connection
self.quantization_bit = quantization_bit
self.pre_seq_len = pre_seq_len
self.prefix_projection = prefix_projection
super().__init__(**kwargs)
\ No newline at end of file
This diff is collapsed.
from torch.nn import Linear
from torch.nn.parameter import Parameter
import bz2
import torch
import base64
import ctypes
from transformers.utils import logging
from typing import List
from functools import partial
logger = logging.get_logger(__name__)
try:
from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up
class Kernel:
def __init__(self, code: bytes, function_names: List[str]):
self.code = code
self._function_names = function_names
self._cmodule = LazyKernelCModule(self.code)
for name in self._function_names:
setattr(self, name, KernelFunction(self._cmodule, name))
quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ"
kernels = Kernel(
bz2.decompress(base64.b64decode(quantization_code)),
[
"int4WeightCompression",
"int4WeightExtractionFloat",
"int4WeightExtractionHalf",
"int8WeightExtractionFloat",
"int8WeightExtractionHalf",
],
)
except Exception as exception:
kernels = None
logger.warning("Failed to load cpm_kernels:" + str(exception))
class W8A16Linear(torch.autograd.Function):
@staticmethod
def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width):
ctx.inp_shape = inp.size()
ctx.weight_bit_width = weight_bit_width
out_features = quant_w.size(0)
inp = inp.contiguous().view(-1, inp.size(-1))
weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width)
ctx.weight_shape = weight.size()
output = inp.mm(weight.t())
ctx.save_for_backward(inp, quant_w, scale_w)
return output.view(*(ctx.inp_shape[:-1] + (out_features,)))
@staticmethod
def backward(ctx, grad_output: torch.Tensor):
inp, quant_w, scale_w = ctx.saved_tensors
weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width)
grad_output = grad_output.contiguous().view(-1, weight.size(0))
grad_input = grad_output.mm(weight)
grad_weight = grad_output.t().mm(inp)
return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None
def compress_int4_weight(weight: torch.Tensor): # (n, m)
with torch.cuda.device(weight.device):
n, m = weight.size(0), weight.size(1)
assert m % 2 == 0
m = m // 2
out = torch.empty(n, m, dtype=torch.int8, device="cuda")
stream = torch.cuda.current_stream()
gridDim = (n, 1, 1)
blockDim = (min(round_up(m, 32), 1024), 1, 1)
kernels.int4WeightCompression(
gridDim,
blockDim,
0,
stream,
[ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
)
return out
def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
assert scale_list.dtype in [torch.half, torch.bfloat16]
assert weight.dtype in [torch.int8]
if source_bit_width == 8:
return weight.to(scale_list.dtype) * scale_list[:, None]
elif source_bit_width == 4:
func = (
kernels.int4WeightExtractionHalf if scale_list.dtype == torch.half else kernels.int4WeightExtractionBFloat16
)
else:
assert False, "Unsupported bit-width"
with torch.cuda.device(weight.device):
n, m = weight.size(0), weight.size(1)
out = torch.empty(n, m * (8 // source_bit_width), dtype=scale_list.dtype, device="cuda")
stream = torch.cuda.current_stream()
gridDim = (n, 1, 1)
blockDim = (min(round_up(m, 32), 1024), 1, 1)
func(
gridDim,
blockDim,
0,
stream,
[
ctypes.c_void_p(weight.data_ptr()),
ctypes.c_void_p(scale_list.data_ptr()),
ctypes.c_void_p(out.data_ptr()),
ctypes.c_int32(n),
ctypes.c_int32(m),
],
)
return out
class QuantizedLinear(torch.nn.Module):
def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args,
**kwargs):
super().__init__()
self.weight_bit_width = weight_bit_width
shape = weight.shape
if weight is None or empty_init:
self.weight = torch.empty(shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=device)
self.weight_scale = torch.empty(shape[0], dtype=dtype, device=device)
else:
self.weight_scale = weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)
self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8)
if weight_bit_width == 4:
self.weight = compress_int4_weight(self.weight)
self.weight = Parameter(self.weight.to(device), requires_grad=False)
self.weight_scale = Parameter(self.weight_scale.to(device), requires_grad=False)
self.bias = Parameter(bias.to(device), requires_grad=False) if bias is not None else None
def forward(self, input):
output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width)
if self.bias is not None:
output = output + self.bias
return output
def quantize(model, weight_bit_width, empty_init=False, device=None):
"""Replace fp16 linear with quantized linear"""
for layer in model.layers:
layer.self_attention.query_key_value = QuantizedLinear(
weight_bit_width=weight_bit_width,
weight=layer.self_attention.query_key_value.weight.to(torch.cuda.current_device()),
bias=layer.self_attention.query_key_value.bias,
dtype=layer.self_attention.query_key_value.weight.dtype,
device=layer.self_attention.query_key_value.weight.device if device is None else device,
empty_init=empty_init
)
layer.self_attention.dense = QuantizedLinear(
weight_bit_width=weight_bit_width,
weight=layer.self_attention.dense.weight.to(torch.cuda.current_device()),
bias=layer.self_attention.dense.bias,
dtype=layer.self_attention.dense.weight.dtype,
device=layer.self_attention.dense.weight.device if device is None else device,
empty_init=empty_init
)
layer.mlp.dense_h_to_4h = QuantizedLinear(
weight_bit_width=weight_bit_width,
weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
bias=layer.mlp.dense_h_to_4h.bias,
dtype=layer.mlp.dense_h_to_4h.weight.dtype,
device=layer.mlp.dense_h_to_4h.weight.device if device is None else device,
empty_init=empty_init
)
layer.mlp.dense_4h_to_h = QuantizedLinear(
weight_bit_width=weight_bit_width,
weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()),
bias=layer.mlp.dense_4h_to_h.bias,
dtype=layer.mlp.dense_4h_to_h.weight.dtype,
device=layer.mlp.dense_4h_to_h.weight.device if device is None else device,
empty_init=empty_init
)
return model
import json
import os
import re
from typing import List, Optional, Union, Dict
from sentencepiece import SentencePieceProcessor
from transformers import PreTrainedTokenizer
from transformers.utils import logging, PaddingStrategy
from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
class SPTokenizer:
def __init__(self, model_path: str):
# reload tokenizer
assert os.path.isfile(model_path), model_path
self.sp_model = SentencePieceProcessor(model_file=model_path)
# BOS / EOS token IDs
self.n_words: int = self.sp_model.vocab_size()
self.bos_id: int = self.sp_model.bos_id()
self.eos_id: int = self.sp_model.eos_id()
self.pad_id: int = self.sp_model.unk_id()
assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
self.special_tokens = {}
self.index_special_tokens = {}
for token in special_tokens:
self.special_tokens[token] = self.n_words
self.index_special_tokens[self.n_words] = token
self.n_words += 1
self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens])
def tokenize(self, s: str, encode_special_tokens=False):
if encode_special_tokens:
last_index = 0
t = []
for match in re.finditer(self.role_special_token_expression, s):
if last_index < match.start():
t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()]))
t.append(s[match.start():match.end()])
last_index = match.end()
if last_index < len(s):
t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
return t
else:
return self.sp_model.EncodeAsPieces(s)
def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
assert type(s) is str
t = self.sp_model.encode(s)
if bos:
t = [self.bos_id] + t
if eos:
t = t + [self.eos_id]
return t
def decode(self, t: List[int]) -> str:
text, buffer = "", []
for token in t:
if token in self.index_special_tokens:
if buffer:
text += self.sp_model.decode(buffer)
buffer = []
text += self.index_special_tokens[token]
else:
buffer.append(token)
if buffer:
text += self.sp_model.decode(buffer)
return text
def decode_tokens(self, tokens: List[str]) -> str:
text = self.sp_model.DecodePieces(tokens)
return text
def convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """
if token in self.special_tokens:
return self.special_tokens[token]
return self.sp_model.PieceToId(token)
def convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
if index in self.index_special_tokens:
return self.index_special_tokens[index]
if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
return ""
return self.sp_model.IdToPiece(index)
class ChatGLMTokenizer(PreTrainedTokenizer):
vocab_files_names = {"vocab_file": "tokenizer.model"}
model_input_names = ["input_ids", "attention_mask", "position_ids"]
def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False,
**kwargs):
self.name = "GLMTokenizer"
self.vocab_file = vocab_file
self.tokenizer = SPTokenizer(vocab_file)
self.special_tokens = {
"<bos>": self.tokenizer.bos_id,
"<eos>": self.tokenizer.eos_id,
"<pad>": self.tokenizer.pad_id
}
self.encode_special_tokens = encode_special_tokens
super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
encode_special_tokens=encode_special_tokens,
**kwargs)
def get_command(self, token):
if token in self.special_tokens:
return self.special_tokens[token]
assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
return self.tokenizer.special_tokens[token]
@property
def unk_token(self) -> str:
return "<unk>"
@property
def pad_token(self) -> str:
return "<unk>"
@property
def pad_token_id(self):
return self.get_command("<pad>")
@property
def eos_token(self) -> str:
return "</s>"
@property
def eos_token_id(self):
return self.get_command("<eos>")
@property
def vocab_size(self):
return self.tokenizer.n_words
def get_vocab(self):
""" Returns vocab as a dict """
vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text, **kwargs):
return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
def _convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """
return self.tokenizer.convert_token_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.tokenizer.convert_id_to_token(index)
def convert_tokens_to_string(self, tokens: List[str]) -> str:
return self.tokenizer.decode_tokens(tokens)
def save_vocabulary(self, save_directory, filename_prefix=None):
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
filename_prefix (`str`, *optional*):
An optional prefix to add to the named of the saved files.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, self.vocab_files_names["vocab_file"]
)
else:
vocab_file = save_directory
with open(self.vocab_file, 'rb') as fin:
proto_str = fin.read()
with open(vocab_file, "wb") as writer:
writer.write(proto_str)
return (vocab_file,)
def get_prefix_tokens(self):
prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
return prefix_tokens
def build_single_message(self, role, metadata, message):
assert role in ["system", "user", "assistant", "observation"], role
role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n")
message_tokens = self.tokenizer.encode(message)
tokens = role_tokens + message_tokens
return tokens
def build_chat_input(self, query, history=None, role="user"):
if history is None:
history = []
input_ids = []
for item in history:
content = item["content"]
if item["role"] == "system" and "tools" in item:
content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False)
input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content))
input_ids.extend(self.build_single_message(role, "", query))
input_ids.extend([self.get_command("<|assistant|>")])
return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
prefix_tokens = self.get_prefix_tokens()
token_ids_0 = prefix_tokens + token_ids_0
if token_ids_1 is not None:
token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
return token_ids_0
def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
Args:
encoded_inputs:
Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
max_length: maximum length of the returned list and optionally padding length (see below).
Will truncate by taking into account the special tokens.
padding_strategy: PaddingStrategy to use for padding.
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
- PaddingStrategy.DO_NOT_PAD: Do not pad
The tokenizer padding sides are defined in self.padding_side:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
# Load from model defaults
assert self.padding_side == "left"
required_input = encoded_inputs[self.model_input_names[0]]
seq_length = len(required_input)
if padding_strategy == PaddingStrategy.LONGEST:
max_length = len(required_input)
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
# Initialize attention mask if not present.
if "attention_mask" not in encoded_inputs:
encoded_inputs["attention_mask"] = [1] * seq_length
if "position_ids" not in encoded_inputs:
encoded_inputs["position_ids"] = list(range(seq_length))
if needs_to_be_padded:
difference = max_length - len(required_input)
if "attention_mask" in encoded_inputs:
encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
if "position_ids" in encoded_inputs:
encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
return encoded_inputs
import torch
import torch.nn as nn
from vtimellm.constants import IMAGE_TOKEN_INDEX, IGNORE_INDEX
from abc import ABC, abstractmethod
class VTimeLLMMetaModel:
def initialize_vision_modules(self, model_args):
pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
if not hasattr(self, 'mm_projector'):
self.mm_projector = nn.Linear(768, self.config.hidden_size)
if pretrain_mm_mlp_adapter is not None:
mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
def get_w(weights, keyword):
return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
print("load mlp:", pretrain_mm_mlp_adapter)
class VTimeLLMMetaForCausalLM(ABC):
@abstractmethod
def get_model(self):
pass
def prepare_inputs_labels_for_multimodal(
self, input_ids, position_ids, attention_mask, past_key_values, labels, images
):
# print(position_ids, attention_mask)
# if past_key_values:
# print(past_key_values[-1][-1].shape)
# print(input_ids.shape, position_ids.shape, attention_mask.shape, past_key_values.shape, images)
if images is None or input_ids.shape[1] == 1:
if past_key_values is not None and images is not None and input_ids.shape[1] == 1:
if self.get_model().config.model_type == 'chatglm':
target_shape = past_key_values[-1][-1].shape[0] + 1
else:
target_shape = past_key_values[-1][-1].shape[-2] + 1
attention_mask = torch.cat((attention_mask, torch.ones(
(attention_mask.shape[0], target_shape - attention_mask.shape[1]),
dtype=attention_mask.dtype,
device=attention_mask.device
)), dim=1)
position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
return input_ids, position_ids, attention_mask, past_key_values, None, labels
if type(images) is list:
concat_images = torch.cat([image for image in images], dim=0)
image_features = self.get_model().mm_projector(concat_images)
split_sizes = [image.shape[0] for image in images]
image_features = torch.split(image_features, split_sizes, dim=0)
# image_features = [x.flatten(0, 1) for x in image_features]
else:
image_features = self.get_model().mm_projector(images)
# print([image.shape for image in image_features])
_labels = labels
_position_ids = position_ids
_attention_mask = attention_mask
if attention_mask is None:
attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
else:
attention_mask = attention_mask.bool()
if position_ids is None:
position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
if labels is None:
labels = torch.full_like(input_ids, IGNORE_INDEX)
# remove the padding using attention_mask -- TODO: double check
input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
new_input_embeds = []
new_labels = []
cur_image_idx = 0
for batch_idx, cur_input_ids in enumerate(input_ids):
num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
if num_images == 0:
cur_image_features = image_features[cur_image_idx]
cur_input_embeds_1 = self.get_model().get_input_embeddings()(cur_input_ids)
cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
new_input_embeds.append(cur_input_embeds)
new_labels.append(labels[batch_idx])
cur_image_idx += 1
continue
image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
cur_input_ids_noim = []
cur_labels = labels[batch_idx]
cur_labels_noim = []
for i in range(len(image_token_indices) - 1):
cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
split_sizes = [x.shape[0] for x in cur_labels_noim]
cur_input_embeds = self.get_model().get_input_embeddings()(torch.cat(cur_input_ids_noim))
cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
cur_new_input_embeds = []
cur_new_labels = []
for i in range(num_images + 1):
cur_new_input_embeds.append(cur_input_embeds_no_im[i])
cur_new_labels.append(cur_labels_noim[i])
if i < num_images:
cur_image_features = image_features[cur_image_idx]
cur_image_idx += 1
cur_new_input_embeds.append(cur_image_features)
cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
cur_new_input_embeds = torch.cat(cur_new_input_embeds)
cur_new_labels = torch.cat(cur_new_labels)
new_input_embeds.append(cur_new_input_embeds)
new_labels.append(cur_new_labels)
# Truncate sequences to max length as image embeddings can make the sequence longer
tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
if tokenizer_model_max_length is not None:
new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
# Combine them
max_len = max(x.shape[0] for x in new_input_embeds)
batch_size = len(new_input_embeds)
new_input_embeds_padded = []
new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
cur_len = cur_new_embed.shape[0]
if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
new_input_embeds_padded.append(torch.cat((
torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
cur_new_embed
), dim=0))
if cur_len > 0:
new_labels_padded[i, -cur_len:] = cur_new_labels
attention_mask[i, -cur_len:] = True
position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
else:
new_input_embeds_padded.append(torch.cat((
cur_new_embed,
torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
), dim=0))
if cur_len > 0:
new_labels_padded[i, :cur_len] = cur_new_labels
attention_mask[i, :cur_len] = True
position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
if _labels is None:
new_labels = None
else:
new_labels = new_labels_padded
if _attention_mask is None:
attention_mask = None
else:
attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
if _position_ids is None:
position_ids = None
if self.get_model().config.model_type == 'chatglm':
fake_input_ids = torch.full((new_input_embeds.shape[0], new_input_embeds.shape[1]), -10000,
dtype=new_input_embeds.dtype, device=new_input_embeds.device)
attention_mask = attention_mask.to(torch.int8)
new_input_embeds = new_input_embeds.transpose(0, 1).contiguous()
else:
fake_input_ids = None
# print(position_ids, attention_mask)
return fake_input_ids, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
import torch
import torch.nn as nn
from typing import List, Optional, Tuple, Union
from transformers import AutoConfig, AutoModelForCausalLM
from .chatglm import ChatGLMConfig, ChatGLMModel, ChatGLMForConditionalGeneration
from .vtimellm_arch import VTimeLLMMetaModel, VTimeLLMMetaForCausalLM
class VTimeLLMChatGLMConfig(ChatGLMConfig):
model_type = "VTimeLLM_ChatGLM"
class VTimeLLMChatGLMModel(ChatGLMModel, VTimeLLMMetaModel):
config_class = VTimeLLMChatGLMConfig
def __init__(self, config, empty_init=True, device=None):
super(VTimeLLMChatGLMModel, self).__init__(config, empty_init=empty_init, device=device)
class VTimeLLMChatGLMForCausalLM(ChatGLMForConditionalGeneration, VTimeLLMMetaForCausalLM):
config_class = VTimeLLMChatGLMConfig
def __init__(self, config, empty_init=True, device=None):
super(ChatGLMForConditionalGeneration, self).__init__(config)
self.transformer = VTimeLLMChatGLMModel(config, empty_init=empty_init, device=device)
self.max_sequence_length = config.max_length
self.config = config
self.quantized = False
# Initialize weights and apply final processing
self.post_init()
def get_model(self):
return self.transformer
def forward(
self,
input_ids: torch.LongTensor = None,
position_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
return_last_logit: Optional[bool] = False,
images: Optional[torch.FloatTensor] = None,
):
if inputs_embeds is None:
(
input_ids,
position_ids,
attention_mask,
past_key_values,
inputs_embeds,
labels
) = self.prepare_inputs_labels_for_multimodal(
input_ids,
position_ids,
attention_mask,
past_key_values,
labels,
images
)
return super().forward(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
labels=labels,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict
)
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
images = kwargs.pop("images", None)
_inputs = super().prepare_inputs_for_generation(
input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
)
if images is not None:
_inputs['images'] = images
return _inputs
AutoConfig.register("VTimeLLM_ChatGLM", VTimeLLMChatGLMConfig)
AutoModelForCausalLM.register(VTimeLLMChatGLMConfig, VTimeLLMChatGLMForCausalLM)
import torch
import torch.nn as nn
from typing import List, Optional, Tuple, Union
from transformers import AutoConfig, AutoModelForCausalLM, LlamaConfig, LlamaModel, LlamaForCausalLM
from transformers.modeling_outputs import CausalLMOutputWithPast
from .vtimellm_arch import VTimeLLMMetaModel, VTimeLLMMetaForCausalLM
class VTimeLLMConfig(LlamaConfig):
model_type = "VTimeLLM"
class VTimeLLMLlamaModel(LlamaModel, VTimeLLMMetaModel):
config_class = VTimeLLMConfig
def __init__(self, config: LlamaConfig):
super(VTimeLLMLlamaModel, self).__init__(config)
class VTimeLLMLlamaForCausalLM(LlamaForCausalLM, VTimeLLMMetaForCausalLM):
config_class = VTimeLLMConfig
def __init__(self, config):
super(LlamaForCausalLM, self).__init__(config)
self.model = VTimeLLMLlamaModel(config)
self.pretraining_tp = config.pretraining_tp
self.vocab_size = config.vocab_size
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
# Initialize weights and apply final processing
self.post_init()
def get_model(self):
return self.model
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
images: Optional[torch.FloatTensor] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
if inputs_embeds is None:
(
input_ids,
position_ids,
attention_mask,
past_key_values,
inputs_embeds,
labels
) = self.prepare_inputs_labels_for_multimodal(
input_ids,
position_ids,
attention_mask,
past_key_values,
labels,
images
)
return super().forward(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
labels=labels,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict
)
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
images = kwargs.pop("images", None)
_inputs = super().prepare_inputs_for_generation(
input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
)
if images is not None:
_inputs['images'] = images
return _inputs
AutoConfig.register("VTimeLLM", VTimeLLMConfig)
AutoModelForCausalLM.register(VTimeLLMConfig, VTimeLLMLlamaForCausalLM)
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment