init

fef630ee · suily · fef630ee · fef630ee · fef630ee · fef630ee
Commit fef630ee authored Nov 20, 2024 by suily
20 changed files
--- a/vtimellm/eval/dvc_eval/SODA/utils.py
+++ b/vtimellm/eval/dvc_eval/SODA/utils.py
+#!/usr/bin/env python
+
+def iou(interval_1, interval_2):
+    """
+    interval: list (2 float elements)
+    """
+    eps = 1e-8 # to avoid zero division
+    (s_1, e_1) = interval_1
+    (s_2, e_2) = interval_2
+
+    intersection = max(0., min(e_1, e_2) - max(s_1, s_2))
+    union = min(max(e_1, e_2) - min(s_1, s_2), e_1 - s_1 + e_2 - s_2)
+    iou = intersection / (union + eps)
+    return iou
+
+def remove_nonascii(text):
+    return ''.join([i if ord(i) < 128 else ' ' for i in text])
+
--- a/vtimellm/eval/dvc_eval/__init__.py
+++ b/vtimellm/eval/dvc_eval/__init__.py
+from .eval_dvc import eval_dvc
+from .eval_soda import eval_soda
\ No newline at end of file
--- a/vtimellm/eval/dvc_eval/eval_dvc.py
+++ b/vtimellm/eval/dvc_eval/eval_dvc.py
+# --------------------------------------------------------
+# evaluation scripts for dense video captioning, support python 3
+# Modified from https://github.com/ranjaykrishna/densevid_eval/tree/9d4045aced3d827834a5d2da3c9f0692e3f33c1c
+# --------------------------------------------------------
+# Dense-Captioning Events in Videos Eval
+# Copyright (c) 2017 Ranjay Krishna
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ranjay Krishna
+# --------------------------------------------------------
+
+import argparse
+import json
+import random
+import string
+import sys
+import time
+# sys.path.insert(0, './coco-caption') # Hack to allow the import of pycocoeval
+
+from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+from pycocoevalcap.meteor.meteor import Meteor
+from pycocoevalcap.cider.cider import Cider
+from pycocoevalcap.bleu.bleu import Bleu
+from pycocoevalcap.rouge.rouge import Rouge
+
+Set = set
+import numpy as np
+
+
+def random_string(string_length):
+    letters = string.ascii_lowercase
+    return ''.join(random.choice(letters) for i in range(string_length))
+
+
+def remove_nonascii(text):
+    return ''.join([i if ord(i) < 128 else ' ' for i in text])
+
+
+class ANETcaptions(object):
+    PREDICTION_FIELDS = ['results', 'version', 'external_data']
+
+    def __init__(self, ground_truth_filenames=None, prediction_filename=None,
+                 tious=None, distances=[1, 3, 5, 10, 30, 60], max_proposals=1000,
+                 prediction_fields=PREDICTION_FIELDS, verbose=False, no_lang_eval=False):
+        # Check that the gt and submission files exist and load them
+        if len(tious) == 0:
+            raise IOError('Please input a valid tIoU.')
+        if not ground_truth_filenames:
+            raise IOError('Please input a valid ground truth file.')
+        if not prediction_filename:
+            raise IOError('Please input a valid prediction file.')
+
+        self.verbose = verbose
+        self.no_lang_eval = no_lang_eval
+        self.tious = tious
+        self.distances = distances
+        self.max_proposals = max_proposals
+        self.pred_fields = prediction_fields
+        self.ground_truths = self.import_ground_truths(ground_truth_filenames)
+        self.prediction = self.import_prediction(prediction_filename)
+        self.ground_truths_keys = [vid for gt in self.ground_truths for vid in gt]
+        print('available video number', len(set(self.ground_truths_keys) & set(self.prediction.keys())))
+
+        # Set up scorers
+        if not self.no_lang_eval:
+            self.tokenizer = PTBTokenizer()
+            self.scorers = [
+                (Meteor(), "METEOR"),
+                (Cider(), "CIDEr"),
+                (Rouge(), "Rouge-L"),
+                (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
+            ]
+
+    def import_prediction(self, prediction_filename):
+        if self.verbose:
+            print("| Loading submission...")
+        if isinstance(prediction_filename, dict):
+            submission = prediction_filename
+        else:
+            submission = json.load(open(prediction_filename))
+        # if not all([field in submission.keys() for field in self.pred_fields]):
+        #    raise IOError('Please input a valid ground truth file.')
+        # Ensure that every video is limited to the correct maximum number of proposals.
+        results = {}
+        for vid_id in submission['results']:
+            results[vid_id] = submission['results'][vid_id][:self.max_proposals]
+        return results
+
+    def import_ground_truths(self, filenames):
+        gts = []
+        self.n_ref_vids = Set()
+        for filename in filenames:
+            if isinstance(filename, dict):
+                gt = filename
+            else:
+                gt = json.load(open(filename))
+            self.n_ref_vids.update(gt.keys())
+            gts.append(gt)
+        if self.verbose:
+            print("| Loading GT. #files: %d, #videos: %d" % (len(filenames), len(self.n_ref_vids)))
+        return gts
+
+    def iou(self, interval_1, interval_2):
+        start_i, end_i = interval_1[0], interval_1[1]
+        start, end = interval_2[0], interval_2[1]
+        intersection = max(0, min(end, end_i) - max(start, start_i))
+        union = min(max(end, end_i) - min(start, start_i), end - start + end_i - start_i)
+        iou = float(intersection) / (union + 1e-8)
+        return iou
+
+    def check_gt_exists(self, vid_id):
+        for gt in self.ground_truths:
+            if vid_id in gt:
+                return True
+        return False
+
+    def get_gt_vid_ids(self):
+        vid_ids = set([])
+        for gt in self.ground_truths:
+            vid_ids |= set(gt.keys())
+        return list(vid_ids)
+
+    def evaluate(self):
+        aggregator = {}
+        self.scores = {}
+        if not self.no_lang_eval:
+            for tiou in self.tious:
+                scores = self.evaluate_tiou(tiou)
+                for metric, score in scores.items():
+                    if metric not in self.scores:
+                        self.scores[metric] = []
+                    self.scores[metric].append(score)
+        if True:
+            # if self.verbose:
+            self.scores['Recall'] = []
+            self.scores['Precision'] = []
+            self.scores['F1'] = []
+            for tiou in self.tious:
+                precision, recall = self.evaluate_detection(tiou)
+                self.scores['Recall'].append(recall)
+                self.scores['Precision'].append(precision)
+                self.scores['F1'].append(2 * recall * precision / (recall + precision) if recall + precision else 0.)
+            for tiou in self.distances:
+                precision, recall = self.evaluate_navigation(tiou)
+                self.scores['Recall'].append(recall)
+                self.scores['Precision'].append(precision)
+                self.scores['F1'].append(2 * recall * precision / (recall + precision) if recall + precision else 0.)
+
+    def evaluate_detection(self, tiou):
+        gt_vid_ids = self.get_gt_vid_ids()
+        # Recall is the percentage of ground truth that is covered by the predictions
+        # Precision is the percentage of predictions that are valid
+        recall = []
+        precision = []
+        for vid_i, vid_id in enumerate(gt_vid_ids):
+            if vid_id not in self.prediction:  # missing video
+                continue
+            best_recall = 0
+            best_precision = 0
+            for gt in self.ground_truths:
+                if vid_id not in gt:
+                    continue
+                refs = gt[vid_id]
+                ref_set_covered = set([])
+                pred_set_covered = set([])
+                num_gt = 0
+                num_pred = 0
+                if vid_id in self.prediction:
+                    for pred_i, pred in enumerate(self.prediction[vid_id]):
+                        pred_timestamp = pred['timestamp']
+                        for ref_i, ref_timestamp in enumerate(refs['timestamps']):
+                            if self.iou(pred_timestamp, ref_timestamp) > tiou:
+                                ref_set_covered.add(ref_i)
+                                pred_set_covered.add(pred_i)
+
+                    new_precision = float(len(pred_set_covered)) / max(len(self.prediction[vid_id]), 1)
+                    best_precision = max(best_precision, new_precision)
+                new_recall = float(len(ref_set_covered)) / len(refs['timestamps'])
+                best_recall = max(best_recall, new_recall)
+            recall.append(best_recall)
+            precision.append(best_precision)
+        return sum(precision) / len(precision), sum(recall) / len(recall)
+
+    def evaluate_navigation(self, tiou):
+        gt_vid_ids = self.get_gt_vid_ids()
+        # Recall is the percentage of ground truth that is covered by the predictions
+        # Precision is the percentage of predictions that are valid
+        recall = []
+        precision = []
+        for vid_i, vid_id in enumerate(gt_vid_ids):
+            if vid_id not in self.prediction:  # missing video
+                continue
+            best_recall = 0
+            best_precision = 0
+            for gt in self.ground_truths:
+                if vid_id not in gt:
+                    continue
+                refs = gt[vid_id]
+                ref_set_covered = set([])
+                pred_set_covered = set([])
+                num_gt = 0
+                num_pred = 0
+                if vid_id in self.prediction:
+                    for pred_i, pred in enumerate(self.prediction[vid_id]):
+                        pred_timestamp = pred['timestamp']
+                        for ref_i, ref_timestamp in enumerate(refs['timestamps']):
+                            if abs(pred_timestamp[0] - ref_timestamp[0]) < tiou:
+                                ref_set_covered.add(ref_i)
+                                pred_set_covered.add(pred_i)
+
+                    new_precision = float(len(pred_set_covered)) / max(len(self.prediction[vid_id]), 1)
+                    best_precision = max(best_precision, new_precision)
+                new_recall = float(len(ref_set_covered)) / len(refs['timestamps'])
+                best_recall = max(best_recall, new_recall)
+            recall.append(best_recall)
+            precision.append(best_precision)
+        return sum(precision) / len(precision), sum(recall) / len(recall)
+
+    def evaluate_tiou(self, tiou):
+        # This method averages the tIoU precision from METEOR, Bleu, etc. across videos
+        res = {}
+        gts = {}
+        gt_vid_ids = self.get_gt_vid_ids()
+
+        unique_index = 0
+
+        # video id to unique caption ids mapping
+        vid2capid = {}
+
+        cur_res = {}
+        cur_gts = {}
+
+        for vid_id in gt_vid_ids:
+
+            # If the video does not have a prediction, then we give it no matches
+            # We set it to empty, and use this as a sanity check later on
+            if vid_id not in self.prediction:  # missing video
+                continue
+
+            # If we do have a prediction, then we find the scores based on all the
+            # valid tIoU overlaps.
+            else:
+                vid2capid[vid_id] = []
+                # For each prediction, we look at the tIoU with ground truth.
+                for pred in self.prediction[vid_id]:
+                    has_added = False
+                    for gt in self.ground_truths:
+                        if vid_id not in gt:
+                            continue
+                        gt_captions = gt[vid_id]
+                        for caption_idx, caption_timestamp in enumerate(gt_captions['timestamps']):
+                            if self.iou(pred['timestamp'], caption_timestamp) >= tiou:
+                                cur_res[unique_index] = [{'caption': remove_nonascii(pred['sentence'])}]
+                                cur_gts[unique_index] = [
+                                    {'caption': remove_nonascii(gt_captions['sentences'][caption_idx])}]
+                                vid2capid[vid_id].append(unique_index)
+                                unique_index += 1
+                                has_added = True
+
+                    # If the predicted caption does not overlap with any ground truth,
+                    # we should compare it with garbage.
+                    if not has_added:
+                        cur_res[unique_index] = [{'caption': remove_nonascii(pred['sentence'])}]
+                        cur_gts[unique_index] = [{'caption': random_string(random.randint(10, 20))}]
+                        vid2capid[vid_id].append(unique_index)
+                        unique_index += 1
+
+        # Each scorer will compute across all videos and take average score
+        output = {}
+        for scorer, method in self.scorers:
+            if self.verbose:
+                print('computing %s score...' % (scorer.method()))
+
+            # For each video, take all the valid pairs (based from tIoU) and compute the score
+            all_scores = {}
+
+            # call tokenizer here for all predictions and gts
+            tokenize_res = self.tokenizer.tokenize(cur_res)
+            tokenize_gts = self.tokenizer.tokenize(cur_gts)
+
+            # reshape back
+            for vid in vid2capid.keys():
+                res[vid] = {index: tokenize_res[index] for index in vid2capid[vid]}
+                gts[vid] = {index: tokenize_gts[index] for index in vid2capid[vid]}
+
+            for vid_id in gt_vid_ids:
+
+                if vid_id not in self.prediction:  # missing video
+                    continue
+
+                if len(res[vid_id]) == 0 or len(gts[vid_id]) == 0:
+                    if type(method) == list:
+                        score = [0] * len(method)
+                    else:
+                        score = 0
+                else:
+                    score, scores = scorer.compute_score(gts[vid_id], res[vid_id])
+                all_scores[vid_id] = score
+                # import ipdb;ipdb.set_trace()
+
+            # print(all_scores.values())
+            if type(method) == list:
+                scores = np.mean(list(all_scores.values()), axis=0)
+                for m in range(len(method)):
+                    output[method[m]] = scores[m]
+                    if self.verbose:
+                        print("Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method[m], output[method[m]]))
+            else:
+                output[method] = np.mean(list(all_scores.values()))
+                if self.verbose:
+                    print("Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method, output[method]))
+        return output
+
+
+def eval_dvc(submission, references, tious=[0.3, 0.5, 0.7, 0.9], distances=[1, 3, 5, 10, 30, 60], max_proposals_per_video=1000, verbose=False, no_lang_eval=False):
+    # Call coco eval
+    evaluator = ANETcaptions(ground_truth_filenames=references,
+                             prediction_filename=submission,
+                             tious=tious,
+                             distances=distances,
+                             max_proposals=max_proposals_per_video,
+                             verbose=verbose, no_lang_eval=no_lang_eval)
+    evaluator.evaluate()
+    score = evaluator.scores
+    # print(score)
+    loc_score = {}
+    for i, x in enumerate(tious):
+        for y in ["Recall", "Precision", "F1"]:
+            loc_score[y + "@" + str(x)] = score[y][i]
+    for y in ["Recall", "Precision", "F1"]:
+        loc_score[y] = np.array([score[y][i] for i in range(len(tious))]).mean()
+    if distances:
+        for i, x in enumerate(distances):
+            for y in ["Recall", "Precision", "F1"]:
+                loc_score[y + "@" + str(x) + "s"] = score[y][len(tious) + i]
+    avg_eval_score = {key: np.array(value).mean() for key, value in score.items() if key not in ["Recall", "Precision", "F1"]}
+    avg_eval_score.update(loc_score)
+    return avg_eval_score
+
+if __name__ == '__main__':
+    eval_dvc(pred_path, references, 
+                tious=[0.3, 0.5, 0.7, 0.9], 
+                max_proposals_per_video=1000, 
+                verbose=False, 
+                no_lang_eval=False)
+    eval_soda(pred_path, references, verbose=False)
\ No newline at end of file
--- a/vtimellm/eval/dvc_eval/eval_soda.py
+++ b/vtimellm/eval/dvc_eval/eval_soda.py
+import numpy as np
+from .SODA.soda import SODA
+from .SODA.dataset import ANETCaptions
+
+def eval_tool(prediction, referneces=None, metric='Meteor', soda_type='c', verbose=False):
+
+    args = type('args', (object,), {})()
+    args.prediction = prediction
+    args.references = referneces
+    args.metric = metric
+    args.soda_type = soda_type
+    args.tious = [0.3, 0.5, 0.7, 0.9]
+    args.verbose = verbose
+    args.multi_reference = False
+
+    data = ANETCaptions.from_load_files(args.references,
+                                        args.prediction,
+                                        multi_reference=args.multi_reference,
+                                        verbose=args.verbose,
+                                        )
+    data.preprocess()
+    if args.soda_type == 'a':
+        tious = args.tious
+    else:
+        tious = None
+    evaluator = SODA(data,
+                     soda_type=args.soda_type,
+                     tious=tious,
+                     scorer=args.metric,
+                     verbose=args.verbose
+                     )
+    result = evaluator.evaluate()
+
+    return result
+
+def eval_soda(p, ref_list,verbose=False):
+    score_sum = []
+    for ref in ref_list:
+        r = eval_tool(prediction=p, referneces=[ref], verbose=verbose, soda_type='c')
+        score_sum.append(r['Meteor'])
+    soda_avg = np.mean(score_sum, axis=0) #[avg_pre, avg_rec, avg_f1]
+    soda_c_avg = soda_avg[-1]
+    results = {'soda_c': soda_c_avg}
+    return results
\ No newline at end of file
--- a/vtimellm/eval/eval.py
+++ b/vtimellm/eval/eval.py
+import os
+root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..")
+import sys
+sys.path.append(root_dir)
+
+import clip
+import re
+import argparse
+import torch
+import json
+import numpy as np
+from tqdm import tqdm
+from torchvision.transforms import Compose, Resize, CenterCrop, Normalize
+from vtimellm.model.builder import load_pretrained_model
+from vtimellm.utils import disable_torch_init
+from vtimellm.mm_utils import VideoExtractor
+from vtimellm.inference import inference
+
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    from PIL import Image
+    BICUBIC = Image.BICUBIC
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--clip_path", type=str, default="checkpoints/clip/ViT-L-14.pt")
+    parser.add_argument("--pretrain_mm_mlp_adapter", type=str, default="checkpoints/vtimellm-vicuna-v1-5-7b-stage1/mm_projector.bin")
+    parser.add_argument("--stage2", type=str, default="checkpoints/vtimellm-vicuna-v1-5-7b-stage2")
+    parser.add_argument("--stage3", type=str, default="checkpoints/vtimellm-vicuna-v1-5-7b-stage3")
+    parser.add_argument("--model_base", type=str, default="/path/to/vicuna-7b-v1.5")
+    parser.add_argument("--data_path", type=str, default="vtimellm/eval/data_example.json")
+    parser.add_argument("--feat_folder", type=str, default=None)
+    parser.add_argument("--video_folder", type=str, default=None)
+    parser.add_argument("--task", type=str, default='all', choices=['all', 'grounding', 'captioning'])
+    parser.add_argument("--log_path", type=str, default='vtimellm/eval/log/example_log.txt')
+    args = parser.parse_args()
+    return args
+
+def iou(outputs, gt):
+    matches = re.search(r"(\d{2}) (to|and) (\d{2})", outputs)
+    if not matches:
+        return 0
+    from_number = float(matches.group(1)) / 100
+    to_number = float(matches.group(3)) / 100
+    s, e = gt
+    intersection = max(0, min(to_number, e) - max(from_number, s))
+    union = max(to_number, e) - min(from_number, s)
+    iou = intersection / union
+    return round(iou, 2)
+
+
+def write_log(log_path, video_id, task, query_id, answer, info=None):
+    log = {
+        'video_id': video_id,
+        'task': task,
+        'query_id': query_id,
+        'answer': answer
+    }
+    if info is not None:
+        log['info'] = info
+    with open(log_path, 'a') as f:
+        f.write(json.dumps(log) + '\n')
+
+questions = {
+    'grounding': ['During which frames can we see {}?'],
+    'captioning': ['Could you please describe the events in the video in detail? Be specific about the activities of individuals, their surroundings, and interactions with others. The output should be in JSON format, structured as follows: {"event": "xx", "timestamps": "from xx to xx"}.']
+}
+
+if __name__ == "__main__":
+    args = parse_args()
+    disable_torch_init()
+    tokenizer, model, context_len = load_pretrained_model(args, args.stage2, args.stage3)
+    model = model.cuda()
+    model.to(torch.float16)
+
+    if args.video_folder is not None:
+        clip_model, _ = clip.load(args.clip_path)
+        clip_model.eval()
+        clip_model = clip_model.cuda()
+
+        video_loader = VideoExtractor(N=100)
+
+        transform = Compose([
+            Resize(224, interpolation=BICUBIC),
+            CenterCrop(224),
+            Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+        ])
+
+    js = json.load(open(args.data_path))
+    for id, data in tqdm(js.items()):
+        features = None
+
+        if args.feat_folder is not None:
+            feat_path = os.path.join(args.feat_folder, f"{id}.npy")
+            if os.path.isfile(feat_path):
+                features = torch.from_numpy(np.load(feat_path)).cuda()
+
+        if features is None and args.video_folder is not None:
+            for ext in ['mp4', 'mkv', 'webm']:
+                video_path = os.path.join(args.video_folder, f"{id}.{ext}")
+                if os.path.isfile(video_path):
+                    _, images = video_loader.extract({'id': None, 'video': video_path})
+
+                    images = transform(images / 255.0)
+                    images = images.to(torch.float16)
+                    with torch.no_grad():
+                        features = clip_model.encode_image(images.to('cuda'))
+
+        if features is None:
+            print(f'Can not find video {id}')
+            continue
+ 
+        if args.task in ['captioning', 'all']:
+            for query_id, query in enumerate(questions['captioning']):
+                answer = inference(model, features, "<video>\n " + query, tokenizer)
+                write_log(args.log_path, id, 'captioning', query_id, answer)
+      
+        if args.task in ['grounding', 'all']:
+            for sentence_id, (timestamps, sentence) in enumerate(zip(data['timestamps'], data['sentences'])):
+                sentence = sentence.strip().lower()
+                if sentence.endswith("."):
+                    sentence = sentence[:-1]
+
+                for query_id, query in enumerate(questions['grounding']):
+                    answer = inference(model, features, "<video>\n" + query.format(sentence), tokenizer)
+                    gt = (timestamps[0] / data['duration'], timestamps[1] / data['duration'])
+                    u = iou(answer, gt)
+                    write_log(args.log_path, id, 'grounding', query_id, answer, info={"sentence_id": sentence_id, 'iou': u})
--- a/vtimellm/eval/log/example_log.txt
+++ b/vtimellm/eval/log/example_log.txt
+{"video_id": "v_bXdq2zI1Ms0", "task": "captioning", "query_id": 0, "answer": "Here's the requested JSON format with the events and their corresponding timestamps:\n[{\"event\": \"A man is seen talking to the camera while a group of men are seen standing in front of him.\", \"timestamps\": \"from 00 to 29\"}, {\"event\": \"The man then leads the group in a series of martial arts moves.\", \"timestamps\": \"from 29 to 71\"}, {\"event\": \"The man then leads the group in a series of martial arts moves.\", \"timestamps\": \"from 71 to 99\"}]"}
+{"video_id": "v_bXdq2zI1Ms0", "task": "grounding", "query_id": 0, "answer": "The three men are standing on a mat from 17 to 34.", "info": {"sentence_id": 0, "iou": 0.2}}
+{"video_id": "v_bXdq2zI1Ms0", "task": "grounding", "query_id": 0, "answer": "The man in front begins to do karate on the mat from 28 to 71.", "info": {"sentence_id": 1, "iou": 0.2}}
+{"video_id": "v_bXdq2zI1Ms0", "task": "grounding", "query_id": 0, "answer": "The man gets down on the ground and flips around from 60 to 99.", "info": {"sentence_id": 2, "iou": 0.4}}
+{"video_id": "v_CN01Gm2Yc4k", "task": "captioning", "query_id": 0, "answer": "Here's the requested JSON format with the events and their corresponding timestamps:\n[{\"event\": \"A woman is seen hanging from a bar in a gym.\", \"timestamps\": \"from 00 to 29\"}, {\"event\": \"She then swings her legs up and down while hanging from the bar.\", \"timestamps\": \"from 29 to 71\"}, {\"event\": \"She then swings her legs up and down while hanging from the bar.\", \"timestamps\": \"from 71 to 99\"}]"}
+{"video_id": "v_CN01Gm2Yc4k", "task": "grounding", "query_id": 0, "answer": "The young lady is gripping the punching bag between her legs from 00 to 17.", "info": {"sentence_id": 0, "iou": 0.6}}
+{"video_id": "v_CN01Gm2Yc4k", "task": "grounding", "query_id": 0, "answer": "The woman begins doing a set of crunches by pulling herself up from 21 to 71.", "info": {"sentence_id": 1, "iou": 0.82}}
+{"video_id": "v_CN01Gm2Yc4k", "task": "grounding", "query_id": 0, "answer": "The woman sits up and makes punches out into the air from 22 to 43.", "info": {"sentence_id": 2, "iou": 0.0}}
--- a/vtimellm/eval/metric.py
+++ b/vtimellm/eval/metric.py
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from dvc_eval import eval_dvc, eval_soda
+
+import json
+import argparse
+import re
+import difflib
+
+def print_metrics(metrics):
+    for k, v in metrics.items():
+        print(f"{k}: {v:.2f}")
+
+
+def merge_similar_sentences(data):
+    if not data: return data
+    merged_data = []
+    current_sentence = data[0]["sentence"]
+    current_timestamp = data[0]["timestamp"]
+    for i in range(1, len(data)):
+        next_sentence = data[i]["sentence"]
+        next_timestamp = data[i]["timestamp"]
+        if difflib.SequenceMatcher(None, current_sentence, next_sentence).ratio() > 0.98 and -1 <= next_timestamp[0] - current_timestamp[1] <= 1:
+            current_timestamp = [current_timestamp[0], next_timestamp[1]]
+        else:
+            merged_data.append({"sentence": current_sentence, "timestamp": current_timestamp})
+            current_sentence = next_sentence
+            current_timestamp = next_timestamp
+    merged_data.append({"sentence": current_sentence, "timestamp": current_timestamp})
+    return merged_data
+
+def captioning_metrics(all_logs, data_path):
+    logs = [x for x in all_logs if x['task'] == 'captioning']
+    pred = {}
+    for log in logs:
+        id = log['video_id']
+        answer = log['answer']
+        pred[id] = []
+        try:
+            items = json.loads(re.search(r'\[.*\]', answer).group(0))
+            for item in items:
+                pred[id].append({
+                        'timestamp': [int(item['timestamps'][5:7]), int(item['timestamps'][-2:])],
+                        'sentence': item['event'],
+                    })
+        except Exception as e:
+            print("Error", e, answer)
+        
+
+    gt_js = json.load(open(data_path))
+    gt_js = {k: v for k, v in gt_js.items() if k in pred.keys()}
+
+    
+    for id, items in list(pred.items()): 
+        items = merge_similar_sentences(items)
+        duration = gt_js[id]['duration']
+        for item in items:
+            item['timestamp'][0] = item['timestamp'][0] * duration / 100
+            item['timestamp'][1] = (item['timestamp'][1] + 1) * duration / 100
+        pred[id] = items
+     
+    pred_result = {'results': pred}
+
+    metrics = eval_soda(pred_result, [gt_js])
+    metrics.update(eval_dvc(pred_result, [gt_js], 
+                tious=[0.3, 0.5, 0.7, 0.9], 
+                distances=[],
+                max_proposals_per_video=1000, 
+                verbose=False, 
+                no_lang_eval=False))
+    print(f"Found {len(pred)} logs")
+    metrics = {k: v * 100 for k, v in metrics.items() if k in ['soda_c', 'METEOR', 'CIDEr']}
+    return metrics
+
+
+def grounding_metrics(all_logs):
+    ious = [x['info']['iou'] for x in all_logs if x['task'] == 'grounding']
+    l = len(ious)
+    print(f"Found {l} logs")
+    if l == 0: return
+    metrics = {
+        "mIoU": sum(ious) / l * 100
+    }
+    for m in [0.3, 0.5, 0.7]:
+        metrics[f"R1@{m}"] = sum(iou >= m for iou in ious) / l * 100
+    return metrics
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log_path", type=str, default='vtimellm/eval/log/example_log.txt')
+    parser.add_argument("--task", type=str, default='all', choices=['all', 'grounding', 'captioning'])
+    parser.add_argument("--data_path", type=str, default='vtimellm/eval/data_example.json')
+    args = parser.parse_args()
+
+    logs = []
+    with open(args.log_path) as f:
+        for line in f:
+            try:
+                json_data = json.loads(line)
+                logs.append(json_data)
+            except Exception as e:
+                print(e, line)
+
+    if args.task in ['captioning', 'all']:
+        print("====================== Captioning =====================")
+        print_metrics(captioning_metrics(logs, args.data_path))
+    if args.task in ['grounding', 'all']:
+        print("====================== Grounding ======================")
+        print_metrics(grounding_metrics(logs))
--- a/vtimellm/inference.py
+++ b/vtimellm/inference.py
+import os
+import sys
+import argparse
+import torch
+from vtimellm.constants import IMAGE_TOKEN_INDEX
+from vtimellm.conversation import conv_templates, SeparatorStyle
+from vtimellm.model.builder import load_pretrained_model, load_lora
+from vtimellm.utils import disable_torch_init
+from vtimellm.mm_utils import tokenizer_image_token, KeywordsStoppingCriteria, VideoExtractor
+from PIL import Image
+import requests
+from io import BytesIO
+from transformers import TextStreamer
+from easydict import EasyDict as edict
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    from PIL import Image
+    BICUBIC = Image.BICUBIC
+from torchvision.transforms import Compose, Resize, CenterCrop, Normalize
+import numpy as np
+import clip
+
+def inference(model, image, query, tokenizer):
+    conv = conv_templates["v1"].copy()
+    conv.append_message(conv.roles[0], query)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+
+    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+    keywords = [stop_str]
+    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+
+    with torch.inference_mode():
+        output_ids = model.generate(
+            input_ids,
+            images=image[None,].cuda(),
+            do_sample=True,
+            temperature=0.05,
+            num_beams=1,
+            # no_repeat_ngram_size=3,
+            max_new_tokens=1024,
+            use_cache=True)
+
+        # https://github.com/huggingface/transformers/blob/main/src/transformers/generation/utils.py#L1295
+
+    input_token_len = input_ids.shape[1]
+    n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+    if n_diff_input_output > 0:
+        print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+    outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+    outputs = outputs.strip()
+    if outputs.endswith(stop_str):
+        outputs = outputs[:-len(stop_str)]
+    outputs = outputs.strip()
+    return outputs
+
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Demo")
+    parser.add_argument("--clip_path", type=str, default="checkpoints/clip/ViT-L-14.pt")
+    parser.add_argument("--model_base", type=str, default="/path/to/vicuna-7b-v1.5")
+    parser.add_argument("--pretrain_mm_mlp_adapter", type=str, default="checkpoints/vtimellm-vicuna-v1-5-7b-stage1/mm_projector.bin")
+    parser.add_argument("--stage2", type=str, default="checkpoints/vtimellm-vicuna-v1-5-7b-stage2")
+    parser.add_argument("--stage3", type=str, default="checkpoints/vtimellm-vicuna-v1-5-7b-stage3")
+    parser.add_argument("--video_path", type=str, default="images/demo.mp4")
+    args = parser.parse_args()
+
+    return args
+
+if __name__ == "__main__":
+    args = parse_args()
+    disable_torch_init()
+    tokenizer, model, context_len = load_pretrained_model(args, args.stage2, args.stage3)
+    model = model.cuda()
+    # model.get_model().mm_projector.to(torch.float16)
+    model.to(torch.float16)
+
+    clip_model, _ = clip.load(args.clip_path)
+    clip_model.eval()
+    clip_model = clip_model.cuda()
+
+    video_loader = VideoExtractor(N=100)
+    _, images = video_loader.extract({'id': None, 'video': args.video_path})
+
+    transform = Compose([
+        Resize(224, interpolation=BICUBIC),
+        CenterCrop(224),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+
+    # print(images.shape) # <N, 3, H, W>
+    images = transform(images / 255.0)
+    images = images.to(torch.float16)
+    with torch.no_grad():
+        features = clip_model.encode_image(images.to('cuda'))
+
+    query = "describe the video."
+    print("query: ", query)
+    answer=inference(model, features, "<video>\n " + query, tokenizer)
+    print("answer: ", answer)
+
+
--- a/vtimellm/mm_utils.py
+++ b/vtimellm/mm_utils.py
+from PIL import Image
+from io import BytesIO
+import base64
+import numpy as np
+import torch
+import decord
+from transformers import StoppingCriteria
+from vtimellm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
+
+
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+
+
+def process_images(images, image_processor, model_cfg):
+    return image_processor(images, return_tensors='pt')['pixel_values']
+
+
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split(DEFAULT_IMAGE_TOKEN)]
+
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    elif tokenizer.name == "GLMTokenizer":
+        offset = 2
+        input_ids = prompt_chunks[0][:2]
+
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+
+
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith('checkpoint-'):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+
+
+
+
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
+        offset = min(output_ids.shape[1] - self.start_len, 3)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if output_ids[0, -keyword_id.shape[0]:].equal(keyword_id):
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+
+def print_trainable_parameters(model):
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        # print(_, param.requires_grad, param.numel())
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
+    )
+
+class VideoExtractor():
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, N=100):
+        self.N = N
+
+    def extract(self, data):
+        video_path = data['video']
+        id = data['id']
+        
+        try:
+            video_reader = decord.VideoReader(video_path)
+            total_frames = len(video_reader)
+            start = 0
+            end = total_frames - 1
+
+            split = data.get('split', None)
+            if split is not None:
+                fps = video_reader.get_avg_fps()
+                start = max(int(fps * split[0]), 0)
+                end = min(int(fps * split[1]), total_frames - 1)
+            sampled_indices = np.linspace(start, end, self.N, dtype=np.int32)
+            sampled_frames = video_reader.get_batch(sampled_indices).asnumpy()
+        except Exception as e:
+            print(e)
+            return None, torch.zeros(1)
+        
+        images = torch.from_numpy(sampled_frames.transpose((0, 3, 1, 2)))
+        return id, images
\ No newline at end of file
--- a/vtimellm/model/__init__.py
+++ b/vtimellm/model/__init__.py
+from .vtimellm_llama import VTimeLLMLlamaForCausalLM
+from .vtimellm_chatglm import VTimeLLMChatGLMForCausalLM
\ No newline at end of file
--- a/vtimellm/model/builder.py
+++ b/vtimellm/model/builder.py
+import os
+import shutil
+
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+import torch
+from vtimellm.model import *
+from peft import PeftModel
+
+def load_lora(model, lora_path):
+    non_lora_trainables_path = os.path.join(lora_path, 'non_lora_trainables.bin')
+    if os.path.exists(non_lora_trainables_path):
+        non_lora_trainables = torch.load(non_lora_trainables_path, map_location='cpu')
+        non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
+        if any(k.startswith('model.model.') for k in non_lora_trainables):
+            non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
+        model.load_state_dict(non_lora_trainables, strict=False)
+    print('Loading LoRA weights...')
+    model = PeftModel.from_pretrained(model, lora_path)
+    return model
+
+def load_pretrained_model(args, stage2=None, stage3=None):
+    kwargs = {'torch_dtype': torch.float16}
+
+    # model_path = os.path.expanduser(args.model_path)
+    model_base = args.model_base
+
+
+    # lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
+    print('Loading VTimeLLM from base model...')
+    if 'chatglm' in model_base:
+        tokenizer = AutoTokenizer.from_pretrained(model_base, trust_remote_code=True)
+        model = VTimeLLMChatGLMForCausalLM.from_pretrained(model_base)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+        model = VTimeLLMLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+        token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
+        if model.lm_head.weight.shape[0] != token_num:
+            model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+            model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+
+
+    # load stage1:
+    model.get_model().initialize_vision_modules(args)
+
+    if stage2 is not None:
+        print('Loading stage2 weights...')
+        model = load_lora(model, stage2)
+        print('Merging stage2 weights...')
+        model = model.merge_and_unload()
+        if stage3 is not None:
+            print('Loading stage3 weights...')
+            model = load_lora(model, stage3)
+            print('Merging stage3 weights...')
+            model = model.merge_and_unload()
+
+
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+
+    return tokenizer, model, context_len
--- a/vtimellm/model/chatglm/__init__.py
+++ b/vtimellm/model/chatglm/__init__.py
+from .configuration_chatglm import ChatGLMConfig
+from .modeling_chatglm import ChatGLMModel, ChatGLMForConditionalGeneration
\ No newline at end of file
--- a/vtimellm/model/chatglm/configuration_chatglm.py
+++ b/vtimellm/model/chatglm/configuration_chatglm.py
+from transformers import PretrainedConfig
+
+
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+    def __init__(
+        self,
+        num_layers=28,
+        padded_vocab_size=65024,
+        hidden_size=4096,
+        ffn_hidden_size=13696,
+        kv_channels=128,
+        num_attention_heads=32,
+        seq_length=2048,
+        hidden_dropout=0.0,
+        classifier_dropout=None,
+        attention_dropout=0.0,
+        layernorm_epsilon=1e-5,
+        rmsnorm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
+        add_bias_linear=False,
+        add_qkv_bias=False,
+        bias_dropout_fusion=True,
+        multi_query_attention=False,
+        multi_query_group_num=1,
+        apply_query_key_layer_scaling=True,
+        attention_softmax_in_fp32=True,
+        fp32_residual_connection=False,
+        quantization_bit=0,
+        pre_seq_len=None,
+        prefix_projection=False,
+        **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.classifier_dropout = classifier_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        super().__init__(**kwargs)
\ No newline at end of file
--- a/vtimellm/model/chatglm/modeling_chatglm.py
+++ b/vtimellm/model/chatglm/modeling_chatglm.py
--- a/vtimellm/model/chatglm/quantization.py
+++ b/vtimellm/model/chatglm/quantization.py
+from torch.nn import Linear
+from torch.nn.parameter import Parameter
+
+import bz2
+import torch
+import base64
+import ctypes
+from transformers.utils import logging
+
+from typing import List
+from functools import partial
+
+logger = logging.get_logger(__name__)
+
+try:
+    from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up
+
+    class Kernel:
+        def __init__(self, code: bytes, function_names: List[str]):
+            self.code = code
+            self._function_names = function_names
+            self._cmodule = LazyKernelCModule(self.code)
+
+            for name in self._function_names:
+                setattr(self, name, KernelFunction(self._cmodule, name))
+
+    quantization_code = "$QlpoOTFBWSZTWU9yuJUAQHN//////////f/n/8/n///n//bt4dTidcVx8X3V9FV/92/v4B7/AD5FBQFAAAChSgKpFCFAFVSigUAAAEKhSgUUqgFBKigqVREQAABQBQIANDTTIGI00BkZBkNGE0A0BkBkGQGRkaNAaAGQNBoGgDIAAYIGTI0DQAQAaGmmQMRpoDIyDIaMJoBoDIDIMgMjI0aA0AMgaDQNAGQAAwQMmRoGgAgA0NNMgYjTQGRkGQ0YTQDQGQGQZAZGRo0BoAZA0GgaAMgABggZMjQNABABoaaZAxGmgMjIMhowmgGgMgMgyAyMjRoDQAyBoNA0AZAADBAyZGgaAAmqU1NEgJqnptU/Sn4jRR6J6epk2pqb1Q/SgAPUGgyNNGjQ2SBpoAZAAGg0NB6mgDIAAAAA2oaApSREBNAARhGiYEaEwU8pvImlP0k2aam1GaGqbFNM1MHpTwmkepmyU9R6nqPKekHqNNPUxNGhp6n6p6QaZ6o9TG1GMqcoV9ly6nRanHlq6zPNbnGZNi6HSug+2nPiZ13XcnFYZW+45W11CumhzYhchOJ2GLLV1OBjBjGf4TptOddTSOcVxhqYZMYwZXZZY00zI1paX5X9J+b+f4e+x43RXSxXPOdquiGpduatGyXneN696M9t4HU2eR5XX/kPhP261NTx3JO1Ow7LyuDmeo9a7d351T1ZxnvnrvYnrXv/hXxPCeuYx2XsNmO003eg9J3Z6U7b23meJ4ri01OdzTk9BNO96brz+qT5nuvvH3ds/G+m/JcG/F2XYuhXlvO+jP7U3XgrzPN/lr8Sf1n6j4j7jZs+s/T0tNaNNYzTs12rxjwztHlnire3Nzc3N1wuBwOBwXBvZfoHpD7rFmR99V5vj3aXza3xdBbXMalubTg/jIv5dfAi54Pdc75j4z412n3Npj3Ld/ENm7a3b/Cod6h/ret1/5vn/C+l+gdslMvgPSLJ8d8q+U66fevYn/tW1chleEtNTGlcHCbLRlq0tHzF5tsbbZZfHjjLgZu42XCuC3NrdjTasZGNzgxPIrGqp7r3p7L2p5XjnpPSmTd5XtzqnB6U87zzg1Ol0zd0zsLszxR6lkxp35u6/teL0L0W922cR7Lu1lpL9CsHirzuM2T+BgsyViT6LHcm0/Vr6U/7LGGyJeqTEjt0PHWhF5mCT7R9mtlDwriYv0Tyr/OxYt6qp5r0mPVT0608TqnqMZaarU2nFwrTzzlrs1ed7z1ux60wyr4ydCaTi3enW8x68x0zU7tXSlcmPSW1mGpWJMg4zmPC2lK96tp0OE80y4MfEvnZj8zGluR6b22ki1Ou9V2nCd9xovcPvcYMZYy0lvN60ScZ45vN6yeCeeXFb1lVjnnCar5fwXwE2bzJ4HI1XVPXfXZMm44GUsMpYsmLB65TuVdm0cl0b+i/wGNN66XjeV7zuPpHcnK/juhhjdfId5jMdE5nN0dGmmm2zZs2cexD5n9p/dY352XsvXHaZNWWsmmS1atjR452nYudzvqv2HMRyvNNnlMcDl3R2+yx2uVrBubTW9icHDVtbNXlZm7jma1rM4VurZZd2y6nUau7ZXZ7bVU+mnoOVxZGMrVmvX60605JwmzGZhhhjTWtaaaMaaGTGmNMZasY0iX8VMUl8eepaIrzGSpemWOQyZORk2bNpjUybMmxqYmknCGCFynutfksaZpjTNMaaatM0xsxcGR0sociNqxNSmhhR1ZJPbsn8qyF0t2qH6iYBclclalbtTTcHTDsPaX6rlnElph2Jyumumtynv2Kk8GI7rsvXbIcJgHJOSaSXnnGaI3m87RtVXJOZ/YtgdTE6Wpha6ZlE8ayXkef1fh602r2WwvfMXtMdLlkfnLFdYYwYso+bWqm7yJqHXZGw2nrS5ZanSYnWlxBxMF1V940K2wdrI7R6OYf7DGGamMmTSbRhlS45xmVOumF1EyPCmHrrN8wwZOOrdNtLeMtzFzDlWnfTBxMk2NaXIZHBYxYLD4w8yju0ao65Vz1OIXoS9dLanwCe1PWrYuWMqf1if1z2k2yYfKJ741PDgno1ZQ8DRqvUny3mNoWTzGO6m1DkrJI8JiR5cSd+vZdGOO8nrMoc5+NDUFsMSXaZJeNlMmGLtJsovOsUp7I9S5VojKxF6bTVEelXqlfJobQr3LozSh2Jk7VcrVMfhXqszGWMzNqGhqZY0OadxkyyMssKugZR0KNFXBHlqwmJgTE/BNVMk6ItJXZMR0H47GpXv/DMOvNkmVuaV1PRfEdxuqc7Hcd+ZV/zTLaRxWk0nl9CdCeM6mn5rstHIBcpiuwmUZXeq81DacHI2rmrZ5SuE5mOZd6LQrZg9mx32TprA8BMo5jKN6yLTCi3WzQaZSuhzTtM1fUTGVpG8Tw+KXI0tjEpiWxtLYynOlktSbVlaI5kxP8TDH8kx50xoxi5KcA4pcja8KWLRlO/Ks6q06ergnvm1ca3Tq8Uw7LTUsmWyctXPWmpitl/uvGcWTGXGuAXDfhqazGmjkxcJW5hMMMMpYsXl2TZYtVOddG3XCarUt6Ptq9CZXSNzyuRzqRZOjsxdBbFVz6OA5HI43r1jityVlVpVkxmOsyaYWE1NTGq1sOVh36mHMcxtSvcy70edG0ZGR3I1Go1GRlV7mWWo1G0ZGRqlvH40l7o4m5xMWLLLYyNjnqc8556mdPqLJ31n/1nWOncxzG1tizrHs/Z+d2vP/B/l8wdJ6rHUn2nbbDq4p6htFtYzMMMTaZis1K5GKzGNmxhmUx2DDlZ/qNnIx41xnaMfCZWYaZWtNLTNW8ND4Fw1MyZOCdM428suKG1ehW8TesOydg7J+YYcD4cYR+8dFK6M4E3HM9ZfRNNL+Sn6rsl4DsrDl2HpPCnfxjGXtbZtYys1ttlyJ4T+BvexjGWRjMszK4Jpc77D3GyuVD7q0+G8m9G+2+rGm7cOR2y7FdtY2XUYx/oNlfRYxhMYyYZkyyg55enna9Kt/FFi6GMMwYwdwxWgxGMLKYmUyGExTKMZkMFhkymKuh0NOBNnBu+23LdwDoZYYzGGMxtORaTU1pjTGWTTGGtMrNWUsyyTTLLG1qy2ZjbK2DBllWqxMtBMaYZQmcE7zvvRcTkclUwdkxTaSdyySt/7fpL+T1v516Ji97fwr5JbLu305zMn5+GMTTZ9F+y7ExwmGVfG44yxn3dLv6l5i+Wth1jCrDq21nW9LqvvDzz3Vf3LLH/O/32TJ/erx3bXftO4eF+G956D952K/An4NfvOpjFjExjevP/UmE0fIoZXx6/w6lX/no3D0bLt+ixjieBM6ksRd0yB4Lt2SwYNE+gd1detlZWUnpiZfGfFaK+4PyCa/v18V8X75pe9fLXzp7l3VjF76vWZmHwGz1IZNWT7b8yddJ4q5kyrVdfru6atWc7bVYztL9Jf4GXvT+Y8m9/YsXP6H018a8D4XVOqvfzqeR+6yZOD8dPv0+U7/q5Pl+2dNb0MjzGVH5p6MNQ7cOWvw62U9aHE8DprDek+McLyvDz+te+9Zhq5+YTruufMcWMabqysTmZVWjKPfnK0wyVcrsuhjZRdLkHNvD72b9abriOSGIxiLixMOoalNPXzy+wT/tf+U6HHONfsz+xe8ufHBdQWWGWLA9if0rsnmrxK5LvRZQeWsTCsrmOYy8VteVfuRfcVTtDLItLIsMYxZLdU/DbtSemxF6Z6Zo5WBXE4tFdCyVMMXMTEMZXVlS6Xec2T4e0tHsRcEuWshcJ2YsNF5rUx1E8ifCq6Z+ZP7qdCeu/aTwFd53l16/o0NOw6O3dLavP4Hbi4RdmuDk6DoYaninC0+o4uZjbJ7Rxeu0/FbuFg+q7DVS6fQe0rZ6NDGUNNU6DEqOaLTicKnYZMnBWruljQxoaS3dZhocDge0bSTyOvdAbG5hxe2xji7E/L55xX13wWNDi6HCekcFxfCPGxY0MXC+s7afWaMdDyjyr+o8Rudm/NabOZvdl274zH4f5XK9z6On1Pe/K5TdPAslg77BjuO6Y3eO7GqvOPG/stknp1leyvLL0Z7bl9I4noMvLkzytLhWYzrOZzLXCORe028rORzOg4N/L0HlMOQ3Pgmnbb6KczlabORpu980q37TBqRu0/p3PO6234Bl03Ynuz+9W7gnsEcmvYaYY3aMYY0wx3pYd+ujsXauWdaY5Xkbtl23fPzFHiDB/QMo0yFjBllYxTQYYyxkrwn7JufwJ/PfgJ+C83X69ni6zvXcnyXabv0ncbLwsceS+RNlyN2mnneJtX0ngYO0+e+0+UnA+Wch3ji8hj5an4h+i6XBySU4n+R0roVcbw5yvHrmr4Yw8Y7x6c+9POPYHI5HI5HI5HI5HGXGww4nE4nrVyOR8XeqPEO7PLOiukYa3Novk5hV4cdtYZLI93e+uxff2jRo0aNGjRo0aNG1bVtW1dy3m83m8+tQ5ZzHw3nObwOu8La9Rc1dtkdS8A3eTk823tnktXWlxN6Oixe06zrN70Isd9jiOgZFq9yfkPqP/SLhN2Myl8jDM43bl1nbcb4cO57jlh8Jow6pzXZdL4dyODTuuhu77FyO27DdwdRxmvO+O+3N2+BdqyTwLHVczDVY4UPE4O66/ZO2cx1LFzVdSXtF7G4HMbrauOHRw6c8FdZ5m9fHZHYZXfTlZquyynSyTTKke6vcffSD9pzPA/G7n7jxPmuhc1DHMynPMrGL6AdewYmwu5ko+UUyTwrMv27rPH1v1nGqd87+p6N6LU8k3NEng53xXyHS97+44OSg/sy/hn+Se6yfYNjW0/uTgP+PvWYzLMmjhcLB/gGpri6H83/84eUXWT6T9Hsv7785z/7z4icpW+zfXypuR7rx/gMdZb1/wC678pcs8/2a3mDitGHxl9mfPlll5MafWWqxk/eYuTDgcNMzDGWLWvsuglNxs53GtN6uWpktlW1tZZYcuinMMWmnNnJydze3b2Y1McBxrBkXw799izLMZZYyy0TkbsGM4p03S2uVu5s/XXUdSdec6smVxZYYGpVmT8A+8ajuEyV5FatkvVru2x6uxGXXbH4A+jvgP4GMYy3iPLXzq/6z65+E005ey+cwMZD3fZcqc6xpjTFjQ0P3U+e++cPYmTIwj0nrK5NPTfl3WvpfLtXDcb2HQMudYOxFXQBor4L4T6vrOauFctYXJQ++NUWmJe5bmx1jDiZS1dTqWxo4GR8jm3fttpmPHppk9PEyv4/y8/sO07XacOmcqc0x2Vi9BvNJvN5oW8x4mOsydpidRxMYJPx06m1bqPzq9KtK8sxXNXFodD/+MYYaJTLwOhc9brCsV18oOR1i4tXChyTkq4lf4y1Ke+9axjDHqs1mfBbMXuP4Hzi+X7t8vzv7bHerrUPgPCxhjre4fXdfLNtNM+Jd+Zdh8xd8wP87uNPoPgv4W7/5P2BuxfsMabNnMnza+54Pdi5U671GPZY8CehX8Voeoo7FHpkeEc6715FwHZrIrUrHaviPUbPZHND+IhczrP6FcYvhOZ0Di/ETt0OI+YwNWR9r7tpf6WDeZKZDB1+z2IthOl1mPyb5FluvEx9h9d0NnM0Y1XPFkWIsk1WotJ0PBMmkvjvQTd0e71tfeV+8r8lQ/tpzpsmxJ+InrI/dj2UajUajVTUajatRqNRtGo1Go1Go4wjeMpZFMVV9CHbofPraLsJ3JpWV2XOoanCuFky4y3PPNxucK2uKC1Lbdb1eo+m5XomN6HfeZsabHLHRX/K+offtNGGmHWctcVcG44MdSqsOLY9VzX+Zxfxn2HPdWTpzWvkrtJ8M5zorrKcquRytJ5N5DZmcaW02l76nWO+BqPXm1A2Ry/0q71dH/mqrqeFjkYxjEXtsX8qubTk67rGycyqsdm4tZx5D6D5hhi0waaWmiaMP81Yjii5qxPlPuU/GfTL1Y5E6Jyfiq63qTa39A4J0sOGDgO9WF9bOXl0XfPRbsY2bPNKPy1YrFYrFYmRhhlTIyMjJWJYZHXuCXI8OoXsvfljGLFicNifpp2XunoPiG1wtx3p1Tah+/DD66OnVtVXP9rKbVxOnL0tR/rHtqB5UDErUVcl11D4qqvjpOcxX7armUNJB3LpW6bxVvD08e8h3odKKvyCFZBdSh2FVcST9xV3n3T8t1j7Kr9qgrqXg+13Pt5U7JCvFXVIV1YG5lRhkVYZJYYDDD4KOIMoHCp26WS8GB7uBh2zIdgq/PKyInjV2STShuoapUdCpX1yTwqq/z1VvET7Kh5nVPkO8YyxjLt2MaaMmWTLQvx3qnzltnXW0p2jxgbEtSny/Osv8Y9pLMXYoHVPAhkVdWVeODhR6q9/Sxe2liwwZWMVvFXfRkeIDxAePUPIrdJ4ey6yquzH+PD/bUOWAu05qVHtFd8rrKHSoeNIOUqrYr3FXyToqfYJgwmJdKpXXOwYYegNNGMzfZPp/t3t/DVs4zjNTN61rRqaWaa4NYbRjTa0tWwy2Y2tGN8ZO8ofNKq4j9SL7I+cSm4/6ovLV5HNXLI0jJidwrtk6ynCaP6Z++GjRlWS3tLeW129Mi9evxU9mtz6s5J3Z7M2ngTgnKvmpomxpaLCzPfmx0JWE+m3NLDDGOX47RctdYYNK5jakdqLkRlI39n590T5zctGSwwZZDJj6kW8XSi6ot2MmWWJ0DUT3nuvebBudScjZ79g8cWJ8av0k+/bE5WKd5MdbFpbDVMxu1DVMmtNZGJvq1mtRbn6M+g/kP0FwDwr7quZs7xosNGpbscyxhhd9TyJyFwbLcxlTasg75vW7TsV5K7ji44XPMMrdoj+Y3rT0Hie62nlYV/pwczzOmdLqLhYkzGMzCZWGMQzGMSsZYY6Di1t4nlJ+Em63mJxrVLxPbYxNEdgc1dU2iOKyoYYWjNrEeHTYybVk0atSa7ehuwsWMWTqn1TrnS6hYsi71d1+s+k+ic70e20fzE/VaTdxT9ZtU4GIXdeNx3X77guYYfpHeTQjaMX6brOu4OY4K7Y2d9mbHarI5ox3p4GpJ2Vd/Tst60f7j999pppjR+Q/Qf8J/VaORs3cji7FfFuN61+ui9s8hix1OCh5KGVV23BPXvZfz3CLyHpix+exi8z/KnCnosY2eunor+cxyPO/xJ0vKey9OvE9VjqaYu0x3Z3jd6o2b1T12D+F8l232lwaaacD5LE8LBxu7WTlbWraWpew8Xexjel3E+wWD4APITdNqR8F3R3T0lunCQ4GaE9R37DxeCYfcHi4xci5ovKfxVs55y2hf+65E/Xdp6jR5nrebTmi5incpkyOjs50JvrZwstbbW6kfuuQw+2mykf/EXNFzxfKTrxew929TR6bWnGL//F3JFOFCQT3K4lQ"
+
+    kernels = Kernel(
+        bz2.decompress(base64.b64decode(quantization_code)),
+        [
+            "int4WeightCompression",
+            "int4WeightExtractionFloat",
+            "int4WeightExtractionHalf",
+            "int8WeightExtractionFloat",
+            "int8WeightExtractionHalf",
+        ],
+    )
+except Exception as exception:
+    kernels = None
+    logger.warning("Failed to load cpm_kernels:" + str(exception))
+
+
+class W8A16Linear(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width):
+        ctx.inp_shape = inp.size()
+        ctx.weight_bit_width = weight_bit_width
+        out_features = quant_w.size(0)
+        inp = inp.contiguous().view(-1, inp.size(-1))
+        weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width)
+        ctx.weight_shape = weight.size()
+        output = inp.mm(weight.t())
+        ctx.save_for_backward(inp, quant_w, scale_w)
+        return output.view(*(ctx.inp_shape[:-1] + (out_features,)))
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        inp, quant_w, scale_w = ctx.saved_tensors
+        weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width)
+        grad_output = grad_output.contiguous().view(-1, weight.size(0))
+        grad_input = grad_output.mm(weight)
+        grad_weight = grad_output.t().mm(inp)
+        return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None, None
+
+
+def compress_int4_weight(weight: torch.Tensor):  # (n, m)
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        assert m % 2 == 0
+        m = m // 2
+        out = torch.empty(n, m, dtype=torch.int8, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        kernels.int4WeightCompression(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
+        )
+        return out
+
+
+def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
+    assert scale_list.dtype in [torch.half, torch.bfloat16]
+    assert weight.dtype in [torch.int8]
+    if source_bit_width == 8:
+        return weight.to(scale_list.dtype) * scale_list[:, None]
+    elif source_bit_width == 4:
+        func = (
+            kernels.int4WeightExtractionHalf if scale_list.dtype == torch.half else kernels.int4WeightExtractionBFloat16
+        )
+    else:
+        assert False, "Unsupported bit-width"
+
+    with torch.cuda.device(weight.device):
+        n, m = weight.size(0), weight.size(1)
+        out = torch.empty(n, m * (8 // source_bit_width), dtype=scale_list.dtype, device="cuda")
+        stream = torch.cuda.current_stream()
+
+        gridDim = (n, 1, 1)
+        blockDim = (min(round_up(m, 32), 1024), 1, 1)
+
+        func(
+            gridDim,
+            blockDim,
+            0,
+            stream,
+            [
+                ctypes.c_void_p(weight.data_ptr()),
+                ctypes.c_void_p(scale_list.data_ptr()),
+                ctypes.c_void_p(out.data_ptr()),
+                ctypes.c_int32(n),
+                ctypes.c_int32(m),
+            ],
+        )
+        return out
+
+
+class QuantizedLinear(torch.nn.Module):
+    def __init__(self, weight_bit_width: int, weight, bias=None, device="cpu", dtype=None, empty_init=False, *args,
+                 **kwargs):
+        super().__init__()
+        self.weight_bit_width = weight_bit_width
+
+        shape = weight.shape
+
+        if weight is None or empty_init:
+            self.weight = torch.empty(shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=device)
+            self.weight_scale = torch.empty(shape[0], dtype=dtype, device=device)
+        else:
+            self.weight_scale = weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)
+            self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8)
+            if weight_bit_width == 4:
+                self.weight = compress_int4_weight(self.weight)
+
+        self.weight = Parameter(self.weight.to(device), requires_grad=False)
+        self.weight_scale = Parameter(self.weight_scale.to(device), requires_grad=False)
+        self.bias = Parameter(bias.to(device), requires_grad=False) if bias is not None else None
+
+    def forward(self, input):
+        output = W8A16Linear.apply(input, self.weight, self.weight_scale, self.weight_bit_width)
+        if self.bias is not None:
+            output = output + self.bias
+        return output
+
+
+def quantize(model, weight_bit_width, empty_init=False, device=None):
+    """Replace fp16 linear with quantized linear"""
+    for layer in model.layers:
+        layer.self_attention.query_key_value = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.self_attention.query_key_value.weight.to(torch.cuda.current_device()),
+            bias=layer.self_attention.query_key_value.bias,
+            dtype=layer.self_attention.query_key_value.weight.dtype,
+            device=layer.self_attention.query_key_value.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.self_attention.dense = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.self_attention.dense.weight.to(torch.cuda.current_device()),
+            bias=layer.self_attention.dense.bias,
+            dtype=layer.self_attention.dense.weight.dtype,
+            device=layer.self_attention.dense.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_h_to_4h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
+            bias=layer.mlp.dense_h_to_4h.bias,
+            dtype=layer.mlp.dense_h_to_4h.weight.dtype,
+            device=layer.mlp.dense_h_to_4h.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+        layer.mlp.dense_4h_to_h = QuantizedLinear(
+            weight_bit_width=weight_bit_width,
+            weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()),
+            bias=layer.mlp.dense_4h_to_h.bias,
+            dtype=layer.mlp.dense_4h_to_h.weight.dtype,
+            device=layer.mlp.dense_4h_to_h.weight.device if device is None else device,
+            empty_init=empty_init
+        )
+
+    return model
--- a/vtimellm/model/chatglm/tokenization_chatglm.py
+++ b/vtimellm/model/chatglm/tokenization_chatglm.py
+import json
+import os
+import re
+from typing import List, Optional, Union, Dict
+from sentencepiece import SentencePieceProcessor
+from transformers import PreTrainedTokenizer
+from transformers.utils import logging, PaddingStrategy
+from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
+
+
+class SPTokenizer:
+    def __init__(self, model_path: str):
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.unk_id()
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+
+        role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
+        special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
+        self.special_tokens = {}
+        self.index_special_tokens = {}
+        for token in special_tokens:
+            self.special_tokens[token] = self.n_words
+            self.index_special_tokens[self.n_words] = token
+            self.n_words += 1
+        self.role_special_token_expression = "|".join([re.escape(token) for token in role_special_tokens])
+
+    def tokenize(self, s: str, encode_special_tokens=False):
+        if encode_special_tokens:
+            last_index = 0
+            t = []
+            for match in re.finditer(self.role_special_token_expression, s):
+                if last_index < match.start():
+                    t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()]))
+                t.append(s[match.start():match.end()])
+                last_index = match.end()
+            if last_index < len(s):
+                t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
+            return t
+        else:
+            return self.sp_model.EncodeAsPieces(s)
+
+    def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
+        assert type(s) is str
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+
+    def decode(self, t: List[int]) -> str:
+        text, buffer = "", []
+        for token in t:
+            if token in self.index_special_tokens:
+                if buffer:
+                    text += self.sp_model.decode(buffer)
+                    buffer = []
+                text += self.index_special_tokens[token]
+            else:
+                buffer.append(token)
+        if buffer:
+            text += self.sp_model.decode(buffer)
+        return text
+
+    def decode_tokens(self, tokens: List[str]) -> str:
+        text = self.sp_model.DecodePieces(tokens)
+        return text
+
+    def convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        return self.sp_model.PieceToId(token)
+
+    def convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.index_special_tokens:
+            return self.index_special_tokens[index]
+        if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
+            return ""
+        return self.sp_model.IdToPiece(index)
+
+
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "tokenizer.model"}
+
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+
+    def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, encode_special_tokens=False,
+                 **kwargs):
+        self.name = "GLMTokenizer"
+
+        self.vocab_file = vocab_file
+        self.tokenizer = SPTokenizer(vocab_file)
+        self.special_tokens = {
+            "<bos>": self.tokenizer.bos_id,
+            "<eos>": self.tokenizer.eos_id,
+            "<pad>": self.tokenizer.pad_id
+        }
+        self.encode_special_tokens = encode_special_tokens
+        super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                         encode_special_tokens=encode_special_tokens,
+                         **kwargs)
+
+    def get_command(self, token):
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
+        return self.tokenizer.special_tokens[token]
+
+    @property
+    def unk_token(self) -> str:
+        return "<unk>"
+
+    @property
+    def pad_token(self) -> str:
+        return "<unk>"
+
+    @property
+    def pad_token_id(self):
+        return self.get_command("<pad>")
+
+    @property
+    def eos_token(self) -> str:
+        return "</s>"
+
+    @property
+    def eos_token_id(self):
+        return self.get_command("<eos>")
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_words
+
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text, **kwargs):
+        return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.tokenizer.convert_token_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.tokenizer.convert_id_to_token(index)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return self.tokenizer.decode_tokens(tokens)
+
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, self.vocab_files_names["vocab_file"]
+            )
+        else:
+            vocab_file = save_directory
+
+        with open(self.vocab_file, 'rb') as fin:
+            proto_str = fin.read()
+
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+
+        return (vocab_file,)
+
+    def get_prefix_tokens(self):
+        prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
+        return prefix_tokens
+
+    def build_single_message(self, role, metadata, message):
+        assert role in ["system", "user", "assistant", "observation"], role
+        role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n")
+        message_tokens = self.tokenizer.encode(message)
+        tokens = role_tokens + message_tokens
+        return tokens
+
+    def build_chat_input(self, query, history=None, role="user"):
+        if history is None:
+            history = []
+        input_ids = []
+        for item in history:
+            content = item["content"]
+            if item["role"] == "system" and "tools" in item:
+                content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False)
+            input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content))
+        input_ids.extend(self.build_single_message(role, "", query))
+        input_ids.extend([self.get_command("<|assistant|>")])
+        return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True)
+
+    def build_inputs_with_special_tokens(
+            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        prefix_tokens = self.get_prefix_tokens()
+        token_ids_0 = prefix_tokens + token_ids_0
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
+        return token_ids_0
+
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        assert self.padding_side == "left"
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * seq_length
+
+        if "position_ids" not in encoded_inputs:
+            encoded_inputs["position_ids"] = list(range(seq_length))
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+
+        return encoded_inputs
--- a/vtimellm/model/vtimellm_arch.py
+++ b/vtimellm/model/vtimellm_arch.py
+import torch
+import torch.nn as nn
+from vtimellm.constants import IMAGE_TOKEN_INDEX, IGNORE_INDEX
+from abc import ABC, abstractmethod
+
+class VTimeLLMMetaModel:
+
+    def initialize_vision_modules(self, model_args):
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+
+        if not hasattr(self, 'mm_projector'):
+            self.mm_projector = nn.Linear(768, self.config.hidden_size)
+
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+
+            self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
+            print("load mlp:", pretrain_mm_mlp_adapter)
+
+
+class VTimeLLMMetaForCausalLM(ABC):
+
+    @abstractmethod
+    def get_model(self):
+        pass
+
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, position_ids, attention_mask, past_key_values, labels, images
+    ):
+        # print(position_ids, attention_mask)
+        # if past_key_values:
+        #     print(past_key_values[-1][-1].shape)
+        # print(input_ids.shape, position_ids.shape, attention_mask.shape, past_key_values.shape, images)
+        if images is None or input_ids.shape[1] == 1:
+            if past_key_values is not None and images is not None and input_ids.shape[1] == 1:
+                if self.get_model().config.model_type == 'chatglm':
+                    target_shape = past_key_values[-1][-1].shape[0] + 1
+                else:
+                    target_shape = past_key_values[-1][-1].shape[-2] + 1
+                attention_mask = torch.cat((attention_mask, torch.ones(
+                    (attention_mask.shape[0], target_shape - attention_mask.shape[1]),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device
+                )), dim=1)
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+
+        if type(images) is list:
+            concat_images = torch.cat([image for image in images], dim=0)
+            image_features = self.get_model().mm_projector(concat_images)
+            split_sizes = [image.shape[0] for image in images]
+            image_features = torch.split(image_features, split_sizes, dim=0)
+            # image_features = [x.flatten(0, 1) for x in image_features]
+        else:
+            image_features = self.get_model().mm_projector(images)
+        # print([image.shape for image in image_features])
+        
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+
+        # remove the padding using attention_mask -- TODO: double check
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().get_input_embeddings()(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+
+            image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().get_input_embeddings()(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
+                new_input_embeds_padded.append(torch.cat((
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
+                    cur_new_embed
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+            else:
+                new_input_embeds_padded.append(torch.cat((
+                    cur_new_embed,
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+
+        if _position_ids is None:
+            position_ids = None
+
+        if self.get_model().config.model_type == 'chatglm':
+            fake_input_ids = torch.full((new_input_embeds.shape[0], new_input_embeds.shape[1]), -10000, 
+                                        dtype=new_input_embeds.dtype, device=new_input_embeds.device)
+            attention_mask = attention_mask.to(torch.int8)
+            new_input_embeds = new_input_embeds.transpose(0, 1).contiguous()
+        else:
+            fake_input_ids = None
+        # print(position_ids, attention_mask)
+        return fake_input_ids, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
--- a/vtimellm/model/vtimellm_chatglm.py
+++ b/vtimellm/model/vtimellm_chatglm.py
+import torch
+import torch.nn as nn
+from typing import List, Optional, Tuple, Union
+from transformers import AutoConfig, AutoModelForCausalLM
+from .chatglm import ChatGLMConfig, ChatGLMModel, ChatGLMForConditionalGeneration
+from .vtimellm_arch import VTimeLLMMetaModel, VTimeLLMMetaForCausalLM
+
+class VTimeLLMChatGLMConfig(ChatGLMConfig):
+    model_type = "VTimeLLM_ChatGLM"
+
+class VTimeLLMChatGLMModel(ChatGLMModel, VTimeLLMMetaModel):
+    config_class = VTimeLLMChatGLMConfig
+
+    def __init__(self, config, empty_init=True, device=None):
+        super(VTimeLLMChatGLMModel, self).__init__(config, empty_init=empty_init, device=device)
+
+class VTimeLLMChatGLMForCausalLM(ChatGLMForConditionalGeneration, VTimeLLMMetaForCausalLM):
+    config_class = VTimeLLMChatGLMConfig
+
+    def __init__(self, config, empty_init=True, device=None):
+        super(ChatGLMForConditionalGeneration, self).__init__(config)
+        self.transformer = VTimeLLMChatGLMModel(config, empty_init=empty_init, device=device)
+        self.max_sequence_length = config.max_length
+        self.config = config
+        self.quantized = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.transformer
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        return_last_logit: Optional[bool] = False,
+        images: Optional[torch.FloatTensor] = None,
+    ):
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            _inputs['images'] = images
+        return _inputs
+
+AutoConfig.register("VTimeLLM_ChatGLM", VTimeLLMChatGLMConfig)
+AutoModelForCausalLM.register(VTimeLLMChatGLMConfig, VTimeLLMChatGLMForCausalLM)
--- a/vtimellm/model/vtimellm_llama.py
+++ b/vtimellm/model/vtimellm_llama.py
+import torch
+import torch.nn as nn
+from typing import List, Optional, Tuple, Union
+from transformers import AutoConfig, AutoModelForCausalLM, LlamaConfig, LlamaModel, LlamaForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .vtimellm_arch import VTimeLLMMetaModel, VTimeLLMMetaForCausalLM
+
+class VTimeLLMConfig(LlamaConfig):
+    model_type = "VTimeLLM"
+
+class VTimeLLMLlamaModel(LlamaModel, VTimeLLMMetaModel):
+    config_class = VTimeLLMConfig
+
+    def __init__(self, config: LlamaConfig):
+        super(VTimeLLMLlamaModel, self).__init__(config)
+
+class VTimeLLMLlamaForCausalLM(LlamaForCausalLM, VTimeLLMMetaForCausalLM):
+    config_class = VTimeLLMConfig
+
+    def __init__(self, config):
+        super(LlamaForCausalLM, self).__init__(config)
+        self.model = VTimeLLMLlamaModel(config)
+        self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            _inputs['images'] = images
+        return _inputs
+
+AutoConfig.register("VTimeLLM", VTimeLLMConfig)
+AutoModelForCausalLM.register(VTimeLLMConfig, VTimeLLMLlamaForCausalLM)
--- a/vtimellm/train/dataset.py
+++ b/vtimellm/train/dataset.py