Initial commit

3a0371d5 · suily · 3a0371d5 · 3a0371d5 · 3a0371d5 · 3a0371d5
Commit 3a0371d5 authored Nov 28, 2024 by suily
20 changed files
--- a/scripts/zero3.json
+++ b/scripts/zero3.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    }
+}
\ No newline at end of file
--- a/scripts/zero3_offload.json
+++ b/scripts/zero3_offload.json
+{
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": "auto",
+      "betas": "auto",
+      "eps": "auto",
+      "weight_decay": "auto"
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params": {
+      "warmup_min_lr": "auto",
+      "warmup_max_lr": "auto",
+      "warmup_num_steps": "auto"
+    }
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 1e9,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "gather_16bit_weights_on_model_save": true
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "steps_per_print": 1e5,
+  "wall_clock_breakdown": false
+}
\ No newline at end of file
--- a/vtimellm/__init__.py
+++ b/vtimellm/__init__.py
+from .model import VTimeLLMLlamaForCausalLM
--- a/vtimellm/constants.py
+++ b/vtimellm/constants.py
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+
+LOGDIR = "."
+
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<video>"
+
--- a/vtimellm/conversation.py
+++ b/vtimellm/conversation.py
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+
+
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+
+    skip_next: bool = False
+
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+        return ret
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode == "Crop":
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((336, 336))
+                    else:
+                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="PNG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    ret.append([img_str, None])
+                    msg = msg.replace('<image>', '').strip()
+                    if len(msg) > 0:
+                        ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+
+
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Hi!"),
+        ("Assistant", "Hi there! How can I help you today?")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+
+default_conversation = conv_vicuna_v1
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+
+    "mpt": conv_mpt,
+}
+
+
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())
--- a/vtimellm/demo_gradio.py
+++ b/vtimellm/demo_gradio.py
+"""
+Adapted from: https://github.com/Vision-CAIR/MiniGPT-4/blob/main/demo.py 
+"""
+import argparse
+import os
+root_dir = os.path.join(os.getcwd(), "..")
+import sys
+sys.path.append(root_dir)
+
+import torch
+import gradio as gr
+
+import decord
+decord.bridge.set_bridge('torch')
+
+from vtimellm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
+from vtimellm.conversation import conv_templates, SeparatorStyle
+from vtimellm.model.builder import load_pretrained_model
+from vtimellm.utils import disable_torch_init
+from vtimellm.mm_utils import tokenizer_image_token, KeywordsStoppingCriteria, VideoExtractor
+from PIL import Image
+from transformers import TextStreamer
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    from PIL import Image
+    BICUBIC = Image.BICUBIC
+from torchvision.transforms import Compose, Resize, CenterCrop, Normalize
+import clip
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--gpu_id", type=int, default=0, help="specify the gpu to load the model.")
+    parser.add_argument("--model_base", type=str, required=True, help="Path to your vicuna-7b-v1.5 huggingface checkpoint")
+    parser.add_argument("--clip_path", type=str, default=os.path.join(root_dir, "checkpoints/clip/ViT-L-14.pt"))
+    parser.add_argument("--pretrain_mm_mlp_adapter", type=str, default=os.path.join(root_dir, "checkpoints/vtimellm-vicuna-v1-5-7b-stage1/mm_projector.bin"))
+    parser.add_argument("--stage2", type=str, default=os.path.join(root_dir, "checkpoints/vtimellm-vicuna-v1-5-7b-stage2"))
+    parser.add_argument("--stage3", type=str, default=os.path.join(root_dir, "checkpoints/vtimellm-vicuna-v1-5-7b-stage3"))
+    parser.add_argument("--share", action='store_true')
+    args = parser.parse_args()
+    return args
+
+# ========================================
+#             Model Initialization
+# ========================================
+
+args = parse_args()
+device = f'cuda:{args.gpu_id}'
+
+disable_torch_init()
+tokenizer, model, context_len = load_pretrained_model(args, args.stage2, args.stage3)
+model = model.to(torch.float16).to(device)
+
+clip_model, _ = clip.load(args.clip_path)
+clip_model.eval()
+clip_model = clip_model.to(device)
+
+transform = Compose([
+    Resize(224, interpolation=BICUBIC),
+    CenterCrop(224),
+    Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+])
+
+print('Initialization Finished')
+
+# ========================================
+#             Gradio Setting
+# ========================================
+
+TEXT_PLACEHOLDER = 'Upload your video first, or directly click the examples at the bottom of the page.'
+
+def gradio_reset(chat_state, video_features_state, conv_state):
+    if chat_state is not None:
+        chat_state.messages = []
+    video_features_state = None
+    conv_state = {}
+    return None, gr.update(value=None, interactive=True), gr.update(value='', placeholder=TEXT_PLACEHOLDER, interactive=False), gr.update(value="Upload & Start Chat", interactive=True), chat_state, video_features_state, conv_state
+
+def upload_video(gr_video, chat_state, video_features_state, conv_state, chatbot):
+    if not gr_video:
+        return None, None, gr.update(interactive=True), chat_state, video_features_state, conv_state, None
+    else:
+        print(gr_video)
+        video_loader = VideoExtractor(N=100)
+        _, images = video_loader.extract({'id': None, 'video': gr_video})
+
+        transform = Compose([
+            Resize(224, interpolation=BICUBIC),
+            CenterCrop(224),
+            Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+        ])
+
+        # print(images.shape) # <N, 3, H, W>
+        images = transform(images / 255.0)
+        images = images.to(torch.float16)
+        with torch.no_grad():
+            video_features_state = clip_model.encode_image(images.to('cuda'))
+
+        chatbot = chatbot + [((gr_video,), None)]
+        chat_state = conv_templates["v1"].copy()
+        conv_state['first'] = True
+
+        return gr.update(interactive=False), gr.update(interactive=True, placeholder='Type and press Enter'), gr.update(value="Start Chatting", interactive=False), chat_state, video_features_state, conv_state, chatbot
+
+def gradio_ask(user_message, chatbot, chat_state, conv_state):
+    if len(user_message) == 0:
+        conv_state['should_answer'] = False
+        return gr.update(interactive=True, placeholder='Input should not be empty!'), chatbot, chat_state, conv_state
+    conv_state['should_answer'] = True
+    chatbot = chatbot + [[user_message, None]]
+    if conv_state['first']:
+        user_message = DEFAULT_IMAGE_TOKEN + '\n' + user_message
+        conv_state['first'] = False
+    chat_state.append_message(chat_state.roles[0], user_message)
+    chat_state.append_message(chat_state.roles[1], None)
+    return '', chatbot, chat_state, conv_state
+
+
+def gradio_answer(chatbot, chat_state, video_features_state, conv_state, temperature):
+    if not conv_state['should_answer']:
+        return chatbot, chat_state
+    prompt = chat_state.get_prompt()
+    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(device)
+    stop_str = chat_state.sep if chat_state.sep_style != SeparatorStyle.TWO else chat_state.sep2 # plain:sep(###) v1:sep2(None)
+    keywords = [stop_str]
+    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+
+    with torch.inference_mode():
+        output_ids = model.generate(
+            input_ids,
+            images=video_features_state[None,].to(device),
+            do_sample=True,
+            temperature=temperature,
+            max_new_tokens=1024,
+            streamer=streamer,
+            use_cache=True,
+            stopping_criteria=[stopping_criteria]
+        )
+
+    outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
+    chat_state.messages[-1][-1] = outputs
+
+    chatbot[-1][1] = outputs
+    print(chat_state.get_prompt())
+    print(chat_state)
+    return chatbot, chat_state
+
+with gr.Blocks() as demo:
+    gr.Markdown('''# Demo for VTimeLLM''')
+
+    with gr.Row():
+        with gr.Column(scale=0.5):
+            video = gr.Video()
+
+            upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
+            clear = gr.Button("Reset")
+            
+            temperature = gr.Slider(
+                minimum=0,
+                maximum=2.0,
+                value=0.05,
+                step=0.01,
+                interactive=True,
+                label="Temperature",
+            )
+        with gr.Column():
+            chat_state = gr.State()
+            video_features_state = gr.State()
+            conv_state = gr.State({})
+            chatbot = gr.Chatbot(label='VTimeLLM')
+            text_input = gr.Textbox(label='User', placeholder=TEXT_PLACEHOLDER, interactive=False)
+            
+
+    with gr.Column():
+        gr.Examples(examples=[
+            [os.path.join(root_dir, f"images/demo.mp4"), "Explain why the video is funny."],
+        ], inputs=[video, text_input])
+        
+    upload_button.click(upload_video, [video, chat_state, video_features_state, conv_state, chatbot], [video, text_input, upload_button, chat_state, video_features_state, conv_state, chatbot])
+    text_input.submit(gradio_ask, [text_input, chatbot, chat_state, conv_state], [text_input, chatbot, chat_state, conv_state]).then(gradio_answer, [chatbot, chat_state, video_features_state, conv_state, temperature], [chatbot, chat_state])
+    clear.click(gradio_reset, [chat_state, video_features_state, conv_state], [chatbot, video, text_input, upload_button, chat_state, video_features_state, conv_state], queue=False)
+
+demo.queue().launch(share=args.share)
--- a/vtimellm/eval/data_example.json
+++ b/vtimellm/eval/data_example.json
+{
+	"v_bXdq2zI1Ms0": {
+		"duration": 73.1,
+		"timestamps": [
+			[6.94, 69.08],
+			[37.28, 43.49],
+			[43.13, 55.55]
+		],
+		"sentences": ["Three men are standing on a mat.", " The man in front begins to do karate on the mat.", " He gets down on the ground and flips around."]
+	},
+	"v_CN01Gm2Yc4k": {
+		"duration": 17.56,
+		"timestamps": [
+			[0, 5],
+			[5, 12.2],
+			[12.2, 17.56]
+		],
+		"sentences": ["A young lady is gripping a black and silver punching bag between her legs.", "Once she has secured herself on the bag,she begins doing a set of crunches by pulling herself up.", "In between the crunches,she sits up and makes punches out into the air,before going back down."]
+	}
+}
\ No newline at end of file
--- a/vtimellm/eval/dvc_eval/SODA/LICENSE
+++ b/vtimellm/eval/dvc_eval/SODA/LICENSE
+SOFTWARE LICENSE AGREEMENT FOR EVALUATION
+
+This SOFTWARE EVALUATION LICENSE AGREEMENT (this "Agreement") is a legal contract between a person who uses or otherwise accesses or installs the Software ("User(s)"), and Nippon Telegraph and Telephone corporation ("NTT").
+READ THE TERMS AND CONDITIONS OF THIS AGREEMENT CAREFULLY BEFORE INSTALLING OR OTHERWISE ACCESSING OR USING NTT'S PROPRIETARY SOFTWARE ACCOMPANIED BY THIS AGREEMENT (the "SOFTWARE"). THE SOFTWARE IS COPYRIGHTED AND IT IS LICENSED TO USER UNDER THIS AGREEMENT, NOT SOLD TO USER. BY INSTALLING OR OTHERWISE ACCESSING OR USING THE SOFTWARE, USER ACKNOWLEDGES THAT USER HAS READ THIS AGREEMENT, THAT USER UNDERSTANDS IT, AND THAT USER ACCEPTS AND AGREES TO BE BOUND BY ITS TERMS. IF AT ANY TIME USER IS NOT WILLING TO BE BOUND BY THE TERMS OF THIS AGREEMENT, USER SHOULD  TERMINATE THE INSTALLATION PROCESS, IMMEDIATELY CEASE AND REFRAIN FROM ACCESSING OR USING THE SOFTWARE AND DELETE ANY COPIES USER MAY HAVE. THIS AGREEMENT REPRESENTS THE ENTIRE AGREEMENT BETWEEN USER AND NTT CONCERNING THE SOFTWARE.
+
+
+BACKGROUND
+A. NTT is the owner of all rights, including all patent rights, copyrights and trade secret rights, in and to the Software and related documentation listed in Exhibit A to this Agreement.
+B. User wishes to obtain a royalty free license to use the Software to enable User to evaluate, and NTT wishes to grant such a license to User, pursuant and subject to the terms and conditions of this Agreement.
+C. As a condition to NTT's provision of the Software to User, NTT has required User to execute this Agreement.
+In consideration of these premises, and the mutual promises and conditions in this Agreement, the parties hereby agree as follows:
+1.	Grant of Evaluation License.  	NTT hereby grants to User, and User hereby accepts, under the terms and conditions of this Agreement, a royalty free, nontransferable and nonexclusive license to use the Software internally for the purposes of testing, analyzing, and evaluating the methods or mechanisms as shown in the research paper submitted by NTT to a certain academy. User may make a reasonable number of backup copies of the Software solely for User's internal use pursuant to the license granted in this Section 1.
+2. Shipment and Installation.  NTT will ship or deliver the Software by any method that NTT deems appropriate. User shall be solely responsible for proper installation of the Software.
+3. Term.  This Agreement is effective whichever is earlier (i) upon User's acceptance of the Agreement, or (ii) upon User's installing, accessing, and using the Software, even if User has not expressly accepted this Agreement. Without prejudice to any other rights, NTT may terminate this Agreement without notice to User (i) if User breaches or fails to comply with any of the limitations or other requirements described herein, and (ii) if NTT receives a notice from the academy stating that the research paper would not be published, and in any such case User agrees that NTT may, in addition to any other remedies it may have at law or in equity, remotely disable the Software. User may terminate this Agreement at any time by Userfs decision to terminate the Agreement to NTT and ceasing use of the Software. Upon any termination or expiration of this Agreement for any reason, User agrees to uninstall the Software and either return to NTT the Software and all copies thereof, or to destroy all such materials and provide written verification of such destruction to NTT.
+4.	Proprietary Rights
+(a)	The Software is the valuable, confidential, and proprietary property of NTT, and NTT shall retain exclusive title to this property both during the term and after the termination of this Agreement.  Without limitation, User acknowledges that all patent rights, copyrights and trade secret rights in the Software shall remain the exclusive property of NTT at all times. User shall use not less than reasonable care in safeguarding the confidentiality of the Software. 
+(b)	USER SHALL NOT, IN WHOLE OR IN PART, AT ANY TIME DURING THE TERM OF OR AFTER THE TERMINATION OF THIS AGREEMENT: (i)?SELL, ASSIGN, LEASE, DISTRIBUTE, OR OTHERWISE TRANSFER THE SOFTWARE TO ANY THIRD PARTY; (ii) EXCEPT AS OTHERWISE PROVIDED HEREIN, COPY OR REPRODUCE THE SOFTWARE IN ANY MANNER; (iii) DISCLOSE THE SOFTWARE TO ANY THIRD PARTY, EXCEPT TO USER'S EMPLOYEES WHO REQUIRE ACCESS TO THE SOFTWARE FOR THE PURPOSES OF THIS AGREEMENT; (iv) MODIFY, DISASSEMBLE, DECOMPILE, REVERSE ENGINEER OR TRANSLATE THE SOFTWARE; OR (v) ALLOW ANY PERSON OR ENTITY TO COMMIT ANY OF THE ACTIONS DESCRIBED IN (i) THROUGH (iv) ABOVE.
+(c)	User shall take appropriate action, by instruction, agreement, or otherwise, with respect to its employees permitted under this Agreement to have access to the Software to ensure that all of User's obligations under this Section 4 shall be satisfied.  
+5.	Indemnity.  User shall defend, indemnify and hold harmless NTT, its agents and employees, from any loss, damage, or liability arising in connection with User's improper or unauthorized use of the Software. NTT SHALL HAVE THE SOLE RIGHT TO CONDUCT DEFEND ANY ACTTION RELATING TO THE SOFTWARE.
+6.	Disclaimer.  THE SOFTWARE IS LICENSED TO USER "AS IS," WITHOUT ANY TRAINING, MAINTENANCE, OR SERVICE OBLIGATIONS WHATSOEVER ON THE PART OF NTT. NTT MAKES NO EXPRESS OR IMPLIED WARRANTIES OF ANY TYPE WHATSOEVER, INCLUDING WITHOUT LIMITATION THE IMPLIED WARRANTIES OF MERCHANTABILITY, OF FITNESS FOR A PARTICULAR PURPOSE AND OF NON-INFRINGEMENT ON COPYRIGHT OR ANY OTHER RIGHT OF THIRD PARTIES.  USER ASSUMES ALL RISKS ASSOCIATED WITH ITS USE OF THE SOFTWARE, INCLUDING WITHOUT LIMITATION RISKS RELATING TO QUALITY, PERFORMANCE, DATA LOSS, AND UTILITY IN A PRODUCTION ENVIRONMENT. 
+7.	Limitation of Liability.  IN NO EVENT SHALL NTT BE LIABLE TO USER OR TO ANY THIRD PARTY FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING BUT NOT LIMITED TO DAMAGES FOR PERSONAL INJURY, PROPERTY DAMAGE, LOST PROFITS, OR OTHER ECONOMIC LOSS, ARISING IN CONNECTION WITH USER'S USE OF OR INABILITY TO USE THE SOFTWARE, IN CONNECTION WITH NTT'S PROVISION OF OR FAILURE TO PROVIDE SERVICES PERTAINING TO THE SOFTWARE, OR AS A RESULT OF ANY DEFECT IN THE SOFTWARE.  THIS DISCLAIMER OF LIABILITY SHALL APPLY REGARDLESS OF THE FORM OF ACTION THAT MAY BE BROUGHT AGAINST NTT, WHETHER IN CONTRACT OR TORT, INCLUDING WITHOUT LIMITATION ANY ACTION FOR NEGLIGENCE.  USER'S SOLE REMEDY IN THE EVENT OF ANY BREACH OF THIS AGREEMENT BY NTT SHALL BE TERMINATION PURSUANT TO SECTION 3.
+8.	No Assignment or Sublicense.  Neither this Agreement nor any right or license under this Agreement, nor the Software, may be sublicensed, assigned, or otherwise transferred by User without NTT's prior written consent.
+9.	General
+(a)	If any provision, or part of a provision, of this Agreement is or becomes illegal, unenforceable, or invalidated, by operation of law or otherwise, that provision or part shall to that extent be deemed omitted, and the remainder of this Agreement shall remain in full force and effect.
+(b)	This Agreement is the complete and exclusive statement of the agreement between the parties with respect to the subject matter hereof, and supersedes all written and oral contracts, proposals, and other communications between the parties relating to that subject matter.  
+(c)	Subject to Section 8, this Agreement shall be binding on, and shall inure to the benefit of, the respective successors and assigns of NTT and User.  
+(d)	If either party to this Agreement initiates a legal action or proceeding to enforce or interpret any part of this Agreement, the prevailing party in such action shall be entitled to recover, as an element of the costs of such action and not as damages, its attorneys' fees and other costs associated with such action or proceeding.
+(e)	This Agreement shall be governed by and interpreted under the laws of Japan, without reference to conflicts of law principles. All disputes arising out of or in connection with this Agreement shall be finally settled by arbitration in Tokyo in accordance with the Commercial Arbitration Rules of the Japan Commercial Arbitration Association.  The arbitration shall be conducted by three (3) arbitrators and in Japanese. The award rendered by the arbitrators shall be final and binding upon the parties. Judgment upon the award may be entered in any court having jurisdiction thereof.
+(f)	NTT shall not be liable to the User or to any third party for any delay or failure to perform NTT's obligation set forth under this Agreement due to any cause beyond NTTfs reasonable control.
+
+
+EXHIBIT A
+
--- a/vtimellm/eval/dvc_eval/SODA/README.md
+++ b/vtimellm/eval/dvc_eval/SODA/README.md
+# SODA
+This repository is the imprimentation of "SODA: Story Oriented Dense Video Captioning Evaluation Flamework" published at ECCV 2020 [pdf](https://fujiso.github.io/publications/ECCV2020_soda.pdf).
+SODA measures the performance of video story description systems.
+
+## Update
+v1.1 (2021/5)
+* Added new option "--multi_reference" to deal with multiple reference.  
+  SODA selects the reference that has the maximum f1 for each video, and returns macro averaged scores.  
+* Fixed BertScore import
+
+## Requirements
+python 3.6+ (developed with 3.7)
+* Numpy
+* tqdm
+* [pycocoevalcap (Python3 version)](https://github.com/salaniz/pycocoevalcap)
+* BERTScore (optional)
+
+## Usage
+You can run SODA by specifying the path of system output and that of ground truth.
+Both files should be the json format for ActivityNet Captions.
+```bash
+python soda.py -s path/to/submission.json -r path/to/ground_truth.json 
+```
+
+You can run on the multiple reference setting, with `--multi_reference` option.
+```bash
+python soda.py --multi_reference -s path/to/submission.json -r path/to/ground_truth1.json path/to/ground_truth2.json
+```
+
+You can try other sentence evaluation metrics, e.g. CIDEr and BERTScore, with `-m` option.
+```bash
+python soda.py -s path/to/submission.json -m BERTScore
+```
+
+## Sample input file
+Please use the same format as [ActivityNet Challenge](http://activity-net.org/index.html)
+```
+{
+  version: "VERSION 1.0",
+  results: {
+    "sample_id" : [
+        {
+        sentence: "This is a sample caption.",
+        timestamp: [1.23, 4.56]
+        },
+        {
+        sentence: "This is a sample caption 2.",
+        timestamp: [7.89, 19.87]
+        }
+    ]
+  }
+  external_data: {
+    used: False,
+    }
+}
+```
+
+## Reference
+```
+@inproceedings{Fujita2020soda,
+  title={SODA: Story Oriented Dense Video Captioning Evaluation Flamework},
+  author={Soichiro Fujita and Tsutomu Hirao and Hidetaka Kamigaito and Manabu Okumura and Masaaki Nagata},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  month={August},
+  year={2020},
+}
+```
+
+## LICENSE
+NTT License
+
+According to the license, it is not allowed to create pull requests.
+Please feel free to send issues.
--- a/vtimellm/eval/dvc_eval/SODA/dataset.py
+++ b/vtimellm/eval/dvc_eval/SODA/dataset.py
+import numpy as np
+import json
+from collections import defaultdict
+from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+from .utils import iou, remove_nonascii
+
+
+class ANETCaptions:
+    def __init__(self, preds, gts, gt_vid, verbose=False):
+        self.pred_keys = ['results']
+        # self.pred_keys = ['results', 'version', 'external_data']
+        self.verbose = verbose
+        self.preds = preds
+        self.gts = gts
+        self.gt_vids = gt_vid
+        self.tokenizer = PTBTokenizer()
+
+    @classmethod
+    def from_load_files(cls, gt_file, pred_file, multi_reference=True, verbose=False):
+        gts, gt_vid = cls.load_ground_truth(gt_file, multi_reference=multi_reference, verbose=verbose)
+        preds = cls.load_prediction(pred_file, verbose=verbose)
+        # missing video
+        gt_vid = [x for x in gt_vid if x in preds]
+        gt_vid = cls.check_videos(gt_vid, preds.keys(),verbose=verbose)
+        return cls(preds, gts, gt_vid, verbose=verbose)
+
+    @classmethod
+    def from_prediction(cls, gt_file, preds, multi_reference=True, verbose=False):
+        results = {}
+        for vid in preds['results']:
+            results[vid] = sorted(preds["results"][vid], key=lambda x: x["timestamp"][0])
+        gts, gt_vid = cls.load_ground_truth(gt_file, multi_reference=multi_reference)
+        gt_vid = cls.check_videos(gt_vid, results.keys(),verbose=verbose)
+
+        return cls(results, gts, gt_vid, verbose=verbose)
+    
+    @staticmethod
+    def load_ground_truth(filenames, multi_reference=False, verbose=False):
+        if verbose: 
+            print(f"| Loading ground truths: {filenames}.")
+        if isinstance(filenames, str):
+            filenames = [filenames]
+        gt_vids = set()
+        gt = defaultdict(dict)
+        gts = []
+        for filename in filenames:
+            if isinstance(filename, dict):
+                _gt = filename
+            else:
+                with open(filename, "r") as f:
+                    _gt = json.load(f) 
+            gt_vids.update(_gt.keys())
+            gts.append(_gt)
+        if multi_reference is False:
+            for vid in gt_vids:
+                t, s = [], []
+                for _g in gts:
+                    if vid not in _g: 
+                        continue
+                    t += _g[vid]["timestamps"]
+                    s += _g[vid]["sentences"]
+                sort_t, sort_s = list(zip(*sorted(zip(t, s), key=lambda x: x[0][0])))
+                gt[vid]["timestamps"] = sort_t
+                gt[vid]["sentences"] = sort_s
+            gts = [gt]
+        if verbose:
+            print(f"stats:\n\t n_files: {len(filenames)}, n_videos: {len(gt_vids)}")
+        return gts, gt_vids 
+
+    @staticmethod
+    def load_prediction(filename, verbose=False):
+        if verbose: print(f"\n| Loading predictions: {filename}.")
+        if isinstance(filename, dict):
+            pred = filename
+        else:
+            with open(filename, 'r') as f:
+                pred = json.load(f)
+        # If the json file doesn’t have enough attribute
+        # if not all([key in pred.keys() for key in ["results"]]):
+        #     raise IOError('Please input a correct format prediction file.')
+        results = {}
+        for vid in pred['results']:
+            # if vid not in self.gt_vids: continue
+            results[vid] = sorted(pred["results"][vid], key=lambda x: x["timestamp"][0])
+        return results
+
+    def preprocess(self):
+        if self.verbose: print("\n| Preprocessing captions...")
+        n_ref = len(self.gts)
+        p_spliter = [0]
+        g_spliter = [[0] for i in range(n_ref)]
+        times = {}
+        cur_preds = {}
+        cur_gts = [{} for i in range(n_ref)]
+        for i, vid in enumerate(self.gt_vids): 
+            cur_preds.update({j+p_spliter[-1]:[{"caption": remove_nonascii(p["sentence"])}] for j,p in enumerate(self.preds[vid])})
+            times[i] = [p["timestamp"] for p in self.preds[vid]]
+            p_spliter.append(p_spliter[-1] + len(times[i]))
+            for n in range(n_ref):
+                if vid not in self.gts[n]: 
+                    g_spliter[n].append(g_spliter[n][-1])
+                    continue
+                cur_gts[n].update({j+g_spliter[n][-1]:[{"caption": remove_nonascii(p)}] for j,p in enumerate(self.gts[n][vid]["sentences"])})
+                g_spliter[n].append(g_spliter[n][-1] + len(self.gts[n][vid]["sentences"]))
+        tokenize_preds = self.tokenizer.tokenize(cur_preds)
+        tokenize_gts = [self.tokenizer.tokenize(j) for j in cur_gts]
+        for i, vid in enumerate(self.gt_vids): 
+            _p = [tokenize_preds[j] for j in range(p_spliter[i],p_spliter[i+1])]
+            self.preds[vid] = {"timestamps":times[i], "sentences":_p}
+            for n in range(n_ref):
+                if vid not in self.gts[n]: continue
+                _g = [tokenize_gts[n][j] for j in range(g_spliter[n][i],g_spliter[n][i+1])]
+                self.gts[n][vid]["sentences"] = _g
+
+    @staticmethod
+    def check_videos(gold_vid, pred_vid, verbose=True):
+        not_appear = set(gold_vid) - set(pred_vid)
+        if len(not_appear) > 0 and verbose:
+            print((f"Warning: some videos in ground truth file are not appeared in prediction file!\n"
+                f"\t{len(not_appear)} videos are not predicted: {not_appear}"))
+        return list(set(gold_vid) & set(pred_vid))
+
--- a/vtimellm/eval/dvc_eval/SODA/nlpeval/bert_f_score.py
+++ b/vtimellm/eval/dvc_eval/SODA/nlpeval/bert_f_score.py
+#!/usr/bin/env python
+from bert_score.scorer import BERTScorer
+
+
+class BertScore:
+    # def __init__(self, lang="en", model_type="bert-large-uncased"):
+    def __init__(self, lang="en", model_type=None):
+        self.lang = lang
+        self.model_type = model_type
+        self.bert = BERTScorer(model_type=model_type, lang=lang)
+
+    def compute_score(self, gts, res):
+        assert gts.keys() == res.keys()
+        # convert dict to list of str
+        cands = list(map(lambda x: x[0], res.values()))
+        refs = list(map(lambda x: x[0], gts.values()))
+        (P, R, F), hashname = self.bert.score(cands, refs, return_hash=True)
+        # print(f'{hashname}: P={P.mean().item():.6f} R={R.mean().item():.6f} F={F.mean().item():.6f}')
+        F = F.numpy()
+        return F.mean(), F
+
+    def method(self):
+        return "BertScore"
--- a/vtimellm/eval/dvc_eval/SODA/nlpeval/bert_r_score.py
+++ b/vtimellm/eval/dvc_eval/SODA/nlpeval/bert_r_score.py
+#!/usr/bin/env python
+
+from bert_score.scorer import BERTScorer
+
+
+class BertScore:
+    def __init__(self, lang="en", model_type="roberta-large"):
+        self.lang = lang
+        self.model_type = model_type
+        self.bert = BERTScorer(model_type=model_type, lang=lang)
+
+    def compute_score(self, gts, res):
+        assert gts.keys() == res.keys()
+        # convert dict to list of str
+        cands = list(map(lambda x: x[0], res.values()))
+        refs = list(map(lambda x: x[0], gts.values()))
+        (P, R, F), hashname = self.bert.score(cands, refs, return_hash=True)
+        #print(f'{hashname}: P={P.mean().item():.6f} R={R.mean().item():.6f} F={F.mean().item():.6f}')
+        R = R.numpy()
+        return R.mean(), R
+
+    def method(self):
+        return "BertScore"
--- a/vtimellm/eval/dvc_eval/SODA/nlpeval/mover.py
+++ b/vtimellm/eval/dvc_eval/SODA/nlpeval/mover.py
+#!/usr/bin/env python
+import numpy as np
+#from moverscore_v2 import get_idf_dict, word_mover_score 
+from moverscore import get_idf_dict, word_mover_score 
+from collections import defaultdict
+
+class MoverScore:
+    def __init__(self, lang="en", model_type=None):
+        self.lang = lang
+        self.model_type=model_type
+        #self.model = load_model(model_type=model_type, lang=lang)
+        self.idf_dict_ref = None
+        self.idf_dict_hyp = None
+
+    def compute_score(self, gts, res):
+        assert gts.keys()==res.keys()
+        assert self.idf_dict_hyp is not None and self.idf_dict_hyp is not None
+        # convert dict to list of str
+        cands = list(map(lambda x:x[0], res.values()))
+        refs = list(map(lambda x:x[0], gts.values()))
+
+        scores = word_mover_score(refs, cands, self.idf_dict_ref, self.idf_dict_hyp, \
+                                          stop_words=[], n_gram=1, remove_subwords=True)
+        #print(np.mean(scores), max(scores))
+        return np.mean(scores), scores
+
+    def make_dict(self, all_gts, all_res, vids):
+        gold = []
+        pred = []
+        for vid in vids:
+            gold.extend(all_gts[vid]["sentences"])
+            pred.extend([pred["sentence"] for pred in all_res[vid]])
+        self.idf_dict_ref = get_idf_dict(gold)
+        self.idf_dict_hyp = get_idf_dict(pred)
+        #print(self.idf_dict_hyp)
+
+    def method(self):
+        return "MoverScore"
--- a/vtimellm/eval/dvc_eval/SODA/requirements.txt
+++ b/vtimellm/eval/dvc_eval/SODA/requirements.txt
+numpy==1.19.1
+tqdm==4.48.2
--- a/vtimellm/eval/dvc_eval/SODA/soda.py
+++ b/vtimellm/eval/dvc_eval/SODA/soda.py
+#!/uer/bin/env python
+import argparse
+import json
+from tqdm import tqdm
+import numpy as np
+
+from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+from pycocoevalcap.meteor.meteor import Meteor
+from pycocoevalcap.cider.cider import Cider
+
+from .dataset import ANETCaptions
+from .utils import iou, remove_nonascii
+
+
+class SODA:
+    def __init__(self, data, soda_type="c", tious=None, scorer="Meteor", verbose=False):
+        #self.data = data
+        self.preds = data.preds
+        self.gts = data.gts
+        self.gt_vids = data.gt_vids
+        self.soda_type = soda_type
+        self.tious = [0.0] if tious is None else tious
+        self.tokenizer = PTBTokenizer()
+        if scorer == "BertScore":
+            from nlpeval.bert_r_score import BertScore
+        self.scorer = eval(scorer)()
+        self.scorer_name = scorer
+        self.verbose = verbose
+
+        if soda_type == "a":    # averaging F-measure scores with IoU threshold = 0.9, 0.7, 0.5, 0.3
+            self.soda_func = self.soda_a
+        elif soda_type == "b":  # F-measure, where IoU threshold is set to 0.
+            self.soda_func = self.soda_b
+        elif soda_type == "c":  # F-measure, utilizing the IoU x METEOR score
+            self.soda_func = self.soda_c
+        elif soda_type == "d":  # F-measure of IoU score
+            self.soda_func = self.soda_d
+
+            class Dummy:
+                def compute_score(self, x, y):
+                    return [0, 0]
+
+            self.scorer = Dummy()
+        else:
+            raise NotImplementedError
+
+    @classmethod
+    def build(cls, preds, gts, gt_vids, soda_type="c", tious=[0.0], scorer="Meteor", verbose=False):
+        data = ANETCaptions(preds, gts, gt_vids)
+        data.preprocess()
+        return cls(data, soda_type, tious, scorer, verbose)
+
+    @classmethod
+    def build_from_prediction(cls, preds, gt_files, soda_type="c", tious=[0.0], scorer="Meteor", verbose=False):
+        data = ANETCaptions.from_prediction(gt_files, preds)
+        data.preprocess()
+        return cls(data, soda_type, tious, scorer, verbose)
+
+    def calc_iou_matrix(self, preds, golds):
+        #print(preds["timestamps"], gt["timestamps"])
+        return np.array([[iou(pred, ct) for pred in preds["timestamps"]] for ct in golds['timestamps']])
+
+    def calc_score_matrix(self, preds, golds):
+        # Reformat to fit the input of pycocoevalcap scorers.
+        p_sent, g_sent = preds["sentences"], golds["sentences"]
+        res = {index: p for index, p in enumerate(p_sent)}
+        gts = [{index: g for index in range(len(p_sent))} for i, g in enumerate(g_sent)]
+        return np.array([self.scorer.compute_score(res, gt)[1] for gt in gts])
+
+    def evaluate(self,):
+        if self.verbose:
+            print(f"\n| Running SODA {self.soda_type}.")
+        tious = self.tious
+        p_best = [[] for i in range(len(tious))]
+        r_best = [[] for i in range(len(tious))]
+        f_best = [[] for i in range(len(tious))]
+        n_pred = []
+        for vid in tqdm(self.gt_vids, disable=not self.verbose):
+            _p = [[] for i in range(len(tious))]
+            _r = [[] for i in range(len(tious))]
+            _f = [[] for i in range(len(tious))]
+            pred = self.preds[vid]
+            n_pred.append(len(pred["sentences"]))
+            # empty pred
+            if not pred['sentences']:
+                for i, tiou in enumerate(tious):
+                    p_best[i].append(0)
+                    r_best[i].append(0)
+                    f_best[i].append(0)
+                continue
+            for gt in self.gts:
+                if vid not in gt:
+                    continue
+                gold = gt[vid]
+                # create matrix
+                _iou = self.calc_iou_matrix(pred, gold)
+                scores = self.calc_score_matrix(pred, gold)
+                for i, tiou in enumerate(tious):
+                    iou = np.copy(_iou)
+                    iou[iou < tiou] = 0.0
+                    try:
+                        max_score, pairs = self.soda_func(iou, scores)
+                    except:  # RecursionError
+                        max_score, pairs = 0., None
+                    (n_g, n_p) = iou.shape
+                    p = max_score / n_p
+                    r = max_score / n_g
+                    _p[i].append(p)
+                    _r[i].append(r)
+                    _f[i].append(2 * p * r / (p + r) if p+r > 0 else 0)
+            best_idx = np.argmax(_f, axis=1)
+            for i, tiou in enumerate(tious):
+                p_best[i].append(_p[i][best_idx[i]])
+                r_best[i].append(_r[i][best_idx[i]])
+                f_best[i].append(_f[i][best_idx[i]])
+        precision = np.mean(p_best, axis=1)
+        recall = np.mean(r_best, axis=1)
+        f1 = np.mean(f_best, axis=1)
+        print(f"avg. outputs: {np.mean(n_pred)}")
+        # average scores across all the tIoUs
+        if self.verbose:
+            for i, tiou in enumerate(tious):
+                partial_result = {self.scorer_name: [precision[i], recall[i], f1[i]]}
+                print_score(partial_result, description=f"tIoU: {tiou}")
+
+        final_scores = [np.mean(precision), np.mean(recall), np.mean(f1)]
+        result = {self.scorer_name: final_scores}
+        return result
+
+    def soda_a(self, iou, scores):
+        _, pairs = self.chased_dp_assignment(iou)
+        r, c = (*zip(*pairs),)
+        max_score = np.sum(scores[r, c])
+        return max_score, pairs
+
+    def soda_b(self, iou, scores):
+        # same as soda_a
+        _, pairs = self.chased_dp_assignment(iou)
+        r, c = (*zip(*pairs),)
+        max_score = np.sum(scores[r, c])
+        return max_score, pairs
+
+    def soda_c(self, iou, scores):
+        max_score, pairs = self.chased_dp_assignment(iou*scores)
+        return max_score, pairs
+
+    def soda_d(self, iou, scores):
+        max_score, pairs = self.chased_dp_assignment(iou)
+        return max_score, pairs
+
+    def chased_dp_assignment(self, scores):
+        """ 
+        Run dp matching
+        Recurrence:  
+            dp[i,j] = 
+                max(dp[i-1,j], dp[i-1,j-1] + scores[i,j], dp[i,j-1])
+        """
+        M, N = scores.shape
+        dp = - np.ones((M, N))
+        path = np.zeros((M, N))
+
+        def transition(i, j):
+            if dp[i, j] >= 0:
+                return dp[i, j]
+            elif i == 0 and j == 0:
+                state = [-1, -1, scores[i, j]]
+            elif i == 0:
+                state = [-1, transition(i, j-1), scores[i, j]]
+            elif j == 0:
+                state = [transition(i-1, j), -1, scores[i, j]]
+            else:
+                state = [transition(i-1, j), transition(i, j-1), transition(i-1, j-1) + scores[i, j]]
+            dp[i, j] = np.max(state)
+            path[i, j] = np.argmax(state)
+            return dp[i, j]
+
+        def get_pairs(i, j):
+            p = np.where(path[i][:j+1] == 2)[0]
+            if i != 0 and len(p) == 0:
+                return get_pairs(i-1, j)
+            elif i == 0 or p[-1] == 0:
+                return [(i, p[-1])]
+            else:
+                return get_pairs(i-1, p[-1]-1) + [(i, p[-1])]
+        N, M = scores.shape
+        max_score = transition(N-1, M-1)
+        pairs = get_pairs(N-1, M-1)
+        return max_score, pairs
+
+
+def print_score(result, description="SODA result"):
+    prf = ["precision", "recall", "f1_score"]
+    print('-' * 80)
+    print(description)
+    print('-' * 80)
+    for scorer_name, score in result.items():
+        print(f'| scorer:{scorer_name}')
+        for k, v in zip(prf, score):
+            print(f"\t{k}:{v*100:2.4f}")
+
+
+def main(args):
+    # Call coco eval
+    data = ANETCaptions.from_load_files(args.references,
+                                        args.prediction,
+                                        multi_reference=args.multi_reference,
+                                        verbose=args.verbose,
+                                        )
+    data.preprocess()
+    if args.soda_type == 'a':
+        tious = args.tious
+    else:
+        tious = None
+    evaluator = SODA(data,
+                     soda_type=args.soda_type,
+                     tious=tious,
+                     scorer=args.metric,
+                     verbose=args.verbose
+                     )
+    result = evaluator.evaluate()
+    print_score(result)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--prediction', type=str, required=True, default='sample.json',
+                        help='system output file with json format for ActivityNet Challenge')
+    parser.add_argument('-r', '--references', type=str, nargs='+', default=['./data/val_1.json', './data/val_2.json'],
+                        help='reference files with ground truth captions')
+    parser.add_argument('-m', '--metric', type=str, default="Meteor", choices=['Meteor', 'Cider',  'BertScore'],
+                        help='choice evaluation metrics for SODA')
+    parser.add_argument('-s', '--soda_type', type=str, default="c", choices=['a', 'b',  'c', 'd'],
+                        help='choice evaluation metrics for SODA')
+    parser.add_argument('--tious', type=float,  nargs='+', default=[0.3, 0.5, 0.7, 0.9],
+                        help='list of the tIoUs (only for SODA-a)')
+    parser.add_argument('-mr', '--multi_reference', action='store_true',
+                        help='print details')
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help='print details')
+    args = parser.parse_args()
+
+    main(args)
--- a/vtimellm/eval/dvc_eval/SODA/utils.py
+++ b/vtimellm/eval/dvc_eval/SODA/utils.py
+#!/usr/bin/env python
+
+def iou(interval_1, interval_2):
+    """
+    interval: list (2 float elements)
+    """
+    eps = 1e-8 # to avoid zero division
+    (s_1, e_1) = interval_1
+    (s_2, e_2) = interval_2
+
+    intersection = max(0., min(e_1, e_2) - max(s_1, s_2))
+    union = min(max(e_1, e_2) - min(s_1, s_2), e_1 - s_1 + e_2 - s_2)
+    iou = intersection / (union + eps)
+    return iou
+
+def remove_nonascii(text):
+    return ''.join([i if ord(i) < 128 else ' ' for i in text])
+
--- a/vtimellm/eval/dvc_eval/__init__.py
+++ b/vtimellm/eval/dvc_eval/__init__.py
+from .eval_dvc import eval_dvc
+from .eval_soda import eval_soda
\ No newline at end of file
--- a/vtimellm/eval/dvc_eval/eval_dvc.py
+++ b/vtimellm/eval/dvc_eval/eval_dvc.py
+# --------------------------------------------------------
+# evaluation scripts for dense video captioning, support python 3
+# Modified from https://github.com/ranjaykrishna/densevid_eval/tree/9d4045aced3d827834a5d2da3c9f0692e3f33c1c
+# --------------------------------------------------------
+# Dense-Captioning Events in Videos Eval
+# Copyright (c) 2017 Ranjay Krishna
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ranjay Krishna
+# --------------------------------------------------------
+
+import argparse
+import json
+import random
+import string
+import sys
+import time
+# sys.path.insert(0, './coco-caption') # Hack to allow the import of pycocoeval
+
+from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+from pycocoevalcap.meteor.meteor import Meteor
+from pycocoevalcap.cider.cider import Cider
+from pycocoevalcap.bleu.bleu import Bleu
+from pycocoevalcap.rouge.rouge import Rouge
+
+Set = set
+import numpy as np
+
+
+def random_string(string_length):
+    letters = string.ascii_lowercase
+    return ''.join(random.choice(letters) for i in range(string_length))
+
+
+def remove_nonascii(text):
+    return ''.join([i if ord(i) < 128 else ' ' for i in text])
+
+
+class ANETcaptions(object):
+    PREDICTION_FIELDS = ['results', 'version', 'external_data']
+
+    def __init__(self, ground_truth_filenames=None, prediction_filename=None,
+                 tious=None, distances=[1, 3, 5, 10, 30, 60], max_proposals=1000,
+                 prediction_fields=PREDICTION_FIELDS, verbose=False, no_lang_eval=False):
+        # Check that the gt and submission files exist and load them
+        if len(tious) == 0:
+            raise IOError('Please input a valid tIoU.')
+        if not ground_truth_filenames:
+            raise IOError('Please input a valid ground truth file.')
+        if not prediction_filename:
+            raise IOError('Please input a valid prediction file.')
+
+        self.verbose = verbose
+        self.no_lang_eval = no_lang_eval
+        self.tious = tious
+        self.distances = distances
+        self.max_proposals = max_proposals
+        self.pred_fields = prediction_fields
+        self.ground_truths = self.import_ground_truths(ground_truth_filenames)
+        self.prediction = self.import_prediction(prediction_filename)
+        self.ground_truths_keys = [vid for gt in self.ground_truths for vid in gt]
+        print('available video number', len(set(self.ground_truths_keys) & set(self.prediction.keys())))
+
+        # Set up scorers
+        if not self.no_lang_eval:
+            self.tokenizer = PTBTokenizer()
+            self.scorers = [
+                (Meteor(), "METEOR"),
+                (Cider(), "CIDEr"),
+                (Rouge(), "Rouge-L"),
+                (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
+            ]
+
+    def import_prediction(self, prediction_filename):
+        if self.verbose:
+            print("| Loading submission...")
+        if isinstance(prediction_filename, dict):
+            submission = prediction_filename
+        else:
+            submission = json.load(open(prediction_filename))
+        # if not all([field in submission.keys() for field in self.pred_fields]):
+        #    raise IOError('Please input a valid ground truth file.')
+        # Ensure that every video is limited to the correct maximum number of proposals.
+        results = {}
+        for vid_id in submission['results']:
+            results[vid_id] = submission['results'][vid_id][:self.max_proposals]
+        return results
+
+    def import_ground_truths(self, filenames):
+        gts = []
+        self.n_ref_vids = Set()
+        for filename in filenames:
+            if isinstance(filename, dict):
+                gt = filename
+            else:
+                gt = json.load(open(filename))
+            self.n_ref_vids.update(gt.keys())
+            gts.append(gt)
+        if self.verbose:
+            print("| Loading GT. #files: %d, #videos: %d" % (len(filenames), len(self.n_ref_vids)))
+        return gts
+
+    def iou(self, interval_1, interval_2):
+        start_i, end_i = interval_1[0], interval_1[1]
+        start, end = interval_2[0], interval_2[1]
+        intersection = max(0, min(end, end_i) - max(start, start_i))
+        union = min(max(end, end_i) - min(start, start_i), end - start + end_i - start_i)
+        iou = float(intersection) / (union + 1e-8)
+        return iou
+
+    def check_gt_exists(self, vid_id):
+        for gt in self.ground_truths:
+            if vid_id in gt:
+                return True
+        return False
+
+    def get_gt_vid_ids(self):
+        vid_ids = set([])
+        for gt in self.ground_truths:
+            vid_ids |= set(gt.keys())
+        return list(vid_ids)
+
+    def evaluate(self):
+        aggregator = {}
+        self.scores = {}
+        if not self.no_lang_eval:
+            for tiou in self.tious:
+                scores = self.evaluate_tiou(tiou)
+                for metric, score in scores.items():
+                    if metric not in self.scores:
+                        self.scores[metric] = []
+                    self.scores[metric].append(score)
+        if True:
+            # if self.verbose:
+            self.scores['Recall'] = []
+            self.scores['Precision'] = []
+            self.scores['F1'] = []
+            for tiou in self.tious:
+                precision, recall = self.evaluate_detection(tiou)
+                self.scores['Recall'].append(recall)
+                self.scores['Precision'].append(precision)
+                self.scores['F1'].append(2 * recall * precision / (recall + precision) if recall + precision else 0.)
+            for tiou in self.distances:
+                precision, recall = self.evaluate_navigation(tiou)
+                self.scores['Recall'].append(recall)
+                self.scores['Precision'].append(precision)
+                self.scores['F1'].append(2 * recall * precision / (recall + precision) if recall + precision else 0.)
+
+    def evaluate_detection(self, tiou):
+        gt_vid_ids = self.get_gt_vid_ids()
+        # Recall is the percentage of ground truth that is covered by the predictions
+        # Precision is the percentage of predictions that are valid
+        recall = []
+        precision = []
+        for vid_i, vid_id in enumerate(gt_vid_ids):
+            if vid_id not in self.prediction:  # missing video
+                continue
+            best_recall = 0
+            best_precision = 0
+            for gt in self.ground_truths:
+                if vid_id not in gt:
+                    continue
+                refs = gt[vid_id]
+                ref_set_covered = set([])
+                pred_set_covered = set([])
+                num_gt = 0
+                num_pred = 0
+                if vid_id in self.prediction:
+                    for pred_i, pred in enumerate(self.prediction[vid_id]):
+                        pred_timestamp = pred['timestamp']
+                        for ref_i, ref_timestamp in enumerate(refs['timestamps']):
+                            if self.iou(pred_timestamp, ref_timestamp) > tiou:
+                                ref_set_covered.add(ref_i)
+                                pred_set_covered.add(pred_i)
+
+                    new_precision = float(len(pred_set_covered)) / max(len(self.prediction[vid_id]), 1)
+                    best_precision = max(best_precision, new_precision)
+                new_recall = float(len(ref_set_covered)) / len(refs['timestamps'])
+                best_recall = max(best_recall, new_recall)
+            recall.append(best_recall)
+            precision.append(best_precision)
+        return sum(precision) / len(precision), sum(recall) / len(recall)
+
+    def evaluate_navigation(self, tiou):
+        gt_vid_ids = self.get_gt_vid_ids()
+        # Recall is the percentage of ground truth that is covered by the predictions
+        # Precision is the percentage of predictions that are valid
+        recall = []
+        precision = []
+        for vid_i, vid_id in enumerate(gt_vid_ids):
+            if vid_id not in self.prediction:  # missing video
+                continue
+            best_recall = 0
+            best_precision = 0
+            for gt in self.ground_truths:
+                if vid_id not in gt:
+                    continue
+                refs = gt[vid_id]
+                ref_set_covered = set([])
+                pred_set_covered = set([])
+                num_gt = 0
+                num_pred = 0
+                if vid_id in self.prediction:
+                    for pred_i, pred in enumerate(self.prediction[vid_id]):
+                        pred_timestamp = pred['timestamp']
+                        for ref_i, ref_timestamp in enumerate(refs['timestamps']):
+                            if abs(pred_timestamp[0] - ref_timestamp[0]) < tiou:
+                                ref_set_covered.add(ref_i)
+                                pred_set_covered.add(pred_i)
+
+                    new_precision = float(len(pred_set_covered)) / max(len(self.prediction[vid_id]), 1)
+                    best_precision = max(best_precision, new_precision)
+                new_recall = float(len(ref_set_covered)) / len(refs['timestamps'])
+                best_recall = max(best_recall, new_recall)
+            recall.append(best_recall)
+            precision.append(best_precision)
+        return sum(precision) / len(precision), sum(recall) / len(recall)
+
+    def evaluate_tiou(self, tiou):
+        # This method averages the tIoU precision from METEOR, Bleu, etc. across videos
+        res = {}
+        gts = {}
+        gt_vid_ids = self.get_gt_vid_ids()
+
+        unique_index = 0
+
+        # video id to unique caption ids mapping
+        vid2capid = {}
+
+        cur_res = {}
+        cur_gts = {}
+
+        for vid_id in gt_vid_ids:
+
+            # If the video does not have a prediction, then we give it no matches
+            # We set it to empty, and use this as a sanity check later on
+            if vid_id not in self.prediction:  # missing video
+                continue
+
+            # If we do have a prediction, then we find the scores based on all the
+            # valid tIoU overlaps.
+            else:
+                vid2capid[vid_id] = []
+                # For each prediction, we look at the tIoU with ground truth.
+                for pred in self.prediction[vid_id]:
+                    has_added = False
+                    for gt in self.ground_truths:
+                        if vid_id not in gt:
+                            continue
+                        gt_captions = gt[vid_id]
+                        for caption_idx, caption_timestamp in enumerate(gt_captions['timestamps']):
+                            if self.iou(pred['timestamp'], caption_timestamp) >= tiou:
+                                cur_res[unique_index] = [{'caption': remove_nonascii(pred['sentence'])}]
+                                cur_gts[unique_index] = [
+                                    {'caption': remove_nonascii(gt_captions['sentences'][caption_idx])}]
+                                vid2capid[vid_id].append(unique_index)
+                                unique_index += 1
+                                has_added = True
+
+                    # If the predicted caption does not overlap with any ground truth,
+                    # we should compare it with garbage.
+                    if not has_added:
+                        cur_res[unique_index] = [{'caption': remove_nonascii(pred['sentence'])}]
+                        cur_gts[unique_index] = [{'caption': random_string(random.randint(10, 20))}]
+                        vid2capid[vid_id].append(unique_index)
+                        unique_index += 1
+
+        # Each scorer will compute across all videos and take average score
+        output = {}
+        for scorer, method in self.scorers:
+            if self.verbose:
+                print('computing %s score...' % (scorer.method()))
+
+            # For each video, take all the valid pairs (based from tIoU) and compute the score
+            all_scores = {}
+
+            # call tokenizer here for all predictions and gts
+            tokenize_res = self.tokenizer.tokenize(cur_res)
+            tokenize_gts = self.tokenizer.tokenize(cur_gts)
+
+            # reshape back
+            for vid in vid2capid.keys():
+                res[vid] = {index: tokenize_res[index] for index in vid2capid[vid]}
+                gts[vid] = {index: tokenize_gts[index] for index in vid2capid[vid]}
+
+            for vid_id in gt_vid_ids:
+
+                if vid_id not in self.prediction:  # missing video
+                    continue
+
+                if len(res[vid_id]) == 0 or len(gts[vid_id]) == 0:
+                    if type(method) == list:
+                        score = [0] * len(method)
+                    else:
+                        score = 0
+                else:
+                    score, scores = scorer.compute_score(gts[vid_id], res[vid_id])
+                all_scores[vid_id] = score
+                # import ipdb;ipdb.set_trace()
+
+            # print(all_scores.values())
+            if type(method) == list:
+                scores = np.mean(list(all_scores.values()), axis=0)
+                for m in range(len(method)):
+                    output[method[m]] = scores[m]
+                    if self.verbose:
+                        print("Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method[m], output[method[m]]))
+            else:
+                output[method] = np.mean(list(all_scores.values()))
+                if self.verbose:
+                    print("Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method, output[method]))
+        return output
+
+
+def eval_dvc(submission, references, tious=[0.3, 0.5, 0.7, 0.9], distances=[1, 3, 5, 10, 30, 60], max_proposals_per_video=1000, verbose=False, no_lang_eval=False):
+    # Call coco eval
+    evaluator = ANETcaptions(ground_truth_filenames=references,
+                             prediction_filename=submission,
+                             tious=tious,
+                             distances=distances,
+                             max_proposals=max_proposals_per_video,
+                             verbose=verbose, no_lang_eval=no_lang_eval)
+    evaluator.evaluate()
+    score = evaluator.scores
+    # print(score)
+    loc_score = {}
+    for i, x in enumerate(tious):
+        for y in ["Recall", "Precision", "F1"]:
+            loc_score[y + "@" + str(x)] = score[y][i]
+    for y in ["Recall", "Precision", "F1"]:
+        loc_score[y] = np.array([score[y][i] for i in range(len(tious))]).mean()
+    if distances:
+        for i, x in enumerate(distances):
+            for y in ["Recall", "Precision", "F1"]:
+                loc_score[y + "@" + str(x) + "s"] = score[y][len(tious) + i]
+    avg_eval_score = {key: np.array(value).mean() for key, value in score.items() if key not in ["Recall", "Precision", "F1"]}
+    avg_eval_score.update(loc_score)
+    return avg_eval_score
+
+if __name__ == '__main__':
+    eval_dvc(pred_path, references, 
+                tious=[0.3, 0.5, 0.7, 0.9], 
+                max_proposals_per_video=1000, 
+                verbose=False, 
+                no_lang_eval=False)
+    eval_soda(pred_path, references, verbose=False)
\ No newline at end of file
--- a/vtimellm/eval/dvc_eval/eval_soda.py
+++ b/vtimellm/eval/dvc_eval/eval_soda.py
+import numpy as np
+from .SODA.soda import SODA
+from .SODA.dataset import ANETCaptions
+
+def eval_tool(prediction, referneces=None, metric='Meteor', soda_type='c', verbose=False):
+
+    args = type('args', (object,), {})()
+    args.prediction = prediction
+    args.references = referneces
+    args.metric = metric
+    args.soda_type = soda_type
+    args.tious = [0.3, 0.5, 0.7, 0.9]
+    args.verbose = verbose
+    args.multi_reference = False
+
+    data = ANETCaptions.from_load_files(args.references,
+                                        args.prediction,
+                                        multi_reference=args.multi_reference,
+                                        verbose=args.verbose,
+                                        )
+    data.preprocess()
+    if args.soda_type == 'a':
+        tious = args.tious
+    else:
+        tious = None
+    evaluator = SODA(data,
+                     soda_type=args.soda_type,
+                     tious=tious,
+                     scorer=args.metric,
+                     verbose=args.verbose
+                     )
+    result = evaluator.evaluate()
+
+    return result
+
+def eval_soda(p, ref_list,verbose=False):
+    score_sum = []
+    for ref in ref_list:
+        r = eval_tool(prediction=p, referneces=[ref], verbose=verbose, soda_type='c')
+        score_sum.append(r['Meteor'])
+    soda_avg = np.mean(score_sum, axis=0) #[avg_pre, avg_rec, avg_f1]
+    soda_c_avg = soda_avg[-1]
+    results = {'soda_c': soda_c_avg}
+    return results
\ No newline at end of file
--- a/vtimellm/eval/eval.py
+++ b/vtimellm/eval/eval.py
+import os
+root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..")
+import sys
+sys.path.append(root_dir)
+
+import clip
+import re
+import argparse
+import torch
+import json
+import numpy as np
+from tqdm import tqdm
+from torchvision.transforms import Compose, Resize, CenterCrop, Normalize
+from vtimellm.model.builder import load_pretrained_model
+from vtimellm.utils import disable_torch_init
+from vtimellm.mm_utils import VideoExtractor
+from vtimellm.inference import inference
+
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    from PIL import Image
+    BICUBIC = Image.BICUBIC
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--clip_path", type=str, default="checkpoints/clip/ViT-L-14.pt")
+    parser.add_argument("--pretrain_mm_mlp_adapter", type=str, default="checkpoints/vtimellm-vicuna-v1-5-7b-stage1/mm_projector.bin")
+    parser.add_argument("--stage2", type=str, default="checkpoints/vtimellm-vicuna-v1-5-7b-stage2")
+    parser.add_argument("--stage3", type=str, default="checkpoints/vtimellm-vicuna-v1-5-7b-stage3")
+    parser.add_argument("--model_base", type=str, default="/path/to/vicuna-7b-v1.5")
+    parser.add_argument("--data_path", type=str, default="vtimellm/eval/data_example.json")
+    parser.add_argument("--feat_folder", type=str, default=None)
+    parser.add_argument("--video_folder", type=str, default=None)
+    parser.add_argument("--task", type=str, default='all', choices=['all', 'grounding', 'captioning'])
+    parser.add_argument("--log_path", type=str, default='vtimellm/eval/log/example_log.txt')
+    args = parser.parse_args()
+    return args
+
+def iou(outputs, gt):
+    matches = re.search(r"(\d{2}) (to|and) (\d{2})", outputs)
+    if not matches:
+        return 0
+    from_number = float(matches.group(1)) / 100
+    to_number = float(matches.group(3)) / 100
+    s, e = gt
+    intersection = max(0, min(to_number, e) - max(from_number, s))
+    union = max(to_number, e) - min(from_number, s)
+    iou = intersection / union
+    return round(iou, 2)
+
+
+def write_log(log_path, video_id, task, query_id, answer, info=None):
+    log = {
+        'video_id': video_id,
+        'task': task,
+        'query_id': query_id,
+        'answer': answer
+    }
+    if info is not None:
+        log['info'] = info
+    with open(log_path, 'a') as f:
+        f.write(json.dumps(log) + '\n')
+
+questions = {
+    'grounding': ['During which frames can we see {}?'],
+    'captioning': ['Could you please describe the events in the video in detail? Be specific about the activities of individuals, their surroundings, and interactions with others. The output should be in JSON format, structured as follows: {"event": "xx", "timestamps": "from xx to xx"}.']
+}
+
+if __name__ == "__main__":
+    args = parse_args()
+    disable_torch_init()
+    tokenizer, model, context_len = load_pretrained_model(args, args.stage2, args.stage3)
+    model = model.cuda()
+    model.to(torch.float16)
+
+    if args.video_folder is not None:
+        clip_model, _ = clip.load(args.clip_path)
+        clip_model.eval()
+        clip_model = clip_model.cuda()
+
+        video_loader = VideoExtractor(N=100)
+
+        transform = Compose([
+            Resize(224, interpolation=BICUBIC),
+            CenterCrop(224),
+            Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+        ])
+
+    js = json.load(open(args.data_path))
+    for id, data in tqdm(js.items()):
+        features = None
+
+        if args.feat_folder is not None:
+            feat_path = os.path.join(args.feat_folder, f"{id}.npy")
+            if os.path.isfile(feat_path):
+                features = torch.from_numpy(np.load(feat_path)).cuda()
+
+        if features is None and args.video_folder is not None:
+            for ext in ['mp4', 'mkv', 'webm']:
+                video_path = os.path.join(args.video_folder, f"{id}.{ext}")
+                if os.path.isfile(video_path):
+                    _, images = video_loader.extract({'id': None, 'video': video_path})
+
+                    images = transform(images / 255.0)
+                    images = images.to(torch.float16)
+                    with torch.no_grad():
+                        features = clip_model.encode_image(images.to('cuda'))
+
+        if features is None:
+            print(f'Can not find video {id}')
+            continue
+ 
+        if args.task in ['captioning', 'all']:
+            for query_id, query in enumerate(questions['captioning']):
+                answer = inference(model, features, "<video>\n " + query, tokenizer)
+                write_log(args.log_path, id, 'captioning', query_id, answer)
+      
+        if args.task in ['grounding', 'all']:
+            for sentence_id, (timestamps, sentence) in enumerate(zip(data['timestamps'], data['sentences'])):
+                sentence = sentence.strip().lower()
+                if sentence.endswith("."):
+                    sentence = sentence[:-1]
+
+                for query_id, query in enumerate(questions['grounding']):
+                    answer = inference(model, features, "<video>\n" + query.format(sentence), tokenizer)
+                    gt = (timestamps[0] / data['duration'], timestamps[1] / data['duration'])
+                    u = iou(answer, gt)
+                    write_log(args.log_path, id, 'grounding', query_id, answer, info={"sentence_id": sentence_id, 'iou': u})