v1.0

112bf76b · chenzk · 112bf76b · 112bf76b · 112bf76b · 112bf76b
Commit 112bf76b authored Oct 31, 2024 by chenzk
20 changed files
--- a/data_tools/statistics_token_num_patch.py
+++ b/data_tools/statistics_token_num_patch.py
+import json
+import math
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import torch
+import transformers
+from PIL import Image
+from tqdm import tqdm
+
+import torchaudio
+from vita import conversation as conversation_lib
+from vita.config import *
+from vita.config import AudioFolder, FolderDict
+from vita.config.dataset_config import *
+from vita.constants import AUDIO_TOKEN_INDEX, GLOBAL_WEIGHTS_PATH, IGNORE_INDEX, IMAGE_TOKEN_INDEX
+from vita.util.data_utils_video_audio import DataArguments, LazySupervisedDataset
+from vita.util.data_utils_video_audio_neg_patch import find_closest_aspect_ratio
+from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
+
+image_token_num = 256
+token_thre = 9500
+# datasets = NLP + HumanCentric + VideoQA + NaturalQA + VideoCap + OCRCap + NaturalCap
+
+datasets = NaturalCap + OCRCap + VideoCap + NaturalQA
+# datasets = VideoQA + HumanCentric + NLP
+# datasets = [SGInternvid0]
+datasets = [TextSFT, TextSFT2_0]
+
+out_file_name = "debug.json"
+
+parser = transformers.HfArgumentParser((DataArguments))
+tokenizer = transformers.AutoTokenizer.from_pretrained(
+    f"{GLOBAL_WEIGHTS_PATH}/Mixtral-8x7B_New/mg2hg",
+    cache_dir=None,
+    model_max_length=8192,
+    padding_side="right",
+    use_fast=True,
+)
+
+long_json = []
+
+
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=True):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+    return blocks
+
+
+def get_wav_duration(file_path):
+    waveform, sample_rate = torchaudio.load(file_path)
+    duration = waveform.size(1) / sample_rate
+    return duration
+
+
+def process_item(item, conv, roles, tokenizer):
+    source = item["conversations"]
+    conv.messages = []
+    for j, sentence in enumerate(source):
+        role = roles[sentence["from"]]
+        assert role == conv.roles[j % 2], f"{source}"
+        conv.append_message(role, sentence["value"])
+    prompt = conv.get_prompt()
+
+    input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+    item_token_num = input_ids.shape[0]
+    if "image" in item:
+        image_file = item["image"]
+        if isinstance(image_file, str):
+            image_file = [image_file]
+        set_id = item["set"]
+        if isinstance(set_id, str):
+            set_id = [set_id]
+        for k, img_file in enumerate(image_file):
+            if set_id[k] not in NoPatchSets:
+                image_directory = FolderDict[set_id[k]]
+                image = Image.open(
+                    os.path.join(image_directory, img_file.replace("\\", "/"))
+                ).convert("RGB")
+                num_patches = dynamic_preprocess(image)
+            else:
+                num_patches = 1
+            item_token_num += num_patches * image_token_num
+
+    total_duration = 0
+    if "audio" in item:
+        audio_files = item["audio"]
+        audio_directory = AudioFolder
+        if isinstance(audio_files, str):
+            audio_files = [audio_files]
+        assert isinstance(audio_files, list)
+        for audio_file_name in audio_files:
+            audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
+            duration = get_wav_duration(audio_file_path)
+            duration = (
+                math.ceil(duration) if math.ceil(duration) % 2 == 0 else math.ceil(duration) + 1
+            )
+            total_duration += duration
+        item_token_num += math.ceil(total_duration * 12.5)
+    if item_token_num > token_thre:
+        print(f"item_token_num: {item_token_num}")
+        if len(item["image"]) >= 16:
+            print(f"num_patches: {num_patches}")
+            print(f"total_duration: {total_duration}")
+            long_json.append(item)
+            print(item)
+    return item_token_num
+
+
+for dataset in datasets:
+    json_file_path = dataset["chat_path"]
+
+    with open(json_file_path, "r", encoding="utf-8") as file:
+        data = json.load(file)
+
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    len_list = []
+    with ThreadPoolExecutor() as executor:
+        futures = [executor.submit(process_item, item, conv, roles, tokenizer) for item in data]
+        for future in tqdm(as_completed(futures), total=len(futures)):
+            len_list.append(future.result())
+
+    assert len(len_list) == len(data)
+
+    distribution = {
+        "0-100": 0,
+        "100-200": 0,
+        "200-300": 0,
+        "300-400": 0,
+        "400-500": 0,
+        "500-600": 0,
+        "600-700": 0,
+        "700-800": 0,
+        "800-900": 0,
+        "900-1000": 0,
+        "1000-1500": 0,
+        "1500-2000": 0,
+        "2000-2500": 0,
+        "2500-3000": 0,
+        "3000-3500": 0,
+        "3500-4000": 0,
+        "4000-4500": 0,
+        "4500-5000": 0,
+        "5000-5500": 0,
+        "5500-6000": 0,
+        "6000-6500": 0,
+        "6500-7000": 0,
+        "7000-7500": 0,
+        "7500-8000": 0,
+        "8000-8500": 0,
+        "8500-9000": 0,
+        "9000-9500": 0,
+        "9500-10000": 0,
+        ">10000": 0,
+    }
+
+    for length in len_list:
+        if length <= 100:
+            distribution["0-100"] += 1
+        elif length <= 200:
+            distribution["100-200"] += 1
+        elif length <= 300:
+            distribution["200-300"] += 1
+        elif length <= 400:
+            distribution["300-400"] += 1
+        elif length <= 500:
+            distribution["400-500"] += 1
+        elif length <= 600:
+            distribution["500-600"] += 1
+        elif length <= 700:
+            distribution["600-700"] += 1
+        elif length <= 800:
+            distribution["700-800"] += 1
+        elif length <= 900:
+            distribution["800-900"] += 1
+        elif length <= 1000:
+            distribution["900-1000"] += 1
+        elif length <= 1500:
+            distribution["1000-1500"] += 1
+        elif length <= 2000:
+            distribution["1500-2000"] += 1
+        elif length <= 2500:
+            distribution["2000-2500"] += 1
+        elif length <= 3000:
+            distribution["2500-3000"] += 1
+        elif length <= 3500:
+            distribution["3000-3500"] += 1
+        elif length <= 4000:
+            distribution["3500-4000"] += 1
+        elif length <= 4500:
+            distribution["4000-4500"] += 1
+        elif length <= 5000:
+            distribution["4500-5000"] += 1
+        elif length <= 5500:
+            distribution["5000-5500"] += 1
+        elif length <= 6000:
+            distribution["5500-6000"] += 1
+        elif length <= 6500:
+            distribution["6000-6500"] += 1
+        elif length <= 7000:
+            distribution["6500-7000"] += 1
+        elif length <= 7500:
+            distribution["7000-7500"] += 1
+        elif length <= 8000:
+            distribution["7500-8000"] += 1
+        elif length <= 8500:
+            distribution["8000-8500"] += 1
+        elif length <= 9000:
+            distribution["8500-9000"] += 1
+        elif length <= 9500:
+            distribution["9000-9500"] += 1
+        elif length <= 10000:
+            distribution["9500-10000"] += 1
+        else:
+            distribution[">10000"] += 1
+
+    print(f"Length distribution of {json_file_path}:")
+    for key, value in distribution.items():
+        print(f"{key}: {value}")
+
+# with open(out_file_name, 'w', encoding='utf-8') as file:
+#    json.dump(long_json*10, file, ensure_ascii=False, indent=4)
+
+# print(f"处理完成，大于{token_thre}的已保存到{out_file_name}")
--- a/data_tools/statistics_token_num_patch_video.py
+++ b/data_tools/statistics_token_num_patch_video.py
+import json
+import math
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import numpy as np
+import torch
+import transformers
+from PIL import Image
+from tqdm import tqdm
+
+import torchaudio
+from decord import VideoReader, cpu
+from vita import conversation as conversation_lib
+from vita.config import *
+from vita.config import AudioFolder, FolderDict
+from vita.config.dataset_config import *
+from vita.constants import (
+    DEFAULT_AUDIO_TOKEN,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_VIDEO_TOKEN,
+    GLOBAL_WEIGHTS_PATH,
+    IGNORE_INDEX,
+    MAX_IMAGE_LENGTH,
+    MIN_IMAGE_LENGTH,
+)
+from vita.util.data_utils_video_audio import DataArguments, LazySupervisedDataset
+from vita.util.data_utils_video_audio_neg_patch import find_closest_aspect_ratio
+from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
+
+image_token_num = 256
+token_thre = 9500
+# datasets = NLP + HumanCentric + VideoQA + NaturalQA + VideoCap + OCRCap + NaturalCap
+
+
+datasets = NaturalCap + OCRCap + VideoCap + NaturalQA
+# datasets = VideoQA + HumanCentric + NLP
+# datasets = [SGInternvid0]
+# datasets = [TextSFT, TextSFT2_0]
+datasets = VideoCap
+
+out_file_name = "debug.json"
+
+parser = transformers.HfArgumentParser((DataArguments))
+tokenizer = transformers.AutoTokenizer.from_pretrained(
+    f"{GLOBAL_WEIGHTS_PATH}/Mixtral-8x7B_New/mg2hg",
+    cache_dir=None,
+    model_max_length=8192,
+    padding_side="right",
+    use_fast=True,
+)
+
+long_json = []
+
+
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=True):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+    return blocks
+
+
+def get_wav_duration(file_path):
+    waveform, sample_rate = torchaudio.load(file_path)
+    duration = waveform.size(1) / sample_rate
+    return duration
+
+
+def get_video_frame(
+    video_path,
+    max_frames=MAX_IMAGE_LENGTH,
+    min_frames=MIN_IMAGE_LENGTH,
+    video_framerate=1,
+    s=None,
+    e=None,
+):
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0.0 else 0.0
+        end_time = end_time if end_time >= 0.0 else 0.0
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        if len(all_pos) > max_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)
+            ]
+        elif len(all_pos) < min_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=min_frames, dtype=int)
+            ]
+        else:
+            sample_pos = all_pos
+
+    return len(sample_pos)
+
+
+def process_item(item, conv, roles, tokenizer):
+    source = item["conversations"]
+    conv.messages = []
+    for j, sentence in enumerate(source):
+        role = roles[sentence["from"]]
+        assert role == conv.roles[j % 2], f"{source}"
+        conv.append_message(role, sentence["value"])
+    prompt = conv.get_prompt()
+
+    input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+    item_token_num = input_ids.shape[0]
+    if "image" in item:
+        image_file = item["image"]
+        if isinstance(image_file, str):
+            image_file = [image_file]
+        set_id = item["set"]
+        if isinstance(set_id, str):
+            set_id = [set_id]
+        for k, img_file in enumerate(image_file):
+            if set_id[k] not in NoPatchSets:
+                image_directory = FolderDict[set_id[k]]
+                image = Image.open(
+                    os.path.join(image_directory, img_file.replace("\\", "/"))
+                ).convert("RGB")
+                num_patches = dynamic_preprocess(image)
+            else:
+                num_patches = 1
+            item_token_num += num_patches * image_token_num
+
+    if "video" in item:
+        video_file = item["video"]
+        if isinstance(video_file, str):
+            video_file = [video_file]
+        set_id = item["set"]
+        if isinstance(set_id, str):
+            set_id = [set_id]
+        for k, video_file_name in enumerate(video_file):
+            video_directory = FolderDict[set_id[k]]
+            video_file_path = os.path.join(video_directory, video_file_name)
+            num_frame = get_video_frame(video_file_path)
+            item_token_num += num_frame * image_token_num
+
+    total_duration = 0
+    if "audio" in item:
+        audio_files = item["audio"]
+        audio_directory = AudioFolder
+        if isinstance(audio_files, str):
+            audio_files = [audio_files]
+        assert isinstance(audio_files, list)
+        for audio_file_name in audio_files:
+            audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
+            duration = get_wav_duration(audio_file_path)
+            duration = (
+                math.ceil(duration) if math.ceil(duration) % 2 == 0 else math.ceil(duration) + 1
+            )
+            total_duration += duration
+        item_token_num += math.ceil(total_duration * 12.5)
+    if item_token_num > token_thre:
+        print(f"item_token_num: {item_token_num}")
+        if len(item["image"]) >= 16:
+            print(f"num_patches: {num_patches}")
+            print(f"total_duration: {total_duration}")
+            long_json.append(item)
+            print(item)
+    return item_token_num
+
+
+for dataset in datasets:
+    json_file_path = dataset["chat_path"]
+
+    with open(json_file_path, "r", encoding="utf-8") as file:
+        data = json.load(file)
+
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    len_list = []
+    with ThreadPoolExecutor() as executor:
+        futures = [executor.submit(process_item, item, conv, roles, tokenizer) for item in data]
+        for future in tqdm(as_completed(futures), total=len(futures)):
+            len_list.append(future.result())
+
+    assert len(len_list) == len(data)
+
+    distribution = {
+        "0-100": 0,
+        "100-200": 0,
+        "200-300": 0,
+        "300-400": 0,
+        "400-500": 0,
+        "500-600": 0,
+        "600-700": 0,
+        "700-800": 0,
+        "800-900": 0,
+        "900-1000": 0,
+        "1000-1500": 0,
+        "1500-2000": 0,
+        "2000-2500": 0,
+        "2500-3000": 0,
+        "3000-3500": 0,
+        "3500-4000": 0,
+        "4000-4500": 0,
+        "4500-5000": 0,
+        "5000-5500": 0,
+        "5500-6000": 0,
+        "6000-6500": 0,
+        "6500-7000": 0,
+        "7000-7500": 0,
+        "7500-8000": 0,
+        "8000-8500": 0,
+        "8500-9000": 0,
+        "9000-9500": 0,
+        "9500-10000": 0,
+        ">10000": 0,
+    }
+
+    for length in len_list:
+        if length <= 100:
+            distribution["0-100"] += 1
+        elif length <= 200:
+            distribution["100-200"] += 1
+        elif length <= 300:
+            distribution["200-300"] += 1
+        elif length <= 400:
+            distribution["300-400"] += 1
+        elif length <= 500:
+            distribution["400-500"] += 1
+        elif length <= 600:
+            distribution["500-600"] += 1
+        elif length <= 700:
+            distribution["600-700"] += 1
+        elif length <= 800:
+            distribution["700-800"] += 1
+        elif length <= 900:
+            distribution["800-900"] += 1
+        elif length <= 1000:
+            distribution["900-1000"] += 1
+        elif length <= 1500:
+            distribution["1000-1500"] += 1
+        elif length <= 2000:
+            distribution["1500-2000"] += 1
+        elif length <= 2500:
+            distribution["2000-2500"] += 1
+        elif length <= 3000:
+            distribution["2500-3000"] += 1
+        elif length <= 3500:
+            distribution["3000-3500"] += 1
+        elif length <= 4000:
+            distribution["3500-4000"] += 1
+        elif length <= 4500:
+            distribution["4000-4500"] += 1
+        elif length <= 5000:
+            distribution["4500-5000"] += 1
+        elif length <= 5500:
+            distribution["5000-5500"] += 1
+        elif length <= 6000:
+            distribution["5500-6000"] += 1
+        elif length <= 6500:
+            distribution["6000-6500"] += 1
+        elif length <= 7000:
+            distribution["6500-7000"] += 1
+        elif length <= 7500:
+            distribution["7000-7500"] += 1
+        elif length <= 8000:
+            distribution["7500-8000"] += 1
+        elif length <= 8500:
+            distribution["8000-8500"] += 1
+        elif length <= 9000:
+            distribution["8500-9000"] += 1
+        elif length <= 9500:
+            distribution["9000-9500"] += 1
+        elif length <= 10000:
+            distribution["9500-10000"] += 1
+        else:
+            distribution[">10000"] += 1
+
+    print(f"Length distribution of {json_file_path}:")
+    for key, value in distribution.items():
+        print(f"{key}: {value}")
+
+# with open(out_file_name, 'w', encoding='utf-8') as file:
+#    json.dump(long_json*10, file, ensure_ascii=False, indent=4)
+
+# print(f"处理完成，大于{token_thre}的已保存到{out_file_name}")
--- a/doc/Mixtral.png
+++ b/doc/Mixtral.png
--- a/doc/SMoE.png
+++ b/doc/SMoE.png
--- a/doc/vita.png
+++ b/doc/vita.png
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.3.0-ubuntu22.04-dtk24.04.2-py3.10
+ENV DEBIAN_FRONTEND=noninteractive
+# RUN yum update && yum install -y git cmake wget build-essential
+# RUN source /opt/dtk-24.04.2/env.sh
+# # 安装pip相关依赖
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
+accelerate==0.30.1
+decord==0.6.0
+#deepspeed==0.9.5
+Jinja2==3.1.4
+ninja==1.11.1.1
+numpy==1.26.4
+#torch==2.3.1
+#torchaudio==2.3.1
+#torchvision
+tqdm==4.66.4
+transformers==4.41.1
+#xformers
+timm
+soundfile==0.12.1
--- a/icon.png
+++ b/icon.png
--- a/infer.sh
+++ b/infer.sh
+# Text query
+HIP_VISIBLE_DEVICES=0,1 python video_audio_demo.py --model_path VITA/VITA_ckpt --image_path asset/vita_log2.png --model_type mixtral-8x7b --conv_mode mixtral_two --question "请描述这张图片。"
+# Audio query
+# HIP_VISIBLE_DEVICES=0,1 python video_audio_demo.py --model_path VITA/VITA_ckpt --image_path asset/vita_log2.png --model_type mixtral-8x7b --conv_mode mixtral_two --audio_path asset/q1.wav
+# Noisy audio query
+# HIP_VISIBLE_DEVICES=0,1 python video_audio_demo.py --model_path VITA/VITA_ckpt --image_path asset/vita_log2.png --model_type mixtral-8x7b --conv_mode mixtral_two --audio_path asset/q2.wav
--- a/input_imgs/promp0.jpg
+++ b/input_imgs/promp0.jpg
--- a/input_wavs/audio/audio0.wav
+++ b/input_wavs/audio/audio0.wav
--- a/input_wavs/promp0.wav
+++ b/input_wavs/promp0.wav
--- a/mixtral_decode.py
+++ b/mixtral_decode.py
+# -*- coding: utf-8 -*-
+import time
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from vita.constants import GLOBAL_WEIGHTS_PATH
+
+model_dir = f"{GLOBAL_WEIGHTS_PATH}/Mixtral-8x7B_New/mg2hg"
+
+tokenizer = AutoTokenizer.from_pretrained(model_dir)
+
+
+# 给定的 ID 列表
+id_list = [
+    1,
+    1587,
+    28747,
+    29383,
+    28971,
+    28518,
+    32350,
+    33702,
+    28944,
+    13,
+    28733,
+    28705,
+    29383,
+    28971,
+    32569,
+    32730,
+    32606,
+    28914,
+    29050,
+    35267,
+    32315,
+    28944,
+    29383,
+    28914,
+    32626,
+    39797,
+    28971,
+    32311,
+    29041,
+    41993,
+    29958,
+    46454,
+    28944,
+    13,
+    28733,
+    28705,
+    29383,
+    32585,
+    32474,
+    32599,
+    32683,
+    28914,
+    29292,
+    29824,
+    35267,
+    32100,
+    44797,
+    33089,
+    29457,
+    38038,
+    32599,
+    28914,
+    32509,
+    28944,
+    13,
+    28733,
+    47068,
+    32599,
+    38201,
+    29383,
+    37676,
+    28914,
+    34559,
+    35845,
+    28924,
+    29383,
+    29179,
+    29478,
+    32599,
+    41534,
+    29457,
+    29551,
+    32599,
+    35702,
+    34415,
+    28914,
+    35845,
+    28944,
+    2,
+    28705,
+    13,
+    1838,
+    28747,
+]
+id_list = [
+    28991,
+    34275,
+    29105,
+    33216,
+    30344,
+    29675,
+    28914,
+    46018,
+    29131,
+    29086,
+    28944,
+    29087,
+    29960,
+    28991,
+    34700,
+    43072,
+    28914,
+    28971,
+    28518,
+    29046,
+]
+id_list = [
+    28705,
+    13,
+    2,
+    28705,
+    13,
+    10093,
+    28747,
+    51497,
+    40994,
+    30162,
+    32980,
+    39944,
+    29105,
+    28518,
+    41772,
+    28914,
+    34796,
+    32703,
+    28924,
+    29450,
+    28991,
+    34275,
+    29105,
+    33216,
+    30344,
+    29675,
+    28914,
+    46018,
+    29131,
+    29086,
+    28944,
+    29087,
+    29960,
+    28991,
+    34700,
+    43072,
+    28914,
+    28971,
+    28518,
+    29046,
+    29003,
+    28835,
+    4712,
+    28743,
+    12673,
+    28838,
+    28914,
+    46018,
+    28924,
+    29450,
+    33778,
+    31224,
+    29222,
+    29146,
+    33280,
+    29010,
+    36599,
+    28914,
+    49363,
+    29054,
+    28944,
+    32641,
+    46018,
+    29074,
+    29450,
+    34526,
+    28914,
+    32626,
+    40497,
+    28924,
+    32590,
+    28518,
+    30308,
+    29251,
+    30912,
+    29677,
+    29131,
+    28518,
+    35545,
+    28914,
+    51009,
+    29169,
+    28944,
+    13,
+    29010,
+    33292,
+    28991,
+    28924,
+    32012,
+    32924,
+    29450,
+    29440,
+    34051,
+    46018,
+    28924,
+    33837,
+    46018,
+    33421,
+    32587,
+    28914,
+    33103,
+    28944,
+    29450,
+    28991,
+    28518,
+    46018,
+    28998,
+    28518,
+    36101,
+    28914,
+    33778,
+    28924,
+    29746,
+    31127,
+    28518,
+    29310,
+    35348,
+    30163,
+    32813,
+    28914,
+    31249,
+    31861,
+    28944,
+    32663,
+    46018,
+    29054,
+    28914,
+    33114,
+    29302,
+    29010,
+    32155,
+    33053,
+    28924,
+    41192,
+    29992,
+    30163,
+    42747,
+    28924,
+    29746,
+    41192,
+    29310,
+    30150,
+    29010,
+    49460,
+    29169,
+    49565,
+    28944,
+    13,
+    33238,
+    33015,
+    29458,
+    29366,
+    29366,
+    28914,
+    41261,
+    29061,
+    28914,
+    36599,
+    38437,
+    30131,
+    30631,
+    28924,
+    34249,
+    29065,
+    48245,
+    29746,
+    32850,
+    28914,
+    33857,
+    28944,
+    33257,
+    32031,
+    41772,
+    28924,
+    44169,
+    28969,
+    29824,
+    34239,
+    30266,
+    28924,
+    33837,
+    35115,
+    29460,
+    39676,
+    40016,
+    29074,
+    33158,
+    35523,
+    29276,
+    28914,
+    43604,
+    28944,
+    36286,
+    28991,
+    28914,
+    36096,
+    32557,
+    28971,
+    37478,
+    28914,
+    28924,
+    33070,
+    35155,
+    49059,
+    49550,
+    28914,
+    36096,
+    47444,
+    28924,
+    29118,
+    36101,
+    29131,
+    32813,
+    28914,
+    33778,
+    28944,
+    44488,
+    28914,
+    29367,
+    29051,
+    33151,
+    33647,
+    29176,
+    28971,
+    28518,
+    36059,
+    32710,
+    28914,
+    32703,
+    32854,
+    28924,
+    49323,
+    29010,
+    32857,
+    35049,
+    29276,
+    32789,
+    28944,
+    2,
+]
+# 将 ID 列表转换为 PyTorch 张量
+id_tensor = torch.tensor(id_list)
+
+# 使用 tokenizer 解码
+decoded_text = tokenizer.decode(id_tensor, skip_special_tokens=True)
+
+print(f"Decoded text: {decoded_text}")
--- a/mixtral_inference.py
+++ b/mixtral_inference.py
+# -*- coding: utf-8 -*-
+import time
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from vita.constants import GLOBAL_WEIGHTS_PATH
+
+model_dir = f"{GLOBAL_WEIGHTS_PATH}/Mixtral-8x7B_modVocab/mg2hg"
+tokenizer = AutoTokenizer.from_pretrained(model_dir)
+
+
+system_prompt = "你是一个人工智能机器人。\n- 你是研究社区开发的大语言模型。你的设计宗旨是有益、诚实且无害。\n- 你支持使用用户选择的多种语言流利地进行交流并解答用户的问题。\n- 如果用户更正你生成的错误答案，你会向用户致歉并与用户探讨正确的答案。"
+
+question = "请详细介绍一下火星。"
+
+chat_template = "system:{system_prompt}</s>\nuser:{question}</s>\nbot:"
+
+text = chat_template.format(system_prompt=system_prompt, question=question)
+input_ids = tokenizer(text, return_tensors="pt")["input_ids"]
+input_ids = input_ids.to("cuda")
+
+
+model = AutoModelForCausalLM.from_pretrained(
+    #    model_dir, torch_dtype=torch.float16, device_map="auto",attn_implementation="flash_attention_2").eval()
+    model_dir,
+    torch_dtype=torch.float16,
+    device_map="auto",
+).eval()
+
+start_time = time.time()
+outputs = model.generate(input_ids, max_new_tokens=10)
+time_consume = time.time() - start_time
+
+outputs = outputs.cpu().numpy()[0]
+outputs = outputs[len(input_ids[0]) :]
+output_text = tokenizer.decode(outputs, skip_special_tokens=True)
+
+
+print(output_text)
+print(f"Time consume: {time_consume}")
--- a/model.properties
+++ b/model.properties
+# 模型编码
+modelCode=1068
+# 模型名称
+modelName=vita_pytorch
+# 模型描述
+modelDescription=VITA能够处理视频、图像、文本和音频，具备先进的多模态交互体验，无需使用唤醒词或按钮即可被激活。
+# 应用场景
+appScenario=推理,对话问答,制造,广媒,金融,能源,医疗,家居,教育
+# 框架类型
+frameType=pytorch
--- a/requirements.txt
+++ b/requirements.txt
+accelerate==0.30.1
+decord==0.6.0
+#deepspeed==0.9.5
+Jinja2==3.1.4
+ninja==1.11.1.1
+numpy==1.26.4
+#torch==2.3.1
+#torchaudio==2.3.1
+#torchvision
+tqdm==4.66.4
+transformers==4.41.1
+#xformers
+timm
+soundfile==0.12.1
--- a/script/deepspeed/ds_config_zero3.json
+++ b/script/deepspeed/ds_config_zero3.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 32,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "total_num_steps" : "auto",
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 5e8,
+        "stage3_max_reuse_distance": 5e8,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
--- a/script/deepspeed/ds_config_zero3_offload.json
+++ b/script/deepspeed/ds_config_zero3_offload.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 32,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "total_num_steps" : "auto",
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 5e8,
+        "stage3_max_reuse_distance": 5e8,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
--- a/script/deepspeed/ds_config_zero3_offload2.json
+++ b/script/deepspeed/ds_config_zero3_offload2.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 32,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "total_num_steps" : "auto",
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 5e8,
+        "stage3_max_reuse_distance": 5e8,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
--- a/script/deepspeed/zero2.json
+++ b/script/deepspeed/zero2.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto"
+    }
+}
\ No newline at end of file