v1.0

112bf76b · chenzk · 112bf76b · 112bf76b · 112bf76b · 112bf76b
Commit 112bf76b authored Oct 31, 2024 by chenzk
20 changed files
--- a/coco/train2017/000000000164.jpg
+++ b/coco/train2017/000000000164.jpg
--- a/command.sh
+++ b/command.sh
+# Mixtral
+CUDA_VISIBLE_DEVICES=7 python mixtral_inference.py 
--- a/custom/coco/images/train2017/000000000164.jpg
+++ b/custom/coco/images/train2017/000000000164.jpg
--- a/custom/custom.json
+++ b/custom/custom.json
+[
+    {
+        "set": "sharegpt4",
+        "id": "000000000164",
+        "conversations": [
+            {
+                "from": "human",
+                "value": "<image>\ninput_wavs/promp0.wav\n"
+            },
+            {
+                "from": "gpt",
+                "value": "This is a well-organized kitchen with a clean, modern aesthetic. The kitchen features a white countertop against a white wall, creating a bright and airy atmosphere. "
+            }
+        ],
+        "image": "coco/images/train2017/000000000164.jpg",
+        "audio": [
+            "audio0.wav"
+        ]
+    }
+]
--- a/data_tools/check_audio_lost.py
+++ b/data_tools/check_audio_lost.py
+import json
+import math
+import os
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from PIL import Image
+from tqdm import tqdm
+
+import torchaudio
+from vita.config import *
+from vita.config import AudioFolder, FolderDict
+from vita.config.dataset_config import *
+
+# 定义文件路径
+output_file_path = "lost_file_name.txt"
+dur_thre1 = 30
+dur_thre2 = 0.5
+
+# 将所有字典放入一个列表中
+# datasets = NLP + HumanCentric + VideoQA + NaturalQA +VideoCap + OCRCap + NaturalCap
+# datasets = NaturalCap + VideoCap + OCRCap + NaturalQA + VideoQA + HumanCentric + [TextSFT]
+datasets = NaturalCap + VideoCap
+datasets = OCRCap + NaturalQA
+datasets = VideoQA + HumanCentric + [TextSFT]
+datasets = [TextSFT]
+# 初始化一个列表来存储丢失的文件名
+lost_files = []
+lock = threading.Lock()
+
+
+def get_wav_duration(file_path):
+    waveform, sample_rate = torchaudio.load(file_path)
+    duration = waveform.size(1) / sample_rate
+    return duration
+
+
+def check_audio(audio_file_name, audio_directory):
+    audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
+    if not os.path.exists(audio_file_path):
+        print(f"{audio_file_path} lost!!!!!!!!")
+        return audio_file_name
+    else:
+        try:
+            duration = get_wav_duration(audio_file_path)
+            if duration > dur_thre1 or duration < dur_thre2:
+                print(f"{audio_file_path} duration {duration}, too long!!!!!!!")
+                return audio_file_name
+        except Exception as e:
+            print(f"{audio_file_path} is broken!!!!!!!!")
+            return audio_file_name
+    return None
+
+
+# 遍历每个字典
+for dataset in datasets:
+    keys = list(dataset.keys())
+    json_file_path = dataset["chat_path"]
+    print(json_file_path)
+    # 读取JSON文件
+    with open(json_file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    # 遍历每条数据，使用tqdm显示进度条
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        futures = []
+        for item in data:
+            audio_files = item.get("audio")
+            audio_directory = AudioFolder
+            # 如果 audio_files 是字符串，将其转换为列表
+            if isinstance(audio_files, str):
+                audio_files = [audio_files]
+
+            # 如果 audio_files 是列表，处理每个文件
+            if isinstance(audio_files, list):
+                for audio_file_name in audio_files:
+                    futures.append(executor.submit(check_audio, audio_file_name, audio_directory))
+
+        for future in tqdm(
+            as_completed(futures), total=len(futures), desc="Processing", unit="file"
+        ):
+            result = future.result()
+            if result:
+                with lock:
+                    lost_files.append(result)
+
+# 将丢失的文件名写入到lost_file_name.txt中
+with open(output_file_path, "w", encoding="utf-8") as f:
+    for file_name in lost_files:
+        f.write(file_name + "\n")
+
+print(f"检查完成，共有 {len(lost_files)} 个文件丢失或无法读取，结果已保存到 {output_file_path}")
--- a/data_tools/check_image_lost.py
+++ b/data_tools/check_image_lost.py
+import json
+import os
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from PIL import Image
+from tqdm import tqdm
+
+from vita.config import FolderDict
+from vita.config.dataset_config import *
+
+# 定义文件路径
+output_file_path = "lost_file_name.txt"
+
+# 将所有字典放入一个列表中
+datasets = [ShareGPT4V]
+
+# 初始化一个列表来存储丢失的文件名
+lost_files = []
+lock = threading.Lock()
+
+
+def check_image(image_file_name, image_directory):
+    image_file_path = os.path.join(image_directory, image_file_name)
+    if not os.path.exists(image_file_path):
+        return image_file_name
+    else:
+        try:
+            with Image.open(image_file_path) as img:
+                img.convert("RGB")
+        except Exception as e:
+            return image_file_name
+    return None
+
+
+# 遍历每个字典
+for dataset in datasets:
+    keys = list(dataset.keys())
+    json_file_path = dataset["chat_path"]
+    print(json_file_path)
+    # 读取JSON文件
+    with open(json_file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    # 遍历每条数据，使用tqdm显示进度条
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        futures = []
+        for item in data:
+            if "image" in item:
+                image_files = item.get("image")
+                set_id = item["set"]
+                if type(set_id) is list:
+                    set_id = set_id[0]
+                image_directory = FolderDict[set_id]
+                # 如果 image_files 是字符串，将其转换为列表
+                if isinstance(image_files, str):
+                    image_files = [image_files]
+
+                # 如果 image_files 是列表，处理每个文件
+                if isinstance(image_files, list):
+                    for image_file_name in image_files:
+                        futures.append(
+                            executor.submit(check_image, image_file_name, image_directory)
+                        )
+
+        for future in tqdm(
+            as_completed(futures), total=len(futures), desc="Processing", unit="file"
+        ):
+            result = future.result()
+            if result:
+                with lock:
+                    lost_files.append(result)
+                    print(f"file lost: {result}")
+
+# 将丢失的文件名写入到lost_file_name.txt中
+with open(output_file_path, "w", encoding="utf-8") as f:
+    for file_name in lost_files:
+        f.write(file_name + "\n")
+
+print(f"检查完成，共有 {len(lost_files)} 个文件丢失或无法读取，结果已保存到 {output_file_path}")
--- a/data_tools/check_image_space_ratio.py
+++ b/data_tools/check_image_space_ratio.py
+import json
+import os
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from PIL import Image
+from tqdm import tqdm
+
+from vita.config import FolderDict
+from vita.config.dataset_config import *
+
+# 定义文件路径
+output_file_path = "long_image_file_name.txt"
+ratio_thre = 12
+
+# 将所有字典放入一个列表中
+# datasets = [AnyWord_20to50, RCTW2019, RCTW2019QA, RCTW2017, OpenChart, SCID, K12, TabRECSet, DigChat, iFlyTab]
+datasets = [AnyWord_20to50, DyChart_iresearch]
+datasets = [RCTW2019, RCTW2019QA, RCTW2017]
+datasets = [OpenChart, SCID]
+datasets = [K12]
+# datasets = [TabRECSet, DigChat, iFlyTab]
+
+# 初始化一个列表来存储丢失的文件名
+lost_files = []
+lock = threading.Lock()
+
+
+def check_image(image_file_name, image_directory):
+    image_file_path = os.path.join(image_directory, image_file_name)
+    if not os.path.exists(image_file_path):
+        print(f"{image_file_path} not exist!!!!!!!!!!")
+        return image_file_name
+    else:
+        try:
+            with Image.open(image_file_path) as img:
+                img.convert("RGB")
+                size_ratio = img.size[0] / img.size[1]
+                if size_ratio < 1 / ratio_thre or size_ratio > ratio_thre:
+                    print(f"{image_file_path} ratio is too big!!!!!!!!!!!!!!")
+                    return image_file_name
+        except Exception as e:
+            print(f"{image_file_path} is broken!!!!!!!!!!!!")
+            return image_file_name
+    return None
+
+
+# 遍历每个字典
+for dataset in datasets:
+    keys = list(dataset.keys())
+    json_file_path = dataset["chat_path"]
+
+    # 读取JSON文件
+    with open(json_file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    # 遍历每条数据，使用tqdm显示进度条
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        futures = []
+        for item in data:
+            image_files = item.get("image")
+            set_id = item["set"]
+            image_directory = FolderDict[set_id]
+
+            # 如果 image_files 是字符串，将其转换为列表
+            if isinstance(image_files, str):
+                image_files = [image_files]
+
+            # 如果 image_files 是列表，处理每个文件
+            if isinstance(image_files, list):
+                for image_file_name in image_files:
+                    futures.append(executor.submit(check_image, image_file_name, image_directory))
+
+        for future in tqdm(
+            as_completed(futures), total=len(futures), desc="Processing", unit="file"
+        ):
+            result = future.result()
+            if result:
+                with lock:
+                    lost_files.append(result)
+
+# 将丢失的文件名写入到lost_file_name.txt中
+with open(output_file_path, "w", encoding="utf-8") as f:
+    for file_name in lost_files:
+        f.write(file_name + "\n")
+
+print(f"检查完成，共有 {len(lost_files)} 个文件丢失或无法读取，结果已保存到 {output_file_path}")
--- a/data_tools/check_json.py
+++ b/data_tools/check_json.py
+import json
+import math
+import os
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from PIL import Image
+from tqdm import tqdm
+
+import torchaudio
+from vita.config import *
+from vita.config import AudioFolder, FolderDict
+from vita.config.dataset_config import *
+
+# 将所有字典放入一个列表中
+datasets = NLP + HumanCentric + VideoQA + NaturalQA + VideoCap + OCRCap + NaturalCap
+datasets = [Webvid]
+
+# 遍历每个字典
+for dataset in datasets:
+    dur_list = []
+    keys = list(dataset.keys())
+    input_file_name = dataset["chat_path"]
+
+    # 读取JSON文件
+    with open(input_file_name, "r", encoding="utf-8") as file:
+        data = json.load(file)
+
+    print(f"check {input_file_name}")
+    # 遍历每条数据
+    for item in tqdm(data):
+        # 是否有set_id
+        assert "set" in item, f"{input_file_name} do not have set_id: {item} !!!!!!!!!!"
+
+        # item是否为空
+        assert len(item) > 0, f"{input_file_name} have null item!!!!!!!!!!"
+
+        # 是否有键的值为空
+        for key in item.keys():
+            if type(item[key]) is not int and key != "id":
+                assert (
+                    len(item[key]) > 0
+                ), f"{input_file_name}, item {item} have null key!!!!!!!!!!{key}"
+
+        # item['conversations']是否有空
+        for conv in item["conversations"]:
+            text = conv["value"]
+            if len(text) == 0:
+                print(f"{input_file_name}, item {item} has null speaking!!!")
+
+        # image/video路径数量、set_id数量、place_holder数量是否一致
+        count_image_ph = 0
+        count_video_ph = 0
+        count_audio_ph = 0
+        count_image_path = 0
+        count_video_path = 0
+        count_audio_path = 0
+
+        text_all = ""
+        for conv in item["conversations"]:
+            text = conv["value"]
+            text_all += text
+            count_image_ph = text_all.count("<image>")
+            count_video_ph = text_all.count("<video>")
+            count_audio_ph = text_all.count("<audio>")
+
+        if "image" in item:
+            image_path = item["image"]
+            assert isinstance(image_path[0], str)
+            if type(image_path) is not list:
+                assert isinstance(image_path, str)
+                image_path = [image_path]
+            count_image_path = len(image_path)
+
+        if "video" in item:
+            video_path = item["video"]
+            assert isinstance(video_path[0], str)
+            if type(video_path) is not list:
+                assert isinstance(video_path, str)
+                video_path = [video_path]
+            count_video_path = len(video_path)
+
+        if "audio" in item:
+            audio_path = item["audio"]
+            assert isinstance(audio_path[0], str)
+            if type(audio_path) is not list:
+                assert isinstance(audio_path, str)
+                audio_path = [audio_path]
+            count_audio_path = len(audio_path)
+
+        # assert count_image_path == count_image_ph, f"{input_file_name}, item {item} image place holder number NOT equal image file number"
+        # assert count_video_path == count_video_ph, f"{input_file_name}, item {item} video place holder number NOT equal video file number"
+        # assert count_audio_path == count_audio_ph, f"{input_file_name}, item {item} audio place holder number NOT equal audio file number"
+
+        if count_image_path != count_image_ph:
+            print(
+                f"{input_file_name}, item {item} image place holder number NOT equal image file number"
+            )
+        if count_video_path != count_video_ph:
+            print(
+                f"{input_file_name}, item {item} video place holder number NOT equal video file number"
+            )
+        if count_audio_path != count_audio_ph:
+            print(
+                f"{input_file_name}, item {item} audio place holder number NOT equal audio file number"
+            )
+
+        set_id = item["set"]
+        if type(set_id) is not list:
+            set_id = [set_id]
+        if "image" in item or "video" in item:
+            if set_id[0] != "sqa":
+                assert (
+                    len(set_id) == count_image_path + count_video_path
+                ), f"{input_file_name}, item {item} set_id numer Not correct"
--- a/data_tools/check_video_lost.py
+++ b/data_tools/check_video_lost.py
+import json
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from PIL import Image
+from tqdm import tqdm
+
+from decord import VideoReader, cpu
+from vita.config import FolderDict
+from vita.config.dataset_config import *
+
+# 定义文件路径
+output_file_path = "lost_file_name.txt"
+
+# 将所有字典放入一个列表中
+# datasets = [Webvid, K400]
+# datasets = [VIDEOChatGPT, K700Split, VC2Internvid]
+# datasets = [EgoGesture, Literature, CopyWrite, MovingFashion]
+# datasets = [NoHarm]
+datasets = [SGInternvid0]
+
+# 初始化一个列表来存储丢失的文件名
+lost_files = []
+
+# 遍历每个字典
+for dataset in datasets:
+    keys = list(dataset.keys())
+    json_file_path = dataset["chat_path"]
+
+    # 读取JSON文件
+    with open(json_file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    def check_video_file(item):
+        video_file_name = item.get("video")
+        if video_file_name:
+            video_directory = FolderDict[item["set"]]
+            video_file_path = os.path.join(video_directory, video_file_name)
+            if not os.path.exists(video_file_path):
+                print(f"file lost: {video_file_path}")
+                return video_file_name
+            else:
+                sample_pos = [0, 10]
+                try:
+                    vreader = VideoReader(video_file_path, ctx=cpu(0))
+                    patch_images = [
+                        Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()
+                    ]
+                except Exception as e:
+                    print(f"file broken: {video_file_path}")
+                    return video_file_name
+        return None
+
+    # 使用ThreadPoolExecutor进行多线程并行处理
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        futures = [executor.submit(check_video_file, item) for item in data]
+        for future in tqdm(
+            as_completed(futures), total=len(futures), desc="Processing", unit="file"
+        ):
+            result = future.result()
+            if result:
+                lost_files.append(result)
+
+# 将丢失的文件名写入到lost_file_name.txt中
+with open(output_file_path, "w", encoding="utf-8") as f:
+    for file_name in lost_files:
+        f.write(file_name + "\n")
+
+print(f"检查完成，共有 {len(lost_files)} 个文件丢失或无法读取，结果已保存到 {output_file_path}")
--- a/data_tools/concat_data.py
+++ b/data_tools/concat_data.py
+import json
+import math
+import os
+import random
+
+import torch
+import transformers
+from tqdm import tqdm
+
+import torchaudio
+from vita import conversation as conversation_lib
+from vita.config import AudioFolder, FolderDict
+from vita.config.dataset_config import *
+from vita.constants import AUDIO_TOKEN_INDEX, GLOBAL_WEIGHTS_PATH, IGNORE_INDEX, IMAGE_TOKEN_INDEX
+from vita.util.data_utils_video_audio import DataArguments, LazySupervisedDataset
+from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
+
+image_token_num = 256
+concat_size = 4500
+datasets = [ShareGPT4V]
+
+parser = transformers.HfArgumentParser((DataArguments))
+tokenizer = transformers.AutoTokenizer.from_pretrained(
+    f"{GLOBAL_WEIGHTS_PATH}/Mixtral-8x7B_New/mg2hg",
+    cache_dir=None,
+    model_max_length=8192,
+    padding_side="right",
+    use_fast=True,
+)
+
+
+def get_wav_duration(file_path):
+    waveform, sample_rate = torchaudio.load(file_path)
+    duration = waveform.size(1) / sample_rate
+    return duration
+
+
+for dataset in datasets:
+    input_file_name = dataset["chat_path"]
+    base_name, ext = os.path.splitext(input_file_name)
+    suffix = f"-concat{concat_size}"
+    out_file_name = f"{base_name}{suffix}{ext}"
+
+    with open(input_file_name, "r", encoding="utf-8") as file:
+        data = json.load(file)
+    random.shuffle(data)
+    # data = data[:100]
+
+    # 遍历每条数据
+    len_list = []
+
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    len_list = []
+    # Apply prompt templates
+    for item in tqdm(data):
+        source = item["conversations"]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{source}"
+            conv.append_message(role, sentence["value"])
+        prompt = conv.get_prompt()
+
+        # import pdb; pdb.set_trace()
+        input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+        num_images = (input_ids == IMAGE_TOKEN_INDEX).sum()
+        item_token_num = input_ids.shape[0] + num_images * image_token_num
+
+        if "audio" in item:
+            audio_files = item["audio"]
+            audio_directory = AudioFolder
+            # 如果 audio_files 是字符串，将其转换为列表
+            if isinstance(audio_files, str):
+                audio_files = [audio_files]
+            # 如果 audio_files 是列表，处理每个文件
+            assert isinstance(audio_files, list)
+            total_duration = 0
+            for audio_file_name in audio_files:
+                audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
+                duration = get_wav_duration(audio_file_path)
+                duration = (
+                    math.ceil(duration) if math.ceil(duration) % 2 == 0 else math.ceil(duration) + 1
+                )
+                total_duration += duration
+            item_token_num += math.ceil(total_duration * 12.5)
+        len_list.append(item_token_num)
+    assert len(len_list) == len(data)
+
+    def concat_item(items):
+        temp_set_id = []
+        temp_conversations = []
+        temp_ids = []
+        temp_images = []
+        temp_audios = []
+
+        for item in items:
+            temp_set_id.append(item["set"])
+            temp_conversations.extend(item["conversations"])
+            if "id" in item:
+                temp_ids.append(item["id"])
+            if "image" in item:
+                temp_images.append(item["image"])
+            if "audio" in item:
+                audio = item["audio"]
+                if type(audio) is not list:
+                    audio = [audio]
+                temp_audios += audio
+        if len(temp_images) > 0:
+            merged_item = {
+                "set": temp_set_id,
+                "id": temp_ids,
+                "image": temp_images,
+                "conversations": temp_conversations,
+            }
+        else:
+            merged_item = {
+                "set": temp_set_id,
+                "id": temp_ids,
+                "conversations": temp_conversations,
+            }
+        if len(temp_audios) > 0:
+            merged_item["audio"] = temp_audios
+        return merged_item
+
+    merged_data = []
+    i = 0
+    while i < len(data):
+        len_token = len_list[i]
+        k = 1
+        while True:
+            if sum(len_list[i : i + k]) > concat_size:
+                if k > 1:
+                    k -= 1
+                break
+            if i + k == len(data):
+                break
+            k += 1
+        merged_item = concat_item(data[i : i + k])
+        merged_data.append(merged_item)
+        #    print(f"i: {i}, k: {k}; len of merged item: {sum(len_list[i:i+k])}")
+        i = i + k
+
+    with open(out_file_name, "w", encoding="utf-8") as f:
+        json.dump(merged_data, f, ensure_ascii=False, indent=4)
+
+    print(f"save {out_file_name}")
--- a/data_tools/concat_data_frameCat.py
+++ b/data_tools/concat_data_frameCat.py
+import json
+import math
+import os
+import random
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import torch
+import transformers
+from PIL import Image
+from tqdm import tqdm
+
+import torchaudio
+from vita import conversation as conversation_lib
+from vita.config import *
+from vita.config import AudioFolder, FolderDict
+from vita.constants import AUDIO_TOKEN_INDEX, GLOBAL_WEIGHTS_PATH, IGNORE_INDEX, IMAGE_TOKEN_INDEX
+from vita.util.data_utils_video_audio import DataArguments, LazySupervisedDataset
+from vita.util.data_utils_video_audio_neg_patch import find_closest_aspect_ratio
+from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
+
+image_token_num = 256
+concat_size = 6000
+datasets = [ShareGPT4V0]
+
+parser = transformers.HfArgumentParser((DataArguments))
+tokenizer = transformers.AutoTokenizer.from_pretrained(
+    f"{GLOBAL_WEIGHTS_PATH}/Mixtral-8x7B_New/mg2hg",
+    cache_dir=None,
+    model_max_length=8192,
+    padding_side="right",
+    use_fast=True,
+)
+
+
+def dynamic_preprocess(
+    image, min_num=2, max_num=12, image_size=448, use_thumbnail=False, img_mean=0
+):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    # expand target_aspect_ratio to even for each size
+    new_target_aspect_ratio = [e if e % 2 == 0 else e + 1 for e in target_aspect_ratio]
+    blocks_big = int(0.5 * new_target_aspect_ratio[0] * 0.5 * new_target_aspect_ratio[1])
+    return blocks_big
+
+
+def get_wav_duration(file_path):
+    waveform, sample_rate = torchaudio.load(file_path)
+    duration = waveform.size(1) / sample_rate
+    return duration
+
+
+def concat_item(items):
+    temp_set_id = []
+    temp_conversations = []
+    temp_ids = []
+    temp_images = []
+    temp_audios = []
+
+    for item in items:
+        temp_set_id.append(item["set"])
+        temp_conversations.extend(item["conversations"])
+        if "id" in item:
+            temp_ids.append(item["id"])
+        if "image" in item:
+            temp_images.append(item["image"])
+        if "audio" in item:
+            audio = item["audio"]
+            if type(audio) is not list:
+                audio = [audio]
+            temp_audios += audio
+    if len(temp_images) > 0:
+        merged_item = {
+            "set": temp_set_id,
+            "id": temp_ids,
+            "image": temp_images,
+            "conversations": temp_conversations,
+        }
+    else:
+        merged_item = {
+            "set": temp_set_id,
+            "id": temp_ids,
+            "conversations": temp_conversations,
+        }
+    if len(temp_audios) > 0:
+        merged_item["audio"] = temp_audios
+    return merged_item
+
+
+def compute_item_token_num(item):
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    source = item["conversations"]
+    conv.messages = []
+    modality = "lang"
+    for j, sentence in enumerate(source):
+        role = roles[sentence["from"]]
+        assert role == conv.roles[j % 2], f"{source}"
+        conv.append_message(role, sentence["value"])
+        if "<image>" in sentence["value"]:
+            modality = "image"
+    prompt = conv.get_prompt(modality)
+
+    # import pdb; pdb.set_trace()
+    input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+    item_token_num = input_ids.shape[0]
+    if "image" in item:
+        image_file = item["image"]
+        set_id = item["set"]
+        image_directory = FolderDict[set_id]
+        image = Image.open(os.path.join(image_directory, image_file.replace("\\", "/"))).convert(
+            "RGB"
+        )
+        num_patches = dynamic_preprocess(image)
+        item_token_num = item_token_num + num_patches * image_token_num
+
+    if "audio" in item:
+        audio_files = item["audio"]
+        audio_directory = AudioFolder
+        # 如果 audio_files 是字符串，将其转换为列表
+        if isinstance(audio_files, str):
+            audio_files = [audio_files]
+        # 如果 audio_files 是列表，处理每个文件
+        assert isinstance(audio_files, list)
+        total_duration = 0
+        for audio_file_name in audio_files:
+            audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
+            duration = get_wav_duration(audio_file_path)
+            duration = (
+                math.ceil(duration) if math.ceil(duration) % 2 == 0 else math.ceil(duration) + 1
+            )
+            total_duration += duration
+        item_token_num += math.ceil(total_duration * 12.5)
+    item["token_len"] = item_token_num
+
+
+for dataset in datasets:
+    input_file_name = dataset["chat_path"]
+    base_name, ext = os.path.splitext(input_file_name)
+    suffix = f"-FrameConcat{concat_size}"
+    out_file_name = f"{base_name}{suffix}{ext}"
+
+    with open(input_file_name, "r", encoding="utf-8") as file:
+        data = json.load(file)
+    random.shuffle(data)
+    # data = data[:100]
+
+    #    for item in tqdm(data):
+    #        compute_item_token_num(item)
+    with ThreadPoolExecutor() as executor:
+        futures = [executor.submit(compute_item_token_num, item) for item in data]
+        for future in tqdm(as_completed(futures), total=len(futures)):
+            future.result()
+
+    merged_data = []
+    i = 0
+    while i < len(data):
+        len_token = data[i]["token_len"]
+        k = 1
+        while True:
+            if sum([item["token_len"] for item in data[i : i + k]]) > concat_size:
+                if k > 1:
+                    k -= 1
+                break
+            if i + k == len(data):
+                break
+            k += 1
+        merged_item = concat_item(data[i : i + k])
+        merged_data.append(merged_item)
+        #    print(f"i: {i}, k: {k}; len of merged item: {sum(len_list[i:i+k])}")
+        i = i + k
+
+    with open(out_file_name, "w", encoding="utf-8") as f:
+        json.dump(merged_data, f, ensure_ascii=False, indent=4)
+
+    print(f"save {out_file_name}")
--- a/data_tools/concat_data_patch.py
+++ b/data_tools/concat_data_patch.py
+import json
+import math
+import os
+import random
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import torch
+import transformers
+from PIL import Image
+from tqdm import tqdm
+
+import torchaudio
+from vita import conversation as conversation_lib
+from vita.config import AudioFolder, FolderDict
+from vita.config.dataset_config import *
+from vita.constants import AUDIO_TOKEN_INDEX, GLOBAL_WEIGHTS_PATH, IGNORE_INDEX, IMAGE_TOKEN_INDEX
+from vita.util.data_utils_video_audio import DataArguments, LazySupervisedDataset
+from vita.util.data_utils_video_audio_neg_patch import find_closest_aspect_ratio
+from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
+
+image_token_num = 256
+concat_size = 6000
+datasets = [ShareGPT4V]
+
+parser = transformers.HfArgumentParser((DataArguments))
+tokenizer = transformers.AutoTokenizer.from_pretrained(
+    f"{GLOBAL_WEIGHTS_PATH}/Mixtral-8x7B_New/mg2hg",
+    cache_dir=None,
+    model_max_length=8192,
+    padding_side="right",
+    use_fast=True,
+)
+conv = conversation_lib.default_conversation.copy()
+roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=True):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+    return blocks
+
+
+def get_wav_duration(file_path):
+    waveform, sample_rate = torchaudio.load(file_path)
+    duration = waveform.size(1) / sample_rate
+    return duration
+
+
+def concat_item(items):
+    temp_set_id = []
+    temp_conversations = []
+    temp_ids = []
+    temp_images = []
+    temp_audios = []
+
+    for item in items:
+        temp_set_id.append(item["set"])
+        temp_conversations.extend(item["conversations"])
+        if "id" in item:
+            temp_ids.append(item["id"])
+        if "image" in item:
+            temp_images.append(item["image"])
+        if "audio" in item:
+            audio = item["audio"]
+            if type(audio) is not list:
+                audio = [audio]
+            temp_audios += audio
+    if len(temp_images) > 0:
+        merged_item = {
+            "set": temp_set_id,
+            "id": temp_ids,
+            "image": temp_images,
+            "conversations": temp_conversations,
+        }
+    else:
+        merged_item = {
+            "set": temp_set_id,
+            "id": temp_ids,
+            "conversations": temp_conversations,
+        }
+    if len(temp_audios) > 0:
+        merged_item["audio"] = temp_audios
+    return merged_item
+
+
+def compute_item_token_num(item):
+    source = item["conversations"]
+    conv.messages = []
+    for j, sentence in enumerate(source):
+        role = roles[sentence["from"]]
+        assert role == conv.roles[j % 2], f"{source}"
+        conv.append_message(role, sentence["value"])
+    prompt = conv.get_prompt()
+
+    # import pdb; pdb.set_trace()
+    input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+    item_token_num = input_ids.shape[0]
+    if "image" in item:
+        image_file = item["image"]
+        set_id = item["set"]
+        image_directory = FolderDict[set_id]
+        image = Image.open(os.path.join(image_directory, image_file.replace("\\", "/"))).convert(
+            "RGB"
+        )
+        num_patches = dynamic_preprocess(image)
+        item_token_num = item_token_num + num_patches * image_token_num
+
+    if "audio" in item:
+        audio_files = item["audio"]
+        audio_directory = AudioFolder
+        # 如果 audio_files 是字符串，将其转换为列表
+        if isinstance(audio_files, str):
+            audio_files = [audio_files]
+        # 如果 audio_files 是列表，处理每个文件
+        assert isinstance(audio_files, list)
+        total_duration = 0
+        for audio_file_name in audio_files:
+            audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
+            duration = get_wav_duration(audio_file_path)
+            duration = (
+                math.ceil(duration) if math.ceil(duration) % 2 == 0 else math.ceil(duration) + 1
+            )
+            total_duration += duration
+        item_token_num += math.ceil(total_duration * 12.5)
+    item["token_len"] = item_token_num
+
+
+for dataset in datasets:
+    input_file_name = dataset["chat_path"]
+    base_name, ext = os.path.splitext(input_file_name)
+    suffix = f"-PatchConcat{concat_size}"
+    out_file_name = f"{base_name}{suffix}{ext}"
+
+    with open(input_file_name, "r", encoding="utf-8") as file:
+        data = json.load(file)
+    random.shuffle(data)
+    # data = data[:100]
+
+    #    for item in tqdm(data):
+    #        compute_item_token_num(item)
+    with ThreadPoolExecutor() as executor:
+        futures = [executor.submit(compute_item_token_num, item) for item in data]
+        for future in tqdm(as_completed(futures), total=len(futures)):
+            future.result()
+
+    merged_data = []
+    i = 0
+    while i < len(data):
+        len_token = data[i]["token_len"]
+        k = 1
+        while True:
+            if sum([item["token_len"] for item in data[i : i + k]]) > concat_size:
+                if k > 1:
+                    k -= 1
+                break
+            if i + k == len(data):
+                break
+            k += 1
+        merged_item = concat_item(data[i : i + k])
+        merged_data.append(merged_item)
+        #    print(f"i: {i}, k: {k}; len of merged item: {sum(len_list[i:i+k])}")
+        i = i + k
+
+    with open(out_file_name, "w", encoding="utf-8") as f:
+        json.dump(merged_data, f, ensure_ascii=False, indent=4)
+
+    print(f"save {out_file_name}")
--- a/data_tools/rm_lost_audio_in_json.py
+++ b/data_tools/rm_lost_audio_in_json.py
+import json
+import os
+
+from vita.constants import GLOBAL_WEIGHTS_PATH
+
+# 定义文件路径
+
+lost_file_path = "lost_file_name.txt"
+json_list = [""]
+
+for json_file_path in json_list:
+    output_json_file_path = json_file_path
+
+    # 读取丢失的文件名
+    with open(lost_file_path, "r", encoding="utf-8") as f:
+        lost_files = set(line.strip() for line in f)
+
+    # 读取JSON文件
+    with open(json_file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    # 过滤数据，删除丢失文件对应的数据
+    filtered_data = []
+    for item in data:
+        audio_OK = True
+        if "audio" in item:
+            assert type(item["audio"]) is list
+            for audio_filename in item["audio"]:
+                if audio_filename in lost_files:
+                    audio_OK = False
+        if audio_OK:
+            filtered_data.append(item)
+
+    # 将更新后的数据写入新的JSON文件
+    with open(output_json_file_path, "w", encoding="utf-8") as f:
+        json.dump(filtered_data, f, ensure_ascii=False, indent=4)
+
+    print(f"更新完成，共删除了 {len(data) - len(filtered_data)} 条数据，结果已保存到 {output_json_file_path}")
--- a/data_tools/rm_lost_image_in_json.py
+++ b/data_tools/rm_lost_image_in_json.py
+import json
+
+from vita.constants import GLOBAL_WEIGHTS_PATH
+
+# 定义文件路径
+# lost_file_path = 'lost_file_name.txt'
+lost_file_path = "long_image_file_name.txt"
+json_list = [""]
+
+for json_file_path in json_list:
+    output_json_file_path = json_file_path
+    # 读取丢失的文件名
+    with open(lost_file_path, "r", encoding="utf-8") as f:
+        lost_files = set(line.strip() for line in f)
+
+    # 读取JSON文件
+    with open(json_file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    # 过滤数据，删除丢失文件对应的数据
+    filtered_data = []
+    for item in data:
+        image_OK = True
+        if "image" in item:
+            image_file = item["image"]
+            if type(image_file) is str:
+                image_file = [image_file]
+            assert type(image_file) is list
+            for image_filename in image_file:
+                if image_filename in lost_files:
+                    image_OK = False
+                    break
+        if image_OK:
+            filtered_data.append(item)
+
+    # 将更新后的数据写入新的JSON文件
+    with open(output_json_file_path, "w", encoding="utf-8") as f:
+        json.dump(filtered_data, f, ensure_ascii=False, indent=4)
+
+    print(f"更新完成，共删除了 {len(data) - len(filtered_data)} 条数据，结果已保存到 {output_json_file_path}")
--- a/data_tools/rm_lost_video_in_json.py
+++ b/data_tools/rm_lost_video_in_json.py
+import json
+
+from vita.constants import GLOBAL_WEIGHTS_PATH
+
+# 定义文件路径
+lost_file_path = "lost_file_name.txt"
+json_list = []
+
+for json_file_path in json_list:
+    output_json_file_path = json_file_path
+
+    with open(lost_file_path, "r") as file:
+        lost_files = set(file.read().splitlines())
+
+    # Load the JSON data
+    with open(json_file_path, "r") as file:
+        data = json.load(file)
+
+    # 过滤数据，删除丢失文件对应的数据
+    filtered_data = []
+    for item in data:
+        video_OK = True
+        if "video" in item:
+            video_filename = item["video"]
+            if video_filename in lost_files:
+                video_OK = False
+        if video_OK:
+            filtered_data.append(item)
+
+    # Save the filtered data back to a new JSON file
+    with open(output_json_file_path, "w", encoding="utf-8") as file:
+        json.dump(filtered_data, file, indent=2, ensure_ascii=False)
+
+    print(
+        f"The json data has been delete {len(data)-len(filtered_data)} and saved to {output_json_file_path}"
+    )
--- a/data_tools/statistics_audio_duration.py
+++ b/data_tools/statistics_audio_duration.py
+import json
+import math
+import os
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from PIL import Image
+from tqdm import tqdm
+
+import torchaudio
+from vita.config import *
+from vita.config import AudioFolder, FolderDict
+from vita.config.dataset_config import *
+
+# 定义文件路径
+output_file_path = "lost_file_name.txt"
+
+# 将所有字典放入一个列表中
+# datasets = NLP+HumanCentric+VideoQA+NaturalQA
+datasets = VideoCap + OCRCap + NaturalCap
+
+# 初始化一个列表来存储丢失的文件名
+lock = threading.Lock()
+
+
+def get_wav_duration(file_path):
+    waveform, sample_rate = torchaudio.load(file_path)
+    duration = waveform.size(1) / sample_rate
+    return duration
+
+
+def check_audio(audio_file_name, audio_directory):
+    audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
+    duration = get_wav_duration(audio_file_path)
+    if duration > 200:
+        print(audio_file_path, duration)
+    return duration
+
+
+# 遍历每个字典
+for dataset in datasets:
+    dur_list = []
+    keys = list(dataset.keys())
+    json_file_path = dataset["chat_path"]
+    print(json_file_path)
+    # 读取JSON文件
+    with open(json_file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    # 遍历每条数据，使用tqdm显示进度条
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        futures = []
+        for item in data:
+            audio_files = item.get("audio")
+            audio_directory = AudioFolder
+            # 如果 audio_files 是字符串，将其转换为列表
+            if isinstance(audio_files, str):
+                audio_files = [audio_files]
+
+            # 如果 audio_files 是列表，处理每个文件
+            if isinstance(audio_files, list):
+                for audio_file_name in audio_files:
+                    futures.append(executor.submit(check_audio, audio_file_name, audio_directory))
+
+        for future in tqdm(
+            as_completed(futures), total=len(futures), desc="Processing", unit="file"
+        ):
+            duration = future.result()
+            dur_list.append(duration)
+
+    # 初始化区间计数字典
+    distribution = {
+        "0-1": 0,
+        "1-5": 0,
+        "5-10": 0,
+        "10-15": 0,
+        "15-20": 0,
+        "20-25": 0,
+        "25-30": 0,
+        "30-60": 0,
+        "60-200": 0,
+        ">200": 0,
+    }
+
+    # 统计每个区间的计数
+    for length in dur_list:
+        if length <= 1:
+            distribution["0-1"] += 1
+        elif length <= 5:
+            distribution["1-5"] += 1
+        elif length <= 10:
+            distribution["5-10"] += 1
+        elif length <= 15:
+            distribution["10-15"] += 1
+        elif length <= 20:
+            distribution["15-20"] += 1
+        elif length <= 25:
+            distribution["20-25"] += 1
+        elif length <= 30:
+            distribution["25-30"] += 1
+        elif length <= 60:
+            distribution["30-60"] += 1
+        elif length <= 200:
+            distribution["60-200"] += 1
+        else:
+            distribution[">200"] += 1
+
+    # 打印分布结果
+    print(f"duration distribution of {json_file_path}:")
+    for key, value in distribution.items():
+        print(f"{key}: {value}")
--- a/data_tools/statistics_data_num.py
+++ b/data_tools/statistics_data_num.py
+import json
+import math
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import torch
+import transformers
+from tqdm import tqdm
+
+import torchaudio
+from vita import conversation as conversation_lib
+from vita.config import *
+from vita.config import AudioFolder, FolderDict
+from vita.config.dataset_config import *
+from vita.constants import AUDIO_TOKEN_INDEX, IGNORE_INDEX, IMAGE_TOKEN_INDEX
+from vita.util.data_utils_video_audio import DataArguments, LazySupervisedDataset
+from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
+
+image_token_num = 256
+token_thre = 4500
+# datasets = NaturalCap + VideoCap + OCRCap + NaturalQA + VideoQA + HumanCentric + NLP
+datasets = (
+    NaturalCap0
+    + OCRCap0
+    + VideoCap0
+    + NaturalQA0
+    + VideoQA0
+    + [EgoGesture0, Literature0, CopyWrite0, MovingFashion0]
+)
+
+num_data_neg_audio = 0
+for dataset in datasets:
+    json_file_path = dataset["chat_path"]
+
+    with open(json_file_path, "r", encoding="utf-8") as file:
+        data = json.load(file)
+    num_data_audio = 0
+    num_data_conv = 0
+    num_data_qs_qudio = 0
+    num_data_qs_text = 0
+    for item in data:
+        conversations = item["conversations"]
+        assert len(conversations) % 2 == 0
+        num_conv = len(conversations) // 2
+        num_data_conv += num_conv
+        num_qs_audio = 0
+        num_qs_text = 0
+        for conv in conversations:
+            if conv["from"] == "human":
+                qs = conv["value"]
+                if "<audio>" in qs:
+                    num_qs_audio += 1
+                else:
+                    num_qs_text += 1
+        num_data_qs_qudio += num_qs_audio
+        num_data_qs_text += num_qs_text
+        num_audio = 0
+        audio_files = item.get("audio")
+        audio_directory = AudioFolder
+        # 如果 audio_files 是字符串，将其转换为列表
+        if isinstance(audio_files, str):
+            audio_files = [audio_files]
+
+        # 如果 audio_files 是列表，处理每个文件
+        if isinstance(audio_files, list):
+            num_audio = len(audio_files)
+            for audio in audio_files:
+                if "new_value_dict_0725" in audio or "new_value_dict_0730" in audio:
+                    num_data_neg_audio += 1
+        num_data_audio += num_audio
+
+    assert num_data_conv == num_data_qs_qudio + num_data_qs_text
+    # print(f'{json_file_path} conversation number: {num_data_conv/1000}K')
+    # print(f'{json_file_path} audio question number: {num_data_qs_qudio/1000}K')
+    # print(f'{json_file_path} text question number: {num_data_qs_text/1000}K')
+    # print(f'{json_file_path} audio number: {num_data_audio/1000}K')
+    print(f"{json_file_path} data number: {len(data)/1000}K")
+
+# print(f'{json_file_path} negtive audio question number: {num_data_neg_audio/1000}K')
--- a/data_tools/statistics_image_num.py
+++ b/data_tools/statistics_image_num.py
+import json
+import math
+import os
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from PIL import Image
+from tqdm import tqdm
+
+import torchaudio
+from vita.config import *
+from vita.config import AudioFolder, FolderDict
+from vita.config.dataset_config import *
+
+# 将所有字典放入一个列表中
+datasets = NLP + HumanCentric + VideoQA + NaturalQA + VideoCap + OCRCap + NaturalCap
+
+# 遍历每个字典
+for dataset in datasets:
+    dur_list = []
+    keys = list(dataset.keys())
+    input_file_name = dataset["chat_path"]
+
+    # 读取JSON文件
+    len_list = []
+    with open(input_file_name, "r", encoding="utf-8") as file:
+        data = json.load(file)
+
+    print(f"check {input_file_name}")
+    # 遍历每条数据
+    for item in tqdm(data):
+        if "image" in item:
+            image_path = item["image"]
+            assert isinstance(image_path[0], str)
+            if type(image_path) is not list:
+                assert isinstance(image_path, str)
+                image_path = [image_path]
+            count_image_path = len(image_path)
+            if count_image_path > 40:
+                print(count_image_path)
+                print(item)
+            len_list.append(count_image_path)
+
+    distribution = {
+        "0-5": 0,
+        "5-10": 0,
+        "10-16": 0,
+        "16-20": 0,
+        "20-25": 0,
+        "25-30": 0,
+        "30-35": 0,
+        "35-40": 0,
+        ">40": 0,
+    }
+
+    for length in len_list:
+        if length <= 5:
+            distribution["0-5"] += 1
+        elif length <= 10:
+            distribution["5-10"] += 1
+        elif length <= 16:
+            distribution["10-16"] += 1
+        elif length <= 20:
+            distribution["16-20"] += 1
+        elif length <= 25:
+            distribution["20-25"] += 1
+        elif length <= 30:
+            distribution["25-30"] += 1
+        elif length <= 35:
+            distribution["30-35"] += 1
+        elif length <= 40:
+            distribution["35-40"] += 1
+        else:
+            distribution[">40"] += 1
+
+    print(f"Length distribution of {input_file_name}:")
+    for key, value in distribution.items():
+        print(f"{key}: {value}")
--- a/data_tools/statistics_token_num.py
+++ b/data_tools/statistics_token_num.py
+import json
+import math
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import torch
+import transformers
+from tqdm import tqdm
+
+import torchaudio
+from vita import conversation as conversation_lib
+from vita.config import *
+from vita.config import AudioFolder, FolderDict
+from vita.config.dataset_config import *
+from vita.constants import AUDIO_TOKEN_INDEX, GLOBAL_WEIGHTS_PATH, IGNORE_INDEX, IMAGE_TOKEN_INDEX
+from vita.util.data_utils_video_audio import DataArguments, LazySupervisedDataset
+from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
+
+image_token_num = 256
+token_thre = 4500
+# datasets = NLP + HumanCentric + VideoQA + NaturalQA + VideoCap + OCRCap + NaturalCap
+datasets = [DyChart_iresearch, RCTW2019QA, Lvis_cn_noDesc, VIDEOChatGPT]
+datasets = [AnyWord_20to50]
+out_file_name = "debug.json"
+
+parser = transformers.HfArgumentParser((DataArguments))
+tokenizer = transformers.AutoTokenizer.from_pretrained(
+    f"{GLOBAL_WEIGHTS_PATH}/Mixtral-8x7B_New/mg2hg",
+    cache_dir=None,
+    model_max_length=8192,
+    padding_side="right",
+    use_fast=True,
+)
+
+long_json = []
+
+
+def get_wav_duration(file_path):
+    waveform, sample_rate = torchaudio.load(file_path)
+    duration = waveform.size(1) / sample_rate
+    return duration
+
+
+def process_item(item, conv, roles, tokenizer):
+    source = item["conversations"]
+    conv.messages = []
+    for j, sentence in enumerate(source):
+        role = roles[sentence["from"]]
+        assert role == conv.roles[j % 2], f"{source}"
+        conv.append_message(role, sentence["value"])
+    prompt = conv.get_prompt()
+
+    input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+    num_images = (input_ids == IMAGE_TOKEN_INDEX).sum()
+    item_token_num = input_ids.shape[0] + num_images * image_token_num
+
+    if "audio" in item:
+        audio_files = item["audio"]
+        audio_directory = AudioFolder
+        if isinstance(audio_files, str):
+            audio_files = [audio_files]
+        assert isinstance(audio_files, list)
+        total_duration = 0
+        for audio_file_name in audio_files:
+            audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
+            duration = get_wav_duration(audio_file_path)
+            duration = (
+                math.ceil(duration) if math.ceil(duration) % 2 == 0 else math.ceil(duration) + 1
+            )
+            total_duration += duration
+        item_token_num += math.ceil(total_duration * 12.5)
+    if item_token_num > token_thre:
+        print(item_token_num)
+        if len(item["image"]) >= 16:
+            long_json.append(item)
+            print(len(item["image"]))
+    return item_token_num
+
+
+for dataset in datasets:
+    json_file_path = dataset["chat_path"]
+
+    with open(json_file_path, "r", encoding="utf-8") as file:
+        data = json.load(file)
+
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    len_list = []
+    with ThreadPoolExecutor() as executor:
+        futures = [executor.submit(process_item, item, conv, roles, tokenizer) for item in data]
+        for future in tqdm(as_completed(futures), total=len(futures)):
+            len_list.append(future.result())
+
+    assert len(len_list) == len(data)
+
+    distribution = {
+        "0-100": 0,
+        "100-200": 0,
+        "200-300": 0,
+        "300-400": 0,
+        "400-500": 0,
+        "500-600": 0,
+        "600-700": 0,
+        "700-800": 0,
+        "800-900": 0,
+        "900-1000": 0,
+        "1000-1100": 0,
+        "1100-1200": 0,
+        "1200-1300": 0,
+        "1300-1400": 0,
+        "1400-1500": 0,
+        "1500-1600": 0,
+        "1600-1700": 0,
+        "1700-1800": 0,
+        "1800-1900": 0,
+        "1900-2000": 0,
+        "2000-2500": 0,
+        "2500-3000": 0,
+        "3000-3500": 0,
+        "3500-4000": 0,
+        "4000-4500": 0,
+        "4500-5000": 0,
+        "5000-5500": 0,
+        "5500-6000": 0,
+        ">6000": 0,
+    }
+
+    for length in len_list:
+        if length <= 100:
+            distribution["0-100"] += 1
+        elif length <= 200:
+            distribution["100-200"] += 1
+        elif length <= 300:
+            distribution["200-300"] += 1
+        elif length <= 400:
+            distribution["300-400"] += 1
+        elif length <= 500:
+            distribution["400-500"] += 1
+        elif length <= 600:
+            distribution["500-600"] += 1
+        elif length <= 700:
+            distribution["600-700"] += 1
+        elif length <= 800:
+            distribution["700-800"] += 1
+        elif length <= 900:
+            distribution["800-900"] += 1
+        elif length <= 1000:
+            distribution["900-1000"] += 1
+        elif length <= 1100:
+            distribution["1000-1100"] += 1
+        elif length <= 1200:
+            distribution["1100-1200"] += 1
+        elif length <= 1300:
+            distribution["1200-1300"] += 1
+        elif length <= 1400:
+            distribution["1300-1400"] += 1
+        elif length <= 1500:
+            distribution["1400-1500"] += 1
+        elif length <= 1600:
+            distribution["1500-1600"] += 1
+        elif length <= 1700:
+            distribution["1600-1700"] += 1
+        elif length <= 1800:
+            distribution["1700-1800"] += 1
+        elif length <= 1900:
+            distribution["1800-1900"] += 1
+        elif length <= 2000:
+            distribution["1900-2000"] += 1
+        elif length <= 2500:
+            distribution["2000-2500"] += 1
+        elif length <= 3000:
+            distribution["2500-3000"] += 1
+        elif length <= 3500:
+            distribution["3000-3500"] += 1
+        elif length <= 4000:
+            distribution["3500-4000"] += 1
+        elif length <= 4500:
+            distribution["4000-4500"] += 1
+        elif length <= 5000:
+            distribution["4500-5000"] += 1
+        elif length <= 5500:
+            distribution["5000-5500"] += 1
+        elif length <= 6000:
+            distribution["5500-6000"] += 1
+        else:
+            distribution[">6000"] += 1
+
+    print(f"Length distribution of {json_file_path}:")
+    for key, value in distribution.items():
+        print(f"{key}: {value}")
+
+# with open(out_file_name, 'w', encoding='utf-8') as file:
+#    json.dump(long_json*10, file, ensure_ascii=False, indent=4)
+
+# print(f"处理完成，大于{token_thre}的已保存到{out_file_name}")
--- a/data_tools/statistics_token_num_frameCat.py
+++ b/data_tools/statistics_token_num_frameCat.py
+import json
+import math
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import torch
+import transformers
+from PIL import Image
+from tqdm import tqdm
+
+import torchaudio
+from vita import conversation as conversation_lib
+from vita.config import *
+from vita.config import AudioFolder, FolderDict
+from vita.config.dataset_config import *
+from vita.constants import AUDIO_TOKEN_INDEX, GLOBAL_WEIGHTS_PATH, IGNORE_INDEX, IMAGE_TOKEN_INDEX
+from vita.util.data_utils_video_audio import DataArguments, LazySupervisedDataset
+from vita.util.data_utils_video_audio_neg_patch import find_closest_aspect_ratio
+from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
+
+image_token_num = 256
+token_thre = 9500
+# datasets = NLP + HumanCentric + VideoQA + NaturalQA + VideoCap + OCRCap + NaturalCap
+
+
+datasets = NaturalCap0 + OCRCap0 + VideoCap0 + NaturalQA0
+# datasets = VideoQA + HumanCentric + NLP
+# datasets = [SGInternvid0]
+datasets = NaturalCap0
+datasets = OCRCap0
+datasets = VideoCap0 + NaturalQA0 + [TextSFT0]
+
+out_file_name = "debug.json"
+
+parser = transformers.HfArgumentParser((DataArguments))
+tokenizer = transformers.AutoTokenizer.from_pretrained(
+    f"{GLOBAL_WEIGHTS_PATH}/Mixtral-8x7B_New/mg2hg",
+    cache_dir=None,
+    model_max_length=8192,
+    padding_side="right",
+    use_fast=True,
+)
+
+long_json = []
+
+
+def dynamic_preprocess(
+    image, min_num=2, max_num=12, image_size=448, use_thumbnail=False, img_mean=0
+):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    # expand target_aspect_ratio to even for each size
+    new_target_aspect_ratio = [e if e % 2 == 0 else e + 1 for e in target_aspect_ratio]
+    blocks_big = int(0.5 * new_target_aspect_ratio[0] * 0.5 * new_target_aspect_ratio[1])
+    return blocks_big
+
+
+def get_wav_duration(file_path):
+    waveform, sample_rate = torchaudio.load(file_path)
+    duration = waveform.size(1) / sample_rate
+    return duration
+
+
+def process_item(item, tokenizer):
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    source = item["conversations"]
+    conv.messages = []
+    modality = "lang"
+    for j, sentence in enumerate(source):
+        role = roles[sentence["from"]]
+        assert role == conv.roles[j % 2], f"{source}"
+        conv.append_message(role, sentence["value"])
+        if "<image>" in sentence["value"]:
+            modality = "image"
+        elif "<video>" in sentence["value"]:
+            modality = "lang"
+    prompt = conv.get_prompt(modality)
+
+    input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+    item_token_num = input_ids.shape[0]
+    if "image" in item:
+        image_file = item["image"]
+        if isinstance(image_file, str):
+            image_file = [image_file]
+        set_id = item["set"]
+        if isinstance(set_id, str):
+            set_id = [set_id]
+        for k, img_file in enumerate(image_file):
+            if set_id[k] not in NoPatchSets:
+                image_directory = FolderDict[set_id[k]]
+                image = Image.open(
+                    os.path.join(image_directory, img_file.replace("\\", "/"))
+                ).convert("RGB")
+                num_patches = dynamic_preprocess(image)
+            else:
+                num_patches = 1
+            item_token_num += num_patches * image_token_num
+
+    total_duration = 0
+    if "audio" in item:
+        audio_files = item["audio"]
+        audio_directory = AudioFolder
+        if isinstance(audio_files, str):
+            audio_files = [audio_files]
+        assert isinstance(audio_files, list)
+        for audio_file_name in audio_files:
+            audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
+            duration = get_wav_duration(audio_file_path)
+            duration = (
+                math.ceil(duration) if math.ceil(duration) % 2 == 0 else math.ceil(duration) + 1
+            )
+            total_duration += duration
+        item_token_num += math.ceil(total_duration * 12.5)
+    if item_token_num > token_thre:
+        print(f"item_token_num: {item_token_num}")
+        if len(item["image"]) >= 16:
+            print(f"num_patches: {num_patches}")
+            print(f"total_duration: {total_duration}")
+            long_json.append(item)
+            print(item)
+    return item_token_num
+
+
+for dataset in datasets:
+    json_file_path = dataset["chat_path"]
+
+    with open(json_file_path, "r", encoding="utf-8") as file:
+        data = json.load(file)
+
+    len_list = []
+    with ThreadPoolExecutor() as executor:
+        futures = [executor.submit(process_item, item, tokenizer) for item in data]
+        for future in tqdm(as_completed(futures), total=len(futures)):
+            len_list.append(future.result())
+
+    assert len(len_list) == len(data)
+
+    distribution = {
+        "0-100": 0,
+        "100-200": 0,
+        "200-300": 0,
+        "300-400": 0,
+        "400-500": 0,
+        "500-600": 0,
+        "600-700": 0,
+        "700-800": 0,
+        "800-900": 0,
+        "900-1000": 0,
+        "1000-1500": 0,
+        "1500-2000": 0,
+        "2000-2500": 0,
+        "2500-3000": 0,
+        "3000-3500": 0,
+        "3500-4000": 0,
+        "4000-4500": 0,
+        "4500-5000": 0,
+        "5000-5500": 0,
+        "5500-6000": 0,
+        "6000-6500": 0,
+        "6500-7000": 0,
+        "7000-7500": 0,
+        "7500-8000": 0,
+        "8000-8500": 0,
+        "8500-9000": 0,
+        "9000-9500": 0,
+        "9500-10000": 0,
+        ">10000": 0,
+    }
+
+    for length in len_list:
+        if length <= 100:
+            distribution["0-100"] += 1
+        elif length <= 200:
+            distribution["100-200"] += 1
+        elif length <= 300:
+            distribution["200-300"] += 1
+        elif length <= 400:
+            distribution["300-400"] += 1
+        elif length <= 500:
+            distribution["400-500"] += 1
+        elif length <= 600:
+            distribution["500-600"] += 1
+        elif length <= 700:
+            distribution["600-700"] += 1
+        elif length <= 800:
+            distribution["700-800"] += 1
+        elif length <= 900:
+            distribution["800-900"] += 1
+        elif length <= 1000:
+            distribution["900-1000"] += 1
+        elif length <= 1500:
+            distribution["1000-1500"] += 1
+        elif length <= 2000:
+            distribution["1500-2000"] += 1
+        elif length <= 2500:
+            distribution["2000-2500"] += 1
+        elif length <= 3000:
+            distribution["2500-3000"] += 1
+        elif length <= 3500:
+            distribution["3000-3500"] += 1
+        elif length <= 4000:
+            distribution["3500-4000"] += 1
+        elif length <= 4500:
+            distribution["4000-4500"] += 1
+        elif length <= 5000:
+            distribution["4500-5000"] += 1
+        elif length <= 5500:
+            distribution["5000-5500"] += 1
+        elif length <= 6000:
+            distribution["5500-6000"] += 1
+        elif length <= 6500:
+            distribution["6000-6500"] += 1
+        elif length <= 7000:
+            distribution["6500-7000"] += 1
+        elif length <= 7500:
+            distribution["7000-7500"] += 1
+        elif length <= 8000:
+            distribution["7500-8000"] += 1
+        elif length <= 8500:
+            distribution["8000-8500"] += 1
+        elif length <= 9000:
+            distribution["8500-9000"] += 1
+        elif length <= 9500:
+            distribution["9000-9500"] += 1
+        elif length <= 10000:
+            distribution["9500-10000"] += 1
+        else:
+            distribution[">10000"] += 1
+
+    print(f"Length distribution of {json_file_path}:")
+    for key, value in distribution.items():
+        print(f"{key}: {value}")
+
+# with open(out_file_name, 'w', encoding='utf-8') as file:
+#    json.dump(long_json*10, file, ensure_ascii=False, indent=4)
+
+# print(f"处理完成，大于{token_thre}的已保存到{out_file_name}")