Commit 112bf76b authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #1826 canceled with stages
# Mixtral
CUDA_VISIBLE_DEVICES=7 python mixtral_inference.py
[
{
"set": "sharegpt4",
"id": "000000000164",
"conversations": [
{
"from": "human",
"value": "<image>\ninput_wavs/promp0.wav\n"
},
{
"from": "gpt",
"value": "This is a well-organized kitchen with a clean, modern aesthetic. The kitchen features a white countertop against a white wall, creating a bright and airy atmosphere. "
}
],
"image": "coco/images/train2017/000000000164.jpg",
"audio": [
"audio0.wav"
]
}
]
import json
import math
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
from tqdm import tqdm
import torchaudio
from vita.config import *
from vita.config import AudioFolder, FolderDict
from vita.config.dataset_config import *
# 定义文件路径
output_file_path = "lost_file_name.txt"
dur_thre1 = 30
dur_thre2 = 0.5
# 将所有字典放入一个列表中
# datasets = NLP + HumanCentric + VideoQA + NaturalQA +VideoCap + OCRCap + NaturalCap
# datasets = NaturalCap + VideoCap + OCRCap + NaturalQA + VideoQA + HumanCentric + [TextSFT]
datasets = NaturalCap + VideoCap
datasets = OCRCap + NaturalQA
datasets = VideoQA + HumanCentric + [TextSFT]
datasets = [TextSFT]
# 初始化一个列表来存储丢失的文件名
lost_files = []
lock = threading.Lock()
def get_wav_duration(file_path):
waveform, sample_rate = torchaudio.load(file_path)
duration = waveform.size(1) / sample_rate
return duration
def check_audio(audio_file_name, audio_directory):
audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
if not os.path.exists(audio_file_path):
print(f"{audio_file_path} lost!!!!!!!!")
return audio_file_name
else:
try:
duration = get_wav_duration(audio_file_path)
if duration > dur_thre1 or duration < dur_thre2:
print(f"{audio_file_path} duration {duration}, too long!!!!!!!")
return audio_file_name
except Exception as e:
print(f"{audio_file_path} is broken!!!!!!!!")
return audio_file_name
return None
# 遍历每个字典
for dataset in datasets:
keys = list(dataset.keys())
json_file_path = dataset["chat_path"]
print(json_file_path)
# 读取JSON文件
with open(json_file_path, "r", encoding="utf-8") as f:
data = json.load(f)
# 遍历每条数据,使用tqdm显示进度条
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for item in data:
audio_files = item.get("audio")
audio_directory = AudioFolder
# 如果 audio_files 是字符串,将其转换为列表
if isinstance(audio_files, str):
audio_files = [audio_files]
# 如果 audio_files 是列表,处理每个文件
if isinstance(audio_files, list):
for audio_file_name in audio_files:
futures.append(executor.submit(check_audio, audio_file_name, audio_directory))
for future in tqdm(
as_completed(futures), total=len(futures), desc="Processing", unit="file"
):
result = future.result()
if result:
with lock:
lost_files.append(result)
# 将丢失的文件名写入到lost_file_name.txt中
with open(output_file_path, "w", encoding="utf-8") as f:
for file_name in lost_files:
f.write(file_name + "\n")
print(f"检查完成,共有 {len(lost_files)} 个文件丢失或无法读取,结果已保存到 {output_file_path}")
import json
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
from tqdm import tqdm
from vita.config import FolderDict
from vita.config.dataset_config import *
# 定义文件路径
output_file_path = "lost_file_name.txt"
# 将所有字典放入一个列表中
datasets = [ShareGPT4V]
# 初始化一个列表来存储丢失的文件名
lost_files = []
lock = threading.Lock()
def check_image(image_file_name, image_directory):
image_file_path = os.path.join(image_directory, image_file_name)
if not os.path.exists(image_file_path):
return image_file_name
else:
try:
with Image.open(image_file_path) as img:
img.convert("RGB")
except Exception as e:
return image_file_name
return None
# 遍历每个字典
for dataset in datasets:
keys = list(dataset.keys())
json_file_path = dataset["chat_path"]
print(json_file_path)
# 读取JSON文件
with open(json_file_path, "r", encoding="utf-8") as f:
data = json.load(f)
# 遍历每条数据,使用tqdm显示进度条
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for item in data:
if "image" in item:
image_files = item.get("image")
set_id = item["set"]
if type(set_id) is list:
set_id = set_id[0]
image_directory = FolderDict[set_id]
# 如果 image_files 是字符串,将其转换为列表
if isinstance(image_files, str):
image_files = [image_files]
# 如果 image_files 是列表,处理每个文件
if isinstance(image_files, list):
for image_file_name in image_files:
futures.append(
executor.submit(check_image, image_file_name, image_directory)
)
for future in tqdm(
as_completed(futures), total=len(futures), desc="Processing", unit="file"
):
result = future.result()
if result:
with lock:
lost_files.append(result)
print(f"file lost: {result}")
# 将丢失的文件名写入到lost_file_name.txt中
with open(output_file_path, "w", encoding="utf-8") as f:
for file_name in lost_files:
f.write(file_name + "\n")
print(f"检查完成,共有 {len(lost_files)} 个文件丢失或无法读取,结果已保存到 {output_file_path}")
import json
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
from tqdm import tqdm
from vita.config import FolderDict
from vita.config.dataset_config import *
# 定义文件路径
output_file_path = "long_image_file_name.txt"
ratio_thre = 12
# 将所有字典放入一个列表中
# datasets = [AnyWord_20to50, RCTW2019, RCTW2019QA, RCTW2017, OpenChart, SCID, K12, TabRECSet, DigChat, iFlyTab]
datasets = [AnyWord_20to50, DyChart_iresearch]
datasets = [RCTW2019, RCTW2019QA, RCTW2017]
datasets = [OpenChart, SCID]
datasets = [K12]
# datasets = [TabRECSet, DigChat, iFlyTab]
# 初始化一个列表来存储丢失的文件名
lost_files = []
lock = threading.Lock()
def check_image(image_file_name, image_directory):
image_file_path = os.path.join(image_directory, image_file_name)
if not os.path.exists(image_file_path):
print(f"{image_file_path} not exist!!!!!!!!!!")
return image_file_name
else:
try:
with Image.open(image_file_path) as img:
img.convert("RGB")
size_ratio = img.size[0] / img.size[1]
if size_ratio < 1 / ratio_thre or size_ratio > ratio_thre:
print(f"{image_file_path} ratio is too big!!!!!!!!!!!!!!")
return image_file_name
except Exception as e:
print(f"{image_file_path} is broken!!!!!!!!!!!!")
return image_file_name
return None
# 遍历每个字典
for dataset in datasets:
keys = list(dataset.keys())
json_file_path = dataset["chat_path"]
# 读取JSON文件
with open(json_file_path, "r", encoding="utf-8") as f:
data = json.load(f)
# 遍历每条数据,使用tqdm显示进度条
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for item in data:
image_files = item.get("image")
set_id = item["set"]
image_directory = FolderDict[set_id]
# 如果 image_files 是字符串,将其转换为列表
if isinstance(image_files, str):
image_files = [image_files]
# 如果 image_files 是列表,处理每个文件
if isinstance(image_files, list):
for image_file_name in image_files:
futures.append(executor.submit(check_image, image_file_name, image_directory))
for future in tqdm(
as_completed(futures), total=len(futures), desc="Processing", unit="file"
):
result = future.result()
if result:
with lock:
lost_files.append(result)
# 将丢失的文件名写入到lost_file_name.txt中
with open(output_file_path, "w", encoding="utf-8") as f:
for file_name in lost_files:
f.write(file_name + "\n")
print(f"检查完成,共有 {len(lost_files)} 个文件丢失或无法读取,结果已保存到 {output_file_path}")
import json
import math
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
from tqdm import tqdm
import torchaudio
from vita.config import *
from vita.config import AudioFolder, FolderDict
from vita.config.dataset_config import *
# 将所有字典放入一个列表中
datasets = NLP + HumanCentric + VideoQA + NaturalQA + VideoCap + OCRCap + NaturalCap
datasets = [Webvid]
# 遍历每个字典
for dataset in datasets:
dur_list = []
keys = list(dataset.keys())
input_file_name = dataset["chat_path"]
# 读取JSON文件
with open(input_file_name, "r", encoding="utf-8") as file:
data = json.load(file)
print(f"check {input_file_name}")
# 遍历每条数据
for item in tqdm(data):
# 是否有set_id
assert "set" in item, f"{input_file_name} do not have set_id: {item} !!!!!!!!!!"
# item是否为空
assert len(item) > 0, f"{input_file_name} have null item!!!!!!!!!!"
# 是否有键的值为空
for key in item.keys():
if type(item[key]) is not int and key != "id":
assert (
len(item[key]) > 0
), f"{input_file_name}, item {item} have null key!!!!!!!!!!{key}"
# item['conversations']是否有空
for conv in item["conversations"]:
text = conv["value"]
if len(text) == 0:
print(f"{input_file_name}, item {item} has null speaking!!!")
# image/video路径数量、set_id数量、place_holder数量是否一致
count_image_ph = 0
count_video_ph = 0
count_audio_ph = 0
count_image_path = 0
count_video_path = 0
count_audio_path = 0
text_all = ""
for conv in item["conversations"]:
text = conv["value"]
text_all += text
count_image_ph = text_all.count("<image>")
count_video_ph = text_all.count("<video>")
count_audio_ph = text_all.count("<audio>")
if "image" in item:
image_path = item["image"]
assert isinstance(image_path[0], str)
if type(image_path) is not list:
assert isinstance(image_path, str)
image_path = [image_path]
count_image_path = len(image_path)
if "video" in item:
video_path = item["video"]
assert isinstance(video_path[0], str)
if type(video_path) is not list:
assert isinstance(video_path, str)
video_path = [video_path]
count_video_path = len(video_path)
if "audio" in item:
audio_path = item["audio"]
assert isinstance(audio_path[0], str)
if type(audio_path) is not list:
assert isinstance(audio_path, str)
audio_path = [audio_path]
count_audio_path = len(audio_path)
# assert count_image_path == count_image_ph, f"{input_file_name}, item {item} image place holder number NOT equal image file number"
# assert count_video_path == count_video_ph, f"{input_file_name}, item {item} video place holder number NOT equal video file number"
# assert count_audio_path == count_audio_ph, f"{input_file_name}, item {item} audio place holder number NOT equal audio file number"
if count_image_path != count_image_ph:
print(
f"{input_file_name}, item {item} image place holder number NOT equal image file number"
)
if count_video_path != count_video_ph:
print(
f"{input_file_name}, item {item} video place holder number NOT equal video file number"
)
if count_audio_path != count_audio_ph:
print(
f"{input_file_name}, item {item} audio place holder number NOT equal audio file number"
)
set_id = item["set"]
if type(set_id) is not list:
set_id = [set_id]
if "image" in item or "video" in item:
if set_id[0] != "sqa":
assert (
len(set_id) == count_image_path + count_video_path
), f"{input_file_name}, item {item} set_id numer Not correct"
import json
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
from tqdm import tqdm
from decord import VideoReader, cpu
from vita.config import FolderDict
from vita.config.dataset_config import *
# 定义文件路径
output_file_path = "lost_file_name.txt"
# 将所有字典放入一个列表中
# datasets = [Webvid, K400]
# datasets = [VIDEOChatGPT, K700Split, VC2Internvid]
# datasets = [EgoGesture, Literature, CopyWrite, MovingFashion]
# datasets = [NoHarm]
datasets = [SGInternvid0]
# 初始化一个列表来存储丢失的文件名
lost_files = []
# 遍历每个字典
for dataset in datasets:
keys = list(dataset.keys())
json_file_path = dataset["chat_path"]
# 读取JSON文件
with open(json_file_path, "r", encoding="utf-8") as f:
data = json.load(f)
def check_video_file(item):
video_file_name = item.get("video")
if video_file_name:
video_directory = FolderDict[item["set"]]
video_file_path = os.path.join(video_directory, video_file_name)
if not os.path.exists(video_file_path):
print(f"file lost: {video_file_path}")
return video_file_name
else:
sample_pos = [0, 10]
try:
vreader = VideoReader(video_file_path, ctx=cpu(0))
patch_images = [
Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()
]
except Exception as e:
print(f"file broken: {video_file_path}")
return video_file_name
return None
# 使用ThreadPoolExecutor进行多线程并行处理
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(check_video_file, item) for item in data]
for future in tqdm(
as_completed(futures), total=len(futures), desc="Processing", unit="file"
):
result = future.result()
if result:
lost_files.append(result)
# 将丢失的文件名写入到lost_file_name.txt中
with open(output_file_path, "w", encoding="utf-8") as f:
for file_name in lost_files:
f.write(file_name + "\n")
print(f"检查完成,共有 {len(lost_files)} 个文件丢失或无法读取,结果已保存到 {output_file_path}")
import json
import math
import os
import random
import torch
import transformers
from tqdm import tqdm
import torchaudio
from vita import conversation as conversation_lib
from vita.config import AudioFolder, FolderDict
from vita.config.dataset_config import *
from vita.constants import AUDIO_TOKEN_INDEX, GLOBAL_WEIGHTS_PATH, IGNORE_INDEX, IMAGE_TOKEN_INDEX
from vita.util.data_utils_video_audio import DataArguments, LazySupervisedDataset
from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
image_token_num = 256
concat_size = 4500
datasets = [ShareGPT4V]
parser = transformers.HfArgumentParser((DataArguments))
tokenizer = transformers.AutoTokenizer.from_pretrained(
f"{GLOBAL_WEIGHTS_PATH}/Mixtral-8x7B_New/mg2hg",
cache_dir=None,
model_max_length=8192,
padding_side="right",
use_fast=True,
)
def get_wav_duration(file_path):
waveform, sample_rate = torchaudio.load(file_path)
duration = waveform.size(1) / sample_rate
return duration
for dataset in datasets:
input_file_name = dataset["chat_path"]
base_name, ext = os.path.splitext(input_file_name)
suffix = f"-concat{concat_size}"
out_file_name = f"{base_name}{suffix}{ext}"
with open(input_file_name, "r", encoding="utf-8") as file:
data = json.load(file)
random.shuffle(data)
# data = data[:100]
# 遍历每条数据
len_list = []
conv = conversation_lib.default_conversation.copy()
roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
len_list = []
# Apply prompt templates
for item in tqdm(data):
source = item["conversations"]
conv.messages = []
for j, sentence in enumerate(source):
role = roles[sentence["from"]]
assert role == conv.roles[j % 2], f"{source}"
conv.append_message(role, sentence["value"])
prompt = conv.get_prompt()
# import pdb; pdb.set_trace()
input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
num_images = (input_ids == IMAGE_TOKEN_INDEX).sum()
item_token_num = input_ids.shape[0] + num_images * image_token_num
if "audio" in item:
audio_files = item["audio"]
audio_directory = AudioFolder
# 如果 audio_files 是字符串,将其转换为列表
if isinstance(audio_files, str):
audio_files = [audio_files]
# 如果 audio_files 是列表,处理每个文件
assert isinstance(audio_files, list)
total_duration = 0
for audio_file_name in audio_files:
audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
duration = get_wav_duration(audio_file_path)
duration = (
math.ceil(duration) if math.ceil(duration) % 2 == 0 else math.ceil(duration) + 1
)
total_duration += duration
item_token_num += math.ceil(total_duration * 12.5)
len_list.append(item_token_num)
assert len(len_list) == len(data)
def concat_item(items):
temp_set_id = []
temp_conversations = []
temp_ids = []
temp_images = []
temp_audios = []
for item in items:
temp_set_id.append(item["set"])
temp_conversations.extend(item["conversations"])
if "id" in item:
temp_ids.append(item["id"])
if "image" in item:
temp_images.append(item["image"])
if "audio" in item:
audio = item["audio"]
if type(audio) is not list:
audio = [audio]
temp_audios += audio
if len(temp_images) > 0:
merged_item = {
"set": temp_set_id,
"id": temp_ids,
"image": temp_images,
"conversations": temp_conversations,
}
else:
merged_item = {
"set": temp_set_id,
"id": temp_ids,
"conversations": temp_conversations,
}
if len(temp_audios) > 0:
merged_item["audio"] = temp_audios
return merged_item
merged_data = []
i = 0
while i < len(data):
len_token = len_list[i]
k = 1
while True:
if sum(len_list[i : i + k]) > concat_size:
if k > 1:
k -= 1
break
if i + k == len(data):
break
k += 1
merged_item = concat_item(data[i : i + k])
merged_data.append(merged_item)
# print(f"i: {i}, k: {k}; len of merged item: {sum(len_list[i:i+k])}")
i = i + k
with open(out_file_name, "w", encoding="utf-8") as f:
json.dump(merged_data, f, ensure_ascii=False, indent=4)
print(f"save {out_file_name}")
import json
import math
import os
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
import torch
import transformers
from PIL import Image
from tqdm import tqdm
import torchaudio
from vita import conversation as conversation_lib
from vita.config import *
from vita.config import AudioFolder, FolderDict
from vita.constants import AUDIO_TOKEN_INDEX, GLOBAL_WEIGHTS_PATH, IGNORE_INDEX, IMAGE_TOKEN_INDEX
from vita.util.data_utils_video_audio import DataArguments, LazySupervisedDataset
from vita.util.data_utils_video_audio_neg_patch import find_closest_aspect_ratio
from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
image_token_num = 256
concat_size = 6000
datasets = [ShareGPT4V0]
parser = transformers.HfArgumentParser((DataArguments))
tokenizer = transformers.AutoTokenizer.from_pretrained(
f"{GLOBAL_WEIGHTS_PATH}/Mixtral-8x7B_New/mg2hg",
cache_dir=None,
model_max_length=8192,
padding_side="right",
use_fast=True,
)
def dynamic_preprocess(
image, min_num=2, max_num=12, image_size=448, use_thumbnail=False, img_mean=0
):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if i * j <= max_num and i * j >= min_num
)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size
)
# expand target_aspect_ratio to even for each size
new_target_aspect_ratio = [e if e % 2 == 0 else e + 1 for e in target_aspect_ratio]
blocks_big = int(0.5 * new_target_aspect_ratio[0] * 0.5 * new_target_aspect_ratio[1])
return blocks_big
def get_wav_duration(file_path):
waveform, sample_rate = torchaudio.load(file_path)
duration = waveform.size(1) / sample_rate
return duration
def concat_item(items):
temp_set_id = []
temp_conversations = []
temp_ids = []
temp_images = []
temp_audios = []
for item in items:
temp_set_id.append(item["set"])
temp_conversations.extend(item["conversations"])
if "id" in item:
temp_ids.append(item["id"])
if "image" in item:
temp_images.append(item["image"])
if "audio" in item:
audio = item["audio"]
if type(audio) is not list:
audio = [audio]
temp_audios += audio
if len(temp_images) > 0:
merged_item = {
"set": temp_set_id,
"id": temp_ids,
"image": temp_images,
"conversations": temp_conversations,
}
else:
merged_item = {
"set": temp_set_id,
"id": temp_ids,
"conversations": temp_conversations,
}
if len(temp_audios) > 0:
merged_item["audio"] = temp_audios
return merged_item
def compute_item_token_num(item):
conv = conversation_lib.default_conversation.copy()
roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
source = item["conversations"]
conv.messages = []
modality = "lang"
for j, sentence in enumerate(source):
role = roles[sentence["from"]]
assert role == conv.roles[j % 2], f"{source}"
conv.append_message(role, sentence["value"])
if "<image>" in sentence["value"]:
modality = "image"
prompt = conv.get_prompt(modality)
# import pdb; pdb.set_trace()
input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
item_token_num = input_ids.shape[0]
if "image" in item:
image_file = item["image"]
set_id = item["set"]
image_directory = FolderDict[set_id]
image = Image.open(os.path.join(image_directory, image_file.replace("\\", "/"))).convert(
"RGB"
)
num_patches = dynamic_preprocess(image)
item_token_num = item_token_num + num_patches * image_token_num
if "audio" in item:
audio_files = item["audio"]
audio_directory = AudioFolder
# 如果 audio_files 是字符串,将其转换为列表
if isinstance(audio_files, str):
audio_files = [audio_files]
# 如果 audio_files 是列表,处理每个文件
assert isinstance(audio_files, list)
total_duration = 0
for audio_file_name in audio_files:
audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
duration = get_wav_duration(audio_file_path)
duration = (
math.ceil(duration) if math.ceil(duration) % 2 == 0 else math.ceil(duration) + 1
)
total_duration += duration
item_token_num += math.ceil(total_duration * 12.5)
item["token_len"] = item_token_num
for dataset in datasets:
input_file_name = dataset["chat_path"]
base_name, ext = os.path.splitext(input_file_name)
suffix = f"-FrameConcat{concat_size}"
out_file_name = f"{base_name}{suffix}{ext}"
with open(input_file_name, "r", encoding="utf-8") as file:
data = json.load(file)
random.shuffle(data)
# data = data[:100]
# for item in tqdm(data):
# compute_item_token_num(item)
with ThreadPoolExecutor() as executor:
futures = [executor.submit(compute_item_token_num, item) for item in data]
for future in tqdm(as_completed(futures), total=len(futures)):
future.result()
merged_data = []
i = 0
while i < len(data):
len_token = data[i]["token_len"]
k = 1
while True:
if sum([item["token_len"] for item in data[i : i + k]]) > concat_size:
if k > 1:
k -= 1
break
if i + k == len(data):
break
k += 1
merged_item = concat_item(data[i : i + k])
merged_data.append(merged_item)
# print(f"i: {i}, k: {k}; len of merged item: {sum(len_list[i:i+k])}")
i = i + k
with open(out_file_name, "w", encoding="utf-8") as f:
json.dump(merged_data, f, ensure_ascii=False, indent=4)
print(f"save {out_file_name}")
import json
import math
import os
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
import torch
import transformers
from PIL import Image
from tqdm import tqdm
import torchaudio
from vita import conversation as conversation_lib
from vita.config import AudioFolder, FolderDict
from vita.config.dataset_config import *
from vita.constants import AUDIO_TOKEN_INDEX, GLOBAL_WEIGHTS_PATH, IGNORE_INDEX, IMAGE_TOKEN_INDEX
from vita.util.data_utils_video_audio import DataArguments, LazySupervisedDataset
from vita.util.data_utils_video_audio_neg_patch import find_closest_aspect_ratio
from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
image_token_num = 256
concat_size = 6000
datasets = [ShareGPT4V]
parser = transformers.HfArgumentParser((DataArguments))
tokenizer = transformers.AutoTokenizer.from_pretrained(
f"{GLOBAL_WEIGHTS_PATH}/Mixtral-8x7B_New/mg2hg",
cache_dir=None,
model_max_length=8192,
padding_side="right",
use_fast=True,
)
conv = conversation_lib.default_conversation.copy()
roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=True):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if i * j <= max_num and i * j >= min_num
)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
if use_thumbnail and blocks != 1:
blocks += 1
return blocks
def get_wav_duration(file_path):
waveform, sample_rate = torchaudio.load(file_path)
duration = waveform.size(1) / sample_rate
return duration
def concat_item(items):
temp_set_id = []
temp_conversations = []
temp_ids = []
temp_images = []
temp_audios = []
for item in items:
temp_set_id.append(item["set"])
temp_conversations.extend(item["conversations"])
if "id" in item:
temp_ids.append(item["id"])
if "image" in item:
temp_images.append(item["image"])
if "audio" in item:
audio = item["audio"]
if type(audio) is not list:
audio = [audio]
temp_audios += audio
if len(temp_images) > 0:
merged_item = {
"set": temp_set_id,
"id": temp_ids,
"image": temp_images,
"conversations": temp_conversations,
}
else:
merged_item = {
"set": temp_set_id,
"id": temp_ids,
"conversations": temp_conversations,
}
if len(temp_audios) > 0:
merged_item["audio"] = temp_audios
return merged_item
def compute_item_token_num(item):
source = item["conversations"]
conv.messages = []
for j, sentence in enumerate(source):
role = roles[sentence["from"]]
assert role == conv.roles[j % 2], f"{source}"
conv.append_message(role, sentence["value"])
prompt = conv.get_prompt()
# import pdb; pdb.set_trace()
input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
item_token_num = input_ids.shape[0]
if "image" in item:
image_file = item["image"]
set_id = item["set"]
image_directory = FolderDict[set_id]
image = Image.open(os.path.join(image_directory, image_file.replace("\\", "/"))).convert(
"RGB"
)
num_patches = dynamic_preprocess(image)
item_token_num = item_token_num + num_patches * image_token_num
if "audio" in item:
audio_files = item["audio"]
audio_directory = AudioFolder
# 如果 audio_files 是字符串,将其转换为列表
if isinstance(audio_files, str):
audio_files = [audio_files]
# 如果 audio_files 是列表,处理每个文件
assert isinstance(audio_files, list)
total_duration = 0
for audio_file_name in audio_files:
audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
duration = get_wav_duration(audio_file_path)
duration = (
math.ceil(duration) if math.ceil(duration) % 2 == 0 else math.ceil(duration) + 1
)
total_duration += duration
item_token_num += math.ceil(total_duration * 12.5)
item["token_len"] = item_token_num
for dataset in datasets:
input_file_name = dataset["chat_path"]
base_name, ext = os.path.splitext(input_file_name)
suffix = f"-PatchConcat{concat_size}"
out_file_name = f"{base_name}{suffix}{ext}"
with open(input_file_name, "r", encoding="utf-8") as file:
data = json.load(file)
random.shuffle(data)
# data = data[:100]
# for item in tqdm(data):
# compute_item_token_num(item)
with ThreadPoolExecutor() as executor:
futures = [executor.submit(compute_item_token_num, item) for item in data]
for future in tqdm(as_completed(futures), total=len(futures)):
future.result()
merged_data = []
i = 0
while i < len(data):
len_token = data[i]["token_len"]
k = 1
while True:
if sum([item["token_len"] for item in data[i : i + k]]) > concat_size:
if k > 1:
k -= 1
break
if i + k == len(data):
break
k += 1
merged_item = concat_item(data[i : i + k])
merged_data.append(merged_item)
# print(f"i: {i}, k: {k}; len of merged item: {sum(len_list[i:i+k])}")
i = i + k
with open(out_file_name, "w", encoding="utf-8") as f:
json.dump(merged_data, f, ensure_ascii=False, indent=4)
print(f"save {out_file_name}")
import json
import os
from vita.constants import GLOBAL_WEIGHTS_PATH
# 定义文件路径
lost_file_path = "lost_file_name.txt"
json_list = [""]
for json_file_path in json_list:
output_json_file_path = json_file_path
# 读取丢失的文件名
with open(lost_file_path, "r", encoding="utf-8") as f:
lost_files = set(line.strip() for line in f)
# 读取JSON文件
with open(json_file_path, "r", encoding="utf-8") as f:
data = json.load(f)
# 过滤数据,删除丢失文件对应的数据
filtered_data = []
for item in data:
audio_OK = True
if "audio" in item:
assert type(item["audio"]) is list
for audio_filename in item["audio"]:
if audio_filename in lost_files:
audio_OK = False
if audio_OK:
filtered_data.append(item)
# 将更新后的数据写入新的JSON文件
with open(output_json_file_path, "w", encoding="utf-8") as f:
json.dump(filtered_data, f, ensure_ascii=False, indent=4)
print(f"更新完成,共删除了 {len(data) - len(filtered_data)} 条数据,结果已保存到 {output_json_file_path}")
import json
from vita.constants import GLOBAL_WEIGHTS_PATH
# 定义文件路径
# lost_file_path = 'lost_file_name.txt'
lost_file_path = "long_image_file_name.txt"
json_list = [""]
for json_file_path in json_list:
output_json_file_path = json_file_path
# 读取丢失的文件名
with open(lost_file_path, "r", encoding="utf-8") as f:
lost_files = set(line.strip() for line in f)
# 读取JSON文件
with open(json_file_path, "r", encoding="utf-8") as f:
data = json.load(f)
# 过滤数据,删除丢失文件对应的数据
filtered_data = []
for item in data:
image_OK = True
if "image" in item:
image_file = item["image"]
if type(image_file) is str:
image_file = [image_file]
assert type(image_file) is list
for image_filename in image_file:
if image_filename in lost_files:
image_OK = False
break
if image_OK:
filtered_data.append(item)
# 将更新后的数据写入新的JSON文件
with open(output_json_file_path, "w", encoding="utf-8") as f:
json.dump(filtered_data, f, ensure_ascii=False, indent=4)
print(f"更新完成,共删除了 {len(data) - len(filtered_data)} 条数据,结果已保存到 {output_json_file_path}")
import json
from vita.constants import GLOBAL_WEIGHTS_PATH
# 定义文件路径
lost_file_path = "lost_file_name.txt"
json_list = []
for json_file_path in json_list:
output_json_file_path = json_file_path
with open(lost_file_path, "r") as file:
lost_files = set(file.read().splitlines())
# Load the JSON data
with open(json_file_path, "r") as file:
data = json.load(file)
# 过滤数据,删除丢失文件对应的数据
filtered_data = []
for item in data:
video_OK = True
if "video" in item:
video_filename = item["video"]
if video_filename in lost_files:
video_OK = False
if video_OK:
filtered_data.append(item)
# Save the filtered data back to a new JSON file
with open(output_json_file_path, "w", encoding="utf-8") as file:
json.dump(filtered_data, file, indent=2, ensure_ascii=False)
print(
f"The json data has been delete {len(data)-len(filtered_data)} and saved to {output_json_file_path}"
)
import json
import math
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
from tqdm import tqdm
import torchaudio
from vita.config import *
from vita.config import AudioFolder, FolderDict
from vita.config.dataset_config import *
# 定义文件路径
output_file_path = "lost_file_name.txt"
# 将所有字典放入一个列表中
# datasets = NLP+HumanCentric+VideoQA+NaturalQA
datasets = VideoCap + OCRCap + NaturalCap
# 初始化一个列表来存储丢失的文件名
lock = threading.Lock()
def get_wav_duration(file_path):
waveform, sample_rate = torchaudio.load(file_path)
duration = waveform.size(1) / sample_rate
return duration
def check_audio(audio_file_name, audio_directory):
audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
duration = get_wav_duration(audio_file_path)
if duration > 200:
print(audio_file_path, duration)
return duration
# 遍历每个字典
for dataset in datasets:
dur_list = []
keys = list(dataset.keys())
json_file_path = dataset["chat_path"]
print(json_file_path)
# 读取JSON文件
with open(json_file_path, "r", encoding="utf-8") as f:
data = json.load(f)
# 遍历每条数据,使用tqdm显示进度条
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for item in data:
audio_files = item.get("audio")
audio_directory = AudioFolder
# 如果 audio_files 是字符串,将其转换为列表
if isinstance(audio_files, str):
audio_files = [audio_files]
# 如果 audio_files 是列表,处理每个文件
if isinstance(audio_files, list):
for audio_file_name in audio_files:
futures.append(executor.submit(check_audio, audio_file_name, audio_directory))
for future in tqdm(
as_completed(futures), total=len(futures), desc="Processing", unit="file"
):
duration = future.result()
dur_list.append(duration)
# 初始化区间计数字典
distribution = {
"0-1": 0,
"1-5": 0,
"5-10": 0,
"10-15": 0,
"15-20": 0,
"20-25": 0,
"25-30": 0,
"30-60": 0,
"60-200": 0,
">200": 0,
}
# 统计每个区间的计数
for length in dur_list:
if length <= 1:
distribution["0-1"] += 1
elif length <= 5:
distribution["1-5"] += 1
elif length <= 10:
distribution["5-10"] += 1
elif length <= 15:
distribution["10-15"] += 1
elif length <= 20:
distribution["15-20"] += 1
elif length <= 25:
distribution["20-25"] += 1
elif length <= 30:
distribution["25-30"] += 1
elif length <= 60:
distribution["30-60"] += 1
elif length <= 200:
distribution["60-200"] += 1
else:
distribution[">200"] += 1
# 打印分布结果
print(f"duration distribution of {json_file_path}:")
for key, value in distribution.items():
print(f"{key}: {value}")
import json
import math
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import torch
import transformers
from tqdm import tqdm
import torchaudio
from vita import conversation as conversation_lib
from vita.config import *
from vita.config import AudioFolder, FolderDict
from vita.config.dataset_config import *
from vita.constants import AUDIO_TOKEN_INDEX, IGNORE_INDEX, IMAGE_TOKEN_INDEX
from vita.util.data_utils_video_audio import DataArguments, LazySupervisedDataset
from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
image_token_num = 256
token_thre = 4500
# datasets = NaturalCap + VideoCap + OCRCap + NaturalQA + VideoQA + HumanCentric + NLP
datasets = (
NaturalCap0
+ OCRCap0
+ VideoCap0
+ NaturalQA0
+ VideoQA0
+ [EgoGesture0, Literature0, CopyWrite0, MovingFashion0]
)
num_data_neg_audio = 0
for dataset in datasets:
json_file_path = dataset["chat_path"]
with open(json_file_path, "r", encoding="utf-8") as file:
data = json.load(file)
num_data_audio = 0
num_data_conv = 0
num_data_qs_qudio = 0
num_data_qs_text = 0
for item in data:
conversations = item["conversations"]
assert len(conversations) % 2 == 0
num_conv = len(conversations) // 2
num_data_conv += num_conv
num_qs_audio = 0
num_qs_text = 0
for conv in conversations:
if conv["from"] == "human":
qs = conv["value"]
if "<audio>" in qs:
num_qs_audio += 1
else:
num_qs_text += 1
num_data_qs_qudio += num_qs_audio
num_data_qs_text += num_qs_text
num_audio = 0
audio_files = item.get("audio")
audio_directory = AudioFolder
# 如果 audio_files 是字符串,将其转换为列表
if isinstance(audio_files, str):
audio_files = [audio_files]
# 如果 audio_files 是列表,处理每个文件
if isinstance(audio_files, list):
num_audio = len(audio_files)
for audio in audio_files:
if "new_value_dict_0725" in audio or "new_value_dict_0730" in audio:
num_data_neg_audio += 1
num_data_audio += num_audio
assert num_data_conv == num_data_qs_qudio + num_data_qs_text
# print(f'{json_file_path} conversation number: {num_data_conv/1000}K')
# print(f'{json_file_path} audio question number: {num_data_qs_qudio/1000}K')
# print(f'{json_file_path} text question number: {num_data_qs_text/1000}K')
# print(f'{json_file_path} audio number: {num_data_audio/1000}K')
print(f"{json_file_path} data number: {len(data)/1000}K")
# print(f'{json_file_path} negtive audio question number: {num_data_neg_audio/1000}K')
import json
import math
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
from tqdm import tqdm
import torchaudio
from vita.config import *
from vita.config import AudioFolder, FolderDict
from vita.config.dataset_config import *
# 将所有字典放入一个列表中
datasets = NLP + HumanCentric + VideoQA + NaturalQA + VideoCap + OCRCap + NaturalCap
# 遍历每个字典
for dataset in datasets:
dur_list = []
keys = list(dataset.keys())
input_file_name = dataset["chat_path"]
# 读取JSON文件
len_list = []
with open(input_file_name, "r", encoding="utf-8") as file:
data = json.load(file)
print(f"check {input_file_name}")
# 遍历每条数据
for item in tqdm(data):
if "image" in item:
image_path = item["image"]
assert isinstance(image_path[0], str)
if type(image_path) is not list:
assert isinstance(image_path, str)
image_path = [image_path]
count_image_path = len(image_path)
if count_image_path > 40:
print(count_image_path)
print(item)
len_list.append(count_image_path)
distribution = {
"0-5": 0,
"5-10": 0,
"10-16": 0,
"16-20": 0,
"20-25": 0,
"25-30": 0,
"30-35": 0,
"35-40": 0,
">40": 0,
}
for length in len_list:
if length <= 5:
distribution["0-5"] += 1
elif length <= 10:
distribution["5-10"] += 1
elif length <= 16:
distribution["10-16"] += 1
elif length <= 20:
distribution["16-20"] += 1
elif length <= 25:
distribution["20-25"] += 1
elif length <= 30:
distribution["25-30"] += 1
elif length <= 35:
distribution["30-35"] += 1
elif length <= 40:
distribution["35-40"] += 1
else:
distribution[">40"] += 1
print(f"Length distribution of {input_file_name}:")
for key, value in distribution.items():
print(f"{key}: {value}")
import json
import math
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import torch
import transformers
from tqdm import tqdm
import torchaudio
from vita import conversation as conversation_lib
from vita.config import *
from vita.config import AudioFolder, FolderDict
from vita.config.dataset_config import *
from vita.constants import AUDIO_TOKEN_INDEX, GLOBAL_WEIGHTS_PATH, IGNORE_INDEX, IMAGE_TOKEN_INDEX
from vita.util.data_utils_video_audio import DataArguments, LazySupervisedDataset
from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
image_token_num = 256
token_thre = 4500
# datasets = NLP + HumanCentric + VideoQA + NaturalQA + VideoCap + OCRCap + NaturalCap
datasets = [DyChart_iresearch, RCTW2019QA, Lvis_cn_noDesc, VIDEOChatGPT]
datasets = [AnyWord_20to50]
out_file_name = "debug.json"
parser = transformers.HfArgumentParser((DataArguments))
tokenizer = transformers.AutoTokenizer.from_pretrained(
f"{GLOBAL_WEIGHTS_PATH}/Mixtral-8x7B_New/mg2hg",
cache_dir=None,
model_max_length=8192,
padding_side="right",
use_fast=True,
)
long_json = []
def get_wav_duration(file_path):
waveform, sample_rate = torchaudio.load(file_path)
duration = waveform.size(1) / sample_rate
return duration
def process_item(item, conv, roles, tokenizer):
source = item["conversations"]
conv.messages = []
for j, sentence in enumerate(source):
role = roles[sentence["from"]]
assert role == conv.roles[j % 2], f"{source}"
conv.append_message(role, sentence["value"])
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
num_images = (input_ids == IMAGE_TOKEN_INDEX).sum()
item_token_num = input_ids.shape[0] + num_images * image_token_num
if "audio" in item:
audio_files = item["audio"]
audio_directory = AudioFolder
if isinstance(audio_files, str):
audio_files = [audio_files]
assert isinstance(audio_files, list)
total_duration = 0
for audio_file_name in audio_files:
audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
duration = get_wav_duration(audio_file_path)
duration = (
math.ceil(duration) if math.ceil(duration) % 2 == 0 else math.ceil(duration) + 1
)
total_duration += duration
item_token_num += math.ceil(total_duration * 12.5)
if item_token_num > token_thre:
print(item_token_num)
if len(item["image"]) >= 16:
long_json.append(item)
print(len(item["image"]))
return item_token_num
for dataset in datasets:
json_file_path = dataset["chat_path"]
with open(json_file_path, "r", encoding="utf-8") as file:
data = json.load(file)
conv = conversation_lib.default_conversation.copy()
roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
len_list = []
with ThreadPoolExecutor() as executor:
futures = [executor.submit(process_item, item, conv, roles, tokenizer) for item in data]
for future in tqdm(as_completed(futures), total=len(futures)):
len_list.append(future.result())
assert len(len_list) == len(data)
distribution = {
"0-100": 0,
"100-200": 0,
"200-300": 0,
"300-400": 0,
"400-500": 0,
"500-600": 0,
"600-700": 0,
"700-800": 0,
"800-900": 0,
"900-1000": 0,
"1000-1100": 0,
"1100-1200": 0,
"1200-1300": 0,
"1300-1400": 0,
"1400-1500": 0,
"1500-1600": 0,
"1600-1700": 0,
"1700-1800": 0,
"1800-1900": 0,
"1900-2000": 0,
"2000-2500": 0,
"2500-3000": 0,
"3000-3500": 0,
"3500-4000": 0,
"4000-4500": 0,
"4500-5000": 0,
"5000-5500": 0,
"5500-6000": 0,
">6000": 0,
}
for length in len_list:
if length <= 100:
distribution["0-100"] += 1
elif length <= 200:
distribution["100-200"] += 1
elif length <= 300:
distribution["200-300"] += 1
elif length <= 400:
distribution["300-400"] += 1
elif length <= 500:
distribution["400-500"] += 1
elif length <= 600:
distribution["500-600"] += 1
elif length <= 700:
distribution["600-700"] += 1
elif length <= 800:
distribution["700-800"] += 1
elif length <= 900:
distribution["800-900"] += 1
elif length <= 1000:
distribution["900-1000"] += 1
elif length <= 1100:
distribution["1000-1100"] += 1
elif length <= 1200:
distribution["1100-1200"] += 1
elif length <= 1300:
distribution["1200-1300"] += 1
elif length <= 1400:
distribution["1300-1400"] += 1
elif length <= 1500:
distribution["1400-1500"] += 1
elif length <= 1600:
distribution["1500-1600"] += 1
elif length <= 1700:
distribution["1600-1700"] += 1
elif length <= 1800:
distribution["1700-1800"] += 1
elif length <= 1900:
distribution["1800-1900"] += 1
elif length <= 2000:
distribution["1900-2000"] += 1
elif length <= 2500:
distribution["2000-2500"] += 1
elif length <= 3000:
distribution["2500-3000"] += 1
elif length <= 3500:
distribution["3000-3500"] += 1
elif length <= 4000:
distribution["3500-4000"] += 1
elif length <= 4500:
distribution["4000-4500"] += 1
elif length <= 5000:
distribution["4500-5000"] += 1
elif length <= 5500:
distribution["5000-5500"] += 1
elif length <= 6000:
distribution["5500-6000"] += 1
else:
distribution[">6000"] += 1
print(f"Length distribution of {json_file_path}:")
for key, value in distribution.items():
print(f"{key}: {value}")
# with open(out_file_name, 'w', encoding='utf-8') as file:
# json.dump(long_json*10, file, ensure_ascii=False, indent=4)
# print(f"处理完成,大于{token_thre}的已保存到{out_file_name}")
import json
import math
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import torch
import transformers
from PIL import Image
from tqdm import tqdm
import torchaudio
from vita import conversation as conversation_lib
from vita.config import *
from vita.config import AudioFolder, FolderDict
from vita.config.dataset_config import *
from vita.constants import AUDIO_TOKEN_INDEX, GLOBAL_WEIGHTS_PATH, IGNORE_INDEX, IMAGE_TOKEN_INDEX
from vita.util.data_utils_video_audio import DataArguments, LazySupervisedDataset
from vita.util.data_utils_video_audio_neg_patch import find_closest_aspect_ratio
from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
image_token_num = 256
token_thre = 9500
# datasets = NLP + HumanCentric + VideoQA + NaturalQA + VideoCap + OCRCap + NaturalCap
datasets = NaturalCap0 + OCRCap0 + VideoCap0 + NaturalQA0
# datasets = VideoQA + HumanCentric + NLP
# datasets = [SGInternvid0]
datasets = NaturalCap0
datasets = OCRCap0
datasets = VideoCap0 + NaturalQA0 + [TextSFT0]
out_file_name = "debug.json"
parser = transformers.HfArgumentParser((DataArguments))
tokenizer = transformers.AutoTokenizer.from_pretrained(
f"{GLOBAL_WEIGHTS_PATH}/Mixtral-8x7B_New/mg2hg",
cache_dir=None,
model_max_length=8192,
padding_side="right",
use_fast=True,
)
long_json = []
def dynamic_preprocess(
image, min_num=2, max_num=12, image_size=448, use_thumbnail=False, img_mean=0
):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if i * j <= max_num and i * j >= min_num
)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size
)
# expand target_aspect_ratio to even for each size
new_target_aspect_ratio = [e if e % 2 == 0 else e + 1 for e in target_aspect_ratio]
blocks_big = int(0.5 * new_target_aspect_ratio[0] * 0.5 * new_target_aspect_ratio[1])
return blocks_big
def get_wav_duration(file_path):
waveform, sample_rate = torchaudio.load(file_path)
duration = waveform.size(1) / sample_rate
return duration
def process_item(item, tokenizer):
conv = conversation_lib.default_conversation.copy()
roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
source = item["conversations"]
conv.messages = []
modality = "lang"
for j, sentence in enumerate(source):
role = roles[sentence["from"]]
assert role == conv.roles[j % 2], f"{source}"
conv.append_message(role, sentence["value"])
if "<image>" in sentence["value"]:
modality = "image"
elif "<video>" in sentence["value"]:
modality = "lang"
prompt = conv.get_prompt(modality)
input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
item_token_num = input_ids.shape[0]
if "image" in item:
image_file = item["image"]
if isinstance(image_file, str):
image_file = [image_file]
set_id = item["set"]
if isinstance(set_id, str):
set_id = [set_id]
for k, img_file in enumerate(image_file):
if set_id[k] not in NoPatchSets:
image_directory = FolderDict[set_id[k]]
image = Image.open(
os.path.join(image_directory, img_file.replace("\\", "/"))
).convert("RGB")
num_patches = dynamic_preprocess(image)
else:
num_patches = 1
item_token_num += num_patches * image_token_num
total_duration = 0
if "audio" in item:
audio_files = item["audio"]
audio_directory = AudioFolder
if isinstance(audio_files, str):
audio_files = [audio_files]
assert isinstance(audio_files, list)
for audio_file_name in audio_files:
audio_file_path = os.path.join(audio_directory, "audio", audio_file_name)
duration = get_wav_duration(audio_file_path)
duration = (
math.ceil(duration) if math.ceil(duration) % 2 == 0 else math.ceil(duration) + 1
)
total_duration += duration
item_token_num += math.ceil(total_duration * 12.5)
if item_token_num > token_thre:
print(f"item_token_num: {item_token_num}")
if len(item["image"]) >= 16:
print(f"num_patches: {num_patches}")
print(f"total_duration: {total_duration}")
long_json.append(item)
print(item)
return item_token_num
for dataset in datasets:
json_file_path = dataset["chat_path"]
with open(json_file_path, "r", encoding="utf-8") as file:
data = json.load(file)
len_list = []
with ThreadPoolExecutor() as executor:
futures = [executor.submit(process_item, item, tokenizer) for item in data]
for future in tqdm(as_completed(futures), total=len(futures)):
len_list.append(future.result())
assert len(len_list) == len(data)
distribution = {
"0-100": 0,
"100-200": 0,
"200-300": 0,
"300-400": 0,
"400-500": 0,
"500-600": 0,
"600-700": 0,
"700-800": 0,
"800-900": 0,
"900-1000": 0,
"1000-1500": 0,
"1500-2000": 0,
"2000-2500": 0,
"2500-3000": 0,
"3000-3500": 0,
"3500-4000": 0,
"4000-4500": 0,
"4500-5000": 0,
"5000-5500": 0,
"5500-6000": 0,
"6000-6500": 0,
"6500-7000": 0,
"7000-7500": 0,
"7500-8000": 0,
"8000-8500": 0,
"8500-9000": 0,
"9000-9500": 0,
"9500-10000": 0,
">10000": 0,
}
for length in len_list:
if length <= 100:
distribution["0-100"] += 1
elif length <= 200:
distribution["100-200"] += 1
elif length <= 300:
distribution["200-300"] += 1
elif length <= 400:
distribution["300-400"] += 1
elif length <= 500:
distribution["400-500"] += 1
elif length <= 600:
distribution["500-600"] += 1
elif length <= 700:
distribution["600-700"] += 1
elif length <= 800:
distribution["700-800"] += 1
elif length <= 900:
distribution["800-900"] += 1
elif length <= 1000:
distribution["900-1000"] += 1
elif length <= 1500:
distribution["1000-1500"] += 1
elif length <= 2000:
distribution["1500-2000"] += 1
elif length <= 2500:
distribution["2000-2500"] += 1
elif length <= 3000:
distribution["2500-3000"] += 1
elif length <= 3500:
distribution["3000-3500"] += 1
elif length <= 4000:
distribution["3500-4000"] += 1
elif length <= 4500:
distribution["4000-4500"] += 1
elif length <= 5000:
distribution["4500-5000"] += 1
elif length <= 5500:
distribution["5000-5500"] += 1
elif length <= 6000:
distribution["5500-6000"] += 1
elif length <= 6500:
distribution["6000-6500"] += 1
elif length <= 7000:
distribution["6500-7000"] += 1
elif length <= 7500:
distribution["7000-7500"] += 1
elif length <= 8000:
distribution["7500-8000"] += 1
elif length <= 8500:
distribution["8000-8500"] += 1
elif length <= 9000:
distribution["8500-9000"] += 1
elif length <= 9500:
distribution["9000-9500"] += 1
elif length <= 10000:
distribution["9500-10000"] += 1
else:
distribution[">10000"] += 1
print(f"Length distribution of {json_file_path}:")
for key, value in distribution.items():
print(f"{key}: {value}")
# with open(out_file_name, 'w', encoding='utf-8') as file:
# json.dump(long_json*10, file, ensure_ascii=False, indent=4)
# print(f"处理完成,大于{token_thre}的已保存到{out_file_name}")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment