Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
VITA_pytorch
Commits
112bf76b
Commit
112bf76b
authored
Oct 31, 2024
by
chenzk
Browse files
v1.0
parents
Pipeline
#1826
canceled with stages
Changes
171
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1825 additions
and
0 deletions
+1825
-0
coco/train2017/000000000164.jpg
coco/train2017/000000000164.jpg
+0
-0
command.sh
command.sh
+2
-0
custom/coco/images/train2017/000000000164.jpg
custom/coco/images/train2017/000000000164.jpg
+0
-0
custom/custom.json
custom/custom.json
+20
-0
data_tools/check_audio_lost.py
data_tools/check_audio_lost.py
+92
-0
data_tools/check_image_lost.py
data_tools/check_image_lost.py
+80
-0
data_tools/check_image_space_ratio.py
data_tools/check_image_space_ratio.py
+87
-0
data_tools/check_json.py
data_tools/check_json.py
+115
-0
data_tools/check_video_lost.py
data_tools/check_video_lost.py
+69
-0
data_tools/concat_data.py
data_tools/concat_data.py
+148
-0
data_tools/concat_data_frameCat.py
data_tools/concat_data_frameCat.py
+191
-0
data_tools/concat_data_patch.py
data_tools/concat_data_patch.py
+189
-0
data_tools/rm_lost_audio_in_json.py
data_tools/rm_lost_audio_in_json.py
+38
-0
data_tools/rm_lost_image_in_json.py
data_tools/rm_lost_image_in_json.py
+40
-0
data_tools/rm_lost_video_in_json.py
data_tools/rm_lost_video_in_json.py
+36
-0
data_tools/statistics_audio_duration.py
data_tools/statistics_audio_duration.py
+111
-0
data_tools/statistics_data_num.py
data_tools/statistics_data_num.py
+79
-0
data_tools/statistics_image_num.py
data_tools/statistics_image_num.py
+78
-0
data_tools/statistics_token_num.py
data_tools/statistics_token_num.py
+196
-0
data_tools/statistics_token_num_frameCat.py
data_tools/statistics_token_num_frameCat.py
+254
-0
No files found.
coco/train2017/000000000164.jpg
0 → 100644
View file @
112bf76b
157 KB
command.sh
0 → 100644
View file @
112bf76b
# Mixtral
CUDA_VISIBLE_DEVICES
=
7 python mixtral_inference.py
custom/coco/images/train2017/000000000164.jpg
0 → 100644
View file @
112bf76b
157 KB
custom/custom.json
0 → 100644
View file @
112bf76b
[
{
"set"
:
"sharegpt4"
,
"id"
:
"000000000164"
,
"conversations"
:
[
{
"from"
:
"human"
,
"value"
:
"<image>
\n
input_wavs/promp0.wav
\n
"
},
{
"from"
:
"gpt"
,
"value"
:
"This is a well-organized kitchen with a clean, modern aesthetic. The kitchen features a white countertop against a white wall, creating a bright and airy atmosphere. "
}
],
"image"
:
"coco/images/train2017/000000000164.jpg"
,
"audio"
:
[
"audio0.wav"
]
}
]
data_tools/check_audio_lost.py
0 → 100644
View file @
112bf76b
import
json
import
math
import
os
import
threading
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
from
PIL
import
Image
from
tqdm
import
tqdm
import
torchaudio
from
vita.config
import
*
from
vita.config
import
AudioFolder
,
FolderDict
from
vita.config.dataset_config
import
*
# 定义文件路径
output_file_path
=
"lost_file_name.txt"
dur_thre1
=
30
dur_thre2
=
0.5
# 将所有字典放入一个列表中
# datasets = NLP + HumanCentric + VideoQA + NaturalQA +VideoCap + OCRCap + NaturalCap
# datasets = NaturalCap + VideoCap + OCRCap + NaturalQA + VideoQA + HumanCentric + [TextSFT]
datasets
=
NaturalCap
+
VideoCap
datasets
=
OCRCap
+
NaturalQA
datasets
=
VideoQA
+
HumanCentric
+
[
TextSFT
]
datasets
=
[
TextSFT
]
# 初始化一个列表来存储丢失的文件名
lost_files
=
[]
lock
=
threading
.
Lock
()
def
get_wav_duration
(
file_path
):
waveform
,
sample_rate
=
torchaudio
.
load
(
file_path
)
duration
=
waveform
.
size
(
1
)
/
sample_rate
return
duration
def
check_audio
(
audio_file_name
,
audio_directory
):
audio_file_path
=
os
.
path
.
join
(
audio_directory
,
"audio"
,
audio_file_name
)
if
not
os
.
path
.
exists
(
audio_file_path
):
print
(
f
"
{
audio_file_path
}
lost!!!!!!!!"
)
return
audio_file_name
else
:
try
:
duration
=
get_wav_duration
(
audio_file_path
)
if
duration
>
dur_thre1
or
duration
<
dur_thre2
:
print
(
f
"
{
audio_file_path
}
duration
{
duration
}
, too long!!!!!!!"
)
return
audio_file_name
except
Exception
as
e
:
print
(
f
"
{
audio_file_path
}
is broken!!!!!!!!"
)
return
audio_file_name
return
None
# 遍历每个字典
for
dataset
in
datasets
:
keys
=
list
(
dataset
.
keys
())
json_file_path
=
dataset
[
"chat_path"
]
print
(
json_file_path
)
# 读取JSON文件
with
open
(
json_file_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
data
=
json
.
load
(
f
)
# 遍历每条数据,使用tqdm显示进度条
with
ThreadPoolExecutor
(
max_workers
=
10
)
as
executor
:
futures
=
[]
for
item
in
data
:
audio_files
=
item
.
get
(
"audio"
)
audio_directory
=
AudioFolder
# 如果 audio_files 是字符串,将其转换为列表
if
isinstance
(
audio_files
,
str
):
audio_files
=
[
audio_files
]
# 如果 audio_files 是列表,处理每个文件
if
isinstance
(
audio_files
,
list
):
for
audio_file_name
in
audio_files
:
futures
.
append
(
executor
.
submit
(
check_audio
,
audio_file_name
,
audio_directory
))
for
future
in
tqdm
(
as_completed
(
futures
),
total
=
len
(
futures
),
desc
=
"Processing"
,
unit
=
"file"
):
result
=
future
.
result
()
if
result
:
with
lock
:
lost_files
.
append
(
result
)
# 将丢失的文件名写入到lost_file_name.txt中
with
open
(
output_file_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
for
file_name
in
lost_files
:
f
.
write
(
file_name
+
"
\n
"
)
print
(
f
"检查完成,共有
{
len
(
lost_files
)
}
个文件丢失或无法读取,结果已保存到
{
output_file_path
}
"
)
data_tools/check_image_lost.py
0 → 100644
View file @
112bf76b
import
json
import
os
import
threading
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
from
PIL
import
Image
from
tqdm
import
tqdm
from
vita.config
import
FolderDict
from
vita.config.dataset_config
import
*
# 定义文件路径
output_file_path
=
"lost_file_name.txt"
# 将所有字典放入一个列表中
datasets
=
[
ShareGPT4V
]
# 初始化一个列表来存储丢失的文件名
lost_files
=
[]
lock
=
threading
.
Lock
()
def
check_image
(
image_file_name
,
image_directory
):
image_file_path
=
os
.
path
.
join
(
image_directory
,
image_file_name
)
if
not
os
.
path
.
exists
(
image_file_path
):
return
image_file_name
else
:
try
:
with
Image
.
open
(
image_file_path
)
as
img
:
img
.
convert
(
"RGB"
)
except
Exception
as
e
:
return
image_file_name
return
None
# 遍历每个字典
for
dataset
in
datasets
:
keys
=
list
(
dataset
.
keys
())
json_file_path
=
dataset
[
"chat_path"
]
print
(
json_file_path
)
# 读取JSON文件
with
open
(
json_file_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
data
=
json
.
load
(
f
)
# 遍历每条数据,使用tqdm显示进度条
with
ThreadPoolExecutor
(
max_workers
=
10
)
as
executor
:
futures
=
[]
for
item
in
data
:
if
"image"
in
item
:
image_files
=
item
.
get
(
"image"
)
set_id
=
item
[
"set"
]
if
type
(
set_id
)
is
list
:
set_id
=
set_id
[
0
]
image_directory
=
FolderDict
[
set_id
]
# 如果 image_files 是字符串,将其转换为列表
if
isinstance
(
image_files
,
str
):
image_files
=
[
image_files
]
# 如果 image_files 是列表,处理每个文件
if
isinstance
(
image_files
,
list
):
for
image_file_name
in
image_files
:
futures
.
append
(
executor
.
submit
(
check_image
,
image_file_name
,
image_directory
)
)
for
future
in
tqdm
(
as_completed
(
futures
),
total
=
len
(
futures
),
desc
=
"Processing"
,
unit
=
"file"
):
result
=
future
.
result
()
if
result
:
with
lock
:
lost_files
.
append
(
result
)
print
(
f
"file lost:
{
result
}
"
)
# 将丢失的文件名写入到lost_file_name.txt中
with
open
(
output_file_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
for
file_name
in
lost_files
:
f
.
write
(
file_name
+
"
\n
"
)
print
(
f
"检查完成,共有
{
len
(
lost_files
)
}
个文件丢失或无法读取,结果已保存到
{
output_file_path
}
"
)
data_tools/check_image_space_ratio.py
0 → 100644
View file @
112bf76b
import
json
import
os
import
threading
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
from
PIL
import
Image
from
tqdm
import
tqdm
from
vita.config
import
FolderDict
from
vita.config.dataset_config
import
*
# 定义文件路径
output_file_path
=
"long_image_file_name.txt"
ratio_thre
=
12
# 将所有字典放入一个列表中
# datasets = [AnyWord_20to50, RCTW2019, RCTW2019QA, RCTW2017, OpenChart, SCID, K12, TabRECSet, DigChat, iFlyTab]
datasets
=
[
AnyWord_20to50
,
DyChart_iresearch
]
datasets
=
[
RCTW2019
,
RCTW2019QA
,
RCTW2017
]
datasets
=
[
OpenChart
,
SCID
]
datasets
=
[
K12
]
# datasets = [TabRECSet, DigChat, iFlyTab]
# 初始化一个列表来存储丢失的文件名
lost_files
=
[]
lock
=
threading
.
Lock
()
def
check_image
(
image_file_name
,
image_directory
):
image_file_path
=
os
.
path
.
join
(
image_directory
,
image_file_name
)
if
not
os
.
path
.
exists
(
image_file_path
):
print
(
f
"
{
image_file_path
}
not exist!!!!!!!!!!"
)
return
image_file_name
else
:
try
:
with
Image
.
open
(
image_file_path
)
as
img
:
img
.
convert
(
"RGB"
)
size_ratio
=
img
.
size
[
0
]
/
img
.
size
[
1
]
if
size_ratio
<
1
/
ratio_thre
or
size_ratio
>
ratio_thre
:
print
(
f
"
{
image_file_path
}
ratio is too big!!!!!!!!!!!!!!"
)
return
image_file_name
except
Exception
as
e
:
print
(
f
"
{
image_file_path
}
is broken!!!!!!!!!!!!"
)
return
image_file_name
return
None
# 遍历每个字典
for
dataset
in
datasets
:
keys
=
list
(
dataset
.
keys
())
json_file_path
=
dataset
[
"chat_path"
]
# 读取JSON文件
with
open
(
json_file_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
data
=
json
.
load
(
f
)
# 遍历每条数据,使用tqdm显示进度条
with
ThreadPoolExecutor
(
max_workers
=
10
)
as
executor
:
futures
=
[]
for
item
in
data
:
image_files
=
item
.
get
(
"image"
)
set_id
=
item
[
"set"
]
image_directory
=
FolderDict
[
set_id
]
# 如果 image_files 是字符串,将其转换为列表
if
isinstance
(
image_files
,
str
):
image_files
=
[
image_files
]
# 如果 image_files 是列表,处理每个文件
if
isinstance
(
image_files
,
list
):
for
image_file_name
in
image_files
:
futures
.
append
(
executor
.
submit
(
check_image
,
image_file_name
,
image_directory
))
for
future
in
tqdm
(
as_completed
(
futures
),
total
=
len
(
futures
),
desc
=
"Processing"
,
unit
=
"file"
):
result
=
future
.
result
()
if
result
:
with
lock
:
lost_files
.
append
(
result
)
# 将丢失的文件名写入到lost_file_name.txt中
with
open
(
output_file_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
for
file_name
in
lost_files
:
f
.
write
(
file_name
+
"
\n
"
)
print
(
f
"检查完成,共有
{
len
(
lost_files
)
}
个文件丢失或无法读取,结果已保存到
{
output_file_path
}
"
)
data_tools/check_json.py
0 → 100644
View file @
112bf76b
import
json
import
math
import
os
import
threading
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
from
PIL
import
Image
from
tqdm
import
tqdm
import
torchaudio
from
vita.config
import
*
from
vita.config
import
AudioFolder
,
FolderDict
from
vita.config.dataset_config
import
*
# 将所有字典放入一个列表中
datasets
=
NLP
+
HumanCentric
+
VideoQA
+
NaturalQA
+
VideoCap
+
OCRCap
+
NaturalCap
datasets
=
[
Webvid
]
# 遍历每个字典
for
dataset
in
datasets
:
dur_list
=
[]
keys
=
list
(
dataset
.
keys
())
input_file_name
=
dataset
[
"chat_path"
]
# 读取JSON文件
with
open
(
input_file_name
,
"r"
,
encoding
=
"utf-8"
)
as
file
:
data
=
json
.
load
(
file
)
print
(
f
"check
{
input_file_name
}
"
)
# 遍历每条数据
for
item
in
tqdm
(
data
):
# 是否有set_id
assert
"set"
in
item
,
f
"
{
input_file_name
}
do not have set_id:
{
item
}
!!!!!!!!!!"
# item是否为空
assert
len
(
item
)
>
0
,
f
"
{
input_file_name
}
have null item!!!!!!!!!!"
# 是否有键的值为空
for
key
in
item
.
keys
():
if
type
(
item
[
key
])
is
not
int
and
key
!=
"id"
:
assert
(
len
(
item
[
key
])
>
0
),
f
"
{
input_file_name
}
, item
{
item
}
have null key!!!!!!!!!!
{
key
}
"
# item['conversations']是否有空
for
conv
in
item
[
"conversations"
]:
text
=
conv
[
"value"
]
if
len
(
text
)
==
0
:
print
(
f
"
{
input_file_name
}
, item
{
item
}
has null speaking!!!"
)
# image/video路径数量、set_id数量、place_holder数量是否一致
count_image_ph
=
0
count_video_ph
=
0
count_audio_ph
=
0
count_image_path
=
0
count_video_path
=
0
count_audio_path
=
0
text_all
=
""
for
conv
in
item
[
"conversations"
]:
text
=
conv
[
"value"
]
text_all
+=
text
count_image_ph
=
text_all
.
count
(
"<image>"
)
count_video_ph
=
text_all
.
count
(
"<video>"
)
count_audio_ph
=
text_all
.
count
(
"<audio>"
)
if
"image"
in
item
:
image_path
=
item
[
"image"
]
assert
isinstance
(
image_path
[
0
],
str
)
if
type
(
image_path
)
is
not
list
:
assert
isinstance
(
image_path
,
str
)
image_path
=
[
image_path
]
count_image_path
=
len
(
image_path
)
if
"video"
in
item
:
video_path
=
item
[
"video"
]
assert
isinstance
(
video_path
[
0
],
str
)
if
type
(
video_path
)
is
not
list
:
assert
isinstance
(
video_path
,
str
)
video_path
=
[
video_path
]
count_video_path
=
len
(
video_path
)
if
"audio"
in
item
:
audio_path
=
item
[
"audio"
]
assert
isinstance
(
audio_path
[
0
],
str
)
if
type
(
audio_path
)
is
not
list
:
assert
isinstance
(
audio_path
,
str
)
audio_path
=
[
audio_path
]
count_audio_path
=
len
(
audio_path
)
# assert count_image_path == count_image_ph, f"{input_file_name}, item {item} image place holder number NOT equal image file number"
# assert count_video_path == count_video_ph, f"{input_file_name}, item {item} video place holder number NOT equal video file number"
# assert count_audio_path == count_audio_ph, f"{input_file_name}, item {item} audio place holder number NOT equal audio file number"
if
count_image_path
!=
count_image_ph
:
print
(
f
"
{
input_file_name
}
, item
{
item
}
image place holder number NOT equal image file number"
)
if
count_video_path
!=
count_video_ph
:
print
(
f
"
{
input_file_name
}
, item
{
item
}
video place holder number NOT equal video file number"
)
if
count_audio_path
!=
count_audio_ph
:
print
(
f
"
{
input_file_name
}
, item
{
item
}
audio place holder number NOT equal audio file number"
)
set_id
=
item
[
"set"
]
if
type
(
set_id
)
is
not
list
:
set_id
=
[
set_id
]
if
"image"
in
item
or
"video"
in
item
:
if
set_id
[
0
]
!=
"sqa"
:
assert
(
len
(
set_id
)
==
count_image_path
+
count_video_path
),
f
"
{
input_file_name
}
, item
{
item
}
set_id numer Not correct"
data_tools/check_video_lost.py
0 → 100644
View file @
112bf76b
import
json
import
os
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
from
PIL
import
Image
from
tqdm
import
tqdm
from
decord
import
VideoReader
,
cpu
from
vita.config
import
FolderDict
from
vita.config.dataset_config
import
*
# 定义文件路径
output_file_path
=
"lost_file_name.txt"
# 将所有字典放入一个列表中
# datasets = [Webvid, K400]
# datasets = [VIDEOChatGPT, K700Split, VC2Internvid]
# datasets = [EgoGesture, Literature, CopyWrite, MovingFashion]
# datasets = [NoHarm]
datasets
=
[
SGInternvid0
]
# 初始化一个列表来存储丢失的文件名
lost_files
=
[]
# 遍历每个字典
for
dataset
in
datasets
:
keys
=
list
(
dataset
.
keys
())
json_file_path
=
dataset
[
"chat_path"
]
# 读取JSON文件
with
open
(
json_file_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
data
=
json
.
load
(
f
)
def
check_video_file
(
item
):
video_file_name
=
item
.
get
(
"video"
)
if
video_file_name
:
video_directory
=
FolderDict
[
item
[
"set"
]]
video_file_path
=
os
.
path
.
join
(
video_directory
,
video_file_name
)
if
not
os
.
path
.
exists
(
video_file_path
):
print
(
f
"file lost:
{
video_file_path
}
"
)
return
video_file_name
else
:
sample_pos
=
[
0
,
10
]
try
:
vreader
=
VideoReader
(
video_file_path
,
ctx
=
cpu
(
0
))
patch_images
=
[
Image
.
fromarray
(
f
)
for
f
in
vreader
.
get_batch
(
sample_pos
).
asnumpy
()
]
except
Exception
as
e
:
print
(
f
"file broken:
{
video_file_path
}
"
)
return
video_file_name
return
None
# 使用ThreadPoolExecutor进行多线程并行处理
with
ThreadPoolExecutor
(
max_workers
=
10
)
as
executor
:
futures
=
[
executor
.
submit
(
check_video_file
,
item
)
for
item
in
data
]
for
future
in
tqdm
(
as_completed
(
futures
),
total
=
len
(
futures
),
desc
=
"Processing"
,
unit
=
"file"
):
result
=
future
.
result
()
if
result
:
lost_files
.
append
(
result
)
# 将丢失的文件名写入到lost_file_name.txt中
with
open
(
output_file_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
for
file_name
in
lost_files
:
f
.
write
(
file_name
+
"
\n
"
)
print
(
f
"检查完成,共有
{
len
(
lost_files
)
}
个文件丢失或无法读取,结果已保存到
{
output_file_path
}
"
)
data_tools/concat_data.py
0 → 100644
View file @
112bf76b
import
json
import
math
import
os
import
random
import
torch
import
transformers
from
tqdm
import
tqdm
import
torchaudio
from
vita
import
conversation
as
conversation_lib
from
vita.config
import
AudioFolder
,
FolderDict
from
vita.config.dataset_config
import
*
from
vita.constants
import
AUDIO_TOKEN_INDEX
,
GLOBAL_WEIGHTS_PATH
,
IGNORE_INDEX
,
IMAGE_TOKEN_INDEX
from
vita.util.data_utils_video_audio
import
DataArguments
,
LazySupervisedDataset
from
vita.util.mm_utils
import
tokenizer_image_audio_token
,
tokenizer_image_token
image_token_num
=
256
concat_size
=
4500
datasets
=
[
ShareGPT4V
]
parser
=
transformers
.
HfArgumentParser
((
DataArguments
))
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
f
"
{
GLOBAL_WEIGHTS_PATH
}
/Mixtral-8x7B_New/mg2hg"
,
cache_dir
=
None
,
model_max_length
=
8192
,
padding_side
=
"right"
,
use_fast
=
True
,
)
def
get_wav_duration
(
file_path
):
waveform
,
sample_rate
=
torchaudio
.
load
(
file_path
)
duration
=
waveform
.
size
(
1
)
/
sample_rate
return
duration
for
dataset
in
datasets
:
input_file_name
=
dataset
[
"chat_path"
]
base_name
,
ext
=
os
.
path
.
splitext
(
input_file_name
)
suffix
=
f
"-concat
{
concat_size
}
"
out_file_name
=
f
"
{
base_name
}{
suffix
}{
ext
}
"
with
open
(
input_file_name
,
"r"
,
encoding
=
"utf-8"
)
as
file
:
data
=
json
.
load
(
file
)
random
.
shuffle
(
data
)
# data = data[:100]
# 遍历每条数据
len_list
=
[]
conv
=
conversation_lib
.
default_conversation
.
copy
()
roles
=
{
"human"
:
conv
.
roles
[
0
],
"gpt"
:
conv
.
roles
[
1
]}
len_list
=
[]
# Apply prompt templates
for
item
in
tqdm
(
data
):
source
=
item
[
"conversations"
]
conv
.
messages
=
[]
for
j
,
sentence
in
enumerate
(
source
):
role
=
roles
[
sentence
[
"from"
]]
assert
role
==
conv
.
roles
[
j
%
2
],
f
"
{
source
}
"
conv
.
append_message
(
role
,
sentence
[
"value"
])
prompt
=
conv
.
get_prompt
()
# import pdb; pdb.set_trace()
input_ids
=
tokenizer_image_token
(
prompt
,
tokenizer
,
return_tensors
=
"pt"
)
num_images
=
(
input_ids
==
IMAGE_TOKEN_INDEX
).
sum
()
item_token_num
=
input_ids
.
shape
[
0
]
+
num_images
*
image_token_num
if
"audio"
in
item
:
audio_files
=
item
[
"audio"
]
audio_directory
=
AudioFolder
# 如果 audio_files 是字符串,将其转换为列表
if
isinstance
(
audio_files
,
str
):
audio_files
=
[
audio_files
]
# 如果 audio_files 是列表,处理每个文件
assert
isinstance
(
audio_files
,
list
)
total_duration
=
0
for
audio_file_name
in
audio_files
:
audio_file_path
=
os
.
path
.
join
(
audio_directory
,
"audio"
,
audio_file_name
)
duration
=
get_wav_duration
(
audio_file_path
)
duration
=
(
math
.
ceil
(
duration
)
if
math
.
ceil
(
duration
)
%
2
==
0
else
math
.
ceil
(
duration
)
+
1
)
total_duration
+=
duration
item_token_num
+=
math
.
ceil
(
total_duration
*
12.5
)
len_list
.
append
(
item_token_num
)
assert
len
(
len_list
)
==
len
(
data
)
def
concat_item
(
items
):
temp_set_id
=
[]
temp_conversations
=
[]
temp_ids
=
[]
temp_images
=
[]
temp_audios
=
[]
for
item
in
items
:
temp_set_id
.
append
(
item
[
"set"
])
temp_conversations
.
extend
(
item
[
"conversations"
])
if
"id"
in
item
:
temp_ids
.
append
(
item
[
"id"
])
if
"image"
in
item
:
temp_images
.
append
(
item
[
"image"
])
if
"audio"
in
item
:
audio
=
item
[
"audio"
]
if
type
(
audio
)
is
not
list
:
audio
=
[
audio
]
temp_audios
+=
audio
if
len
(
temp_images
)
>
0
:
merged_item
=
{
"set"
:
temp_set_id
,
"id"
:
temp_ids
,
"image"
:
temp_images
,
"conversations"
:
temp_conversations
,
}
else
:
merged_item
=
{
"set"
:
temp_set_id
,
"id"
:
temp_ids
,
"conversations"
:
temp_conversations
,
}
if
len
(
temp_audios
)
>
0
:
merged_item
[
"audio"
]
=
temp_audios
return
merged_item
merged_data
=
[]
i
=
0
while
i
<
len
(
data
):
len_token
=
len_list
[
i
]
k
=
1
while
True
:
if
sum
(
len_list
[
i
:
i
+
k
])
>
concat_size
:
if
k
>
1
:
k
-=
1
break
if
i
+
k
==
len
(
data
):
break
k
+=
1
merged_item
=
concat_item
(
data
[
i
:
i
+
k
])
merged_data
.
append
(
merged_item
)
# print(f"i: {i}, k: {k}; len of merged item: {sum(len_list[i:i+k])}")
i
=
i
+
k
with
open
(
out_file_name
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
merged_data
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
print
(
f
"save
{
out_file_name
}
"
)
data_tools/concat_data_frameCat.py
0 → 100644
View file @
112bf76b
import
json
import
math
import
os
import
random
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
import
torch
import
transformers
from
PIL
import
Image
from
tqdm
import
tqdm
import
torchaudio
from
vita
import
conversation
as
conversation_lib
from
vita.config
import
*
from
vita.config
import
AudioFolder
,
FolderDict
from
vita.constants
import
AUDIO_TOKEN_INDEX
,
GLOBAL_WEIGHTS_PATH
,
IGNORE_INDEX
,
IMAGE_TOKEN_INDEX
from
vita.util.data_utils_video_audio
import
DataArguments
,
LazySupervisedDataset
from
vita.util.data_utils_video_audio_neg_patch
import
find_closest_aspect_ratio
from
vita.util.mm_utils
import
tokenizer_image_audio_token
,
tokenizer_image_token
image_token_num
=
256
concat_size
=
6000
datasets
=
[
ShareGPT4V0
]
parser
=
transformers
.
HfArgumentParser
((
DataArguments
))
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
f
"
{
GLOBAL_WEIGHTS_PATH
}
/Mixtral-8x7B_New/mg2hg"
,
cache_dir
=
None
,
model_max_length
=
8192
,
padding_side
=
"right"
,
use_fast
=
True
,
)
def
dynamic_preprocess
(
image
,
min_num
=
2
,
max_num
=
12
,
image_size
=
448
,
use_thumbnail
=
False
,
img_mean
=
0
):
orig_width
,
orig_height
=
image
.
size
aspect_ratio
=
orig_width
/
orig_height
# calculate the existing image aspect ratio
target_ratios
=
set
(
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
i
*
j
<=
max_num
and
i
*
j
>=
min_num
)
target_ratios
=
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
orig_width
,
orig_height
,
image_size
)
# expand target_aspect_ratio to even for each size
new_target_aspect_ratio
=
[
e
if
e
%
2
==
0
else
e
+
1
for
e
in
target_aspect_ratio
]
blocks_big
=
int
(
0.5
*
new_target_aspect_ratio
[
0
]
*
0.5
*
new_target_aspect_ratio
[
1
])
return
blocks_big
def
get_wav_duration
(
file_path
):
waveform
,
sample_rate
=
torchaudio
.
load
(
file_path
)
duration
=
waveform
.
size
(
1
)
/
sample_rate
return
duration
def
concat_item
(
items
):
temp_set_id
=
[]
temp_conversations
=
[]
temp_ids
=
[]
temp_images
=
[]
temp_audios
=
[]
for
item
in
items
:
temp_set_id
.
append
(
item
[
"set"
])
temp_conversations
.
extend
(
item
[
"conversations"
])
if
"id"
in
item
:
temp_ids
.
append
(
item
[
"id"
])
if
"image"
in
item
:
temp_images
.
append
(
item
[
"image"
])
if
"audio"
in
item
:
audio
=
item
[
"audio"
]
if
type
(
audio
)
is
not
list
:
audio
=
[
audio
]
temp_audios
+=
audio
if
len
(
temp_images
)
>
0
:
merged_item
=
{
"set"
:
temp_set_id
,
"id"
:
temp_ids
,
"image"
:
temp_images
,
"conversations"
:
temp_conversations
,
}
else
:
merged_item
=
{
"set"
:
temp_set_id
,
"id"
:
temp_ids
,
"conversations"
:
temp_conversations
,
}
if
len
(
temp_audios
)
>
0
:
merged_item
[
"audio"
]
=
temp_audios
return
merged_item
def
compute_item_token_num
(
item
):
conv
=
conversation_lib
.
default_conversation
.
copy
()
roles
=
{
"human"
:
conv
.
roles
[
0
],
"gpt"
:
conv
.
roles
[
1
]}
source
=
item
[
"conversations"
]
conv
.
messages
=
[]
modality
=
"lang"
for
j
,
sentence
in
enumerate
(
source
):
role
=
roles
[
sentence
[
"from"
]]
assert
role
==
conv
.
roles
[
j
%
2
],
f
"
{
source
}
"
conv
.
append_message
(
role
,
sentence
[
"value"
])
if
"<image>"
in
sentence
[
"value"
]:
modality
=
"image"
prompt
=
conv
.
get_prompt
(
modality
)
# import pdb; pdb.set_trace()
input_ids
=
tokenizer_image_token
(
prompt
,
tokenizer
,
return_tensors
=
"pt"
)
item_token_num
=
input_ids
.
shape
[
0
]
if
"image"
in
item
:
image_file
=
item
[
"image"
]
set_id
=
item
[
"set"
]
image_directory
=
FolderDict
[
set_id
]
image
=
Image
.
open
(
os
.
path
.
join
(
image_directory
,
image_file
.
replace
(
"
\\
"
,
"/"
))).
convert
(
"RGB"
)
num_patches
=
dynamic_preprocess
(
image
)
item_token_num
=
item_token_num
+
num_patches
*
image_token_num
if
"audio"
in
item
:
audio_files
=
item
[
"audio"
]
audio_directory
=
AudioFolder
# 如果 audio_files 是字符串,将其转换为列表
if
isinstance
(
audio_files
,
str
):
audio_files
=
[
audio_files
]
# 如果 audio_files 是列表,处理每个文件
assert
isinstance
(
audio_files
,
list
)
total_duration
=
0
for
audio_file_name
in
audio_files
:
audio_file_path
=
os
.
path
.
join
(
audio_directory
,
"audio"
,
audio_file_name
)
duration
=
get_wav_duration
(
audio_file_path
)
duration
=
(
math
.
ceil
(
duration
)
if
math
.
ceil
(
duration
)
%
2
==
0
else
math
.
ceil
(
duration
)
+
1
)
total_duration
+=
duration
item_token_num
+=
math
.
ceil
(
total_duration
*
12.5
)
item
[
"token_len"
]
=
item_token_num
for
dataset
in
datasets
:
input_file_name
=
dataset
[
"chat_path"
]
base_name
,
ext
=
os
.
path
.
splitext
(
input_file_name
)
suffix
=
f
"-FrameConcat
{
concat_size
}
"
out_file_name
=
f
"
{
base_name
}{
suffix
}{
ext
}
"
with
open
(
input_file_name
,
"r"
,
encoding
=
"utf-8"
)
as
file
:
data
=
json
.
load
(
file
)
random
.
shuffle
(
data
)
# data = data[:100]
# for item in tqdm(data):
# compute_item_token_num(item)
with
ThreadPoolExecutor
()
as
executor
:
futures
=
[
executor
.
submit
(
compute_item_token_num
,
item
)
for
item
in
data
]
for
future
in
tqdm
(
as_completed
(
futures
),
total
=
len
(
futures
)):
future
.
result
()
merged_data
=
[]
i
=
0
while
i
<
len
(
data
):
len_token
=
data
[
i
][
"token_len"
]
k
=
1
while
True
:
if
sum
([
item
[
"token_len"
]
for
item
in
data
[
i
:
i
+
k
]])
>
concat_size
:
if
k
>
1
:
k
-=
1
break
if
i
+
k
==
len
(
data
):
break
k
+=
1
merged_item
=
concat_item
(
data
[
i
:
i
+
k
])
merged_data
.
append
(
merged_item
)
# print(f"i: {i}, k: {k}; len of merged item: {sum(len_list[i:i+k])}")
i
=
i
+
k
with
open
(
out_file_name
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
merged_data
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
print
(
f
"save
{
out_file_name
}
"
)
data_tools/concat_data_patch.py
0 → 100644
View file @
112bf76b
import
json
import
math
import
os
import
random
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
import
torch
import
transformers
from
PIL
import
Image
from
tqdm
import
tqdm
import
torchaudio
from
vita
import
conversation
as
conversation_lib
from
vita.config
import
AudioFolder
,
FolderDict
from
vita.config.dataset_config
import
*
from
vita.constants
import
AUDIO_TOKEN_INDEX
,
GLOBAL_WEIGHTS_PATH
,
IGNORE_INDEX
,
IMAGE_TOKEN_INDEX
from
vita.util.data_utils_video_audio
import
DataArguments
,
LazySupervisedDataset
from
vita.util.data_utils_video_audio_neg_patch
import
find_closest_aspect_ratio
from
vita.util.mm_utils
import
tokenizer_image_audio_token
,
tokenizer_image_token
image_token_num
=
256
concat_size
=
6000
datasets
=
[
ShareGPT4V
]
parser
=
transformers
.
HfArgumentParser
((
DataArguments
))
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
f
"
{
GLOBAL_WEIGHTS_PATH
}
/Mixtral-8x7B_New/mg2hg"
,
cache_dir
=
None
,
model_max_length
=
8192
,
padding_side
=
"right"
,
use_fast
=
True
,
)
conv
=
conversation_lib
.
default_conversation
.
copy
()
roles
=
{
"human"
:
conv
.
roles
[
0
],
"gpt"
:
conv
.
roles
[
1
]}
def
dynamic_preprocess
(
image
,
min_num
=
1
,
max_num
=
12
,
image_size
=
448
,
use_thumbnail
=
True
):
orig_width
,
orig_height
=
image
.
size
aspect_ratio
=
orig_width
/
orig_height
# calculate the existing image aspect ratio
target_ratios
=
set
(
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
i
*
j
<=
max_num
and
i
*
j
>=
min_num
)
target_ratios
=
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
orig_width
,
orig_height
,
image_size
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
if
use_thumbnail
and
blocks
!=
1
:
blocks
+=
1
return
blocks
def
get_wav_duration
(
file_path
):
waveform
,
sample_rate
=
torchaudio
.
load
(
file_path
)
duration
=
waveform
.
size
(
1
)
/
sample_rate
return
duration
def
concat_item
(
items
):
temp_set_id
=
[]
temp_conversations
=
[]
temp_ids
=
[]
temp_images
=
[]
temp_audios
=
[]
for
item
in
items
:
temp_set_id
.
append
(
item
[
"set"
])
temp_conversations
.
extend
(
item
[
"conversations"
])
if
"id"
in
item
:
temp_ids
.
append
(
item
[
"id"
])
if
"image"
in
item
:
temp_images
.
append
(
item
[
"image"
])
if
"audio"
in
item
:
audio
=
item
[
"audio"
]
if
type
(
audio
)
is
not
list
:
audio
=
[
audio
]
temp_audios
+=
audio
if
len
(
temp_images
)
>
0
:
merged_item
=
{
"set"
:
temp_set_id
,
"id"
:
temp_ids
,
"image"
:
temp_images
,
"conversations"
:
temp_conversations
,
}
else
:
merged_item
=
{
"set"
:
temp_set_id
,
"id"
:
temp_ids
,
"conversations"
:
temp_conversations
,
}
if
len
(
temp_audios
)
>
0
:
merged_item
[
"audio"
]
=
temp_audios
return
merged_item
def
compute_item_token_num
(
item
):
source
=
item
[
"conversations"
]
conv
.
messages
=
[]
for
j
,
sentence
in
enumerate
(
source
):
role
=
roles
[
sentence
[
"from"
]]
assert
role
==
conv
.
roles
[
j
%
2
],
f
"
{
source
}
"
conv
.
append_message
(
role
,
sentence
[
"value"
])
prompt
=
conv
.
get_prompt
()
# import pdb; pdb.set_trace()
input_ids
=
tokenizer_image_token
(
prompt
,
tokenizer
,
return_tensors
=
"pt"
)
item_token_num
=
input_ids
.
shape
[
0
]
if
"image"
in
item
:
image_file
=
item
[
"image"
]
set_id
=
item
[
"set"
]
image_directory
=
FolderDict
[
set_id
]
image
=
Image
.
open
(
os
.
path
.
join
(
image_directory
,
image_file
.
replace
(
"
\\
"
,
"/"
))).
convert
(
"RGB"
)
num_patches
=
dynamic_preprocess
(
image
)
item_token_num
=
item_token_num
+
num_patches
*
image_token_num
if
"audio"
in
item
:
audio_files
=
item
[
"audio"
]
audio_directory
=
AudioFolder
# 如果 audio_files 是字符串,将其转换为列表
if
isinstance
(
audio_files
,
str
):
audio_files
=
[
audio_files
]
# 如果 audio_files 是列表,处理每个文件
assert
isinstance
(
audio_files
,
list
)
total_duration
=
0
for
audio_file_name
in
audio_files
:
audio_file_path
=
os
.
path
.
join
(
audio_directory
,
"audio"
,
audio_file_name
)
duration
=
get_wav_duration
(
audio_file_path
)
duration
=
(
math
.
ceil
(
duration
)
if
math
.
ceil
(
duration
)
%
2
==
0
else
math
.
ceil
(
duration
)
+
1
)
total_duration
+=
duration
item_token_num
+=
math
.
ceil
(
total_duration
*
12.5
)
item
[
"token_len"
]
=
item_token_num
for
dataset
in
datasets
:
input_file_name
=
dataset
[
"chat_path"
]
base_name
,
ext
=
os
.
path
.
splitext
(
input_file_name
)
suffix
=
f
"-PatchConcat
{
concat_size
}
"
out_file_name
=
f
"
{
base_name
}{
suffix
}{
ext
}
"
with
open
(
input_file_name
,
"r"
,
encoding
=
"utf-8"
)
as
file
:
data
=
json
.
load
(
file
)
random
.
shuffle
(
data
)
# data = data[:100]
# for item in tqdm(data):
# compute_item_token_num(item)
with
ThreadPoolExecutor
()
as
executor
:
futures
=
[
executor
.
submit
(
compute_item_token_num
,
item
)
for
item
in
data
]
for
future
in
tqdm
(
as_completed
(
futures
),
total
=
len
(
futures
)):
future
.
result
()
merged_data
=
[]
i
=
0
while
i
<
len
(
data
):
len_token
=
data
[
i
][
"token_len"
]
k
=
1
while
True
:
if
sum
([
item
[
"token_len"
]
for
item
in
data
[
i
:
i
+
k
]])
>
concat_size
:
if
k
>
1
:
k
-=
1
break
if
i
+
k
==
len
(
data
):
break
k
+=
1
merged_item
=
concat_item
(
data
[
i
:
i
+
k
])
merged_data
.
append
(
merged_item
)
# print(f"i: {i}, k: {k}; len of merged item: {sum(len_list[i:i+k])}")
i
=
i
+
k
with
open
(
out_file_name
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
merged_data
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
print
(
f
"save
{
out_file_name
}
"
)
data_tools/rm_lost_audio_in_json.py
0 → 100644
View file @
112bf76b
import
json
import
os
from
vita.constants
import
GLOBAL_WEIGHTS_PATH
# 定义文件路径
lost_file_path
=
"lost_file_name.txt"
json_list
=
[
""
]
for
json_file_path
in
json_list
:
output_json_file_path
=
json_file_path
# 读取丢失的文件名
with
open
(
lost_file_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
lost_files
=
set
(
line
.
strip
()
for
line
in
f
)
# 读取JSON文件
with
open
(
json_file_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
data
=
json
.
load
(
f
)
# 过滤数据,删除丢失文件对应的数据
filtered_data
=
[]
for
item
in
data
:
audio_OK
=
True
if
"audio"
in
item
:
assert
type
(
item
[
"audio"
])
is
list
for
audio_filename
in
item
[
"audio"
]:
if
audio_filename
in
lost_files
:
audio_OK
=
False
if
audio_OK
:
filtered_data
.
append
(
item
)
# 将更新后的数据写入新的JSON文件
with
open
(
output_json_file_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
filtered_data
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
print
(
f
"更新完成,共删除了
{
len
(
data
)
-
len
(
filtered_data
)
}
条数据,结果已保存到
{
output_json_file_path
}
"
)
data_tools/rm_lost_image_in_json.py
0 → 100644
View file @
112bf76b
import
json
from
vita.constants
import
GLOBAL_WEIGHTS_PATH
# 定义文件路径
# lost_file_path = 'lost_file_name.txt'
lost_file_path
=
"long_image_file_name.txt"
json_list
=
[
""
]
for
json_file_path
in
json_list
:
output_json_file_path
=
json_file_path
# 读取丢失的文件名
with
open
(
lost_file_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
lost_files
=
set
(
line
.
strip
()
for
line
in
f
)
# 读取JSON文件
with
open
(
json_file_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
data
=
json
.
load
(
f
)
# 过滤数据,删除丢失文件对应的数据
filtered_data
=
[]
for
item
in
data
:
image_OK
=
True
if
"image"
in
item
:
image_file
=
item
[
"image"
]
if
type
(
image_file
)
is
str
:
image_file
=
[
image_file
]
assert
type
(
image_file
)
is
list
for
image_filename
in
image_file
:
if
image_filename
in
lost_files
:
image_OK
=
False
break
if
image_OK
:
filtered_data
.
append
(
item
)
# 将更新后的数据写入新的JSON文件
with
open
(
output_json_file_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
filtered_data
,
f
,
ensure_ascii
=
False
,
indent
=
4
)
print
(
f
"更新完成,共删除了
{
len
(
data
)
-
len
(
filtered_data
)
}
条数据,结果已保存到
{
output_json_file_path
}
"
)
data_tools/rm_lost_video_in_json.py
0 → 100644
View file @
112bf76b
import
json
from
vita.constants
import
GLOBAL_WEIGHTS_PATH
# 定义文件路径
lost_file_path
=
"lost_file_name.txt"
json_list
=
[]
for
json_file_path
in
json_list
:
output_json_file_path
=
json_file_path
with
open
(
lost_file_path
,
"r"
)
as
file
:
lost_files
=
set
(
file
.
read
().
splitlines
())
# Load the JSON data
with
open
(
json_file_path
,
"r"
)
as
file
:
data
=
json
.
load
(
file
)
# 过滤数据,删除丢失文件对应的数据
filtered_data
=
[]
for
item
in
data
:
video_OK
=
True
if
"video"
in
item
:
video_filename
=
item
[
"video"
]
if
video_filename
in
lost_files
:
video_OK
=
False
if
video_OK
:
filtered_data
.
append
(
item
)
# Save the filtered data back to a new JSON file
with
open
(
output_json_file_path
,
"w"
,
encoding
=
"utf-8"
)
as
file
:
json
.
dump
(
filtered_data
,
file
,
indent
=
2
,
ensure_ascii
=
False
)
print
(
f
"The json data has been delete
{
len
(
data
)
-
len
(
filtered_data
)
}
and saved to
{
output_json_file_path
}
"
)
data_tools/statistics_audio_duration.py
0 → 100644
View file @
112bf76b
import
json
import
math
import
os
import
threading
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
from
PIL
import
Image
from
tqdm
import
tqdm
import
torchaudio
from
vita.config
import
*
from
vita.config
import
AudioFolder
,
FolderDict
from
vita.config.dataset_config
import
*
# 定义文件路径
output_file_path
=
"lost_file_name.txt"
# 将所有字典放入一个列表中
# datasets = NLP+HumanCentric+VideoQA+NaturalQA
datasets
=
VideoCap
+
OCRCap
+
NaturalCap
# 初始化一个列表来存储丢失的文件名
lock
=
threading
.
Lock
()
def
get_wav_duration
(
file_path
):
waveform
,
sample_rate
=
torchaudio
.
load
(
file_path
)
duration
=
waveform
.
size
(
1
)
/
sample_rate
return
duration
def
check_audio
(
audio_file_name
,
audio_directory
):
audio_file_path
=
os
.
path
.
join
(
audio_directory
,
"audio"
,
audio_file_name
)
duration
=
get_wav_duration
(
audio_file_path
)
if
duration
>
200
:
print
(
audio_file_path
,
duration
)
return
duration
# 遍历每个字典
for
dataset
in
datasets
:
dur_list
=
[]
keys
=
list
(
dataset
.
keys
())
json_file_path
=
dataset
[
"chat_path"
]
print
(
json_file_path
)
# 读取JSON文件
with
open
(
json_file_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
data
=
json
.
load
(
f
)
# 遍历每条数据,使用tqdm显示进度条
with
ThreadPoolExecutor
(
max_workers
=
10
)
as
executor
:
futures
=
[]
for
item
in
data
:
audio_files
=
item
.
get
(
"audio"
)
audio_directory
=
AudioFolder
# 如果 audio_files 是字符串,将其转换为列表
if
isinstance
(
audio_files
,
str
):
audio_files
=
[
audio_files
]
# 如果 audio_files 是列表,处理每个文件
if
isinstance
(
audio_files
,
list
):
for
audio_file_name
in
audio_files
:
futures
.
append
(
executor
.
submit
(
check_audio
,
audio_file_name
,
audio_directory
))
for
future
in
tqdm
(
as_completed
(
futures
),
total
=
len
(
futures
),
desc
=
"Processing"
,
unit
=
"file"
):
duration
=
future
.
result
()
dur_list
.
append
(
duration
)
# 初始化区间计数字典
distribution
=
{
"0-1"
:
0
,
"1-5"
:
0
,
"5-10"
:
0
,
"10-15"
:
0
,
"15-20"
:
0
,
"20-25"
:
0
,
"25-30"
:
0
,
"30-60"
:
0
,
"60-200"
:
0
,
">200"
:
0
,
}
# 统计每个区间的计数
for
length
in
dur_list
:
if
length
<=
1
:
distribution
[
"0-1"
]
+=
1
elif
length
<=
5
:
distribution
[
"1-5"
]
+=
1
elif
length
<=
10
:
distribution
[
"5-10"
]
+=
1
elif
length
<=
15
:
distribution
[
"10-15"
]
+=
1
elif
length
<=
20
:
distribution
[
"15-20"
]
+=
1
elif
length
<=
25
:
distribution
[
"20-25"
]
+=
1
elif
length
<=
30
:
distribution
[
"25-30"
]
+=
1
elif
length
<=
60
:
distribution
[
"30-60"
]
+=
1
elif
length
<=
200
:
distribution
[
"60-200"
]
+=
1
else
:
distribution
[
">200"
]
+=
1
# 打印分布结果
print
(
f
"duration distribution of
{
json_file_path
}
:"
)
for
key
,
value
in
distribution
.
items
():
print
(
f
"
{
key
}
:
{
value
}
"
)
data_tools/statistics_data_num.py
0 → 100644
View file @
112bf76b
import
json
import
math
import
os
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
import
torch
import
transformers
from
tqdm
import
tqdm
import
torchaudio
from
vita
import
conversation
as
conversation_lib
from
vita.config
import
*
from
vita.config
import
AudioFolder
,
FolderDict
from
vita.config.dataset_config
import
*
from
vita.constants
import
AUDIO_TOKEN_INDEX
,
IGNORE_INDEX
,
IMAGE_TOKEN_INDEX
from
vita.util.data_utils_video_audio
import
DataArguments
,
LazySupervisedDataset
from
vita.util.mm_utils
import
tokenizer_image_audio_token
,
tokenizer_image_token
image_token_num
=
256
token_thre
=
4500
# datasets = NaturalCap + VideoCap + OCRCap + NaturalQA + VideoQA + HumanCentric + NLP
datasets
=
(
NaturalCap0
+
OCRCap0
+
VideoCap0
+
NaturalQA0
+
VideoQA0
+
[
EgoGesture0
,
Literature0
,
CopyWrite0
,
MovingFashion0
]
)
num_data_neg_audio
=
0
for
dataset
in
datasets
:
json_file_path
=
dataset
[
"chat_path"
]
with
open
(
json_file_path
,
"r"
,
encoding
=
"utf-8"
)
as
file
:
data
=
json
.
load
(
file
)
num_data_audio
=
0
num_data_conv
=
0
num_data_qs_qudio
=
0
num_data_qs_text
=
0
for
item
in
data
:
conversations
=
item
[
"conversations"
]
assert
len
(
conversations
)
%
2
==
0
num_conv
=
len
(
conversations
)
//
2
num_data_conv
+=
num_conv
num_qs_audio
=
0
num_qs_text
=
0
for
conv
in
conversations
:
if
conv
[
"from"
]
==
"human"
:
qs
=
conv
[
"value"
]
if
"<audio>"
in
qs
:
num_qs_audio
+=
1
else
:
num_qs_text
+=
1
num_data_qs_qudio
+=
num_qs_audio
num_data_qs_text
+=
num_qs_text
num_audio
=
0
audio_files
=
item
.
get
(
"audio"
)
audio_directory
=
AudioFolder
# 如果 audio_files 是字符串,将其转换为列表
if
isinstance
(
audio_files
,
str
):
audio_files
=
[
audio_files
]
# 如果 audio_files 是列表,处理每个文件
if
isinstance
(
audio_files
,
list
):
num_audio
=
len
(
audio_files
)
for
audio
in
audio_files
:
if
"new_value_dict_0725"
in
audio
or
"new_value_dict_0730"
in
audio
:
num_data_neg_audio
+=
1
num_data_audio
+=
num_audio
assert
num_data_conv
==
num_data_qs_qudio
+
num_data_qs_text
# print(f'{json_file_path} conversation number: {num_data_conv/1000}K')
# print(f'{json_file_path} audio question number: {num_data_qs_qudio/1000}K')
# print(f'{json_file_path} text question number: {num_data_qs_text/1000}K')
# print(f'{json_file_path} audio number: {num_data_audio/1000}K')
print
(
f
"
{
json_file_path
}
data number:
{
len
(
data
)
/
1000
}
K"
)
# print(f'{json_file_path} negtive audio question number: {num_data_neg_audio/1000}K')
data_tools/statistics_image_num.py
0 → 100644
View file @
112bf76b
import
json
import
math
import
os
import
threading
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
from
PIL
import
Image
from
tqdm
import
tqdm
import
torchaudio
from
vita.config
import
*
from
vita.config
import
AudioFolder
,
FolderDict
from
vita.config.dataset_config
import
*
# 将所有字典放入一个列表中
datasets
=
NLP
+
HumanCentric
+
VideoQA
+
NaturalQA
+
VideoCap
+
OCRCap
+
NaturalCap
# 遍历每个字典
for
dataset
in
datasets
:
dur_list
=
[]
keys
=
list
(
dataset
.
keys
())
input_file_name
=
dataset
[
"chat_path"
]
# 读取JSON文件
len_list
=
[]
with
open
(
input_file_name
,
"r"
,
encoding
=
"utf-8"
)
as
file
:
data
=
json
.
load
(
file
)
print
(
f
"check
{
input_file_name
}
"
)
# 遍历每条数据
for
item
in
tqdm
(
data
):
if
"image"
in
item
:
image_path
=
item
[
"image"
]
assert
isinstance
(
image_path
[
0
],
str
)
if
type
(
image_path
)
is
not
list
:
assert
isinstance
(
image_path
,
str
)
image_path
=
[
image_path
]
count_image_path
=
len
(
image_path
)
if
count_image_path
>
40
:
print
(
count_image_path
)
print
(
item
)
len_list
.
append
(
count_image_path
)
distribution
=
{
"0-5"
:
0
,
"5-10"
:
0
,
"10-16"
:
0
,
"16-20"
:
0
,
"20-25"
:
0
,
"25-30"
:
0
,
"30-35"
:
0
,
"35-40"
:
0
,
">40"
:
0
,
}
for
length
in
len_list
:
if
length
<=
5
:
distribution
[
"0-5"
]
+=
1
elif
length
<=
10
:
distribution
[
"5-10"
]
+=
1
elif
length
<=
16
:
distribution
[
"10-16"
]
+=
1
elif
length
<=
20
:
distribution
[
"16-20"
]
+=
1
elif
length
<=
25
:
distribution
[
"20-25"
]
+=
1
elif
length
<=
30
:
distribution
[
"25-30"
]
+=
1
elif
length
<=
35
:
distribution
[
"30-35"
]
+=
1
elif
length
<=
40
:
distribution
[
"35-40"
]
+=
1
else
:
distribution
[
">40"
]
+=
1
print
(
f
"Length distribution of
{
input_file_name
}
:"
)
for
key
,
value
in
distribution
.
items
():
print
(
f
"
{
key
}
:
{
value
}
"
)
data_tools/statistics_token_num.py
0 → 100644
View file @
112bf76b
import
json
import
math
import
os
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
import
torch
import
transformers
from
tqdm
import
tqdm
import
torchaudio
from
vita
import
conversation
as
conversation_lib
from
vita.config
import
*
from
vita.config
import
AudioFolder
,
FolderDict
from
vita.config.dataset_config
import
*
from
vita.constants
import
AUDIO_TOKEN_INDEX
,
GLOBAL_WEIGHTS_PATH
,
IGNORE_INDEX
,
IMAGE_TOKEN_INDEX
from
vita.util.data_utils_video_audio
import
DataArguments
,
LazySupervisedDataset
from
vita.util.mm_utils
import
tokenizer_image_audio_token
,
tokenizer_image_token
image_token_num
=
256
token_thre
=
4500
# datasets = NLP + HumanCentric + VideoQA + NaturalQA + VideoCap + OCRCap + NaturalCap
datasets
=
[
DyChart_iresearch
,
RCTW2019QA
,
Lvis_cn_noDesc
,
VIDEOChatGPT
]
datasets
=
[
AnyWord_20to50
]
out_file_name
=
"debug.json"
parser
=
transformers
.
HfArgumentParser
((
DataArguments
))
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
f
"
{
GLOBAL_WEIGHTS_PATH
}
/Mixtral-8x7B_New/mg2hg"
,
cache_dir
=
None
,
model_max_length
=
8192
,
padding_side
=
"right"
,
use_fast
=
True
,
)
long_json
=
[]
def
get_wav_duration
(
file_path
):
waveform
,
sample_rate
=
torchaudio
.
load
(
file_path
)
duration
=
waveform
.
size
(
1
)
/
sample_rate
return
duration
def
process_item
(
item
,
conv
,
roles
,
tokenizer
):
source
=
item
[
"conversations"
]
conv
.
messages
=
[]
for
j
,
sentence
in
enumerate
(
source
):
role
=
roles
[
sentence
[
"from"
]]
assert
role
==
conv
.
roles
[
j
%
2
],
f
"
{
source
}
"
conv
.
append_message
(
role
,
sentence
[
"value"
])
prompt
=
conv
.
get_prompt
()
input_ids
=
tokenizer_image_token
(
prompt
,
tokenizer
,
return_tensors
=
"pt"
)
num_images
=
(
input_ids
==
IMAGE_TOKEN_INDEX
).
sum
()
item_token_num
=
input_ids
.
shape
[
0
]
+
num_images
*
image_token_num
if
"audio"
in
item
:
audio_files
=
item
[
"audio"
]
audio_directory
=
AudioFolder
if
isinstance
(
audio_files
,
str
):
audio_files
=
[
audio_files
]
assert
isinstance
(
audio_files
,
list
)
total_duration
=
0
for
audio_file_name
in
audio_files
:
audio_file_path
=
os
.
path
.
join
(
audio_directory
,
"audio"
,
audio_file_name
)
duration
=
get_wav_duration
(
audio_file_path
)
duration
=
(
math
.
ceil
(
duration
)
if
math
.
ceil
(
duration
)
%
2
==
0
else
math
.
ceil
(
duration
)
+
1
)
total_duration
+=
duration
item_token_num
+=
math
.
ceil
(
total_duration
*
12.5
)
if
item_token_num
>
token_thre
:
print
(
item_token_num
)
if
len
(
item
[
"image"
])
>=
16
:
long_json
.
append
(
item
)
print
(
len
(
item
[
"image"
]))
return
item_token_num
for
dataset
in
datasets
:
json_file_path
=
dataset
[
"chat_path"
]
with
open
(
json_file_path
,
"r"
,
encoding
=
"utf-8"
)
as
file
:
data
=
json
.
load
(
file
)
conv
=
conversation_lib
.
default_conversation
.
copy
()
roles
=
{
"human"
:
conv
.
roles
[
0
],
"gpt"
:
conv
.
roles
[
1
]}
len_list
=
[]
with
ThreadPoolExecutor
()
as
executor
:
futures
=
[
executor
.
submit
(
process_item
,
item
,
conv
,
roles
,
tokenizer
)
for
item
in
data
]
for
future
in
tqdm
(
as_completed
(
futures
),
total
=
len
(
futures
)):
len_list
.
append
(
future
.
result
())
assert
len
(
len_list
)
==
len
(
data
)
distribution
=
{
"0-100"
:
0
,
"100-200"
:
0
,
"200-300"
:
0
,
"300-400"
:
0
,
"400-500"
:
0
,
"500-600"
:
0
,
"600-700"
:
0
,
"700-800"
:
0
,
"800-900"
:
0
,
"900-1000"
:
0
,
"1000-1100"
:
0
,
"1100-1200"
:
0
,
"1200-1300"
:
0
,
"1300-1400"
:
0
,
"1400-1500"
:
0
,
"1500-1600"
:
0
,
"1600-1700"
:
0
,
"1700-1800"
:
0
,
"1800-1900"
:
0
,
"1900-2000"
:
0
,
"2000-2500"
:
0
,
"2500-3000"
:
0
,
"3000-3500"
:
0
,
"3500-4000"
:
0
,
"4000-4500"
:
0
,
"4500-5000"
:
0
,
"5000-5500"
:
0
,
"5500-6000"
:
0
,
">6000"
:
0
,
}
for
length
in
len_list
:
if
length
<=
100
:
distribution
[
"0-100"
]
+=
1
elif
length
<=
200
:
distribution
[
"100-200"
]
+=
1
elif
length
<=
300
:
distribution
[
"200-300"
]
+=
1
elif
length
<=
400
:
distribution
[
"300-400"
]
+=
1
elif
length
<=
500
:
distribution
[
"400-500"
]
+=
1
elif
length
<=
600
:
distribution
[
"500-600"
]
+=
1
elif
length
<=
700
:
distribution
[
"600-700"
]
+=
1
elif
length
<=
800
:
distribution
[
"700-800"
]
+=
1
elif
length
<=
900
:
distribution
[
"800-900"
]
+=
1
elif
length
<=
1000
:
distribution
[
"900-1000"
]
+=
1
elif
length
<=
1100
:
distribution
[
"1000-1100"
]
+=
1
elif
length
<=
1200
:
distribution
[
"1100-1200"
]
+=
1
elif
length
<=
1300
:
distribution
[
"1200-1300"
]
+=
1
elif
length
<=
1400
:
distribution
[
"1300-1400"
]
+=
1
elif
length
<=
1500
:
distribution
[
"1400-1500"
]
+=
1
elif
length
<=
1600
:
distribution
[
"1500-1600"
]
+=
1
elif
length
<=
1700
:
distribution
[
"1600-1700"
]
+=
1
elif
length
<=
1800
:
distribution
[
"1700-1800"
]
+=
1
elif
length
<=
1900
:
distribution
[
"1800-1900"
]
+=
1
elif
length
<=
2000
:
distribution
[
"1900-2000"
]
+=
1
elif
length
<=
2500
:
distribution
[
"2000-2500"
]
+=
1
elif
length
<=
3000
:
distribution
[
"2500-3000"
]
+=
1
elif
length
<=
3500
:
distribution
[
"3000-3500"
]
+=
1
elif
length
<=
4000
:
distribution
[
"3500-4000"
]
+=
1
elif
length
<=
4500
:
distribution
[
"4000-4500"
]
+=
1
elif
length
<=
5000
:
distribution
[
"4500-5000"
]
+=
1
elif
length
<=
5500
:
distribution
[
"5000-5500"
]
+=
1
elif
length
<=
6000
:
distribution
[
"5500-6000"
]
+=
1
else
:
distribution
[
">6000"
]
+=
1
print
(
f
"Length distribution of
{
json_file_path
}
:"
)
for
key
,
value
in
distribution
.
items
():
print
(
f
"
{
key
}
:
{
value
}
"
)
# with open(out_file_name, 'w', encoding='utf-8') as file:
# json.dump(long_json*10, file, ensure_ascii=False, indent=4)
# print(f"处理完成,大于{token_thre}的已保存到{out_file_name}")
data_tools/statistics_token_num_frameCat.py
0 → 100644
View file @
112bf76b
import
json
import
math
import
os
from
concurrent.futures
import
ThreadPoolExecutor
,
as_completed
import
torch
import
transformers
from
PIL
import
Image
from
tqdm
import
tqdm
import
torchaudio
from
vita
import
conversation
as
conversation_lib
from
vita.config
import
*
from
vita.config
import
AudioFolder
,
FolderDict
from
vita.config.dataset_config
import
*
from
vita.constants
import
AUDIO_TOKEN_INDEX
,
GLOBAL_WEIGHTS_PATH
,
IGNORE_INDEX
,
IMAGE_TOKEN_INDEX
from
vita.util.data_utils_video_audio
import
DataArguments
,
LazySupervisedDataset
from
vita.util.data_utils_video_audio_neg_patch
import
find_closest_aspect_ratio
from
vita.util.mm_utils
import
tokenizer_image_audio_token
,
tokenizer_image_token
image_token_num
=
256
token_thre
=
9500
# datasets = NLP + HumanCentric + VideoQA + NaturalQA + VideoCap + OCRCap + NaturalCap
datasets
=
NaturalCap0
+
OCRCap0
+
VideoCap0
+
NaturalQA0
# datasets = VideoQA + HumanCentric + NLP
# datasets = [SGInternvid0]
datasets
=
NaturalCap0
datasets
=
OCRCap0
datasets
=
VideoCap0
+
NaturalQA0
+
[
TextSFT0
]
out_file_name
=
"debug.json"
parser
=
transformers
.
HfArgumentParser
((
DataArguments
))
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
f
"
{
GLOBAL_WEIGHTS_PATH
}
/Mixtral-8x7B_New/mg2hg"
,
cache_dir
=
None
,
model_max_length
=
8192
,
padding_side
=
"right"
,
use_fast
=
True
,
)
long_json
=
[]
def
dynamic_preprocess
(
image
,
min_num
=
2
,
max_num
=
12
,
image_size
=
448
,
use_thumbnail
=
False
,
img_mean
=
0
):
orig_width
,
orig_height
=
image
.
size
aspect_ratio
=
orig_width
/
orig_height
# calculate the existing image aspect ratio
target_ratios
=
set
(
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
i
*
j
<=
max_num
and
i
*
j
>=
min_num
)
target_ratios
=
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
orig_width
,
orig_height
,
image_size
)
# expand target_aspect_ratio to even for each size
new_target_aspect_ratio
=
[
e
if
e
%
2
==
0
else
e
+
1
for
e
in
target_aspect_ratio
]
blocks_big
=
int
(
0.5
*
new_target_aspect_ratio
[
0
]
*
0.5
*
new_target_aspect_ratio
[
1
])
return
blocks_big
def
get_wav_duration
(
file_path
):
waveform
,
sample_rate
=
torchaudio
.
load
(
file_path
)
duration
=
waveform
.
size
(
1
)
/
sample_rate
return
duration
def
process_item
(
item
,
tokenizer
):
conv
=
conversation_lib
.
default_conversation
.
copy
()
roles
=
{
"human"
:
conv
.
roles
[
0
],
"gpt"
:
conv
.
roles
[
1
]}
source
=
item
[
"conversations"
]
conv
.
messages
=
[]
modality
=
"lang"
for
j
,
sentence
in
enumerate
(
source
):
role
=
roles
[
sentence
[
"from"
]]
assert
role
==
conv
.
roles
[
j
%
2
],
f
"
{
source
}
"
conv
.
append_message
(
role
,
sentence
[
"value"
])
if
"<image>"
in
sentence
[
"value"
]:
modality
=
"image"
elif
"<video>"
in
sentence
[
"value"
]:
modality
=
"lang"
prompt
=
conv
.
get_prompt
(
modality
)
input_ids
=
tokenizer_image_token
(
prompt
,
tokenizer
,
return_tensors
=
"pt"
)
item_token_num
=
input_ids
.
shape
[
0
]
if
"image"
in
item
:
image_file
=
item
[
"image"
]
if
isinstance
(
image_file
,
str
):
image_file
=
[
image_file
]
set_id
=
item
[
"set"
]
if
isinstance
(
set_id
,
str
):
set_id
=
[
set_id
]
for
k
,
img_file
in
enumerate
(
image_file
):
if
set_id
[
k
]
not
in
NoPatchSets
:
image_directory
=
FolderDict
[
set_id
[
k
]]
image
=
Image
.
open
(
os
.
path
.
join
(
image_directory
,
img_file
.
replace
(
"
\\
"
,
"/"
))
).
convert
(
"RGB"
)
num_patches
=
dynamic_preprocess
(
image
)
else
:
num_patches
=
1
item_token_num
+=
num_patches
*
image_token_num
total_duration
=
0
if
"audio"
in
item
:
audio_files
=
item
[
"audio"
]
audio_directory
=
AudioFolder
if
isinstance
(
audio_files
,
str
):
audio_files
=
[
audio_files
]
assert
isinstance
(
audio_files
,
list
)
for
audio_file_name
in
audio_files
:
audio_file_path
=
os
.
path
.
join
(
audio_directory
,
"audio"
,
audio_file_name
)
duration
=
get_wav_duration
(
audio_file_path
)
duration
=
(
math
.
ceil
(
duration
)
if
math
.
ceil
(
duration
)
%
2
==
0
else
math
.
ceil
(
duration
)
+
1
)
total_duration
+=
duration
item_token_num
+=
math
.
ceil
(
total_duration
*
12.5
)
if
item_token_num
>
token_thre
:
print
(
f
"item_token_num:
{
item_token_num
}
"
)
if
len
(
item
[
"image"
])
>=
16
:
print
(
f
"num_patches:
{
num_patches
}
"
)
print
(
f
"total_duration:
{
total_duration
}
"
)
long_json
.
append
(
item
)
print
(
item
)
return
item_token_num
for
dataset
in
datasets
:
json_file_path
=
dataset
[
"chat_path"
]
with
open
(
json_file_path
,
"r"
,
encoding
=
"utf-8"
)
as
file
:
data
=
json
.
load
(
file
)
len_list
=
[]
with
ThreadPoolExecutor
()
as
executor
:
futures
=
[
executor
.
submit
(
process_item
,
item
,
tokenizer
)
for
item
in
data
]
for
future
in
tqdm
(
as_completed
(
futures
),
total
=
len
(
futures
)):
len_list
.
append
(
future
.
result
())
assert
len
(
len_list
)
==
len
(
data
)
distribution
=
{
"0-100"
:
0
,
"100-200"
:
0
,
"200-300"
:
0
,
"300-400"
:
0
,
"400-500"
:
0
,
"500-600"
:
0
,
"600-700"
:
0
,
"700-800"
:
0
,
"800-900"
:
0
,
"900-1000"
:
0
,
"1000-1500"
:
0
,
"1500-2000"
:
0
,
"2000-2500"
:
0
,
"2500-3000"
:
0
,
"3000-3500"
:
0
,
"3500-4000"
:
0
,
"4000-4500"
:
0
,
"4500-5000"
:
0
,
"5000-5500"
:
0
,
"5500-6000"
:
0
,
"6000-6500"
:
0
,
"6500-7000"
:
0
,
"7000-7500"
:
0
,
"7500-8000"
:
0
,
"8000-8500"
:
0
,
"8500-9000"
:
0
,
"9000-9500"
:
0
,
"9500-10000"
:
0
,
">10000"
:
0
,
}
for
length
in
len_list
:
if
length
<=
100
:
distribution
[
"0-100"
]
+=
1
elif
length
<=
200
:
distribution
[
"100-200"
]
+=
1
elif
length
<=
300
:
distribution
[
"200-300"
]
+=
1
elif
length
<=
400
:
distribution
[
"300-400"
]
+=
1
elif
length
<=
500
:
distribution
[
"400-500"
]
+=
1
elif
length
<=
600
:
distribution
[
"500-600"
]
+=
1
elif
length
<=
700
:
distribution
[
"600-700"
]
+=
1
elif
length
<=
800
:
distribution
[
"700-800"
]
+=
1
elif
length
<=
900
:
distribution
[
"800-900"
]
+=
1
elif
length
<=
1000
:
distribution
[
"900-1000"
]
+=
1
elif
length
<=
1500
:
distribution
[
"1000-1500"
]
+=
1
elif
length
<=
2000
:
distribution
[
"1500-2000"
]
+=
1
elif
length
<=
2500
:
distribution
[
"2000-2500"
]
+=
1
elif
length
<=
3000
:
distribution
[
"2500-3000"
]
+=
1
elif
length
<=
3500
:
distribution
[
"3000-3500"
]
+=
1
elif
length
<=
4000
:
distribution
[
"3500-4000"
]
+=
1
elif
length
<=
4500
:
distribution
[
"4000-4500"
]
+=
1
elif
length
<=
5000
:
distribution
[
"4500-5000"
]
+=
1
elif
length
<=
5500
:
distribution
[
"5000-5500"
]
+=
1
elif
length
<=
6000
:
distribution
[
"5500-6000"
]
+=
1
elif
length
<=
6500
:
distribution
[
"6000-6500"
]
+=
1
elif
length
<=
7000
:
distribution
[
"6500-7000"
]
+=
1
elif
length
<=
7500
:
distribution
[
"7000-7500"
]
+=
1
elif
length
<=
8000
:
distribution
[
"7500-8000"
]
+=
1
elif
length
<=
8500
:
distribution
[
"8000-8500"
]
+=
1
elif
length
<=
9000
:
distribution
[
"8500-9000"
]
+=
1
elif
length
<=
9500
:
distribution
[
"9000-9500"
]
+=
1
elif
length
<=
10000
:
distribution
[
"9500-10000"
]
+=
1
else
:
distribution
[
">10000"
]
+=
1
print
(
f
"Length distribution of
{
json_file_path
}
:"
)
for
key
,
value
in
distribution
.
items
():
print
(
f
"
{
key
}
:
{
value
}
"
)
# with open(out_file_name, 'w', encoding='utf-8') as file:
# json.dump(long_json*10, file, ensure_ascii=False, indent=4)
# print(f"处理完成,大于{token_thre}的已保存到{out_file_name}")
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment