Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
ModelZoo
SenseNova-SI
Commits
876a36a4
Commit
876a36a4
authored
May 27, 2026
by
raojy
Browse files
first
parent
eda2afb8
Changes
175
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1179 additions
and
0 deletions
+1179
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/convert_to_int8.py
...training/intern_vl/internvl_chat/tools/convert_to_int8.py
+17
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/extract_mlp.py
...ain/training/intern_vl/internvl_chat/tools/extract_mlp.py
+19
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/extract_video_frames.py
...ing/intern_vl/internvl_chat/tools/extract_video_frames.py
+126
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/extract_vit.py
...ain/training/intern_vl/internvl_chat/tools/extract_vit.py
+16
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/images_stitching.py
...raining/intern_vl/internvl_chat/tools/images_stitching.py
+95
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/internvl_custom2hf.py
...ining/intern_vl/internvl_chat/tools/internvl_custom2hf.py
+206
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/internvl_hf2custom.py
...ining/intern_vl/internvl_chat/tools/internvl_hf2custom.py
+216
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/json2jsonl.py
...main/training/intern_vl/internvl_chat/tools/json2jsonl.py
+20
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/jsonl2jsonl.py
...ain/training/intern_vl/internvl_chat/tools/jsonl2jsonl.py
+22
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/merge_lora.py
...main/training/intern_vl/internvl_chat/tools/merge_lora.py
+32
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/replace_llm.py
...ain/training/intern_vl/internvl_chat/tools/replace_llm.py
+29
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/resize_pos_embed.py
...raining/intern_vl/internvl_chat/tools/resize_pos_embed.py
+27
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/zero_stage1_config.json
.../training/intern_vl/internvl_chat/zero_stage1_config.json
+41
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/zero_stage2_config.json
.../training/intern_vl/internvl_chat/zero_stage2_config.json
+41
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/zero_stage3_config.json
.../training/intern_vl/internvl_chat/zero_stage3_config.json
+44
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/zero_stage3_config_100b.json
...ning/intern_vl/internvl_chat/zero_stage3_config_100b.json
+44
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/zero_stage3_config_100b_1e7_offload.json
...vl/internvl_chat/zero_stage3_config_100b_1e7_offload.json
+52
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/zero_stage3_config_100b_1e8.json
.../intern_vl/internvl_chat/zero_stage3_config_100b_1e8.json
+44
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/zero_stage3_config_34b.json
...ining/intern_vl/internvl_chat/zero_stage3_config_34b.json
+44
-0
SenseNova-SI-main/training/intern_vl/internvl_chat/zero_stage3_config_70b.json
...ining/intern_vl/internvl_chat/zero_stage3_config_70b.json
+44
-0
No files found.
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/convert_to_int8.py
0 → 100644
View file @
876a36a4
import
torch
from
transformers
import
AutoModel
,
AutoTokenizer
path
=
"OpenGVLab/InternVL-Chat-V1-5"
model
=
AutoModel
.
from_pretrained
(
path
,
torch_dtype
=
torch
.
bfloat16
,
low_cpu_mem_usage
=
True
,
trust_remote_code
=
True
,
load_in_8bit
=
True
,
).
eval
()
tokenizer
=
AutoTokenizer
.
from_pretrained
(
path
,
trust_remote_code
=
True
)
model
.
save_pretrained
(
"release/InternVL-Chat-V1-5-Int8"
)
tokenizer
.
save_pretrained
(
"release/InternVL-Chat-V1-5-Int8"
)
print
(
"finished"
)
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/extract_mlp.py
0 → 100644
View file @
876a36a4
import
argparse
import
os.path
import
torch
from
internvl.model.internvl_chat
import
InternVLChatModel
argparse
=
argparse
.
ArgumentParser
()
argparse
.
add_argument
(
"model_path"
,
type
=
str
,
default
=
""
)
argparse
.
add_argument
(
"output_path"
,
type
=
str
,
default
=
""
)
args
=
argparse
.
parse_args
()
model
=
InternVLChatModel
.
from_pretrained
(
args
.
model_path
,
torch_dtype
=
torch
.
bfloat16
)
model
=
model
.
mlp1
.
to
(
torch
.
bfloat16
)
ckpt
=
model
.
state_dict
()
output_path
=
os
.
path
.
join
(
args
.
output_path
,
"mlp_projector.pth"
)
torch
.
save
(
ckpt
,
output_path
)
print
(
"finished"
)
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/extract_video_frames.py
0 → 100644
View file @
876a36a4
import
concurrent.futures
import
json
import
os
import
av
import
numpy
as
np
import
torch
from
decord
import
VideoReader
,
cpu
from
PIL
import
Image
from
tqdm.auto
import
tqdm
num_segments
=
1
# root directory of evaluation dimension 10
dimension10_dir
=
"./videos/20bn-something-something-v2"
# root directory of evaluation dimension 11
dimension11_dir
=
"./videos/EPIC-KITCHENS"
# root directory of evaluation dimension 12
dimension12_dir
=
"./videos/BreakfastII_15fps_qvga_sync"
def
transform_video
(
buffer
):
try
:
buffer
=
buffer
.
numpy
()
except
AttributeError
:
try
:
buffer
=
buffer
.
asnumpy
()
except
AttributeError
:
print
(
"Both buffer.numpy() and buffer.asnumpy() failed."
)
buffer
=
None
images_group
=
list
()
for
fid
in
range
(
len
(
buffer
)):
images_group
.
append
(
Image
.
fromarray
(
buffer
[
fid
]))
return
images_group
def
get_index
(
num_frames
,
num_segments
):
if
num_segments
>
num_frames
:
offsets
=
np
.
array
([
idx
for
idx
in
range
(
num_frames
)])
else
:
# uniform sampling
seg_size
=
float
(
num_frames
-
1
)
/
num_segments
start
=
int
(
seg_size
/
2
)
offsets
=
np
.
array
(
[
start
+
int
(
np
.
round
(
seg_size
*
idx
))
for
idx
in
range
(
num_segments
)]
)
return
offsets
def
fetch_images
(
qa_item
):
use_pyav
=
False
segment
=
None
if
qa_item
[
"question_type_id"
]
==
10
:
data_path
=
os
.
path
.
join
(
dimension10_dir
,
qa_item
[
"data_id"
])
start
=
0.0
end
=
0.0
elif
qa_item
[
"question_type_id"
]
==
11
:
data_path
=
os
.
path
.
join
(
dimension11_dir
,
qa_item
[
"data_id"
].
split
(
"/"
)[
-
1
])
segment
=
qa_item
[
"segment"
]
start
,
end
=
segment
[
0
],
segment
[
1
]
elif
qa_item
[
"question_type_id"
]
==
12
:
data_path
=
os
.
path
.
join
(
dimension12_dir
,
qa_item
[
"data_id"
])
segment
=
qa_item
[
"segment"
]
start
,
end
=
segment
[
0
],
segment
[
1
]
use_pyav
=
True
if
use_pyav
:
# using pyav for decoding videos in evaluation dimension 12
reader
=
av
.
open
(
data_path
)
frames
=
[
torch
.
from_numpy
(
f
.
to_rgb
().
to_ndarray
())
for
f
in
reader
.
decode
(
video
=
0
)
]
video_len
=
len
(
frames
)
start_frame
,
end_frame
=
start
,
end
end_frame
=
min
(
end_frame
,
video_len
)
offset
=
get_index
(
end_frame
-
start_frame
,
num_segments
)
frame_indices
=
offset
+
start_frame
buffer
=
torch
.
stack
([
frames
[
idx
]
for
idx
in
frame_indices
])
else
:
# using decord for decoding videos in evaluation dimension 10-11
vr
=
VideoReader
(
data_path
,
num_threads
=
1
,
ctx
=
cpu
(
0
))
video_len
=
len
(
vr
)
fps
=
vr
.
get_avg_fps
()
if
segment
is
not
None
:
# obtain start and end frame for the video segment in evaluation dimension 11
start_frame
=
int
(
min
(
max
(
start
*
fps
,
0
),
video_len
-
1
))
end_frame
=
int
(
min
(
max
(
end
*
fps
,
0
),
video_len
-
1
))
tot_frames
=
int
(
end_frame
-
start_frame
)
offset
=
get_index
(
tot_frames
,
num_segments
)
frame_indices
=
offset
+
start_frame
else
:
# sample frames of the video in evaluation dimension 10
frame_indices
=
get_index
(
video_len
-
1
,
num_segments
)
vr
.
seek
(
0
)
buffer
=
vr
.
get_batch
(
frame_indices
)
return
transform_video
(
buffer
)
def
fetch_images_parallel
(
qa_item
):
return
qa_item
,
fetch_images
(
qa_item
)
if
__name__
==
"__main__"
:
data
=
json
.
load
(
open
(
"SEED-Bench.json"
))
video_img_dir
=
"SEED-Bench-video-image"
ques_type_id_to_name
=
{
id
:
n
for
n
,
id
in
data
[
"question_type"
].
items
()}
video_data
=
[
x
for
x
in
data
[
"questions"
]
if
x
[
"data_type"
]
==
"video"
]
with
open
(
output
,
"w"
)
as
f
,
concurrent
.
futures
.
ThreadPoolExecutor
()
as
executor
:
future_to_images
=
{
executor
.
submit
(
fetch_images_parallel
,
qa_item
):
qa_item
for
qa_item
in
video_data
}
for
future
in
tqdm
(
concurrent
.
futures
.
as_completed
(
future_to_images
),
total
=
len
(
future_to_images
),
):
qa_item
=
future_to_images
[
future
]
try
:
qa_item
,
images
=
future
.
result
()
except
Exception
as
exc
:
print
(
f
"
{
qa_item
}
generated an exception:
{
exc
}
"
)
else
:
img_file
=
f
"
{
qa_item
[
'question_type_id'
]
}
_
{
qa_item
[
'question_id'
]
}
.png"
images
[
0
].
save
(
os
.
path
.
join
(
video_img_dir
,
img_file
))
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/extract_vit.py
0 → 100644
View file @
876a36a4
import
argparse
import
torch
from
internvl.model.internvl_chat
import
InternVLChatModel
argparse
=
argparse
.
ArgumentParser
()
argparse
.
add_argument
(
"model_path"
,
type
=
str
,
default
=
""
)
argparse
.
add_argument
(
"output_path"
,
type
=
str
,
default
=
""
)
args
=
argparse
.
parse_args
()
model
=
InternVLChatModel
.
from_pretrained
(
args
.
model_path
,
torch_dtype
=
torch
.
bfloat16
)
model
=
model
.
vision_model
.
to
(
torch
.
bfloat16
)
model
.
save_pretrained
(
args
.
output_path
)
print
(
"finished"
)
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/images_stitching.py
0 → 100644
View file @
876a36a4
import
argparse
import
json
import
os
from
PIL
import
Image
,
ImageDraw
,
ImageFont
from
tqdm
import
tqdm
FOOT
=
ImageFont
.
truetype
(
"/usr/share/fonts/dejavu/DejaVuSans-Bold.ttf"
,
50
)
def
custom_image
(
img_paths
,
save_path
,
image_size
=
448
):
captions
=
[
"CAM_FRONT_LEFT"
,
"CAM_FRONT"
,
"CAM_FRONT_RIGHT"
,
"CAM_BACK_LEFT"
,
"CAM_BACK"
,
"CAM_BACK_RIGHT"
,
]
width
=
image_size
*
2
height
=
image_size
# count = 0
all_images
=
{}
for
image_id
,
image_files
in
tqdm
(
img_paths
.
items
()):
all_images
[
image_id
]
=
dict
()
all_images
[
image_id
][
"images_path"
]
=
image_files
all_images
[
image_id
][
"images_size"
]
=
{
k
:
(
0
,
0
)
for
k
in
image_files
.
keys
()}
imgs
=
{}
for
caption
,
image_file
in
image_files
.
items
():
image_path
=
os
.
path
.
join
(
args
.
data_root
,
image_file
.
replace
(
"../nuscenes/samples/"
,
"/nuscenes/samples/"
),
)
img
=
Image
.
open
(
image_path
).
convert
(
"RGB"
)
old_wide
,
old_height
=
img
.
size
all_images
[
image_id
][
"images_size"
][
caption
]
=
(
old_wide
,
old_height
)
img
=
img
.
resize
((
width
,
height
))
draw
=
ImageDraw
.
Draw
(
img
)
text
=
caption
draw
.
text
((
0
,
0
),
text
,
fill
=
(
255
,
0
,
255
),
font
=
FOOT
)
imgs
[
caption
]
=
img
result_width
=
width
*
3
result_height
=
height
*
2
result_img
=
Image
.
new
(
"RGB"
,
(
result_width
,
result_height
))
imgs
=
[
imgs
[
caption
]
for
caption
in
captions
]
for
i
in
range
(
len
(
imgs
)):
row
=
i
//
3
col
=
i
%
3
left
=
col
*
width
top
=
row
*
height
right
=
left
+
width
bottom
=
top
+
height
result_img
.
paste
(
imgs
[
i
],
(
left
,
top
))
result_path
=
os
.
path
.
join
(
save_path
,
image_id
+
".jpg"
)
result_img
.
save
(
result_path
)
def
get_images
(
ann_file
):
with
open
(
ann_file
,
"r"
)
as
f
:
# , \
train_file
=
json
.
load
(
f
)
images
=
{}
for
scene_id
in
train_file
.
keys
():
scene_data
=
train_file
[
scene_id
][
"key_frames"
]
for
frame_id
in
scene_data
.
keys
():
image_id
=
scene_id
+
"_"
+
frame_id
if
image_id
not
in
images
:
images
[
image_id
]
=
scene_data
[
frame_id
][
"image_paths"
]
else
:
print
(
image_id
)
return
images
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--data-root"
,
type
=
str
,
default
=
"InternVL-Domain-Adaptation-Data/images/drivelm"
,
)
parser
.
add_argument
(
"--ann-file"
,
type
=
str
,
default
=
"path/to/v1_1_val_nus_q_only.json"
)
args
=
parser
.
parse_args
()
images
=
get_images
(
args
.
ann_file
)
save_path
=
os
.
path
.
join
(
args
.
data_root
,
"stitch"
)
os
.
makedirs
(
save_path
,
exist_ok
=
True
)
custom_image
(
img_paths
=
images
,
save_path
=
save_path
)
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/internvl_custom2hf.py
0 → 100644
View file @
876a36a4
import
argparse
import
json
import
os
from
collections
import
OrderedDict
from
copy
import
deepcopy
import
torch
from
safetensors
import
safe_open
from
transformers
import
(
AutoConfig
,
AutoModel
,
AutoModelForImageTextToText
,
AutoTokenizer
,
)
def
compute_l2_distance
(
model1
,
model2
):
state_dict1
=
model1
.
state_dict
()
state_dict2
=
model2
.
state_dict
()
total_l2
=
0.0
total_params
=
0
common_keys
=
set
(
state_dict1
.
keys
())
&
set
(
state_dict2
.
keys
())
for
key
in
common_keys
:
t1
=
state_dict1
[
key
].
float
().
cpu
()
t2
=
state_dict2
[
key
].
float
().
cpu
()
if
t1
.
shape
!=
t2
.
shape
:
print
(
f
"⚠️ Shape mismatch at key:
{
key
}
, skipping."
)
continue
diff
=
t1
-
t2
l2
=
torch
.
norm
(
diff
,
p
=
2
)
total_l2
+=
l2
.
item
()
total_params
+=
diff
.
numel
()
print
(
f
"
\n
✅ Total L2 distance:
{
total_l2
:.
6
f
}
"
)
print
(
f
"✅ Average per-parameter L2:
{
total_l2
/
total_params
:.
8
f
}
"
if
total_params
>
0
else
"⚠️ No matching parameters."
)
return
total_l2
def
convert_keys_to_hf
(
custom_state_dict
):
new_state_dict
=
OrderedDict
()
qkv_split_buffer
=
{}
for
key
,
value
in
custom_state_dict
.
items
():
# === 1. mlp1.* → multi_modal_projector
if
key
.
startswith
(
"mlp1.0."
):
new_key
=
"model."
+
key
.
replace
(
"mlp1.0."
,
"multi_modal_projector.layer_norm."
)
elif
key
.
startswith
(
"mlp1.1."
):
new_key
=
"model."
+
key
.
replace
(
"mlp1.1."
,
"multi_modal_projector.linear_1."
)
elif
key
.
startswith
(
"mlp1.3."
):
new_key
=
"model."
+
key
.
replace
(
"mlp1.3."
,
"multi_modal_projector.linear_2."
)
# === 2. embeddings ===
elif
key
==
"vision_model.embeddings.class_embedding"
:
new_key
=
"model.vision_tower.embeddings.cls_token"
elif
key
.
startswith
(
"vision_model.embeddings.patch_embedding"
):
new_key
=
"model."
+
key
.
replace
(
"vision_model.embeddings.patch_embedding"
,
"vision_tower.embeddings.patch_embeddings.projection"
,
)
elif
key
==
"vision_model.embeddings.position_embedding"
:
new_key
=
"model.vision_tower.embeddings.position_embeddings"
# === 3. encoder ===
elif
key
.
startswith
(
"vision_model.encoder.layers."
):
parts
=
key
.
split
(
"."
)
layer_id
=
parts
[
3
]
suffix
=
"."
.
join
(
parts
[
4
:])
base
=
f
"model.vision_tower.encoder.layer.
{
layer_id
}
."
if
suffix
.
startswith
(
"attn.qkv.weight"
):
qkv_split_buffer
[(
layer_id
,
"weight"
)]
=
value
continue
elif
suffix
.
startswith
(
"attn.qkv.bias"
):
qkv_split_buffer
[(
layer_id
,
"bias"
)]
=
value
continue
elif
suffix
.
startswith
(
"attn.proj."
):
new_key
=
base
+
"attention.projection_layer."
+
suffix
.
split
(
"."
)[
-
1
]
elif
suffix
.
startswith
(
"norm1."
):
new_key
=
base
+
"layernorm_before."
+
suffix
.
split
(
"."
)[
-
1
]
elif
suffix
.
startswith
(
"norm2."
):
new_key
=
base
+
"layernorm_after."
+
suffix
.
split
(
"."
)[
-
1
]
elif
suffix
==
"ls1"
:
new_key
=
base
+
"lambda_1"
elif
suffix
==
"ls2"
:
new_key
=
base
+
"lambda_2"
else
:
new_key
=
base
+
suffix
# === 4. language_model.model. → language_model.
elif
(
key
==
"language_model.lm_head.weight"
or
key
==
"language_model.model.lm_head.weight"
):
new_key
=
"lm_head.weight"
elif
key
.
startswith
(
"language_model.model."
):
new_key
=
"model."
+
key
.
replace
(
"language_model.model."
,
"language_model."
)
# === 5. already has model. prefix or default
elif
key
.
startswith
(
"model."
):
new_key
=
key
else
:
new_key
=
"model."
+
key
new_state_dict
[
new_key
]
=
value
# === 6. Split QKV ===
for
(
layer_id
,
typ
),
tensor
in
qkv_split_buffer
.
items
():
d
=
tensor
.
shape
[
0
]
//
3
q
,
k
,
v
=
tensor
[:
d
],
tensor
[
d
:
2
*
d
],
tensor
[
2
*
d
:]
base
=
f
"model.vision_tower.encoder.layer.
{
layer_id
}
.attention."
if
typ
==
"weight"
:
new_state_dict
[
base
+
"q_proj.weight"
]
=
q
new_state_dict
[
base
+
"k_proj.weight"
]
=
k
new_state_dict
[
base
+
"v_proj.weight"
]
=
v
else
:
new_state_dict
[
base
+
"q_proj.bias"
]
=
q
new_state_dict
[
base
+
"k_proj.bias"
]
=
k
new_state_dict
[
base
+
"v_proj.bias"
]
=
v
return
new_state_dict
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Convert custom safetensors weights and compare with HuggingFace model."
)
parser
.
add_argument
(
"--custom_path"
,
type
=
str
,
required
=
True
,
help
=
"Path to original safetensors checkpoint folder"
,
)
parser
.
add_argument
(
"--hf_path"
,
type
=
str
,
required
=
True
,
help
=
"Path to pretrained HuggingFace model"
,
)
parser
.
add_argument
(
"--save_path"
,
type
=
str
,
required
=
True
,
help
=
"Path to save the converted model"
)
args
=
parser
.
parse_args
()
mllm_custom_path
=
args
.
custom_path
mllm_hf_path
=
args
.
hf_path
mllm_save_path
=
args
.
save_path
# Load custom model configuration
config
=
AutoConfig
.
from_pretrained
(
mllm_hf_path
,
trust_remote_code
=
True
)
model
=
AutoModelForImageTextToText
.
from_config
(
config
,
trust_remote_code
=
True
).
to
(
"cuda"
)
# Load HF safetensor weights
checkpoint_paths
=
[
os
.
path
.
join
(
mllm_custom_path
,
f
)
for
f
in
os
.
listdir
(
mllm_custom_path
)
if
f
.
endswith
(
".safetensors"
)
]
print
(
f
"
\n
🔍 Found checkpoint files:
{
checkpoint_paths
}
"
)
model_state_dict_hf
=
{}
for
checkpoint_path
in
checkpoint_paths
:
with
safe_open
(
checkpoint_path
,
framework
=
"pt"
)
as
f
:
for
k
in
f
.
keys
():
model_state_dict_hf
[
k
]
=
f
.
get_tensor
(
k
)
# Convert key naming style
model_state_dict
=
convert_keys_to_hf
(
model_state_dict_hf
)
# Load weights into model
missing_keys
,
unexpected_keys
=
model
.
load_state_dict
(
model_state_dict
,
strict
=
False
)
print
(
f
"
\n
❌ Missing keys:
{
missing_keys
}
"
)
print
(
f
"⚠️ Unexpected keys:
{
unexpected_keys
}
"
)
# Load original model for comparison
model_compare
=
AutoModelForImageTextToText
.
from_pretrained
(
mllm_hf_path
,
trust_remote_code
=
True
)
compute_l2_distance
(
model
,
model_compare
)
# Save the converted model
model
.
save_pretrained
(
mllm_save_path
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
mllm_hf_path
,
trust_remote_code
=
True
)
tokenizer
.
save_pretrained
(
mllm_save_path
)
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/internvl_hf2custom.py
0 → 100644
View file @
876a36a4
import
argparse
import
json
import
os
from
copy
import
deepcopy
import
torch
from
safetensors
import
safe_open
from
transformers
import
AutoConfig
,
AutoModel
,
AutoTokenizer
def
compute_l2_distance
(
model1
,
model2
):
state_dict1
=
model1
.
state_dict
()
state_dict2
=
model2
.
state_dict
()
total_l2
=
0.0
total_params
=
0
common_keys
=
set
(
state_dict1
.
keys
())
&
set
(
state_dict2
.
keys
())
for
key
in
common_keys
:
t1
=
state_dict1
[
key
].
float
().
cpu
()
t2
=
state_dict2
[
key
].
float
().
cpu
()
if
t1
.
shape
!=
t2
.
shape
:
print
(
f
"⚠️ Shape mismatch at key:
{
key
}
, skipping."
)
continue
diff
=
t1
-
t2
l2
=
torch
.
norm
(
diff
,
p
=
2
)
total_l2
+=
l2
.
item
()
total_params
+=
diff
.
numel
()
print
(
f
"
\n
✅ Total L2 distance:
{
total_l2
:.
6
f
}
"
)
print
(
f
"✅ Average per-parameter L2:
{
total_l2
/
total_params
:.
8
f
}
"
if
total_params
>
0
else
"⚠️ No matching parameters."
)
return
total_l2
def
convert_keys_back
(
hf_state_dict
):
new_state_dict
=
{}
# Temporary buffer for QKV parts, separated into weight and bias
qkv_buffer
=
{}
for
key
,
value
in
hf_state_dict
.
items
():
# === 1. multi_modal_projector → mlp1.*
if
key
.
startswith
(
"multi_modal_projector.layer_norm."
):
new_key
=
key
.
replace
(
"multi_modal_projector.layer_norm."
,
"mlp1.0."
)
elif
key
.
startswith
(
"multi_modal_projector.linear_1."
):
new_key
=
key
.
replace
(
"multi_modal_projector.linear_1."
,
"mlp1.1."
)
elif
key
.
startswith
(
"multi_modal_projector.linear_2."
):
new_key
=
key
.
replace
(
"multi_modal_projector.linear_2."
,
"mlp1.3."
)
# === 2. embeddings ===
elif
key
==
"vision_tower.embeddings.cls_token"
:
new_key
=
"vision_model.embeddings.class_embedding"
elif
key
.
startswith
(
"vision_tower.embeddings.patch_embeddings.projection."
):
new_key
=
key
.
replace
(
"vision_tower.embeddings.patch_embeddings.projection"
,
"vision_model.embeddings.patch_embedding"
,
)
elif
key
==
"vision_tower.embeddings.position_embeddings"
:
new_key
=
"vision_model.embeddings.position_embedding"
# === 3. encoder.layer.X → encoder.layers.X
elif
key
.
startswith
(
"vision_tower.encoder.layer."
):
parts
=
key
.
split
(
"."
)
layer_id
=
parts
[
3
]
suffix
=
"."
.
join
(
parts
[
4
:])
base
=
f
"vision_model.encoder.layers.
{
layer_id
}
."
# Handle QKV weight and bias separately
if
suffix
in
{
"attention.q_proj.weight"
,
"attention.k_proj.weight"
,
"attention.v_proj.weight"
,
"attention.q_proj.bias"
,
"attention.k_proj.bias"
,
"attention.v_proj.bias"
,
}:
if
layer_id
not
in
qkv_buffer
:
qkv_buffer
[
layer_id
]
=
{
"weight"
:
{},
"bias"
:
{}}
if
suffix
.
endswith
(
".weight"
):
if
"q_proj"
in
suffix
:
qkv_buffer
[
layer_id
][
"weight"
][
"q_proj"
]
=
value
elif
"k_proj"
in
suffix
:
qkv_buffer
[
layer_id
][
"weight"
][
"k_proj"
]
=
value
elif
"v_proj"
in
suffix
:
qkv_buffer
[
layer_id
][
"weight"
][
"v_proj"
]
=
value
elif
suffix
.
endswith
(
".bias"
):
if
"q_proj"
in
suffix
:
qkv_buffer
[
layer_id
][
"bias"
][
"q_proj"
]
=
value
elif
"k_proj"
in
suffix
:
qkv_buffer
[
layer_id
][
"bias"
][
"k_proj"
]
=
value
elif
"v_proj"
in
suffix
:
qkv_buffer
[
layer_id
][
"bias"
][
"v_proj"
]
=
value
continue
# Postpone concatenation
elif
suffix
.
startswith
(
"attention.projection_layer."
):
new_key
=
base
+
"attn.proj."
+
suffix
.
split
(
"."
)[
-
1
]
elif
suffix
.
startswith
(
"layernorm_before."
):
new_key
=
base
+
"norm1."
+
suffix
.
split
(
"."
)[
-
1
]
elif
suffix
.
startswith
(
"layernorm_after."
):
new_key
=
base
+
"norm2."
+
suffix
.
split
(
"."
)[
-
1
]
elif
suffix
==
"lambda_1"
:
new_key
=
base
+
"ls1"
elif
suffix
==
"lambda_2"
:
new_key
=
base
+
"ls2"
else
:
new_key
=
base
+
suffix
else
:
new_key
=
key
new_state_dict
[
new_key
]
=
value
# === 4. Concatenate QKV weights and biases ===
for
layer_id
,
qkv_parts
in
qkv_buffer
.
items
():
base
=
f
"vision_model.encoder.layers.
{
layer_id
}
.attn.qkv"
# Concatenate weights
if
all
(
k
in
qkv_parts
[
"weight"
]
for
k
in
(
"q_proj"
,
"k_proj"
,
"v_proj"
)):
qkv_weight
=
torch
.
cat
(
[
qkv_parts
[
"weight"
][
"q_proj"
],
qkv_parts
[
"weight"
][
"k_proj"
],
qkv_parts
[
"weight"
][
"v_proj"
],
],
dim
=
0
,
)
new_state_dict
[
base
+
".weight"
]
=
qkv_weight
# Concatenate biases
if
all
(
k
in
qkv_parts
[
"bias"
]
for
k
in
(
"q_proj"
,
"k_proj"
,
"v_proj"
)):
qkv_bias
=
torch
.
cat
(
[
qkv_parts
[
"bias"
][
"q_proj"
],
qkv_parts
[
"bias"
][
"k_proj"
],
qkv_parts
[
"bias"
][
"v_proj"
],
],
dim
=
0
,
)
new_state_dict
[
base
+
".bias"
]
=
qkv_bias
return
new_state_dict
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Convert HF model weights to original custom key format and compare."
)
parser
.
add_argument
(
"--custom_path"
,
type
=
str
,
required
=
True
,
help
=
"Path to custom model config and tokenizer"
,
)
parser
.
add_argument
(
"--hf_path"
,
type
=
str
,
required
=
True
,
help
=
"Path to HF-formatted safetensor weights"
,
)
parser
.
add_argument
(
"--save_path"
,
type
=
str
,
required
=
True
,
help
=
"Path to save converted model"
)
args
=
parser
.
parse_args
()
mllm_custom_path
=
args
.
custom_path
mllm_hf_path
=
args
.
hf_path
mllm_save_path
=
args
.
save_path
# Load custom model configuration
config
=
AutoConfig
.
from_pretrained
(
mllm_custom_path
,
trust_remote_code
=
True
)
model
=
AutoModel
.
from_config
(
config
,
trust_remote_code
=
True
)
# Load HF safetensor weights
checkpoint_paths
=
[
os
.
path
.
join
(
mllm_hf_path
,
f
)
for
f
in
os
.
listdir
(
mllm_hf_path
)
if
f
.
endswith
(
".safetensors"
)
]
print
(
f
"
\n
🔍 Found checkpoint files:
{
checkpoint_paths
}
"
)
model_state_dict_hf
=
{}
for
checkpoint_path
in
checkpoint_paths
:
with
safe_open
(
checkpoint_path
,
framework
=
"pt"
)
as
f
:
for
k
in
f
.
keys
():
model_state_dict_hf
[
k
]
=
f
.
get_tensor
(
k
)
# Convert key naming style
model_state_dict
=
convert_keys_back
(
model_state_dict_hf
)
# Load weights into model
missing_keys
,
unexpected_keys
=
model
.
load_state_dict
(
model_state_dict
,
strict
=
False
)
print
(
f
"
\n
❌ Missing keys:
{
missing_keys
}
"
)
print
(
f
"⚠️ Unexpected keys:
{
unexpected_keys
}
"
)
# Load original model for comparison
model_compare
=
AutoModel
.
from_pretrained
(
mllm_custom_path
,
trust_remote_code
=
True
)
compute_l2_distance
(
model
,
model_compare
)
# Save the converted model
model
.
save_pretrained
(
mllm_save_path
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
mllm_custom_path
,
trust_remote_code
=
True
)
tokenizer
.
save_pretrained
(
mllm_save_path
)
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/json2jsonl.py
0 → 100644
View file @
876a36a4
import
argparse
import
json
argparse
=
argparse
.
ArgumentParser
()
argparse
.
add_argument
(
"path"
,
type
=
str
)
args
=
argparse
.
parse_args
()
assert
args
.
path
.
endswith
(
".json"
)
data
=
json
.
load
(
open
(
args
.
path
))
writer
=
open
(
args
.
path
.
replace
(
".json"
,
".jsonl"
),
"w"
)
for
idx
,
item
in
enumerate
(
data
):
conversations
=
item
[
"conversations"
]
if
conversations
[
0
][
"from"
]
==
"system"
:
item
[
"conversations"
]
=
item
[
"conversations"
][
1
:]
item
[
"id"
]
=
idx
writer
.
write
(
json
.
dumps
(
item
,
ensure_ascii
=
False
)
+
"
\n
"
)
writer
.
close
()
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/jsonl2jsonl.py
0 → 100644
View file @
876a36a4
import
argparse
import
json
import
os
argparse
=
argparse
.
ArgumentParser
()
argparse
.
add_argument
(
"path"
,
type
=
str
)
args
=
argparse
.
parse_args
()
assert
args
.
path
.
endswith
(
".jsonl"
)
f
=
open
(
args
.
path
)
data
=
[
json
.
loads
(
line
)
for
line
in
f
.
readlines
()]
writer
=
open
(
args
.
path
.
replace
(
".jsonl"
,
"_new.jsonl"
),
"w"
)
for
idx
,
item
in
enumerate
(
data
):
item
[
"id"
]
=
idx
conversations
=
item
[
"conversations"
]
if
conversations
[
0
][
"from"
]
==
"system"
:
item
[
"conversations"
]
=
item
[
"conversations"
][
1
:]
writer
.
write
(
json
.
dumps
(
item
,
ensure_ascii
=
False
)
+
"
\n
"
)
writer
.
close
()
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/merge_lora.py
0 → 100644
View file @
876a36a4
import
argparse
import
torch
from
internvl.model.internvl_chat
import
InternVLChatModel
from
transformers
import
AutoTokenizer
argparse
=
argparse
.
ArgumentParser
()
argparse
.
add_argument
(
"input_path"
,
type
=
str
,
help
=
"Path to the input model"
)
argparse
.
add_argument
(
"output_path"
,
type
=
str
,
help
=
"Path to the output model"
)
args
=
argparse
.
parse_args
()
print
(
"Loading model..."
)
model
=
InternVLChatModel
.
from_pretrained
(
args
.
input_path
,
low_cpu_mem_usage
=
True
,
torch_dtype
=
torch
.
bfloat16
).
eval
()
print
(
"Loading tokenizer..."
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
input_path
,
trust_remote_code
=
True
)
if
model
.
config
.
use_backbone_lora
:
model
.
vision_model
.
merge_and_unload
()
model
.
vision_model
=
model
.
vision_model
.
model
model
.
config
.
use_backbone_lora
=
0
if
model
.
config
.
use_llm_lora
:
model
.
language_model
.
merge_and_unload
()
model
.
language_model
=
model
.
language_model
.
model
model
.
config
.
use_llm_lora
=
0
print
(
"Saving model..."
)
model
.
save_pretrained
(
args
.
output_path
)
print
(
"Saving tokenizer..."
)
tokenizer
.
save_pretrained
(
args
.
output_path
)
print
(
"Done!"
)
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/replace_llm.py
0 → 100644
View file @
876a36a4
import
argparse
import
torch
from
internvl.model.internvl_chat
import
InternVLChatModel
from
transformers
import
AutoModel
,
AutoTokenizer
argparse
=
argparse
.
ArgumentParser
()
argparse
.
add_argument
(
"model_path"
,
type
=
str
,
default
=
""
)
argparse
.
add_argument
(
"llm_path"
,
type
=
str
,
default
=
""
)
args
=
argparse
.
parse_args
()
if
args
.
model_path
[
-
1
]
==
"/"
:
args
.
model_path
=
args
.
model_path
[:
-
1
]
model
=
InternVLChatModel
.
from_pretrained
(
args
.
model_path
,
torch_dtype
=
torch
.
bfloat16
)
llm
=
AutoModel
.
from_pretrained
(
args
.
llm_path
,
trust_remote_code
=
True
,
torch_dtype
=
torch
.
bfloat16
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
llm_path
,
trust_remote_code
=
True
)
model
.
language_model
=
llm
model
.
config
.
llm_config
=
llm
.
config
model
.
to
(
torch
.
bfloat16
)
output_path
=
args
.
model_path
+
"_replace_llm"
model
.
save_pretrained
(
output_path
)
tokenizer
.
save_pretrained
(
output_path
)
print
(
"finished"
)
SenseNova-SI-main/training/intern_vl/internvl_chat/tools/resize_pos_embed.py
0 → 100644
View file @
876a36a4
import
argparse
import
torch
from
internvl.model.internvl_chat
import
InternVLChatModel
from
transformers
import
AutoTokenizer
argparse
=
argparse
.
ArgumentParser
()
argparse
.
add_argument
(
"model_path"
,
type
=
str
,
default
=
""
)
argparse
.
add_argument
(
"output_path"
,
type
=
str
,
default
=
""
)
argparse
.
add_argument
(
"force_image_size"
,
type
=
int
,
default
=
448
)
args
=
argparse
.
parse_args
()
model
=
InternVLChatModel
.
from_pretrained
(
args
.
model_path
,
torch_dtype
=
torch
.
bfloat16
)
model
.
vision_model
.
resize_pos_embeddings
(
old_size
=
model
.
config
.
vision_config
.
image_size
,
new_size
=
args
.
force_image_size
,
patch_size
=
14
,
)
model
.
config
.
vision_config
.
image_size
=
args
.
force_image_size
model
.
config
.
force_image_size
=
args
.
force_image_size
model
.
save_pretrained
(
args
.
output_path
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
model_path
)
tokenizer
.
save_pretrained
(
args
.
output_path
)
print
(
"finished"
)
SenseNova-SI-main/training/intern_vl/internvl_chat/zero_stage1_config.json
0 → 100644
View file @
876a36a4
{
"zero_optimization"
:
{
"stage"
:
1
,
"allgather_partitions"
:
true
,
"allgather_bucket_size"
:
1e9
,
"overlap_comm"
:
true
,
"reduce_scatter"
:
true
,
"reduce_bucket_size"
:
1e9
,
"contiguous_gradients"
:
true
},
"fp16"
:
{
"enabled"
:
"auto"
,
"auto_cast"
:
true
,
"loss_scale"
:
0
,
"initial_scale_power"
:
32
,
"loss_scale_window"
:
1000
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"bf16"
:
{
"enabled"
:
"auto"
},
"optimizer"
:
{
"type"
:
"AdamW"
,
"params"
:
{
"lr"
:
"auto"
,
"betas"
:
[
0.9
,
0.999
],
"eps"
:
1e-8
,
"weight_decay"
:
"auto"
}
},
"gradient_accumulation_steps"
:
"auto"
,
"gradient_clipping"
:
"auto"
,
"steps_per_print"
:
2000
,
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
"wall_clock_breakdown"
:
true
}
SenseNova-SI-main/training/intern_vl/internvl_chat/zero_stage2_config.json
0 → 100644
View file @
876a36a4
{
"zero_optimization"
:
{
"stage"
:
2
,
"allgather_partitions"
:
true
,
"allgather_bucket_size"
:
1e8
,
"overlap_comm"
:
true
,
"reduce_scatter"
:
true
,
"reduce_bucket_size"
:
1e8
,
"contiguous_gradients"
:
true
},
"fp16"
:
{
"enabled"
:
"auto"
,
"auto_cast"
:
true
,
"loss_scale"
:
0
,
"initial_scale_power"
:
32
,
"loss_scale_window"
:
1000
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"bf16"
:
{
"enabled"
:
"auto"
},
"optimizer"
:
{
"type"
:
"AdamW"
,
"params"
:
{
"lr"
:
"auto"
,
"betas"
:
[
0.9
,
0.999
],
"eps"
:
1e-8
,
"weight_decay"
:
"auto"
}
},
"gradient_accumulation_steps"
:
"auto"
,
"gradient_clipping"
:
"auto"
,
"steps_per_print"
:
2000
,
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
"wall_clock_breakdown"
:
false
}
SenseNova-SI-main/training/intern_vl/internvl_chat/zero_stage3_config.json
0 → 100644
View file @
876a36a4
{
"zero_optimization"
:
{
"stage"
:
3
,
"overlap_comm"
:
true
,
"contiguous_gradients"
:
true
,
"sub_group_size"
:
1e9
,
"reduce_bucket_size"
:
1e9
,
"stage3_prefetch_bucket_size"
:
1e9
,
"stage3_param_persistence_threshold"
:
1e7
,
"stage3_max_live_parameters"
:
1e9
,
"stage3_max_reuse_distance"
:
1e9
,
"stage3_gather_16bit_weights_on_model_save"
:
true
},
"fp16"
:
{
"enabled"
:
"auto"
,
"auto_cast"
:
true
,
"loss_scale"
:
0
,
"initial_scale_power"
:
32
,
"loss_scale_window"
:
1000
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"bf16"
:
{
"enabled"
:
"auto"
},
"optimizer"
:
{
"type"
:
"AdamW"
,
"params"
:
{
"lr"
:
"auto"
,
"betas"
:
[
0.9
,
0.999
],
"eps"
:
1e-8
,
"weight_decay"
:
"auto"
}
},
"gradient_accumulation_steps"
:
"auto"
,
"gradient_clipping"
:
"auto"
,
"steps_per_print"
:
2000
,
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
"wall_clock_breakdown"
:
true
}
SenseNova-SI-main/training/intern_vl/internvl_chat/zero_stage3_config_100b.json
0 → 100644
View file @
876a36a4
{
"zero_optimization"
:
{
"stage"
:
3
,
"overlap_comm"
:
true
,
"contiguous_gradients"
:
true
,
"sub_group_size"
:
1e9
,
"reduce_bucket_size"
:
1e9
,
"stage3_prefetch_bucket_size"
:
1e9
,
"stage3_param_persistence_threshold"
:
1e4
,
"stage3_max_live_parameters"
:
1e9
,
"stage3_max_reuse_distance"
:
1e9
,
"stage3_gather_16bit_weights_on_model_save"
:
true
},
"fp16"
:
{
"enabled"
:
"auto"
,
"auto_cast"
:
true
,
"loss_scale"
:
0
,
"initial_scale_power"
:
32
,
"loss_scale_window"
:
1000
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"bf16"
:
{
"enabled"
:
"auto"
},
"optimizer"
:
{
"type"
:
"AdamW"
,
"params"
:
{
"lr"
:
"auto"
,
"betas"
:
[
0.9
,
0.999
],
"eps"
:
1e-8
,
"weight_decay"
:
"auto"
}
},
"gradient_accumulation_steps"
:
"auto"
,
"gradient_clipping"
:
"auto"
,
"steps_per_print"
:
2000
,
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
"wall_clock_breakdown"
:
true
}
SenseNova-SI-main/training/intern_vl/internvl_chat/zero_stage3_config_100b_1e7_offload.json
0 → 100644
View file @
876a36a4
{
"zero_optimization"
:
{
"stage"
:
3
,
"overlap_comm"
:
false
,
"contiguous_gradients"
:
true
,
"sub_group_size"
:
1e7
,
"reduce_bucket_size"
:
1e7
,
"stage3_prefetch_bucket_size"
:
1e7
,
"stage3_param_persistence_threshold"
:
1e4
,
"stage3_max_live_parameters"
:
1e8
,
"stage3_max_reuse_distance"
:
1e8
,
"stage3_gather_16bit_weights_on_model_save"
:
true
,
"offload_param"
:
{
"device"
:
"cpu"
,
"pin_memory"
:
true
},
"offload_optimizer"
:
{
"device"
:
"cpu"
,
"pin_memory"
:
true
}
},
"fp16"
:
{
"enabled"
:
"auto"
,
"auto_cast"
:
true
,
"loss_scale"
:
0
,
"initial_scale_power"
:
32
,
"loss_scale_window"
:
1000
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"bf16"
:
{
"enabled"
:
"auto"
},
"optimizer"
:
{
"type"
:
"AdamW"
,
"params"
:
{
"lr"
:
"auto"
,
"betas"
:
[
0.9
,
0.999
],
"eps"
:
1e-8
,
"weight_decay"
:
"auto"
}
},
"gradient_accumulation_steps"
:
"auto"
,
"gradient_clipping"
:
"auto"
,
"steps_per_print"
:
2000
,
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
"wall_clock_breakdown"
:
true
}
SenseNova-SI-main/training/intern_vl/internvl_chat/zero_stage3_config_100b_1e8.json
0 → 100644
View file @
876a36a4
{
"zero_optimization"
:
{
"stage"
:
3
,
"overlap_comm"
:
true
,
"contiguous_gradients"
:
true
,
"sub_group_size"
:
1e8
,
"reduce_bucket_size"
:
1e8
,
"stage3_prefetch_bucket_size"
:
1e8
,
"stage3_param_persistence_threshold"
:
1e4
,
"stage3_max_live_parameters"
:
1e9
,
"stage3_max_reuse_distance"
:
1e9
,
"stage3_gather_16bit_weights_on_model_save"
:
true
},
"fp16"
:
{
"enabled"
:
"auto"
,
"auto_cast"
:
true
,
"loss_scale"
:
0
,
"initial_scale_power"
:
32
,
"loss_scale_window"
:
1000
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"bf16"
:
{
"enabled"
:
"auto"
},
"optimizer"
:
{
"type"
:
"AdamW"
,
"params"
:
{
"lr"
:
"auto"
,
"betas"
:
[
0.9
,
0.999
],
"eps"
:
1e-8
,
"weight_decay"
:
"auto"
}
},
"gradient_accumulation_steps"
:
"auto"
,
"gradient_clipping"
:
"auto"
,
"steps_per_print"
:
2000
,
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
"wall_clock_breakdown"
:
true
}
SenseNova-SI-main/training/intern_vl/internvl_chat/zero_stage3_config_34b.json
0 → 100644
View file @
876a36a4
{
"zero_optimization"
:
{
"stage"
:
3
,
"overlap_comm"
:
true
,
"contiguous_gradients"
:
true
,
"sub_group_size"
:
1e9
,
"reduce_bucket_size"
:
1e9
,
"stage3_prefetch_bucket_size"
:
1e9
,
"stage3_param_persistence_threshold"
:
1e5
,
"stage3_max_live_parameters"
:
1e9
,
"stage3_max_reuse_distance"
:
1e9
,
"stage3_gather_16bit_weights_on_model_save"
:
true
},
"fp16"
:
{
"enabled"
:
"auto"
,
"auto_cast"
:
true
,
"loss_scale"
:
0
,
"initial_scale_power"
:
32
,
"loss_scale_window"
:
1000
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"bf16"
:
{
"enabled"
:
"auto"
},
"optimizer"
:
{
"type"
:
"AdamW"
,
"params"
:
{
"lr"
:
"auto"
,
"betas"
:
[
0.9
,
0.999
],
"eps"
:
1e-8
,
"weight_decay"
:
"auto"
}
},
"gradient_accumulation_steps"
:
"auto"
,
"gradient_clipping"
:
"auto"
,
"steps_per_print"
:
2000
,
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
"wall_clock_breakdown"
:
true
}
SenseNova-SI-main/training/intern_vl/internvl_chat/zero_stage3_config_70b.json
0 → 100644
View file @
876a36a4
{
"zero_optimization"
:
{
"stage"
:
3
,
"overlap_comm"
:
true
,
"contiguous_gradients"
:
true
,
"sub_group_size"
:
1e9
,
"reduce_bucket_size"
:
1e9
,
"stage3_prefetch_bucket_size"
:
1e9
,
"stage3_param_persistence_threshold"
:
1e5
,
"stage3_max_live_parameters"
:
1e9
,
"stage3_max_reuse_distance"
:
1e9
,
"stage3_gather_16bit_weights_on_model_save"
:
true
},
"fp16"
:
{
"enabled"
:
"auto"
,
"auto_cast"
:
true
,
"loss_scale"
:
0
,
"initial_scale_power"
:
32
,
"loss_scale_window"
:
1000
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
},
"bf16"
:
{
"enabled"
:
"auto"
},
"optimizer"
:
{
"type"
:
"AdamW"
,
"params"
:
{
"lr"
:
"auto"
,
"betas"
:
[
0.9
,
0.999
],
"eps"
:
1e-8
,
"weight_decay"
:
"auto"
}
},
"gradient_accumulation_steps"
:
"auto"
,
"gradient_clipping"
:
"auto"
,
"steps_per_print"
:
2000
,
"train_batch_size"
:
"auto"
,
"train_micro_batch_size_per_gpu"
:
"auto"
,
"wall_clock_breakdown"
:
true
}
Prev
1
…
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment