Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
TS-MODELS-OPT
training
Autonomous-Driving-models
Commits
5ed76316
Commit
5ed76316
authored
Apr 08, 2026
by
雍大凯
Browse files
models add
parent
b2379236
Changes
290
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1931 additions
and
0 deletions
+1931
-0
docker-hub/qwen2.5-vl/llama-factory/scripts/api_example/test_toolcall.py
...2.5-vl/llama-factory/scripts/api_example/test_toolcall.py
+77
-0
docker-hub/qwen2.5-vl/llama-factory/scripts/convert_ckpt/llamafy_baichuan2.py
...l/llama-factory/scripts/convert_ckpt/llamafy_baichuan2.py
+112
-0
docker-hub/qwen2.5-vl/llama-factory/scripts/convert_ckpt/llamafy_qwen.py
...2.5-vl/llama-factory/scripts/convert_ckpt/llamafy_qwen.py
+165
-0
docker-hub/qwen2.5-vl/llama-factory/scripts/convert_ckpt/tiny_llama4.py
...n2.5-vl/llama-factory/scripts/convert_ckpt/tiny_llama4.py
+39
-0
docker-hub/qwen2.5-vl/llama-factory/scripts/eval_bleu_rouge.py
...r-hub/qwen2.5-vl/llama-factory/scripts/eval_bleu_rouge.py
+79
-0
docker-hub/qwen2.5-vl/llama-factory/scripts/llama_pro.py
docker-hub/qwen2.5-vl/llama-factory/scripts/llama_pro.py
+129
-0
docker-hub/qwen2.5-vl/llama-factory/scripts/loftq_init.py
docker-hub/qwen2.5-vl/llama-factory/scripts/loftq_init.py
+88
-0
docker-hub/qwen2.5-vl/llama-factory/scripts/pissa_init.py
docker-hub/qwen2.5-vl/llama-factory/scripts/pissa_init.py
+86
-0
docker-hub/qwen2.5-vl/llama-factory/scripts/qwen_omni_merge.py
...r-hub/qwen2.5-vl/llama-factory/scripts/qwen_omni_merge.py
+136
-0
docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_flops.py
.../qwen2.5-vl/llama-factory/scripts/stat_utils/cal_flops.py
+49
-0
docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_lr.py
...hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_lr.py
+98
-0
docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_mfu.py
...ub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_mfu.py
+161
-0
docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_ppl.py
...ub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_ppl.py
+134
-0
docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/length_cdf.py
...qwen2.5-vl/llama-factory/scripts/stat_utils/length_cdf.py
+69
-0
docker-hub/qwen2.5-vl/llama-factory/scripts/vllm_infer.py
docker-hub/qwen2.5-vl/llama-factory/scripts/vllm_infer.py
+199
-0
docker-hub/qwen2.5-vl/llama-factory/setup.py
docker-hub/qwen2.5-vl/llama-factory/setup.py
+113
-0
docker-hub/qwen2.5-vl/llama-factory/src/api.py
docker-hub/qwen2.5-vl/llama-factory/src/api.py
+33
-0
docker-hub/qwen2.5-vl/llama-factory/src/llamafactory/__init__.py
...hub/qwen2.5-vl/llama-factory/src/llamafactory/__init__.py
+31
-0
docker-hub/qwen2.5-vl/llama-factory/src/llamafactory/api/__init__.py
...qwen2.5-vl/llama-factory/src/llamafactory/api/__init__.py
+0
-0
docker-hub/qwen2.5-vl/llama-factory/src/llamafactory/api/app.py
...-hub/qwen2.5-vl/llama-factory/src/llamafactory/api/app.py
+133
-0
No files found.
docker-hub/qwen2.5-vl/llama-factory/scripts/api_example/test_toolcall.py
0 → 100644
View file @
5ed76316
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
os
from
openai
import
OpenAI
from
transformers.utils.versions
import
require_version
require_version
(
"openai>=1.5.0"
,
"To fix: pip install openai>=1.5.0"
)
def
calculate_gpa
(
grades
:
list
[
str
],
hours
:
list
[
int
])
->
float
:
grade_to_score
=
{
"A"
:
4
,
"B"
:
3
,
"C"
:
2
}
total_score
,
total_hour
=
0
,
0
for
grade
,
hour
in
zip
(
grades
,
hours
):
total_score
+=
grade_to_score
[
grade
]
*
hour
total_hour
+=
hour
return
round
(
total_score
/
total_hour
,
2
)
def
main
():
client
=
OpenAI
(
api_key
=
"{}"
.
format
(
os
.
getenv
(
"API_KEY"
,
"0"
)),
base_url
=
"http://localhost:{}/v1"
.
format
(
os
.
getenv
(
"API_PORT"
,
8000
)),
)
tools
=
[
{
"type"
:
"function"
,
"function"
:
{
"name"
:
"calculate_gpa"
,
"description"
:
"Calculate the Grade Point Average (GPA) based on grades and credit hours"
,
"parameters"
:
{
"type"
:
"object"
,
"properties"
:
{
"grades"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"string"
},
"description"
:
"The grades"
},
"hours"
:
{
"type"
:
"array"
,
"items"
:
{
"type"
:
"integer"
},
"description"
:
"The credit hours"
},
},
"required"
:
[
"grades"
,
"hours"
],
},
},
}
]
tool_map
=
{
"calculate_gpa"
:
calculate_gpa
}
messages
=
[]
messages
.
append
({
"role"
:
"user"
,
"content"
:
"My grades are A, A, B, and C. The credit hours are 3, 4, 3, and 2."
})
result
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
"test"
,
tools
=
tools
)
if
result
.
choices
[
0
].
message
.
tool_calls
is
None
:
raise
ValueError
(
"Cannot retrieve function call from the response."
)
messages
.
append
(
result
.
choices
[
0
].
message
)
tool_call
=
result
.
choices
[
0
].
message
.
tool_calls
[
0
].
function
print
(
tool_call
)
# Function(arguments='{"grades": ["A", "A", "B", "C"], "hours": [3, 4, 3, 2]}', name='calculate_gpa')
name
,
arguments
=
tool_call
.
name
,
json
.
loads
(
tool_call
.
arguments
)
tool_result
=
tool_map
[
name
](
**
arguments
)
messages
.
append
({
"role"
:
"tool"
,
"content"
:
json
.
dumps
({
"gpa"
:
tool_result
},
ensure_ascii
=
False
)})
result
=
client
.
chat
.
completions
.
create
(
messages
=
messages
,
model
=
"test"
,
tools
=
tools
)
print
(
result
.
choices
[
0
].
message
.
content
)
# Based on the grades and credit hours you provided, your Grade Point Average (GPA) is 3.42.
if
__name__
==
"__main__"
:
main
()
docker-hub/qwen2.5-vl/llama-factory/scripts/convert_ckpt/llamafy_baichuan2.py
0 → 100644
View file @
5ed76316
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
os
from
collections
import
OrderedDict
from
typing
import
Any
import
fire
import
torch
from
huggingface_hub
import
split_torch_state_dict_into_shards
from
safetensors.torch
import
save_file
from
tqdm
import
tqdm
from
transformers.modeling_utils
import
SAFE_WEIGHTS_INDEX_NAME
,
SAFE_WEIGHTS_NAME
,
WEIGHTS_INDEX_NAME
,
WEIGHTS_NAME
CONFIG_NAME
=
"config.json"
def
save_weight
(
input_dir
:
str
,
output_dir
:
str
,
shard_size
:
str
,
save_safetensors
:
bool
):
baichuan2_state_dict
:
dict
[
str
,
torch
.
Tensor
]
=
OrderedDict
()
for
filepath
in
tqdm
(
os
.
listdir
(
input_dir
),
desc
=
"Load weights"
):
if
os
.
path
.
isfile
(
os
.
path
.
join
(
input_dir
,
filepath
))
and
filepath
.
endswith
(
".bin"
):
shard_weight
=
torch
.
load
(
os
.
path
.
join
(
input_dir
,
filepath
),
map_location
=
"cpu"
,
weights_only
=
True
)
baichuan2_state_dict
.
update
(
shard_weight
)
llama_state_dict
:
dict
[
str
,
torch
.
Tensor
]
=
OrderedDict
()
for
key
,
value
in
tqdm
(
baichuan2_state_dict
.
items
(),
desc
=
"Convert format"
):
if
"W_pack"
in
key
:
proj_size
=
value
.
size
(
0
)
//
3
llama_state_dict
[
key
.
replace
(
"W_pack"
,
"q_proj"
)]
=
value
[:
proj_size
,
:]
llama_state_dict
[
key
.
replace
(
"W_pack"
,
"k_proj"
)]
=
value
[
proj_size
:
2
*
proj_size
,
:]
llama_state_dict
[
key
.
replace
(
"W_pack"
,
"v_proj"
)]
=
value
[
2
*
proj_size
:,
:]
elif
"lm_head"
in
key
:
llama_state_dict
[
key
]
=
torch
.
nn
.
functional
.
normalize
(
value
)
else
:
llama_state_dict
[
key
]
=
value
weights_name
=
SAFE_WEIGHTS_NAME
if
save_safetensors
else
WEIGHTS_NAME
filename_pattern
=
weights_name
.
replace
(
".bin"
,
"{suffix}.bin"
).
replace
(
".safetensors"
,
"{suffix}.safetensors"
)
state_dict_split
=
split_torch_state_dict_into_shards
(
llama_state_dict
,
filename_pattern
=
filename_pattern
,
max_shard_size
=
shard_size
)
for
shard_file
,
tensors
in
tqdm
(
state_dict_split
.
filename_to_tensors
.
items
(),
desc
=
"Save weights"
):
shard
=
{
tensor
:
llama_state_dict
[
tensor
].
contiguous
()
for
tensor
in
tensors
}
if
save_safetensors
:
save_file
(
shard
,
os
.
path
.
join
(
output_dir
,
shard_file
),
metadata
=
{
"format"
:
"pt"
})
else
:
torch
.
save
(
shard
,
os
.
path
.
join
(
output_dir
,
shard_file
))
if
not
state_dict_split
.
is_sharded
:
print
(
f
"Model weights saved in
{
os
.
path
.
join
(
output_dir
,
weights_name
)
}
."
)
else
:
index
=
{
"metadata"
:
state_dict_split
.
metadata
,
"weight_map"
:
state_dict_split
.
tensor_to_filename
,
}
index_name
=
SAFE_WEIGHTS_INDEX_NAME
if
save_safetensors
else
WEIGHTS_INDEX_NAME
with
open
(
os
.
path
.
join
(
output_dir
,
index_name
),
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
index
,
f
,
indent
=
2
,
sort_keys
=
True
)
print
(
f
"Model weights saved in
{
output_dir
}
."
)
def
save_config
(
input_dir
:
str
,
output_dir
:
str
):
with
open
(
os
.
path
.
join
(
input_dir
,
CONFIG_NAME
),
encoding
=
"utf-8"
)
as
f
:
llama2_config_dict
:
dict
[
str
,
Any
]
=
json
.
load
(
f
)
llama2_config_dict
[
"architectures"
]
=
[
"LlamaForCausalLM"
]
llama2_config_dict
.
pop
(
"auto_map"
,
None
)
llama2_config_dict
.
pop
(
"tokenizer_class"
,
None
)
llama2_config_dict
[
"model_type"
]
=
"llama"
with
open
(
os
.
path
.
join
(
output_dir
,
CONFIG_NAME
),
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
llama2_config_dict
,
f
,
indent
=
2
)
print
(
f
"Model config saved in
{
os
.
path
.
join
(
output_dir
,
CONFIG_NAME
)
}
"
)
def
llamafy_baichuan2
(
input_dir
:
str
,
output_dir
:
str
,
shard_size
:
str
=
"2GB"
,
save_safetensors
:
bool
=
True
,
):
r
"""Convert the Baichuan2-7B model in the same format as LLaMA2-7B.
Usage: python llamafy_baichuan2.py --input_dir input --output_dir output
Converted model: https://huggingface.co/hiyouga/Baichuan2-7B-Base-LLaMAfied
"""
try
:
os
.
makedirs
(
output_dir
,
exist_ok
=
False
)
except
Exception
as
e
:
raise
print
(
"Output dir already exists"
,
e
)
save_weight
(
input_dir
,
output_dir
,
shard_size
,
save_safetensors
)
save_config
(
input_dir
,
output_dir
)
if
__name__
==
"__main__"
:
fire
.
Fire
(
llamafy_baichuan2
)
docker-hub/qwen2.5-vl/llama-factory/scripts/convert_ckpt/llamafy_qwen.py
0 → 100644
View file @
5ed76316
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
os
from
collections
import
OrderedDict
from
typing
import
Any
import
fire
import
torch
from
huggingface_hub
import
split_torch_state_dict_into_shards
from
safetensors
import
safe_open
from
safetensors.torch
import
save_file
from
tqdm
import
tqdm
from
transformers.modeling_utils
import
SAFE_WEIGHTS_INDEX_NAME
,
SAFE_WEIGHTS_NAME
,
WEIGHTS_INDEX_NAME
,
WEIGHTS_NAME
from
transformers.utils
import
check_min_version
try
:
check_min_version
(
"4.34.0"
)
except
Exception
:
raise
ValueError
(
"Please upgrade `transformers` to 4.34.0"
)
CONFIG_NAME
=
"config.json"
def
save_weight
(
input_dir
:
str
,
output_dir
:
str
,
shard_size
:
str
,
save_safetensors
:
bool
)
->
str
:
qwen_state_dict
:
dict
[
str
,
torch
.
Tensor
]
=
OrderedDict
()
for
filepath
in
tqdm
(
os
.
listdir
(
input_dir
),
desc
=
"Load weights"
):
if
os
.
path
.
isfile
(
os
.
path
.
join
(
input_dir
,
filepath
))
and
filepath
.
endswith
(
".safetensors"
):
with
safe_open
(
os
.
path
.
join
(
input_dir
,
filepath
),
framework
=
"pt"
,
device
=
"cpu"
)
as
f
:
for
key
in
f
.
keys
():
qwen_state_dict
[
key
]
=
f
.
get_tensor
(
key
)
llama_state_dict
:
dict
[
str
,
torch
.
Tensor
]
=
OrderedDict
()
torch_dtype
=
None
for
key
,
value
in
tqdm
(
qwen_state_dict
.
items
(),
desc
=
"Convert format"
):
if
torch_dtype
is
None
:
torch_dtype
=
value
.
dtype
if
"wte"
in
key
:
llama_state_dict
[
"model.embed_tokens.weight"
]
=
value
elif
"ln_f"
in
key
:
llama_state_dict
[
"model.norm.weight"
]
=
value
else
:
key
=
key
.
replace
(
"transformer.h"
,
"model.layers"
)
if
"attn.c_attn"
in
key
:
proj_size
=
value
.
size
(
0
)
//
3
llama_state_dict
[
key
.
replace
(
"attn.c_attn"
,
"self_attn.q_proj"
)]
=
value
[:
proj_size
,
...]
llama_state_dict
[
key
.
replace
(
"attn.c_attn"
,
"self_attn.k_proj"
)]
=
value
[
proj_size
:
2
*
proj_size
,
...
]
llama_state_dict
[
key
.
replace
(
"attn.c_attn"
,
"self_attn.v_proj"
)]
=
value
[
2
*
proj_size
:,
...]
elif
"attn.c_proj"
in
key
:
llama_state_dict
[
key
.
replace
(
"attn.c_proj"
,
"self_attn.o_proj"
)]
=
value
llama_state_dict
[
key
.
replace
(
"attn.c_proj.weight"
,
"self_attn.o_proj.bias"
)]
=
torch
.
zeros_like
(
value
[:,
0
]
).
squeeze
()
elif
"ln_1"
in
key
:
llama_state_dict
[
key
.
replace
(
"ln_1"
,
"input_layernorm"
)]
=
value
elif
"ln_2"
in
key
:
llama_state_dict
[
key
.
replace
(
"ln_2"
,
"post_attention_layernorm"
)]
=
value
elif
"mlp.w1"
in
key
:
llama_state_dict
[
key
.
replace
(
"mlp.w1"
,
"mlp.up_proj"
)]
=
value
elif
"mlp.w2"
in
key
:
llama_state_dict
[
key
.
replace
(
"mlp.w2"
,
"mlp.gate_proj"
)]
=
value
elif
"mlp.c_proj"
in
key
:
llama_state_dict
[
key
.
replace
(
"mlp.c_proj"
,
"mlp.down_proj"
)]
=
value
elif
"lm_head"
in
key
:
llama_state_dict
[
key
]
=
value
else
:
raise
KeyError
(
f
"Unable to process key
{
key
}
"
)
weights_name
=
SAFE_WEIGHTS_NAME
if
save_safetensors
else
WEIGHTS_NAME
filename_pattern
=
weights_name
.
replace
(
".bin"
,
"{suffix}.bin"
).
replace
(
".safetensors"
,
"{suffix}.safetensors"
)
state_dict_split
=
split_torch_state_dict_into_shards
(
llama_state_dict
,
filename_pattern
=
filename_pattern
,
max_shard_size
=
shard_size
)
for
shard_file
,
tensors
in
tqdm
(
state_dict_split
.
filename_to_tensors
.
items
(),
desc
=
"Save weights"
):
shard
=
{
tensor
:
llama_state_dict
[
tensor
].
contiguous
()
for
tensor
in
tensors
}
if
save_safetensors
:
save_file
(
shard
,
os
.
path
.
join
(
output_dir
,
shard_file
),
metadata
=
{
"format"
:
"pt"
})
else
:
torch
.
save
(
shard
,
os
.
path
.
join
(
output_dir
,
shard_file
))
if
not
state_dict_split
.
is_sharded
:
print
(
f
"Model weights saved in
{
os
.
path
.
join
(
output_dir
,
weights_name
)
}
."
)
else
:
index
=
{
"metadata"
:
state_dict_split
.
metadata
,
"weight_map"
:
state_dict_split
.
tensor_to_filename
,
}
index_name
=
SAFE_WEIGHTS_INDEX_NAME
if
save_safetensors
else
WEIGHTS_INDEX_NAME
with
open
(
os
.
path
.
join
(
output_dir
,
index_name
),
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
index
,
f
,
indent
=
2
,
sort_keys
=
True
)
print
(
f
"Model weights saved in
{
output_dir
}
."
)
return
str
(
torch_dtype
).
replace
(
"torch."
,
""
)
def
save_config
(
input_dir
:
str
,
output_dir
:
str
,
torch_dtype
:
str
):
with
open
(
os
.
path
.
join
(
input_dir
,
CONFIG_NAME
),
encoding
=
"utf-8"
)
as
f
:
qwen_config_dict
:
dict
[
str
,
Any
]
=
json
.
load
(
f
)
llama2_config_dict
:
dict
[
str
,
Any
]
=
OrderedDict
()
llama2_config_dict
[
"architectures"
]
=
[
"LlamaForCausalLM"
]
llama2_config_dict
[
"hidden_act"
]
=
"silu"
llama2_config_dict
[
"hidden_size"
]
=
qwen_config_dict
[
"hidden_size"
]
llama2_config_dict
[
"initializer_range"
]
=
qwen_config_dict
[
"initializer_range"
]
llama2_config_dict
[
"intermediate_size"
]
=
qwen_config_dict
[
"intermediate_size"
]
//
2
llama2_config_dict
[
"max_position_embeddings"
]
=
qwen_config_dict
[
"max_position_embeddings"
]
llama2_config_dict
[
"model_type"
]
=
"llama"
llama2_config_dict
[
"num_attention_heads"
]
=
qwen_config_dict
[
"num_attention_heads"
]
llama2_config_dict
[
"num_hidden_layers"
]
=
qwen_config_dict
[
"num_hidden_layers"
]
llama2_config_dict
[
"num_key_value_heads"
]
=
qwen_config_dict
[
"hidden_size"
]
//
qwen_config_dict
[
"kv_channels"
]
llama2_config_dict
[
"pretraining_tp"
]
=
1
llama2_config_dict
[
"rms_norm_eps"
]
=
qwen_config_dict
[
"layer_norm_epsilon"
]
llama2_config_dict
[
"rope_scaling"
]
=
None
llama2_config_dict
[
"tie_word_embeddings"
]
=
qwen_config_dict
[
"tie_word_embeddings"
]
llama2_config_dict
[
"torch_dtype"
]
=
torch_dtype
llama2_config_dict
[
"transformers_version"
]
=
"4.34.0"
llama2_config_dict
[
"use_cache"
]
=
True
llama2_config_dict
[
"vocab_size"
]
=
qwen_config_dict
[
"vocab_size"
]
llama2_config_dict
[
"attention_bias"
]
=
True
with
open
(
os
.
path
.
join
(
output_dir
,
CONFIG_NAME
),
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
llama2_config_dict
,
f
,
indent
=
2
)
print
(
f
"Model config saved in
{
os
.
path
.
join
(
output_dir
,
CONFIG_NAME
)
}
"
)
def
llamafy_qwen
(
input_dir
:
str
,
output_dir
:
str
,
shard_size
:
str
=
"2GB"
,
save_safetensors
:
bool
=
False
,
):
r
"""Convert the Qwen models in the same format as LLaMA2.
Usage: python llamafy_qwen.py --input_dir input --output_dir output
Converted model: https://huggingface.co/hiyouga/Qwen-14B-Chat-LLaMAfied
"""
try
:
os
.
makedirs
(
output_dir
,
exist_ok
=
False
)
except
Exception
as
e
:
raise
print
(
"Output dir already exists"
,
e
)
torch_dtype
=
save_weight
(
input_dir
,
output_dir
,
shard_size
,
save_safetensors
)
save_config
(
input_dir
,
output_dir
,
torch_dtype
)
if
__name__
==
"__main__"
:
fire
.
Fire
(
llamafy_qwen
)
docker-hub/qwen2.5-vl/llama-factory/scripts/convert_ckpt/tiny_llama4.py
0 → 100644
View file @
5ed76316
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
transformers
import
Llama4Config
,
Llama4ForConditionalGeneration
,
Llama4TextConfig
,
Llama4VisionConfig
if
__name__
==
"__main__"
:
vision_config
=
Llama4VisionConfig
(
hidden_size
=
1408
,
image_size
=
336
,
intermediate_size
=
5632
,
num_attention_heads
=
16
,
num_hidden_layers
=
4
,
vision_output_dim
=
4096
,
)
text_config
=
Llama4TextConfig
(
hidden_size
=
512
,
intermediate_size
=
1024
,
intermediate_size_mlp
=
1024
,
num_hidden_layers
=
4
,
num_attention_heads
=
8
,
num_key_value_heads
=
2
,
head_dim
=
512
//
8
,
num_local_experts
=
2
,
)
config
=
Llama4Config
(
vision_config
=
vision_config
,
text_config
=
text_config
)
model
=
Llama4ForConditionalGeneration
.
_from_config
(
config
)
model
.
save_pretrained
(
"tiny-llama4"
)
docker-hub/qwen2.5-vl/llama-factory/scripts/eval_bleu_rouge.py
0 → 100644
View file @
5ed76316
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
logging
import
time
import
fire
from
datasets
import
load_dataset
try
:
import
jieba
# type: ignore
from
nltk.translate.bleu_score
import
SmoothingFunction
,
sentence_bleu
# type: ignore
from
rouge_chinese
import
Rouge
# type: ignore
jieba
.
setLogLevel
(
logging
.
CRITICAL
)
jieba
.
initialize
()
except
ImportError
:
print
(
"Please install llamafactory with `pip install -e .[metrics]`."
)
raise
def
compute_metrics
(
sample
):
hypothesis
=
list
(
jieba
.
cut
(
sample
[
"predict"
]))
reference
=
list
(
jieba
.
cut
(
sample
[
"label"
]))
bleu_score
=
sentence_bleu
(
[
list
(
sample
[
"label"
])],
list
(
sample
[
"predict"
]),
smoothing_function
=
SmoothingFunction
().
method3
,
)
if
len
(
" "
.
join
(
hypothesis
).
split
())
==
0
or
len
(
" "
.
join
(
reference
).
split
())
==
0
:
result
=
{
"rouge-1"
:
{
"f"
:
0.0
},
"rouge-2"
:
{
"f"
:
0.0
},
"rouge-l"
:
{
"f"
:
0.0
}}
else
:
rouge
=
Rouge
()
scores
=
rouge
.
get_scores
(
" "
.
join
(
hypothesis
),
" "
.
join
(
reference
))
result
=
scores
[
0
]
metric_result
=
{}
for
k
,
v
in
result
.
items
():
metric_result
[
k
]
=
round
(
v
[
"f"
]
*
100
,
4
)
metric_result
[
"bleu-4"
]
=
round
(
bleu_score
*
100
,
4
)
return
metric_result
def
main
(
filename
:
str
):
start_time
=
time
.
time
()
dataset
=
load_dataset
(
"json"
,
data_files
=
filename
,
split
=
"train"
)
dataset
=
dataset
.
map
(
compute_metrics
,
num_proc
=
8
,
remove_columns
=
dataset
.
column_names
)
score_dict
=
dataset
.
to_dict
()
average_score
=
{}
for
task
,
scores
in
sorted
(
score_dict
.
items
(),
key
=
lambda
x
:
x
[
0
]):
print
(
f
"
{
task
}
:
{
sum
(
scores
)
/
len
(
scores
):.
4
f
}
"
)
average_score
[
task
]
=
sum
(
scores
)
/
len
(
scores
)
with
open
(
"predictions_score.json"
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
average_score
,
f
,
indent
=
4
)
print
(
f
"
\n
Done in
{
time
.
time
()
-
start_time
:.
3
f
}
s.
\n
Score file saved to predictions_score.json"
)
if
__name__
==
"__main__"
:
fire
.
Fire
(
main
)
docker-hub/qwen2.5-vl/llama-factory/scripts/llama_pro.py
0 → 100644
View file @
5ed76316
# Copyright 2025 Tencent Inc. and the LlamaFactory team.
#
# This code is inspired by the Tencent's LLaMA-Pro library.
# https://github.com/TencentARC/LLaMA-Pro/blob/main/scripts/block_expansion.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
os
from
collections
import
OrderedDict
from
typing
import
TYPE_CHECKING
import
fire
import
torch
from
huggingface_hub
import
split_torch_state_dict_into_shards
from
safetensors.torch
import
save_file
from
tqdm
import
tqdm
from
transformers
import
AutoConfig
,
AutoModelForCausalLM
,
AutoTokenizer
,
PreTrainedModel
from
transformers.modeling_utils
import
SAFE_WEIGHTS_INDEX_NAME
,
SAFE_WEIGHTS_NAME
,
WEIGHTS_INDEX_NAME
,
WEIGHTS_NAME
if
TYPE_CHECKING
:
from
transformers
import
PretrainedConfig
def
change_name
(
name
:
str
,
old_index
:
int
,
new_index
:
int
)
->
str
:
return
name
.
replace
(
f
".
{
old_index
:
d
}
."
,
f
".
{
new_index
:
d
}
."
)
def
block_expansion
(
model_name_or_path
:
str
,
output_dir
:
str
,
num_expand
:
int
,
shard_size
:
str
=
"5GB"
,
save_safetensors
:
bool
=
True
,
):
r
"""Perform block expansion for LLaMA, Mistral, Qwen2 or Yi models.
Usage: python llama_pro.py --model_name_or_path meta-llama/Llama-2-7b-hf --output_dir llama2_pro --num_expand 8
"""
config
:
PretrainedConfig
=
AutoConfig
.
from_pretrained
(
model_name_or_path
,
trust_remote_code
=
True
)
num_layers
=
getattr
(
config
,
"num_hidden_layers"
)
if
num_layers
%
num_expand
!=
0
:
raise
ValueError
(
f
"`num_layers`
{
num_layers
}
should be divisible by `num_expand`
{
num_expand
}
."
)
setattr
(
config
,
"num_hidden_layers"
,
num_layers
+
num_expand
)
config
.
save_pretrained
(
output_dir
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name_or_path
,
trust_remote_code
=
True
)
tokenizer
.
save_pretrained
(
output_dir
)
print
(
f
"Expanding model of
{
num_layers
}
layers to
{
num_layers
+
num_expand
}
layers."
)
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_name_or_path
,
torch_dtype
=
"auto"
,
device_map
=
"cpu"
,
trust_remote_code
=
True
,
low_cpu_mem_usage
=
True
)
assert
isinstance
(
model
,
PreTrainedModel
)
# type hint
if
save_safetensors
and
getattr
(
model
.
config
,
"tie_word_embeddings"
,
False
):
del
model
.
lm_head
# safetensors does not allow shared weights
split
=
num_layers
//
num_expand
layer_cnt
=
0
state_dict
=
model
.
state_dict
()
output_state_dict
:
dict
[
str
,
torch
.
Tensor
]
=
OrderedDict
()
for
i
in
range
(
num_layers
):
for
key
,
value
in
state_dict
.
items
():
if
f
".
{
i
:
d
}
."
in
key
:
output_state_dict
[
change_name
(
key
,
i
,
layer_cnt
)]
=
value
print
(
f
"Add layer
{
layer_cnt
}
copied from layer
{
i
}
."
)
layer_cnt
+=
1
if
(
i
+
1
)
%
split
==
0
:
for
key
,
value
in
state_dict
.
items
():
if
f
".
{
i
:
d
}
."
in
key
:
if
"down_proj"
in
key
or
"o_proj"
in
key
:
output_state_dict
[
change_name
(
key
,
i
,
layer_cnt
)]
=
torch
.
zeros_like
(
value
)
else
:
output_state_dict
[
change_name
(
key
,
i
,
layer_cnt
)]
=
torch
.
clone
(
value
)
print
(
f
"Add layer
{
layer_cnt
}
expanded from layer
{
i
}
."
)
layer_cnt
+=
1
for
key
,
value
in
state_dict
.
items
():
if
key
not
in
output_state_dict
:
output_state_dict
[
key
]
=
value
weights_name
=
SAFE_WEIGHTS_NAME
if
save_safetensors
else
WEIGHTS_NAME
filename_pattern
=
weights_name
.
replace
(
".bin"
,
"{suffix}.bin"
).
replace
(
".safetensors"
,
"{suffix}.safetensors"
)
state_dict_split
=
split_torch_state_dict_into_shards
(
output_state_dict
,
filename_pattern
=
filename_pattern
,
max_shard_size
=
shard_size
)
for
shard_file
,
tensors
in
tqdm
(
state_dict_split
.
filename_to_tensors
.
items
(),
desc
=
"Save weights"
):
shard
=
{
tensor
:
output_state_dict
[
tensor
].
contiguous
()
for
tensor
in
tensors
}
if
save_safetensors
:
save_file
(
shard
,
os
.
path
.
join
(
output_dir
,
shard_file
),
metadata
=
{
"format"
:
"pt"
})
else
:
torch
.
save
(
shard
,
os
.
path
.
join
(
output_dir
,
shard_file
))
if
not
state_dict_split
.
is_sharded
:
print
(
f
"Model weights saved in
{
os
.
path
.
join
(
output_dir
,
weights_name
)
}
."
)
else
:
index
=
{
"metadata"
:
state_dict_split
.
metadata
,
"weight_map"
:
state_dict_split
.
tensor_to_filename
,
}
index_name
=
SAFE_WEIGHTS_INDEX_NAME
if
save_safetensors
else
WEIGHTS_INDEX_NAME
with
open
(
os
.
path
.
join
(
output_dir
,
index_name
),
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
index
,
f
,
indent
=
2
,
sort_keys
=
True
)
print
(
f
"Model weights saved in
{
output_dir
}
."
)
print
(
"- Fine-tune this model with:"
)
print
(
f
"model_name_or_path:
{
output_dir
}
"
)
print
(
"finetuning_type: freeze"
)
print
(
f
"freeze_trainable_layers:
{
num_expand
}
"
)
print
(
"use_llama_pro: true"
)
if
__name__
==
"__main__"
:
fire
.
Fire
(
block_expansion
)
docker-hub/qwen2.5-vl/llama-factory/scripts/loftq_init.py
0 → 100644
View file @
5ed76316
# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
#
# This code is based on the HuggingFace's PEFT library.
# https://github.com/huggingface/peft/blob/v0.10.0/examples/loftq_finetuning/quantize_save_load.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
from
typing
import
TYPE_CHECKING
import
fire
from
peft
import
LoftQConfig
,
LoraConfig
,
TaskType
,
get_peft_model
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
if
TYPE_CHECKING
:
from
transformers
import
PreTrainedModel
def
quantize_loftq
(
model_name_or_path
:
str
,
output_dir
:
str
,
loftq_bits
:
int
=
4
,
loftq_iter
:
int
=
4
,
lora_alpha
:
int
=
None
,
lora_rank
:
int
=
16
,
lora_dropout
:
float
=
0
,
lora_target
:
tuple
=
(
"q_proj"
,
"v_proj"
),
save_safetensors
:
bool
=
True
,
):
r
"""Initialize LoRA weights with LoRA-fine-tuning-aware Quantization (LoftQ).
Usage: python loftq_init.py --model_name_or_path path_to_model --output_dir output_dir
"""
if
isinstance
(
lora_target
,
str
):
lora_target
=
[
name
.
strip
()
for
name
in
lora_target
.
split
(
","
)]
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name_or_path
,
trust_remote_code
=
True
)
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_name_or_path
,
trust_remote_code
=
True
,
torch_dtype
=
"auto"
)
loftq_config
=
LoftQConfig
(
loftq_bits
=
loftq_bits
,
loftq_iter
=
loftq_iter
)
lora_config
=
LoraConfig
(
task_type
=
TaskType
.
CAUSAL_LM
,
inference_mode
=
True
,
r
=
lora_rank
,
lora_alpha
=
lora_alpha
if
lora_alpha
is
not
None
else
lora_rank
*
2
,
lora_dropout
=
lora_dropout
,
target_modules
=
lora_target
,
init_lora_weights
=
"loftq"
,
loftq_config
=
loftq_config
,
)
# Init LoftQ model
print
(
"Initializing LoftQ weights, it may be take several minutes, wait patiently."
)
peft_model
=
get_peft_model
(
model
,
lora_config
)
loftq_dir
=
os
.
path
.
join
(
output_dir
,
"loftq_init"
)
# Save LoftQ model
setattr
(
peft_model
.
peft_config
[
"default"
],
"base_model_name_or_path"
,
os
.
path
.
abspath
(
output_dir
))
setattr
(
peft_model
.
peft_config
[
"default"
],
"init_lora_weights"
,
True
)
# don't apply loftq again
peft_model
.
save_pretrained
(
loftq_dir
,
safe_serialization
=
save_safetensors
)
print
(
f
"Adapter weights saved in
{
loftq_dir
}
"
)
# Save base model
base_model
:
PreTrainedModel
=
peft_model
.
unload
()
base_model
.
save_pretrained
(
output_dir
,
safe_serialization
=
save_safetensors
)
tokenizer
.
save_pretrained
(
output_dir
)
print
(
f
"Model weights saved in
{
output_dir
}
"
)
print
(
"- Fine-tune this model with:"
)
print
(
f
"model_name_or_path:
{
output_dir
}
"
)
print
(
f
"adapter_name_or_path:
{
loftq_dir
}
"
)
print
(
"finetuning_type: lora"
)
print
(
f
"quantization_bit:
{
loftq_bits
}
"
)
if
__name__
==
"__main__"
:
fire
.
Fire
(
quantize_loftq
)
docker-hub/qwen2.5-vl/llama-factory/scripts/pissa_init.py
0 → 100644
View file @
5ed76316
# Copyright 2025 HuggingFace Inc. and the LlamaFactory team.
#
# This code is based on the HuggingFace's PEFT library.
# https://github.com/huggingface/peft/blob/v0.11.0/examples/pissa_finetuning/preprocess.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
from
typing
import
TYPE_CHECKING
import
fire
from
peft
import
LoraConfig
,
TaskType
,
get_peft_model
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
if
TYPE_CHECKING
:
from
transformers
import
PreTrainedModel
def
quantize_pissa
(
model_name_or_path
:
str
,
output_dir
:
str
,
pissa_iter
:
int
=
16
,
lora_alpha
:
int
=
None
,
lora_rank
:
int
=
16
,
lora_dropout
:
float
=
0
,
lora_target
:
tuple
=
(
"q_proj"
,
"v_proj"
),
save_safetensors
:
bool
=
True
,
):
r
"""Initialize LoRA weights with Principal Singular values and Singular vectors Adaptation (PiSSA).
Usage: python pissa_init.py --model_name_or_path path_to_model --output_dir output_dir
"""
if
isinstance
(
lora_target
,
str
):
lora_target
=
[
name
.
strip
()
for
name
in
lora_target
.
split
(
","
)]
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name_or_path
,
trust_remote_code
=
True
)
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_name_or_path
,
trust_remote_code
=
True
,
torch_dtype
=
"auto"
)
lora_config
=
LoraConfig
(
task_type
=
TaskType
.
CAUSAL_LM
,
r
=
lora_rank
,
lora_alpha
=
lora_alpha
if
lora_alpha
is
not
None
else
lora_rank
*
2
,
lora_dropout
=
lora_dropout
,
target_modules
=
lora_target
,
init_lora_weights
=
"pissa"
if
pissa_iter
==
-
1
else
f
"pissa_niter_
{
pissa_iter
}
"
,
)
# Init PiSSA model
peft_model
=
get_peft_model
(
model
,
lora_config
)
pissa_dir
=
os
.
path
.
join
(
output_dir
,
"pissa_init"
)
# Save PiSSA model
setattr
(
peft_model
.
peft_config
[
"default"
],
"base_model_name_or_path"
,
os
.
path
.
abspath
(
output_dir
))
setattr
(
peft_model
.
peft_config
[
"default"
],
"init_lora_weights"
,
True
)
# don't apply pissa again
peft_model
.
save_pretrained
(
pissa_dir
,
safe_serialization
=
save_safetensors
)
print
(
f
"Adapter weights saved in
{
pissa_dir
}
"
)
# Save base model
base_model
:
PreTrainedModel
=
peft_model
.
unload
()
base_model
.
save_pretrained
(
output_dir
,
safe_serialization
=
save_safetensors
)
tokenizer
.
save_pretrained
(
output_dir
)
print
(
f
"Model weights saved in
{
output_dir
}
"
)
print
(
"- Fine-tune this model with:"
)
print
(
f
"model_name_or_path:
{
output_dir
}
"
)
print
(
f
"adapter_name_or_path:
{
pissa_dir
}
"
)
print
(
"finetuning_type: lora"
)
print
(
"pissa_init: false"
)
print
(
"pissa_convert: true"
)
print
(
"- and optionally with:"
)
print
(
"quantization_bit: 4"
)
if
__name__
==
"__main__"
:
fire
.
Fire
(
quantize_pissa
)
docker-hub/qwen2.5-vl/llama-factory/scripts/qwen_omni_merge.py
0 → 100644
View file @
5ed76316
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Why we need this script for qwen_omni?
Because the qwen_omni model is constructed by two parts:
1. [Thinker]:[audio_encoder, vision_encoder, LLM backbone], which our repository does support to post-training.
2. [Talker]: [audio_decoder, wave_model], which is not supported to post-training without specific tokenizer.
When we post-training the model, we exactly train the [Thinker] part, and the [Talker] part is dropped.
So, to get the complete model, we need to merge the [Talker] part back to the [Thinker] part.
LoRA mode: [Thinker + LoRA weights] + [Original Talker] -> [Omni model]
Full mode: [Thinker] + [Original Talker] -> [Omni model]
For Processor, we do saved the processor from trained model instead of the original model.
"""
import
os
import
shutil
import
fire
from
peft
import
PeftModel
from
transformers
import
(
AutoProcessor
,
Qwen2_5OmniForConditionalGeneration
,
# type: ignore
Qwen2_5OmniThinkerForConditionalGeneration
,
)
def
merge_lora
(
base_model_path
:
str
,
lora_checkpoint_path
:
str
,
extra_file
:
str
=
"spk_dict.pt"
,
submodule_name
:
str
=
"thinker"
,
save_path
:
str
=
"./merged_model_checkpoint"
,
):
"""Load the original model, merge the LoRA weights.
For a specified submodule, and save the final merged model along with its configurations.
Args:
base_model_path (str): Path to the original model directory.
lora_checkpoint_path (str): Path to the directory containing LoRA weights.
extra_file (str): Name of the extra file to be copied (default: "spk_dict.pt").
submodule_name (str): Name of the submodule to merge (default: "thinker").
save_path (str): Directory where the merged model and configurations will be saved.
"""
# 1. Load the original model
model
=
Qwen2_5OmniForConditionalGeneration
.
from_pretrained
(
base_model_path
,
torch_dtype
=
"auto"
,
device_map
=
"cpu"
)
print
(
"Successfully loaded the original model."
)
# 2. Extract the submodule to be merged (e.g., model.thinker)
if
not
hasattr
(
model
,
submodule_name
):
raise
AttributeError
(
f
"The model does not have a submodule named '
{
submodule_name
}
'."
)
base_submodule
=
getattr
(
model
,
submodule_name
)
print
(
f
"Successfully extracted submodule:
{
submodule_name
}
."
)
# 3. Load the LoRA weights onto the extracted submodule
lora_model
=
PeftModel
.
from_pretrained
(
base_submodule
,
lora_checkpoint_path
)
processor
=
AutoProcessor
.
from_pretrained
(
lora_checkpoint_path
)
print
(
"LoRA weights and processor loaded successfully."
)
# 4. Merge the LoRA weights into the submodule and unload the LoRA modules
merged_submodule
=
lora_model
.
merge_and_unload
()
print
(
"LoRA weights merged successfully."
)
# 5. Replace the original submodule with the merged submodule in the model
setattr
(
model
,
submodule_name
,
merged_submodule
)
# 6. Save the final merged model along with the tokenizer and processor configuration
model
.
save_pretrained
(
save_path
)
processor
.
save_pretrained
(
save_path
)
print
(
f
"Merged model and tokenizer saved to
{
save_path
}
."
)
source_file
=
os
.
path
.
join
(
base_model_path
,
extra_file
)
target_file
=
os
.
path
.
join
(
save_path
,
extra_file
)
if
os
.
path
.
exists
(
source_file
):
shutil
.
copy
(
source_file
,
target_file
)
print
(
f
"File '
{
extra_file
}
' copied from
{
base_model_path
}
to
{
save_path
}
."
)
else
:
print
(
f
"File '
{
extra_file
}
' not found in
{
base_model_path
}
, skipping copy."
)
def
save_full_model
(
saved_thinker_path
:
str
,
base_model_path
:
str
,
save_path
:
str
=
"./merged_model_checkpoint"
,
extra_file
:
str
=
"spk_dict.pt"
,
):
"""Load the saved thinker module and the original model, replace the thinker in the original model.
Then save the complete model along with its tokenizer and processor configuration.
Args:
saved_thinker_path (str): Path to the saved thinker weights.
base_model_path (str): Directory path of the original model.
save_path (str): Directory where the merged model and configurations will be saved.
extra_file (str): Name of the extra file to be copied (default: "spk_dict.pt").
"""
# 1. Load the saved thinker module and the original model
thinker
=
Qwen2_5OmniThinkerForConditionalGeneration
.
from_pretrained
(
saved_thinker_path
,
torch_dtype
=
"auto"
,
device_map
=
"cpu"
)
base_model
=
Qwen2_5OmniForConditionalGeneration
.
from_pretrained
(
base_model_path
,
torch_dtype
=
"auto"
,
device_map
=
"cpu"
)
base_model
.
thinker
=
thinker
# 2. Save the complete model along with its tokenizer and processor configuration
processor
=
AutoProcessor
.
from_pretrained
(
saved_thinker_path
)
base_model
.
save_pretrained
(
save_path
)
processor
.
save_pretrained
(
save_path
)
print
(
f
"Merged model and processor saved to
{
save_path
}
."
)
# 3. Copy the extra file from the base model directory to the save_path
source_file
=
os
.
path
.
join
(
base_model_path
,
extra_file
)
target_file
=
os
.
path
.
join
(
save_path
,
extra_file
)
if
os
.
path
.
exists
(
source_file
):
shutil
.
copy
(
source_file
,
target_file
)
print
(
f
"File '
{
extra_file
}
' copied from
{
base_model_path
}
to
{
save_path
}
."
)
else
:
print
(
f
"File '
{
extra_file
}
' not found in
{
base_model_path
}
, skipping copy."
)
if
__name__
==
"__main__"
:
fire
.
Fire
({
"save_full"
:
save_full_model
,
"merge_lora"
:
merge_lora
})
docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_flops.py
0 → 100644
View file @
5ed76316
# Copyright 2025 Microsoft Corporation and the LlamaFactory team.
#
# This code is inspired by the Microsoft's DeepSpeed library.
# https://www.deepspeed.ai/tutorials/flops-profiler/
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
fire
import
torch
from
deepspeed.accelerator
import
get_accelerator
# type: ignore
from
deepspeed.profiling.flops_profiler
import
get_model_profile
# type: ignore
from
llamafactory.chat
import
ChatModel
def
calculate_flops
(
model_name_or_path
:
str
,
batch_size
:
int
=
1
,
seq_length
:
int
=
512
,
flash_attn
:
str
=
"auto"
,
):
r
"""Calculate the flops of pre-trained models.
Usage: python cal_flops.py --model_name_or_path path_to_model --batch_size 1 --seq_length 512
"""
with
get_accelerator
().
device
(
0
):
chat_model
=
ChatModel
(
dict
(
model_name_or_path
=
model_name_or_path
,
template
=
"empty"
,
flash_attn
=
flash_attn
))
fake_input
=
torch
.
ones
((
batch_size
,
seq_length
),
dtype
=
torch
.
long
,
device
=
chat_model
.
engine
.
model
.
device
)
input_dict
=
{
"input_ids"
:
fake_input
,
"labels"
:
fake_input
.
clone
()}
flops
,
macs
,
params
=
get_model_profile
(
chat_model
.
engine
.
model
,
kwargs
=
input_dict
,
print_profile
=
True
,
detailed
=
True
)
print
(
"FLOPs:"
,
flops
)
print
(
"MACs:"
,
macs
)
print
(
"Params:"
,
params
)
if
__name__
==
"__main__"
:
fire
.
Fire
(
calculate_flops
)
docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_lr.py
0 → 100644
View file @
5ed76316
# Copyright 2025 imoneoi and the LlamaFactory team.
#
# This code is inspired by the imoneoi's OpenChat library.
# https://github.com/imoneoi/openchat/blob/3.6.0/ochat/training_deepspeed/train.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
from
typing
import
Literal
import
fire
import
torch
from
torch.utils.data
import
DataLoader
from
tqdm
import
tqdm
from
transformers
import
DataCollatorForLanguageModeling
from
llamafactory.data
import
MultiModalDataCollatorForSeq2Seq
,
get_dataset
,
get_template_and_fix_tokenizer
from
llamafactory.extras.constants
import
IGNORE_INDEX
from
llamafactory.hparams
import
get_train_args
from
llamafactory.model
import
load_tokenizer
BASE_LR
=
3e-4
# 1.5e-4 for 30B-70B models
BASE_BS
=
4_000_000
# from llama paper
def
calculate_lr
(
model_name_or_path
:
str
,
batch_size
:
int
,
# total batch size, namely (batch size * gradient accumulation * world size)
stage
:
Literal
[
"pt"
,
"sft"
]
=
"sft"
,
dataset
:
str
=
"alpaca_en_demo"
,
dataset_dir
:
str
=
"data"
,
template
:
str
=
"default"
,
cutoff_len
:
int
=
2048
,
# i.e. maximum input length during training
is_mistral_or_gemma
:
bool
=
False
,
# mistral and gemma models opt for a smaller learning rate,
packing
:
bool
=
False
,
):
r
"""Calculate the optimal learning rate for 7B/13B models using LLaMA's hyper-parameters.
Usage:
python cal_lr.py --model_name_or_path path_to_model --dataset alpaca_en_demo --cutoff_len 1024 --batch_size 16
"""
model_args
,
data_args
,
training_args
,
_
,
_
=
get_train_args
(
dict
(
stage
=
stage
,
model_name_or_path
=
model_name_or_path
,
dataset
=
dataset
,
dataset_dir
=
dataset_dir
,
template
=
template
,
cutoff_len
=
cutoff_len
,
packing
=
packing
,
preprocessing_num_workers
=
16
,
output_dir
=
"dummy_dir"
,
overwrite_cache
=
True
,
do_train
=
True
,
)
)
tokenizer_module
=
load_tokenizer
(
model_args
)
tokenizer
=
tokenizer_module
[
"tokenizer"
]
template
=
get_template_and_fix_tokenizer
(
tokenizer
,
data_args
)
trainset
=
get_dataset
(
template
,
model_args
,
data_args
,
training_args
,
stage
,
**
tokenizer_module
)[
"train_dataset"
]
if
stage
==
"pt"
:
data_collator
=
DataCollatorForLanguageModeling
(
tokenizer
=
tokenizer
,
mlm
=
False
)
elif
stage
==
"sft"
:
data_collator
=
MultiModalDataCollatorForSeq2Seq
(
template
=
template
,
tokenizer
=
tokenizer
,
label_pad_token_id
=
IGNORE_INDEX
)
else
:
raise
NotImplementedError
(
f
"Stage does not supported:
{
stage
}
."
)
dataloader
=
DataLoader
(
trainset
,
batch_size
,
shuffle
=
False
,
collate_fn
=
data_collator
,
pin_memory
=
True
)
valid_tokens
,
total_tokens
=
0
,
0
for
batch
in
tqdm
(
dataloader
,
desc
=
"Collecting valid tokens"
):
valid_tokens
+=
torch
.
sum
(
batch
[
"labels"
]
!=
IGNORE_INDEX
).
item
()
total_tokens
+=
torch
.
numel
(
batch
[
"labels"
])
valid_ratio
=
valid_tokens
/
total_tokens
token_batch_size
=
cutoff_len
*
batch_size
*
valid_ratio
lr
=
BASE_LR
*
math
.
sqrt
(
token_batch_size
/
BASE_BS
)
# lr ~ sqrt(batch_size)
lr
=
lr
/
6.0
if
is_mistral_or_gemma
else
lr
print
(
f
"Optimal learning rate is
{
lr
:.
2
e
}
for valid ratio%
{
valid_ratio
*
100
:.
2
f
}
"
f
"and effective token batch size
{
token_batch_size
:.
2
f
}
"
)
if
__name__
==
"__main__"
:
fire
.
Fire
(
calculate_lr
)
docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_mfu.py
0 → 100644
View file @
5ed76316
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
os
import
fire
import
torch
import
torch.distributed
as
dist
from
transformers
import
AutoConfig
from
llamafactory.train.tuner
import
run_exp
BASE
=
2
# gemm (add + mul)
def
compute_model_flops
(
model_name_or_path
:
str
,
total_batch_size
:
int
,
seq_length
:
int
,
include_backward
:
bool
=
True
,
include_recompute
:
bool
=
False
,
include_flashattn
:
bool
=
False
,
)
->
int
:
r
"""Calculate the FLOPs of model per forward/backward pass."""
config
=
AutoConfig
.
from_pretrained
(
model_name_or_path
)
hidden_size
=
getattr
(
config
,
"hidden_size"
,
None
)
vocab_size
=
getattr
(
config
,
"vocab_size"
,
None
)
intermediate_size
=
getattr
(
config
,
"intermediate_size"
,
None
)
num_attention_heads
=
getattr
(
config
,
"num_attention_heads"
,
None
)
num_key_value_heads
=
getattr
(
config
,
"num_key_value_heads"
,
None
)
num_hidden_layers
=
getattr
(
config
,
"num_hidden_layers"
,
None
)
tie_word_embeddings
=
getattr
(
config
,
"tie_word_embeddings"
,
False
)
# mlp module
mlp_flops_per_token
=
3
*
BASE
*
hidden_size
*
intermediate_size
# up, gate, down
mlp_flops
=
total_batch_size
*
seq_length
*
num_hidden_layers
*
mlp_flops_per_token
# attn projector module
q_flops_per_token
=
BASE
*
hidden_size
*
hidden_size
o_flops_per_token
=
BASE
*
hidden_size
*
hidden_size
k_flops_per_token
=
BASE
*
hidden_size
*
hidden_size
*
num_key_value_heads
//
num_attention_heads
v_flops_per_token
=
BASE
*
hidden_size
*
hidden_size
*
num_key_value_heads
//
num_attention_heads
attn_proj_flops_per_token
=
q_flops_per_token
+
o_flops_per_token
+
k_flops_per_token
+
v_flops_per_token
attn_proj_flops
=
total_batch_size
*
seq_length
*
num_hidden_layers
*
attn_proj_flops_per_token
# attn sdpa module
sdpa_flops_per_layer
=
2
*
BASE
*
hidden_size
*
seq_length
*
seq_length
# (q * k^T) * v
sdpa_flops
=
total_batch_size
*
num_hidden_layers
*
sdpa_flops_per_layer
# embedding module
embedding_flops_per_token
=
hidden_size
*
vocab_size
embedding_flops
=
total_batch_size
*
seq_length
*
embedding_flops_per_token
if
tie_word_embeddings
is
False
:
embedding_flops
*=
2
non_embedding_flops
=
mlp_flops
+
attn_proj_flops
+
sdpa_flops
non_embedding_coeff
,
embedding_coeff
=
1
,
1
if
include_backward
:
non_embedding_coeff
+=
2
embedding_coeff
+=
2
if
include_recompute
:
non_embedding_coeff
+=
1
total_flops
=
non_embedding_coeff
*
non_embedding_flops
+
embedding_coeff
*
embedding_flops
if
include_flashattn
:
total_flops
+=
sdpa_flops
return
total_flops
def
compute_device_flops
(
world_size
:
int
)
->
float
:
r
"""Calculate the FLOPs of the device capability per second."""
device_name
=
torch
.
cuda
.
get_device_name
()
if
"H100"
in
device_name
or
"H800"
in
device_name
:
return
989
*
1e12
*
world_size
elif
"A100"
in
device_name
or
"A800"
in
device_name
:
return
312
*
1e12
*
world_size
elif
"V100"
in
device_name
:
return
125
*
1e12
*
world_size
elif
"4090"
in
device_name
:
return
98
*
1e12
*
world_size
else
:
raise
NotImplementedError
(
f
"Device not supported:
{
device_name
}
."
)
def
calculate_mfu
(
model_name_or_path
:
str
,
batch_size
:
int
=
1
,
seq_length
:
int
=
1024
,
num_steps
:
int
=
100
,
finetuning_type
:
str
=
"lora"
,
flash_attn
:
str
=
"auto"
,
deepspeed_stage
:
int
=
0
,
disable_gc
:
bool
=
False
,
liger_kernel
:
bool
=
False
,
unsloth_gc
:
bool
=
False
,
)
->
float
:
r
"""Calculate MFU for given model and hyper-params.
Usage: python cal_mfu.py --model_name_or_path path_to_model --batch_size 1 --seq_length 1024
"""
args
=
{
"model_name_or_path"
:
model_name_or_path
,
"flash_attn"
:
flash_attn
,
"disable_gradient_checkpointing"
:
disable_gc
,
"enable_liger_kernel"
:
liger_kernel
,
"use_unsloth_gc"
:
unsloth_gc
,
"stage"
:
"pt"
,
"do_train"
:
True
,
"finetuning_type"
:
finetuning_type
,
"dataset"
:
"c4_demo"
,
"cutoff_len"
:
seq_length
,
"output_dir"
:
os
.
path
.
join
(
"saves"
,
"test_mfu"
),
"logging_strategy"
:
"no"
,
"save_strategy"
:
"no"
,
"save_only_model"
:
True
,
"overwrite_output_dir"
:
True
,
"per_device_train_batch_size"
:
batch_size
,
"max_steps"
:
num_steps
,
"bf16"
:
True
,
}
if
deepspeed_stage
in
[
2
,
3
]:
args
[
"deepspeed"
]
=
f
"examples/deepspeed/ds_z
{
deepspeed_stage
}
_config.json"
run_exp
(
args
)
if
dist
.
is_initialized
():
dist
.
barrier
()
world_size
=
dist
.
get_world_size
()
else
:
world_size
=
1
if
int
(
os
.
getenv
(
"LOCAL_RANK"
,
"0"
))
==
0
:
with
open
(
os
.
path
.
join
(
"saves"
,
"test_mfu"
,
"all_results.json"
),
encoding
=
"utf-8"
)
as
f
:
result
=
json
.
load
(
f
)
total_batch_size
=
batch_size
*
world_size
mfu_value
=
(
result
[
"train_steps_per_second"
]
*
compute_model_flops
(
model_name_or_path
,
total_batch_size
,
seq_length
)
/
compute_device_flops
(
world_size
)
)
print
(
f
"MFU:
{
mfu_value
*
100
:.
2
f
}
%"
)
if
__name__
==
"__main__"
:
fire
.
Fire
(
calculate_mfu
)
docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/cal_ppl.py
0 → 100644
View file @
5ed76316
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
from
dataclasses
import
dataclass
from
typing
import
Any
,
Literal
,
Optional
import
fire
import
torch
from
torch.utils.data
import
DataLoader
from
tqdm
import
tqdm
from
transformers
import
DataCollatorForLanguageModeling
from
llamafactory.data
import
MultiModalDataCollatorForSeq2Seq
,
get_dataset
,
get_template_and_fix_tokenizer
from
llamafactory.extras.constants
import
IGNORE_INDEX
from
llamafactory.hparams
import
get_train_args
from
llamafactory.model
import
load_model
,
load_tokenizer
@
dataclass
class
PairwiseDataCollatorWithPadding
(
MultiModalDataCollatorForSeq2Seq
):
r
"""Data collator for pairwise data."""
train_on_prompt
:
bool
=
False
def
__call__
(
self
,
features
:
list
[
dict
[
str
,
Any
]])
->
dict
[
str
,
torch
.
Tensor
]:
r
"""Pad batched data to the longest sequence in the batch."""
chosen_features
=
[]
for
feature
in
features
:
chosen_features
.
append
(
{
"input_ids"
:
feature
[
"chosen_input_ids"
],
"attention_mask"
:
feature
[
"chosen_attention_mask"
],
"labels"
:
feature
[
"chosen_input_ids"
]
if
self
.
train_on_prompt
else
feature
[
"chosen_labels"
],
"images"
:
feature
[
"images"
],
"videos"
:
feature
[
"videos"
],
"audios"
:
feature
[
"audios"
],
}
)
return
super
().
__call__
(
chosen_features
)
def
calculate_ppl
(
model_name_or_path
:
str
,
save_name
:
str
=
"ppl.json"
,
batch_size
:
int
=
4
,
stage
:
Literal
[
"pt"
,
"sft"
,
"rm"
]
=
"sft"
,
dataset
:
str
=
"alpaca_en_demo"
,
dataset_dir
:
str
=
"data"
,
template
:
str
=
"default"
,
cutoff_len
:
int
=
2048
,
max_samples
:
Optional
[
int
]
=
None
,
train_on_prompt
:
bool
=
False
,
):
r
"""Calculate the ppl on the dataset of the pre-trained models.
Usage: export CUDA_VISIBLE_DEVICES=0
python cal_ppl.py --model_name_or_path path_to_model --dataset alpaca_en_demo --save_name ppl.json
"""
model_args
,
data_args
,
training_args
,
finetuning_args
,
_
=
get_train_args
(
dict
(
stage
=
stage
,
model_name_or_path
=
model_name_or_path
,
dataset
=
dataset
,
dataset_dir
=
dataset_dir
,
template
=
template
,
cutoff_len
=
cutoff_len
,
max_samples
=
max_samples
,
train_on_prompt
=
train_on_prompt
,
preprocessing_num_workers
=
16
,
output_dir
=
"dummy_dir"
,
overwrite_cache
=
True
,
do_train
=
True
,
)
)
tokenizer_module
=
load_tokenizer
(
model_args
)
tokenizer
=
tokenizer_module
[
"tokenizer"
]
template
=
get_template_and_fix_tokenizer
(
tokenizer
,
data_args
)
trainset
=
get_dataset
(
template
,
model_args
,
data_args
,
training_args
,
stage
,
**
tokenizer_module
)[
"train_dataset"
]
model
=
load_model
(
tokenizer
,
model_args
,
finetuning_args
,
is_trainable
=
False
)
if
stage
==
"pt"
:
data_collator
=
DataCollatorForLanguageModeling
(
tokenizer
=
tokenizer
,
mlm
=
False
)
elif
stage
==
"sft"
:
data_collator
=
MultiModalDataCollatorForSeq2Seq
(
template
=
template
,
tokenizer
=
tokenizer
,
label_pad_token_id
=
IGNORE_INDEX
)
elif
stage
==
"rm"
:
data_collator
=
PairwiseDataCollatorWithPadding
(
template
=
template
,
tokenizer
=
tokenizer
,
label_pad_token_id
=
IGNORE_INDEX
,
train_on_prompt
=
train_on_prompt
)
else
:
raise
NotImplementedError
(
f
"Stage does not supported:
{
stage
}
."
)
dataloader
=
DataLoader
(
trainset
,
batch_size
,
shuffle
=
False
,
collate_fn
=
data_collator
,
pin_memory
=
True
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
(
reduction
=
"none"
)
total_ppl
=
0
perplexities
=
[]
batch
:
dict
[
str
,
torch
.
Tensor
]
with
torch
.
no_grad
():
for
batch
in
tqdm
(
dataloader
,
desc
=
"Computing perplexities"
):
batch
=
batch
.
to
(
model
.
device
)
outputs
=
model
(
**
batch
)
shift_logits
:
torch
.
Tensor
=
outputs
[
"logits"
][...,
:
-
1
,
:]
shift_labels
:
torch
.
Tensor
=
batch
[
"labels"
][...,
1
:]
loss_mask
=
shift_labels
!=
IGNORE_INDEX
flatten_logits
=
shift_logits
.
contiguous
().
view
(
shift_labels
.
size
(
0
)
*
shift_labels
.
size
(
1
),
-
1
)
flatten_labels
=
shift_labels
.
contiguous
().
view
(
-
1
)
token_logps
:
torch
.
Tensor
=
criterion
(
flatten_logits
,
flatten_labels
)
token_logps
=
token_logps
.
contiguous
().
view
(
shift_logits
.
size
(
0
),
-
1
)
sentence_logps
=
(
token_logps
*
loss_mask
).
sum
(
-
1
)
/
loss_mask
.
sum
(
-
1
)
total_ppl
+=
sentence_logps
.
exp
().
sum
().
item
()
perplexities
.
extend
(
sentence_logps
.
exp
().
tolist
())
with
open
(
save_name
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
json
.
dump
(
perplexities
,
f
,
indent
=
2
)
print
(
f
"Average perplexity is
{
total_ppl
/
len
(
perplexities
):.
2
f
}
"
)
print
(
f
"Perplexities have been saved at
{
save_name
}
."
)
if
__name__
==
"__main__"
:
fire
.
Fire
(
calculate_ppl
)
docker-hub/qwen2.5-vl/llama-factory/scripts/stat_utils/length_cdf.py
0 → 100644
View file @
5ed76316
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
collections
import
defaultdict
import
fire
from
tqdm
import
tqdm
from
llamafactory.data
import
get_dataset
,
get_template_and_fix_tokenizer
from
llamafactory.hparams
import
get_train_args
from
llamafactory.model
import
load_tokenizer
def
length_cdf
(
model_name_or_path
:
str
,
dataset
:
str
=
"alpaca_en_demo"
,
dataset_dir
:
str
=
"data"
,
template
:
str
=
"default"
,
interval
:
int
=
1000
,
):
r
"""Calculate the distribution of the input lengths in the dataset.
Usage: export CUDA_VISIBLE_DEVICES=0
python length_cdf.py --model_name_or_path path_to_model --dataset alpaca_en_demo --template default
"""
model_args
,
data_args
,
training_args
,
_
,
_
=
get_train_args
(
dict
(
stage
=
"sft"
,
model_name_or_path
=
model_name_or_path
,
dataset
=
dataset
,
dataset_dir
=
dataset_dir
,
template
=
template
,
cutoff_len
=
1_000_000
,
preprocessing_num_workers
=
16
,
output_dir
=
"dummy_dir"
,
overwrite_cache
=
True
,
do_train
=
True
,
)
)
tokenizer_module
=
load_tokenizer
(
model_args
)
template
=
get_template_and_fix_tokenizer
(
tokenizer_module
[
"tokenizer"
],
data_args
)
trainset
=
get_dataset
(
template
,
model_args
,
data_args
,
training_args
,
"sft"
,
**
tokenizer_module
)[
"train_dataset"
]
total_num
=
len
(
trainset
)
length_dict
=
defaultdict
(
int
)
for
sample
in
tqdm
(
trainset
[
"input_ids"
],
desc
=
"Collecting lengths"
):
length_dict
[
len
(
sample
)
//
interval
*
interval
]
+=
1
length_tuples
=
list
(
length_dict
.
items
())
length_tuples
.
sort
()
count_accu
,
prob_accu
=
0
,
0
for
length
,
count
in
length_tuples
:
count_accu
+=
count
prob_accu
+=
count
/
total_num
*
100
print
(
f
"
{
count_accu
:
d
}
(
{
prob_accu
:.
2
f
}
%) samples have length <
{
length
+
interval
}
."
)
if
__name__
==
"__main__"
:
fire
.
Fire
(
length_cdf
)
docker-hub/qwen2.5-vl/llama-factory/scripts/vllm_infer.py
0 → 100644
View file @
5ed76316
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
gc
import
json
from
typing
import
Optional
import
fire
from
tqdm
import
tqdm
from
transformers
import
Seq2SeqTrainingArguments
from
llamafactory.data
import
get_dataset
,
get_template_and_fix_tokenizer
from
llamafactory.extras.constants
import
IGNORE_INDEX
from
llamafactory.extras.misc
import
get_device_count
from
llamafactory.extras.packages
import
is_vllm_available
from
llamafactory.hparams
import
get_infer_args
from
llamafactory.model
import
load_tokenizer
if
is_vllm_available
():
from
vllm
import
LLM
,
SamplingParams
from
vllm.lora.request
import
LoRARequest
def
vllm_infer
(
model_name_or_path
:
str
,
adapter_name_or_path
:
str
=
None
,
dataset
:
str
=
"alpaca_en_demo"
,
dataset_dir
:
str
=
"data"
,
template
:
str
=
"default"
,
cutoff_len
:
int
=
2048
,
max_samples
:
Optional
[
int
]
=
None
,
vllm_config
:
str
=
"{}"
,
save_name
:
str
=
"generated_predictions.jsonl"
,
temperature
:
float
=
0.95
,
top_p
:
float
=
0.7
,
top_k
:
int
=
50
,
max_new_tokens
:
int
=
1024
,
repetition_penalty
:
float
=
1.0
,
skip_special_tokens
:
bool
=
True
,
default_system
:
Optional
[
str
]
=
None
,
enable_thinking
:
bool
=
True
,
seed
:
Optional
[
int
]
=
None
,
pipeline_parallel_size
:
int
=
1
,
image_max_pixels
:
int
=
768
*
768
,
image_min_pixels
:
int
=
32
*
32
,
video_fps
:
float
=
2.0
,
video_maxlen
:
int
=
128
,
batch_size
:
int
=
1024
,
):
r
"""Perform batch generation using vLLM engine, which supports tensor parallelism.
Usage: python vllm_infer.py --model_name_or_path meta-llama/Llama-2-7b-hf --template llama --dataset alpaca_en_demo
"""
if
pipeline_parallel_size
>
get_device_count
():
raise
ValueError
(
"Pipeline parallel size should be smaller than the number of gpus."
)
model_args
,
data_args
,
_
,
generating_args
=
get_infer_args
(
dict
(
model_name_or_path
=
model_name_or_path
,
adapter_name_or_path
=
adapter_name_or_path
,
dataset
=
dataset
,
dataset_dir
=
dataset_dir
,
template
=
template
,
cutoff_len
=
cutoff_len
,
max_samples
=
max_samples
,
preprocessing_num_workers
=
16
,
default_system
=
default_system
,
enable_thinking
=
enable_thinking
,
vllm_config
=
vllm_config
,
temperature
=
temperature
,
top_p
=
top_p
,
top_k
=
top_k
,
max_new_tokens
=
max_new_tokens
,
repetition_penalty
=
repetition_penalty
,
)
)
training_args
=
Seq2SeqTrainingArguments
(
output_dir
=
"dummy_dir"
)
tokenizer_module
=
load_tokenizer
(
model_args
)
tokenizer
=
tokenizer_module
[
"tokenizer"
]
template_obj
=
get_template_and_fix_tokenizer
(
tokenizer
,
data_args
)
template_obj
.
mm_plugin
.
expand_mm_tokens
=
False
# for vllm generate
engine_args
=
{
"model"
:
model_args
.
model_name_or_path
,
"trust_remote_code"
:
True
,
"dtype"
:
model_args
.
infer_dtype
,
"max_model_len"
:
cutoff_len
+
max_new_tokens
,
"tensor_parallel_size"
:
(
get_device_count
()
//
pipeline_parallel_size
)
or
1
,
"pipeline_parallel_size"
:
pipeline_parallel_size
,
"disable_log_stats"
:
True
,
"enable_lora"
:
model_args
.
adapter_name_or_path
is
not
None
,
}
if
template_obj
.
mm_plugin
.
__class__
.
__name__
!=
"BasePlugin"
:
engine_args
[
"limit_mm_per_prompt"
]
=
{
"image"
:
4
,
"video"
:
2
,
"audio"
:
2
}
if
isinstance
(
model_args
.
vllm_config
,
dict
):
engine_args
.
update
(
model_args
.
vllm_config
)
llm
=
LLM
(
**
engine_args
)
# load datasets
dataset_module
=
get_dataset
(
template_obj
,
model_args
,
data_args
,
training_args
,
"ppo"
,
**
tokenizer_module
)
train_dataset
=
dataset_module
[
"train_dataset"
]
sampling_params
=
SamplingParams
(
repetition_penalty
=
generating_args
.
repetition_penalty
or
1.0
,
# repetition_penalty must > 0
temperature
=
generating_args
.
temperature
,
top_p
=
generating_args
.
top_p
or
1.0
,
# top_p must > 0
top_k
=
generating_args
.
top_k
or
-
1
,
# top_k must > 0
stop_token_ids
=
template_obj
.
get_stop_token_ids
(
tokenizer
),
max_tokens
=
generating_args
.
max_new_tokens
,
skip_special_tokens
=
skip_special_tokens
,
seed
=
seed
,
)
if
model_args
.
adapter_name_or_path
is
not
None
:
lora_request
=
LoRARequest
(
"default"
,
1
,
model_args
.
adapter_name_or_path
[
0
])
else
:
lora_request
=
None
# Store all results in these lists
all_prompts
,
all_preds
,
all_labels
=
[],
[],
[]
# Add batch process to avoid the issue of too many files opened
for
i
in
tqdm
(
range
(
0
,
len
(
train_dataset
),
batch_size
),
desc
=
"Processing batched inference"
):
vllm_inputs
,
prompts
,
labels
=
[],
[],
[]
batch
=
train_dataset
[
i
:
min
(
i
+
batch_size
,
len
(
train_dataset
))]
for
j
in
range
(
len
(
batch
[
"input_ids"
])):
if
batch
[
"images"
][
j
]
is
not
None
:
image
=
batch
[
"images"
][
j
]
multi_modal_data
=
{
"image"
:
template_obj
.
mm_plugin
.
_regularize_images
(
image
,
image_max_pixels
=
image_max_pixels
,
image_min_pixels
=
image_min_pixels
)[
"images"
]
}
elif
batch
[
"videos"
][
j
]
is
not
None
:
video
=
batch
[
"videos"
][
j
]
multi_modal_data
=
{
"video"
:
template_obj
.
mm_plugin
.
_regularize_videos
(
video
,
image_max_pixels
=
image_max_pixels
,
image_min_pixels
=
image_min_pixels
,
video_fps
=
video_fps
,
video_maxlen
=
video_maxlen
,
)[
"videos"
]
}
elif
batch
[
"audios"
][
j
]
is
not
None
:
audio
=
batch
[
"audios"
][
j
]
audio_data
=
template_obj
.
mm_plugin
.
_regularize_audios
(
audio
,
sampling_rate
=
16000
,
)
multi_modal_data
=
{
"audio"
:
zip
(
audio_data
[
"audios"
],
audio_data
[
"sampling_rates"
])}
else
:
multi_modal_data
=
None
vllm_inputs
.
append
({
"prompt_token_ids"
:
batch
[
"input_ids"
][
j
],
"multi_modal_data"
:
multi_modal_data
})
prompts
.
append
(
tokenizer
.
decode
(
batch
[
"input_ids"
][
j
],
skip_special_tokens
=
skip_special_tokens
))
labels
.
append
(
tokenizer
.
decode
(
list
(
filter
(
lambda
x
:
x
!=
IGNORE_INDEX
,
batch
[
"labels"
][
j
])),
skip_special_tokens
=
skip_special_tokens
,
)
)
results
=
llm
.
generate
(
vllm_inputs
,
sampling_params
,
lora_request
=
lora_request
)
preds
=
[
result
.
outputs
[
0
].
text
for
result
in
results
]
# Accumulate results
all_prompts
.
extend
(
prompts
)
all_preds
.
extend
(
preds
)
all_labels
.
extend
(
labels
)
gc
.
collect
()
# Write all results at once outside the loop
with
open
(
save_name
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
for
text
,
pred
,
label
in
zip
(
all_prompts
,
all_preds
,
all_labels
):
f
.
write
(
json
.
dumps
({
"prompt"
:
text
,
"predict"
:
pred
,
"label"
:
label
},
ensure_ascii
=
False
)
+
"
\n
"
)
print
(
"*"
*
70
)
print
(
f
"
{
len
(
all_prompts
)
}
total generated results have been saved at
{
save_name
}
."
)
print
(
"*"
*
70
)
if
__name__
==
"__main__"
:
fire
.
Fire
(
vllm_infer
)
docker-hub/qwen2.5-vl/llama-factory/setup.py
0 → 100644
View file @
5ed76316
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
re
from
setuptools
import
find_packages
,
setup
def
get_version
()
->
str
:
with
open
(
os
.
path
.
join
(
"src"
,
"llamafactory"
,
"extras"
,
"env.py"
),
encoding
=
"utf-8"
)
as
f
:
file_content
=
f
.
read
()
pattern
=
r
"{}\W*=\W*\"([^\"]+)\""
.
format
(
"VERSION"
)
(
version
,)
=
re
.
findall
(
pattern
,
file_content
)
return
version
def
get_requires
()
->
list
[
str
]:
with
open
(
"requirements.txt"
,
encoding
=
"utf-8"
)
as
f
:
file_content
=
f
.
read
()
lines
=
[
line
.
strip
()
for
line
in
file_content
.
strip
().
split
(
"
\n
"
)
if
not
line
.
startswith
(
"#"
)]
return
lines
def
get_console_scripts
()
->
list
[
str
]:
console_scripts
=
[
"llamafactory-cli = llamafactory.cli:main"
]
if
os
.
getenv
(
"ENABLE_SHORT_CONSOLE"
,
"1"
).
lower
()
in
[
"true"
,
"y"
,
"1"
]:
console_scripts
.
append
(
"lmf = llamafactory.cli:main"
)
return
console_scripts
extra_require
=
{
"torch"
:
[
"torch>=2.0.0"
,
"torchvision>=0.15.0"
],
"torch-npu"
:
[
"torch-npu==2.5.1"
,
"torchvision==0.20.1"
,
"decorator"
],
"metrics"
:
[
"nltk"
,
"jieba"
,
"rouge-chinese"
],
"deepspeed"
:
[
"deepspeed>=0.10.0,<=0.16.9"
],
"liger-kernel"
:
[
"liger-kernel>=0.5.5"
],
"bitsandbytes"
:
[
"bitsandbytes>=0.39.0"
],
"hqq"
:
[
"hqq"
],
"eetq"
:
[
"eetq"
],
"gptq"
:
[
"optimum>=1.24.0"
,
"gptqmodel>=2.0.0"
],
"aqlm"
:
[
"aqlm[gpu]>=1.1.0"
],
"vllm"
:
[
"vllm>=0.4.3,<=0.9.1"
],
"sglang"
:
[
"sglang[srt]>=0.4.5"
,
"transformers==4.51.1"
],
"galore"
:
[
"galore-torch"
],
"apollo"
:
[
"apollo-torch"
],
"badam"
:
[
"badam>=1.2.1"
],
"adam-mini"
:
[
"adam-mini"
],
"minicpm_v"
:
[
"soundfile"
,
"torchvision"
,
"torchaudio"
,
"vector_quantize_pytorch"
,
"vocos"
,
"msgpack"
,
"referencing"
,
"jsonschema_specifications"
,
],
"openmind"
:
[
"openmind"
],
"swanlab"
:
[
"swanlab"
],
"dev"
:
[
"pre-commit"
,
"ruff"
,
"pytest"
,
"build"
],
}
def
main
():
setup
(
name
=
"llamafactory"
,
version
=
get_version
(),
author
=
"hiyouga"
,
author_email
=
"hiyouga@buaa.edu.cn"
,
description
=
"Unified Efficient Fine-Tuning of 100+ LLMs"
,
long_description
=
open
(
"README.md"
,
encoding
=
"utf-8"
).
read
(),
long_description_content_type
=
"text/markdown"
,
keywords
=
[
"AI"
,
"LLM"
,
"GPT"
,
"ChatGPT"
,
"Llama"
,
"Transformer"
,
"DeepSeek"
,
"Pytorch"
],
license
=
"Apache 2.0 License"
,
url
=
"https://github.com/hiyouga/LLaMA-Factory"
,
package_dir
=
{
""
:
"src"
},
packages
=
find_packages
(
"src"
),
python_requires
=
">=3.9.0"
,
install_requires
=
get_requires
(),
extras_require
=
extra_require
,
entry_points
=
{
"console_scripts"
:
get_console_scripts
()},
classifiers
=
[
"Development Status :: 4 - Beta"
,
"Intended Audience :: Developers"
,
"Intended Audience :: Education"
,
"Intended Audience :: Science/Research"
,
"License :: OSI Approved :: Apache Software License"
,
"Operating System :: OS Independent"
,
"Programming Language :: Python :: 3"
,
"Programming Language :: Python :: 3.9"
,
"Programming Language :: Python :: 3.10"
,
"Programming Language :: Python :: 3.11"
,
"Programming Language :: Python :: 3.12"
,
"Topic :: Scientific/Engineering :: Artificial Intelligence"
,
],
)
if
__name__
==
"__main__"
:
main
()
docker-hub/qwen2.5-vl/llama-factory/src/api.py
0 → 100644
View file @
5ed76316
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
uvicorn
from
llamafactory.api.app
import
create_app
from
llamafactory.chat
import
ChatModel
def
main
():
chat_model
=
ChatModel
()
app
=
create_app
(
chat_model
)
api_host
=
os
.
getenv
(
"API_HOST"
,
"0.0.0.0"
)
api_port
=
int
(
os
.
getenv
(
"API_PORT"
,
"8000"
))
print
(
f
"Visit http://localhost:
{
api_port
}
/docs for API document."
)
uvicorn
.
run
(
app
,
host
=
api_host
,
port
=
api_port
)
if
__name__
==
"__main__"
:
main
()
docker-hub/qwen2.5-vl/llama-factory/src/llamafactory/__init__.py
0 → 100644
View file @
5ed76316
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r
"""Efficient fine-tuning of large language models.
Level:
api, webui > chat, eval, train > data, model > hparams > extras
Disable version checking: DISABLE_VERSION_CHECK=1
Enable VRAM recording: RECORD_VRAM=1
Force using torchrun: FORCE_TORCHRUN=1
Set logging verbosity: LLAMAFACTORY_VERBOSITY=WARN
Use modelscope: USE_MODELSCOPE_HUB=1
Use openmind: USE_OPENMIND_HUB=1
"""
from
.extras.env
import
VERSION
__version__
=
VERSION
docker-hub/qwen2.5-vl/llama-factory/src/llamafactory/api/__init__.py
0 → 100644
View file @
5ed76316
docker-hub/qwen2.5-vl/llama-factory/src/llamafactory/api/app.py
0 → 100644
View file @
5ed76316
# Copyright 2025 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
os
from
contextlib
import
asynccontextmanager
from
functools
import
partial
from
typing
import
Annotated
,
Optional
from
..chat
import
ChatModel
from
..extras.constants
import
EngineName
from
..extras.misc
import
torch_gc
from
..extras.packages
import
is_fastapi_available
,
is_starlette_available
,
is_uvicorn_available
from
.chat
import
(
create_chat_completion_response
,
create_score_evaluation_response
,
create_stream_chat_completion_response
,
)
from
.protocol
import
(
ChatCompletionRequest
,
ChatCompletionResponse
,
ModelCard
,
ModelList
,
ScoreEvaluationRequest
,
ScoreEvaluationResponse
,
)
if
is_fastapi_available
():
from
fastapi
import
Depends
,
FastAPI
,
HTTPException
,
status
from
fastapi.middleware.cors
import
CORSMiddleware
from
fastapi.security.http
import
HTTPAuthorizationCredentials
,
HTTPBearer
if
is_starlette_available
():
from
sse_starlette
import
EventSourceResponse
if
is_uvicorn_available
():
import
uvicorn
async
def
sweeper
()
->
None
:
while
True
:
torch_gc
()
await
asyncio
.
sleep
(
300
)
@
asynccontextmanager
async
def
lifespan
(
app
:
"FastAPI"
,
chat_model
:
"ChatModel"
):
# collects GPU memory
if
chat_model
.
engine
.
name
==
EngineName
.
HF
:
asyncio
.
create_task
(
sweeper
())
yield
torch_gc
()
def
create_app
(
chat_model
:
"ChatModel"
)
->
"FastAPI"
:
root_path
=
os
.
getenv
(
"FASTAPI_ROOT_PATH"
,
""
)
app
=
FastAPI
(
lifespan
=
partial
(
lifespan
,
chat_model
=
chat_model
),
root_path
=
root_path
)
app
.
add_middleware
(
CORSMiddleware
,
allow_origins
=
[
"*"
],
allow_credentials
=
True
,
allow_methods
=
[
"*"
],
allow_headers
=
[
"*"
],
)
api_key
=
os
.
getenv
(
"API_KEY"
)
security
=
HTTPBearer
(
auto_error
=
False
)
async
def
verify_api_key
(
auth
:
Annotated
[
Optional
[
HTTPAuthorizationCredentials
],
Depends
(
security
)]):
if
api_key
and
(
auth
is
None
or
auth
.
credentials
!=
api_key
):
raise
HTTPException
(
status_code
=
status
.
HTTP_401_UNAUTHORIZED
,
detail
=
"Invalid API key."
)
@
app
.
get
(
"/v1/models"
,
response_model
=
ModelList
,
status_code
=
status
.
HTTP_200_OK
,
dependencies
=
[
Depends
(
verify_api_key
)],
)
async
def
list_models
():
model_card
=
ModelCard
(
id
=
os
.
getenv
(
"API_MODEL_NAME"
,
"gpt-3.5-turbo"
))
return
ModelList
(
data
=
[
model_card
])
@
app
.
post
(
"/v1/chat/completions"
,
response_model
=
ChatCompletionResponse
,
status_code
=
status
.
HTTP_200_OK
,
dependencies
=
[
Depends
(
verify_api_key
)],
)
async
def
create_chat_completion
(
request
:
ChatCompletionRequest
):
if
not
chat_model
.
engine
.
can_generate
:
raise
HTTPException
(
status_code
=
status
.
HTTP_405_METHOD_NOT_ALLOWED
,
detail
=
"Not allowed"
)
if
request
.
stream
:
generate
=
create_stream_chat_completion_response
(
request
,
chat_model
)
return
EventSourceResponse
(
generate
,
media_type
=
"text/event-stream"
,
sep
=
"
\n
"
)
else
:
return
await
create_chat_completion_response
(
request
,
chat_model
)
@
app
.
post
(
"/v1/score/evaluation"
,
response_model
=
ScoreEvaluationResponse
,
status_code
=
status
.
HTTP_200_OK
,
dependencies
=
[
Depends
(
verify_api_key
)],
)
async
def
create_score_evaluation
(
request
:
ScoreEvaluationRequest
):
if
chat_model
.
engine
.
can_generate
:
raise
HTTPException
(
status_code
=
status
.
HTTP_405_METHOD_NOT_ALLOWED
,
detail
=
"Not allowed"
)
return
await
create_score_evaluation_response
(
request
,
chat_model
)
return
app
def
run_api
()
->
None
:
chat_model
=
ChatModel
()
app
=
create_app
(
chat_model
)
api_host
=
os
.
getenv
(
"API_HOST"
,
"0.0.0.0"
)
api_port
=
int
(
os
.
getenv
(
"API_PORT"
,
"8000"
))
print
(
f
"Visit http://localhost:
{
api_port
}
/docs for API document."
)
uvicorn
.
run
(
app
,
host
=
api_host
,
port
=
api_port
)
Prev
1
…
3
4
5
6
7
8
9
10
11
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment