Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
DISC-FinLLM_pytorch
Commits
afe180a6
Commit
afe180a6
authored
May 21, 2024
by
wanglch
Browse files
Initial commit
parents
Pipeline
#1006
canceled with stages
Changes
258
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2448 additions
and
0 deletions
+2448
-0
LLaMA-Factory/src/llmtuner/data/utils.py
LLaMA-Factory/src/llmtuner/data/utils.py
+68
-0
LLaMA-Factory/src/llmtuner/eval/__init__.py
LLaMA-Factory/src/llmtuner/eval/__init__.py
+4
-0
LLaMA-Factory/src/llmtuner/eval/evaluator.py
LLaMA-Factory/src/llmtuner/eval/evaluator.py
+123
-0
LLaMA-Factory/src/llmtuner/eval/template.py
LLaMA-Factory/src/llmtuner/eval/template.py
+67
-0
LLaMA-Factory/src/llmtuner/extras/__init__.py
LLaMA-Factory/src/llmtuner/extras/__init__.py
+0
-0
LLaMA-Factory/src/llmtuner/extras/callbacks.py
LLaMA-Factory/src/llmtuner/extras/callbacks.py
+153
-0
LLaMA-Factory/src/llmtuner/extras/constants.py
LLaMA-Factory/src/llmtuner/extras/constants.py
+866
-0
LLaMA-Factory/src/llmtuner/extras/logging.py
LLaMA-Factory/src/llmtuner/extras/logging.py
+48
-0
LLaMA-Factory/src/llmtuner/extras/misc.py
LLaMA-Factory/src/llmtuner/extras/misc.py
+198
-0
LLaMA-Factory/src/llmtuner/extras/packages.py
LLaMA-Factory/src/llmtuner/extras/packages.py
+53
-0
LLaMA-Factory/src/llmtuner/extras/patches/__init__.py
LLaMA-Factory/src/llmtuner/extras/patches/__init__.py
+0
-0
LLaMA-Factory/src/llmtuner/extras/patches/llama_patch.py
LLaMA-Factory/src/llmtuner/extras/patches/llama_patch.py
+197
-0
LLaMA-Factory/src/llmtuner/extras/patches/mixtral_patch.py
LLaMA-Factory/src/llmtuner/extras/patches/mixtral_patch.py
+38
-0
LLaMA-Factory/src/llmtuner/extras/ploting.py
LLaMA-Factory/src/llmtuner/extras/ploting.py
+56
-0
LLaMA-Factory/src/llmtuner/hparams/__init__.py
LLaMA-Factory/src/llmtuner/hparams/__init__.py
+18
-0
LLaMA-Factory/src/llmtuner/hparams/data_args.py
LLaMA-Factory/src/llmtuner/hparams/data_args.py
+98
-0
LLaMA-Factory/src/llmtuner/hparams/evaluation_args.py
LLaMA-Factory/src/llmtuner/hparams/evaluation_args.py
+48
-0
LLaMA-Factory/src/llmtuner/hparams/finetuning_args.py
LLaMA-Factory/src/llmtuner/hparams/finetuning_args.py
+215
-0
LLaMA-Factory/src/llmtuner/hparams/generating_args.py
LLaMA-Factory/src/llmtuner/hparams/generating_args.py
+56
-0
LLaMA-Factory/src/llmtuner/hparams/model_args.py
LLaMA-Factory/src/llmtuner/hparams/model_args.py
+142
-0
No files found.
LLaMA-Factory/src/llmtuner/data/utils.py
0 → 100644
View file @
afe180a6
import
hashlib
from
enum
import
Enum
,
unique
from
typing
import
TYPE_CHECKING
,
Dict
,
List
,
Optional
,
Tuple
,
Union
from
..extras.logging
import
get_logger
if
TYPE_CHECKING
:
from
datasets
import
Dataset
,
IterableDataset
from
transformers
import
TrainingArguments
from
llmtuner.hparams
import
DataArguments
logger
=
get_logger
(
__name__
)
@
unique
class
Role
(
str
,
Enum
):
USER
=
"user"
ASSISTANT
=
"assistant"
SYSTEM
=
"system"
FUNCTION
=
"function"
OBSERVATION
=
"observation"
def
checksum
(
data_files
:
List
[
str
],
file_sha1
:
Optional
[
str
]
=
None
)
->
None
:
if
file_sha1
is
None
:
logger
.
warning
(
"Checksum failed: missing SHA-1 hash value in dataset_info.json."
)
return
if
len
(
data_files
)
!=
1
:
logger
.
warning
(
"Checksum failed: too many files."
)
return
with
open
(
data_files
[
0
],
"rb"
)
as
f
:
sha1
=
hashlib
.
sha1
(
f
.
read
()).
hexdigest
()
if
sha1
!=
file_sha1
:
logger
.
warning
(
"Checksum failed: mismatched SHA-1 hash value at {}."
.
format
(
data_files
[
0
]))
def
infer_max_len
(
source_len
:
int
,
target_len
:
int
,
max_len
:
int
,
reserved_label_len
:
int
)
->
Tuple
[
int
,
int
]:
max_target_len
=
int
(
max_len
*
(
target_len
/
(
source_len
+
target_len
)))
max_target_len
=
max
(
max_target_len
,
reserved_label_len
)
max_source_len
=
max_len
-
max_target_len
return
max_source_len
,
max_target_len
def
split_dataset
(
dataset
:
Union
[
"Dataset"
,
"IterableDataset"
],
data_args
:
"DataArguments"
,
training_args
:
"TrainingArguments"
)
->
Dict
[
str
,
"Dataset"
]:
if
training_args
.
do_train
:
if
data_args
.
val_size
>
1e-6
:
# Split the dataset
if
data_args
.
streaming
:
val_set
=
dataset
.
take
(
int
(
data_args
.
val_size
))
train_set
=
dataset
.
skip
(
int
(
data_args
.
val_size
))
dataset
=
dataset
.
shuffle
(
buffer_size
=
data_args
.
buffer_size
,
seed
=
training_args
.
seed
)
return
{
"train_dataset"
:
train_set
,
"eval_dataset"
:
val_set
}
else
:
val_size
=
int
(
data_args
.
val_size
)
if
data_args
.
val_size
>
1
else
data_args
.
val_size
dataset
=
dataset
.
train_test_split
(
test_size
=
val_size
,
seed
=
training_args
.
seed
)
return
{
"train_dataset"
:
dataset
[
"train"
],
"eval_dataset"
:
dataset
[
"test"
]}
else
:
if
data_args
.
streaming
:
dataset
=
dataset
.
shuffle
(
buffer_size
=
data_args
.
buffer_size
,
seed
=
training_args
.
seed
)
return
{
"train_dataset"
:
dataset
}
else
:
# do_eval or do_predict
return
{
"eval_dataset"
:
dataset
}
LLaMA-Factory/src/llmtuner/eval/__init__.py
0 → 100644
View file @
afe180a6
from
.evaluator
import
Evaluator
__all__
=
[
"Evaluator"
]
LLaMA-Factory/src/llmtuner/eval/evaluator.py
0 → 100644
View file @
afe180a6
# Inspired by: https://github.com/hendrycks/test/blob/master/evaluate_flan.py
import
inspect
import
json
import
os
from
typing
import
Any
,
Dict
,
List
,
Optional
import
numpy
as
np
import
torch
from
datasets
import
load_dataset
from
tqdm
import
tqdm
,
trange
from
transformers.utils
import
cached_file
from
..data
import
get_template_and_fix_tokenizer
from
..extras.constants
import
CHOICES
,
SUBJECTS
from
..hparams
import
get_eval_args
from
..model
import
dispatch_model
,
load_model_and_tokenizer
from
.template
import
get_eval_template
class
Evaluator
:
def
__init__
(
self
,
args
:
Optional
[
Dict
[
str
,
Any
]]
=
None
)
->
None
:
self
.
model_args
,
self
.
data_args
,
self
.
eval_args
,
finetuning_args
=
get_eval_args
(
args
)
self
.
model
,
self
.
tokenizer
=
load_model_and_tokenizer
(
self
.
model_args
,
finetuning_args
)
self
.
tokenizer
.
padding_side
=
"right"
# avoid overflow issue in batched inference for llama2
self
.
model
=
dispatch_model
(
self
.
model
)
self
.
template
=
get_template_and_fix_tokenizer
(
self
.
tokenizer
,
self
.
data_args
.
template
)
self
.
eval_template
=
get_eval_template
(
self
.
eval_args
.
lang
)
self
.
choice_inputs
=
[
self
.
tokenizer
.
encode
(
self
.
eval_template
.
prefix
+
ch
,
add_special_tokens
=
False
)[
-
1
]
for
ch
in
CHOICES
]
@
torch
.
inference_mode
()
def
batch_inference
(
self
,
batch_input
:
Dict
[
str
,
torch
.
Tensor
])
->
List
[
str
]:
logits
=
self
.
model
(
**
batch_input
).
logits
lengths
=
torch
.
sum
(
batch_input
[
"attention_mask"
],
dim
=-
1
)
word_probs
=
torch
.
stack
([
logits
[
i
,
lengths
[
i
]
-
1
]
for
i
in
range
(
len
(
lengths
))],
dim
=
0
)
choice_probs
=
torch
.
nn
.
functional
.
softmax
(
word_probs
[:,
self
.
choice_inputs
],
dim
=-
1
).
detach
()
return
[
chr
(
ord
(
"A"
)
+
offset
.
item
())
for
offset
in
torch
.
argmax
(
choice_probs
,
dim
=-
1
)]
def
eval
(
self
)
->
None
:
mapping
=
cached_file
(
path_or_repo_id
=
os
.
path
.
join
(
self
.
eval_args
.
task_dir
,
self
.
eval_args
.
task
),
filename
=
"mapping.json"
,
cache_dir
=
self
.
model_args
.
cache_dir
,
token
=
self
.
model_args
.
hf_hub_token
,
)
with
open
(
mapping
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
categorys
:
Dict
[
str
,
Dict
[
str
,
str
]]
=
json
.
load
(
f
)
category_corrects
=
{
subj
:
np
.
array
([],
dtype
=
"bool"
)
for
subj
in
SUBJECTS
}
pbar
=
tqdm
(
categorys
.
keys
(),
desc
=
"Processing subjects"
,
position
=
0
)
results
=
{}
for
subject
in
pbar
:
if
"trust_remote_code"
in
inspect
.
signature
(
load_dataset
).
parameters
:
# for datasets==2.16.0
kwargs
=
{
"trust_remote_code"
:
True
}
else
:
kwargs
=
{}
dataset
=
load_dataset
(
path
=
os
.
path
.
join
(
self
.
eval_args
.
task_dir
,
self
.
eval_args
.
task
),
name
=
subject
,
cache_dir
=
self
.
model_args
.
cache_dir
,
download_mode
=
self
.
eval_args
.
download_mode
,
token
=
self
.
model_args
.
hf_hub_token
,
**
kwargs
,
)
pbar
.
set_postfix_str
(
categorys
[
subject
][
"name"
])
inputs
,
outputs
,
labels
=
[],
[],
[]
for
i
in
trange
(
len
(
dataset
[
self
.
data_args
.
split
]),
desc
=
"Formatting batches"
,
position
=
1
,
leave
=
False
):
support_set
=
(
dataset
[
"train"
].
shuffle
().
select
(
range
(
min
(
self
.
eval_args
.
n_shot
,
len
(
dataset
[
"train"
]))))
)
messages
=
self
.
eval_template
.
format_example
(
target_data
=
dataset
[
self
.
data_args
.
split
][
i
],
support_set
=
support_set
,
subject_name
=
categorys
[
subject
][
"name"
],
)
input_ids
,
_
=
self
.
template
.
encode_oneturn
(
tokenizer
=
self
.
tokenizer
,
messages
=
messages
)
inputs
.
append
({
"input_ids"
:
input_ids
,
"attention_mask"
:
[
1
]
*
len
(
input_ids
)})
labels
.
append
(
messages
[
-
1
][
"content"
])
for
i
in
trange
(
0
,
len
(
inputs
),
self
.
eval_args
.
batch_size
,
desc
=
"Predicting batches"
,
position
=
1
,
leave
=
False
):
batch_input
=
self
.
tokenizer
.
pad
(
inputs
[
i
:
i
+
self
.
eval_args
.
batch_size
],
return_attention_mask
=
True
,
return_tensors
=
"pt"
).
to
(
self
.
model
.
device
)
preds
=
self
.
batch_inference
(
batch_input
)
outputs
+=
preds
corrects
=
np
.
array
(
outputs
)
==
np
.
array
(
labels
)
category_name
=
categorys
[
subject
][
"category"
]
category_corrects
[
category_name
]
=
np
.
concatenate
([
category_corrects
[
category_name
],
corrects
],
axis
=
0
)
category_corrects
[
"Average"
]
=
np
.
concatenate
([
category_corrects
[
"Average"
],
corrects
],
axis
=
0
)
results
[
subject
]
=
{
str
(
i
):
outputs
[
i
]
for
i
in
range
(
len
(
outputs
))}
pbar
.
close
()
self
.
_save_results
(
category_corrects
,
results
)
def
_save_results
(
self
,
category_corrects
:
Dict
[
str
,
np
.
ndarray
],
results
:
Dict
[
str
,
Dict
[
int
,
str
]])
->
None
:
score_info
=
"
\n
"
.
join
(
[
"{:>15}: {:.2f}"
.
format
(
category_name
,
100
*
np
.
mean
(
category_correct
))
for
category_name
,
category_correct
in
category_corrects
.
items
()
if
len
(
category_correct
)
]
)
print
(
score_info
)
if
self
.
eval_args
.
save_dir
is
not
None
:
os
.
makedirs
(
self
.
eval_args
.
save_dir
,
exist_ok
=
False
)
with
open
(
os
.
path
.
join
(
self
.
eval_args
.
save_dir
,
"results.json"
),
"w"
,
encoding
=
"utf-8"
,
newline
=
"
\n
"
)
as
f
:
json
.
dump
(
results
,
f
,
indent
=
2
)
with
open
(
os
.
path
.
join
(
self
.
eval_args
.
save_dir
,
"results.log"
),
"w"
,
encoding
=
"utf-8"
,
newline
=
"
\n
"
)
as
f
:
f
.
write
(
score_info
)
if
__name__
==
"__main__"
:
evaluator
=
Evaluator
()
evaluator
.
eval
()
LLaMA-Factory/src/llmtuner/eval/template.py
0 → 100644
View file @
afe180a6
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
Dict
,
List
,
Tuple
from
..data
import
Role
from
..extras.constants
import
CHOICES
if
TYPE_CHECKING
:
from
datasets
import
Dataset
@
dataclass
class
EvalTemplate
:
system
:
str
choice
:
str
answer
:
str
prefix
:
str
def
parse_example
(
self
,
example
:
Dict
[
str
,
str
])
->
Tuple
[
str
,
str
]:
candidates
=
[
self
.
choice
.
format
(
choice
=
ch
,
content
=
example
[
ch
])
for
ch
in
CHOICES
if
ch
in
example
]
return
""
.
join
([
example
[
"question"
]]
+
candidates
+
[
self
.
answer
]),
example
[
"answer"
]
def
format_example
(
self
,
target_data
:
Dict
[
str
,
str
],
support_set
:
"Dataset"
,
subject_name
:
str
)
->
List
[
Dict
[
str
,
str
]]:
messages
=
[]
for
k
in
range
(
len
(
support_set
)):
prompt
,
response
=
self
.
parse_example
(
support_set
[
k
])
messages
.
append
({
"role"
:
Role
.
USER
,
"content"
:
prompt
})
messages
.
append
({
"role"
:
Role
.
ASSISTANT
,
"content"
:
response
})
prompt
,
response
=
self
.
parse_example
(
target_data
)
messages
.
append
({
"role"
:
Role
.
USER
,
"content"
:
prompt
})
messages
.
append
({
"role"
:
Role
.
ASSISTANT
,
"content"
:
response
})
messages
[
0
][
"content"
]
=
self
.
system
.
format
(
subject
=
subject_name
)
+
messages
[
0
][
"content"
]
return
messages
eval_templates
:
Dict
[
str
,
"EvalTemplate"
]
=
{}
def
register_eval_template
(
name
:
str
,
system
:
str
,
choice
:
str
,
answer
:
str
,
prefix
:
str
)
->
None
:
eval_templates
[
name
]
=
EvalTemplate
(
system
=
system
,
choice
=
choice
,
answer
=
answer
,
prefix
=
prefix
)
def
get_eval_template
(
name
:
str
)
->
"EvalTemplate"
:
eval_template
=
eval_templates
.
get
(
name
,
None
)
assert
eval_template
is
not
None
,
"Template {} does not exist."
.
format
(
name
)
return
eval_template
register_eval_template
(
name
=
"en"
,
system
=
"The following are multiple choice questions (with answers) about {subject}.
\n\n
"
,
choice
=
"
\n
{choice}. {content}"
,
answer
=
"
\n
Answer: "
,
prefix
=
" "
,
)
register_eval_template
(
name
=
"zh"
,
system
=
"以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。
\n\n
"
,
choice
=
"
\n
{choice}. {content}"
,
answer
=
"
\n
答案:"
,
prefix
=
"
\n
"
,
)
LLaMA-Factory/src/llmtuner/extras/__init__.py
0 → 100644
View file @
afe180a6
LLaMA-Factory/src/llmtuner/extras/callbacks.py
0 → 100644
View file @
afe180a6
import
json
import
os
import
time
from
datetime
import
timedelta
from
typing
import
TYPE_CHECKING
from
transformers
import
TrainerCallback
from
transformers.trainer_utils
import
PREFIX_CHECKPOINT_DIR
,
has_length
from
.constants
import
LOG_FILE_NAME
from
.logging
import
get_logger
from
.misc
import
fix_valuehead_checkpoint
if
TYPE_CHECKING
:
from
transformers
import
TrainerControl
,
TrainerState
,
TrainingArguments
logger
=
get_logger
(
__name__
)
class
FixValueHeadModelCallback
(
TrainerCallback
):
def
on_save
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
r
"""
Event called after a checkpoint save.
"""
if
args
.
should_save
:
fix_valuehead_checkpoint
(
model
=
kwargs
.
pop
(
"model"
),
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
"{}-{}"
.
format
(
PREFIX_CHECKPOINT_DIR
,
state
.
global_step
)),
safe_serialization
=
args
.
save_safetensors
,
)
class
LogCallback
(
TrainerCallback
):
def
__init__
(
self
,
runner
=
None
):
self
.
runner
=
runner
self
.
in_training
=
False
self
.
start_time
=
time
.
time
()
self
.
cur_steps
=
0
self
.
max_steps
=
0
self
.
elapsed_time
=
""
self
.
remaining_time
=
""
def
timing
(
self
):
cur_time
=
time
.
time
()
elapsed_time
=
cur_time
-
self
.
start_time
avg_time_per_step
=
elapsed_time
/
self
.
cur_steps
if
self
.
cur_steps
!=
0
else
0
remaining_time
=
(
self
.
max_steps
-
self
.
cur_steps
)
*
avg_time_per_step
self
.
elapsed_time
=
str
(
timedelta
(
seconds
=
int
(
elapsed_time
)))
self
.
remaining_time
=
str
(
timedelta
(
seconds
=
int
(
remaining_time
)))
def
on_train_begin
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
r
"""
Event called at the beginning of training.
"""
if
state
.
is_local_process_zero
:
self
.
in_training
=
True
self
.
start_time
=
time
.
time
()
self
.
max_steps
=
state
.
max_steps
if
os
.
path
.
exists
(
os
.
path
.
join
(
args
.
output_dir
,
LOG_FILE_NAME
))
and
args
.
overwrite_output_dir
:
logger
.
warning
(
"Previous log file in this folder will be deleted."
)
os
.
remove
(
os
.
path
.
join
(
args
.
output_dir
,
LOG_FILE_NAME
))
def
on_train_end
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
r
"""
Event called at the end of training.
"""
if
state
.
is_local_process_zero
:
self
.
in_training
=
False
self
.
cur_steps
=
0
self
.
max_steps
=
0
def
on_substep_end
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
r
"""
Event called at the end of an substep during gradient accumulation.
"""
if
state
.
is_local_process_zero
and
self
.
runner
is
not
None
and
self
.
runner
.
aborted
:
control
.
should_epoch_stop
=
True
control
.
should_training_stop
=
True
def
on_step_end
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
r
"""
Event called at the end of a training step.
"""
if
state
.
is_local_process_zero
:
self
.
cur_steps
=
state
.
global_step
self
.
timing
()
if
self
.
runner
is
not
None
and
self
.
runner
.
aborted
:
control
.
should_epoch_stop
=
True
control
.
should_training_stop
=
True
def
on_evaluate
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
r
"""
Event called after an evaluation phase.
"""
if
state
.
is_local_process_zero
and
not
self
.
in_training
:
self
.
cur_steps
=
0
self
.
max_steps
=
0
def
on_predict
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
*
other
,
**
kwargs
):
r
"""
Event called after a successful prediction.
"""
if
state
.
is_local_process_zero
and
not
self
.
in_training
:
self
.
cur_steps
=
0
self
.
max_steps
=
0
def
on_log
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
)
->
None
:
r
"""
Event called after logging the last logs.
"""
if
not
state
.
is_local_process_zero
:
return
logs
=
dict
(
current_steps
=
self
.
cur_steps
,
total_steps
=
self
.
max_steps
,
loss
=
state
.
log_history
[
-
1
].
get
(
"loss"
,
None
),
eval_loss
=
state
.
log_history
[
-
1
].
get
(
"eval_loss"
,
None
),
predict_loss
=
state
.
log_history
[
-
1
].
get
(
"predict_loss"
,
None
),
reward
=
state
.
log_history
[
-
1
].
get
(
"reward"
,
None
),
learning_rate
=
state
.
log_history
[
-
1
].
get
(
"learning_rate"
,
None
),
epoch
=
state
.
log_history
[
-
1
].
get
(
"epoch"
,
None
),
percentage
=
round
(
self
.
cur_steps
/
self
.
max_steps
*
100
,
2
)
if
self
.
max_steps
!=
0
else
100
,
elapsed_time
=
self
.
elapsed_time
,
remaining_time
=
self
.
remaining_time
,
)
if
self
.
runner
is
not
None
:
logger
.
info
(
"{{'loss': {:.4f}, 'learning_rate': {:2.4e}, 'epoch': {:.2f}}}"
.
format
(
logs
[
"loss"
]
or
0
,
logs
[
"learning_rate"
]
or
0
,
logs
[
"epoch"
]
or
0
)
)
os
.
makedirs
(
args
.
output_dir
,
exist_ok
=
True
)
with
open
(
os
.
path
.
join
(
args
.
output_dir
,
"trainer_log.jsonl"
),
"a"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
json
.
dumps
(
logs
)
+
"
\n
"
)
def
on_prediction_step
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
r
"""
Event called after a prediction step.
"""
eval_dataloader
=
kwargs
.
pop
(
"eval_dataloader"
,
None
)
if
state
.
is_local_process_zero
and
has_length
(
eval_dataloader
)
and
not
self
.
in_training
:
if
self
.
max_steps
==
0
:
self
.
max_steps
=
len
(
eval_dataloader
)
self
.
cur_steps
+=
1
self
.
timing
()
LLaMA-Factory/src/llmtuner/extras/constants.py
0 → 100644
View file @
afe180a6
from
collections
import
OrderedDict
,
defaultdict
from
enum
import
Enum
from
typing
import
Dict
,
Optional
CHOICES
=
[
"A"
,
"B"
,
"C"
,
"D"
]
DATA_CONFIG
=
"dataset_info.json"
DEFAULT_MODULE
=
defaultdict
(
str
)
DEFAULT_TEMPLATE
=
defaultdict
(
str
)
FILEEXT2TYPE
=
{
"arrow"
:
"arrow"
,
"csv"
:
"csv"
,
"json"
:
"json"
,
"jsonl"
:
"json"
,
"parquet"
:
"parquet"
,
"txt"
:
"text"
,
}
IGNORE_INDEX
=
-
100
LAYERNORM_NAMES
=
{
"norm"
,
"ln"
}
LOG_FILE_NAME
=
"trainer_log.jsonl"
METHODS
=
[
"full"
,
"freeze"
,
"lora"
]
PEFT_METHODS
=
[
"lora"
]
SUBJECTS
=
[
"Average"
,
"STEM"
,
"Social Sciences"
,
"Humanities"
,
"Other"
]
SUPPORTED_MODELS
=
OrderedDict
()
TRAINING_STAGES
=
{
"Supervised Fine-Tuning"
:
"sft"
,
"Reward Modeling"
:
"rm"
,
"PPO"
:
"ppo"
,
"DPO"
:
"dpo"
,
"Pre-Training"
:
"pt"
,
}
V_HEAD_WEIGHTS_NAME
=
"value_head.bin"
V_HEAD_SAFE_WEIGHTS_NAME
=
"value_head.safetensors"
class
DownloadSource
(
str
,
Enum
):
DEFAULT
=
"hf"
MODELSCOPE
=
"ms"
def
register_model_group
(
models
:
Dict
[
str
,
Dict
[
DownloadSource
,
str
]],
module
:
Optional
[
str
]
=
None
,
template
:
Optional
[
str
]
=
None
,
)
->
None
:
prefix
=
None
for
name
,
path
in
models
.
items
():
if
prefix
is
None
:
prefix
=
name
.
split
(
"-"
)[
0
]
else
:
assert
prefix
==
name
.
split
(
"-"
)[
0
],
"prefix should be identical."
SUPPORTED_MODELS
[
name
]
=
path
if
module
is
not
None
:
DEFAULT_MODULE
[
prefix
]
=
module
if
template
is
not
None
:
DEFAULT_TEMPLATE
[
prefix
]
=
template
register_model_group
(
models
=
{
"Baichuan-7B-Base"
:
{
DownloadSource
.
DEFAULT
:
"baichuan-inc/Baichuan-7B"
,
DownloadSource
.
MODELSCOPE
:
"baichuan-inc/baichuan-7B"
,
},
"Baichuan-13B-Base"
:
{
DownloadSource
.
DEFAULT
:
"baichuan-inc/Baichuan-13B-Base"
,
DownloadSource
.
MODELSCOPE
:
"baichuan-inc/Baichuan-13B-Base"
,
},
"Baichuan-13B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"baichuan-inc/Baichuan-13B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"baichuan-inc/Baichuan-13B-Chat"
,
},
},
module
=
"W_pack"
,
template
=
"baichuan"
,
)
register_model_group
(
models
=
{
"Baichuan2-7B-Base"
:
{
DownloadSource
.
DEFAULT
:
"baichuan-inc/Baichuan2-7B-Base"
,
DownloadSource
.
MODELSCOPE
:
"baichuan-inc/Baichuan2-7B-Base"
,
},
"Baichuan2-13B-Base"
:
{
DownloadSource
.
DEFAULT
:
"baichuan-inc/Baichuan2-13B-Base"
,
DownloadSource
.
MODELSCOPE
:
"baichuan-inc/Baichuan2-13B-Base"
,
},
"Baichuan2-7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"baichuan-inc/Baichuan2-7B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"baichuan-inc/Baichuan2-7B-Chat"
,
},
"Baichuan2-13B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"baichuan-inc/Baichuan2-13B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"baichuan-inc/Baichuan2-13B-Chat"
,
},
},
module
=
"W_pack"
,
template
=
"baichuan2"
,
)
register_model_group
(
models
=
{
"BLOOM-560M"
:
{
DownloadSource
.
DEFAULT
:
"bigscience/bloom-560m"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/bloom-560m"
,
},
"BLOOM-3B"
:
{
DownloadSource
.
DEFAULT
:
"bigscience/bloom-3b"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/bloom-3b"
,
},
"BLOOM-7B1"
:
{
DownloadSource
.
DEFAULT
:
"bigscience/bloom-7b1"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/bloom-7b1"
,
},
},
module
=
"query_key_value"
,
)
register_model_group
(
models
=
{
"BLOOMZ-560M"
:
{
DownloadSource
.
DEFAULT
:
"bigscience/bloomz-560m"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/bloomz-560m"
,
},
"BLOOMZ-3B"
:
{
DownloadSource
.
DEFAULT
:
"bigscience/bloomz-3b"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/bloomz-3b"
,
},
"BLOOMZ-7B1-mt"
:
{
DownloadSource
.
DEFAULT
:
"bigscience/bloomz-7b1-mt"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/bloomz-7b1-mt"
,
},
},
module
=
"query_key_value"
,
)
register_model_group
(
models
=
{
"BlueLM-7B-Base"
:
{
DownloadSource
.
DEFAULT
:
"vivo-ai/BlueLM-7B-Base"
,
DownloadSource
.
MODELSCOPE
:
"vivo-ai/BlueLM-7B-Base"
,
},
"BlueLM-7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"vivo-ai/BlueLM-7B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"vivo-ai/BlueLM-7B-Chat"
,
},
},
template
=
"bluelm"
,
)
register_model_group
(
models
=
{
"ChatGLM2-6B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"THUDM/chatglm2-6b"
,
DownloadSource
.
MODELSCOPE
:
"ZhipuAI/chatglm2-6b"
,
}
},
module
=
"query_key_value"
,
template
=
"chatglm2"
,
)
register_model_group
(
models
=
{
"ChatGLM3-6B-Base"
:
{
DownloadSource
.
DEFAULT
:
"THUDM/chatglm3-6b-base"
,
DownloadSource
.
MODELSCOPE
:
"ZhipuAI/chatglm3-6b-base"
,
},
"ChatGLM3-6B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"THUDM/chatglm3-6b"
,
DownloadSource
.
MODELSCOPE
:
"ZhipuAI/chatglm3-6b"
,
},
},
module
=
"query_key_value"
,
template
=
"chatglm3"
,
)
register_model_group
(
models
=
{
"ChineseLLaMA2-1.3B"
:
{
DownloadSource
.
DEFAULT
:
"hfl/chinese-llama-2-1.3b"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/chinese-llama-2-1.3b"
,
},
"ChineseLLaMA2-7B"
:
{
DownloadSource
.
DEFAULT
:
"hfl/chinese-llama-2-7b"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/chinese-llama-2-7b"
,
},
"ChineseLLaMA2-13B"
:
{
DownloadSource
.
DEFAULT
:
"hfl/chinese-llama-2-13b"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/chinese-llama-2-13b"
,
},
"ChineseLLaMA2-1.3B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"hfl/chinese-alpaca-2-1.3b"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/chinese-alpaca-2-1.3b"
,
},
"ChineseLLaMA2-7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"hfl/chinese-alpaca-2-7b"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/chinese-alpaca-2-7b"
,
},
"ChineseLLaMA2-13B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"hfl/chinese-alpaca-2-13b"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/chinese-alpaca-2-13b"
,
},
},
template
=
"llama2_zh"
,
)
register_model_group
(
models
=
{
"DeepSeek-LLM-7B-Base"
:
{
DownloadSource
.
DEFAULT
:
"deepseek-ai/deepseek-llm-7b-base"
,
DownloadSource
.
MODELSCOPE
:
"deepseek-ai/deepseek-llm-7b-base"
,
},
"DeepSeek-LLM-67B-Base"
:
{
DownloadSource
.
DEFAULT
:
"deepseek-ai/deepseek-llm-67b-base"
,
DownloadSource
.
MODELSCOPE
:
"deepseek-ai/deepseek-llm-67b-base"
,
},
"DeepSeek-LLM-7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"deepseek-ai/deepseek-llm-7b-chat"
,
DownloadSource
.
MODELSCOPE
:
"deepseek-ai/deepseek-llm-7b-chat"
,
},
"DeepSeek-LLM-67B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"deepseek-ai/deepseek-llm-67b-chat"
,
DownloadSource
.
MODELSCOPE
:
"deepseek-ai/deepseek-llm-67b-chat"
,
},
"DeepSeek-Math-7B-Base"
:
{
DownloadSource
.
DEFAULT
:
"deepseek-ai/deepseek-math-7b-base"
,
},
"DeepSeek-Math-7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"deepseek-ai/deepseek-math-7b-instruct"
,
},
"DeepSeek-MoE-16B-Base"
:
{
DownloadSource
.
DEFAULT
:
"deepseek-ai/deepseek-moe-16b-base"
,
DownloadSource
.
MODELSCOPE
:
"deepseek-ai/deepseek-moe-16b-base"
,
},
"DeepSeek-MoE-16B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"deepseek-ai/deepseek-moe-16b-chat"
,
DownloadSource
.
MODELSCOPE
:
"deepseek-ai/deepseek-moe-16b-chat"
,
},
},
template
=
"deepseek"
,
)
register_model_group
(
models
=
{
"DeepSeekCoder-6.7B-Base"
:
{
DownloadSource
.
DEFAULT
:
"deepseek-ai/deepseek-coder-6.7b-base"
,
DownloadSource
.
MODELSCOPE
:
"deepseek-ai/deepseek-coder-6.7b-base"
,
},
"DeepSeekCoder-7B-Base"
:
{
DownloadSource
.
DEFAULT
:
"deepseek-ai/deepseek-coder-7b-base-v1.5"
,
},
"DeepSeekCoder-33B-Base"
:
{
DownloadSource
.
DEFAULT
:
"deepseek-ai/deepseek-coder-33b-base"
,
DownloadSource
.
MODELSCOPE
:
"deepseek-ai/deepseek-coder-33b-base"
,
},
"DeepSeekCoder-6.7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"deepseek-ai/deepseek-coder-6.7b-instruct"
,
DownloadSource
.
MODELSCOPE
:
"deepseek-ai/deepseek-coder-6.7b-instruct"
,
},
"DeepSeekCoder-7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"deepseek-ai/deepseek-coder-7b-instruct-v1.5"
,
},
"DeepSeekCoder-33B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"deepseek-ai/deepseek-coder-33b-instruct"
,
DownloadSource
.
MODELSCOPE
:
"deepseek-ai/deepseek-coder-33b-instruct"
,
},
},
template
=
"deepseekcoder"
,
)
register_model_group
(
models
=
{
"Falcon-7B"
:
{
DownloadSource
.
DEFAULT
:
"tiiuae/falcon-7b"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/falcon-7b"
,
},
"Falcon-40B"
:
{
DownloadSource
.
DEFAULT
:
"tiiuae/falcon-40b"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/falcon-40b"
,
},
"Falcon-180B"
:
{
DownloadSource
.
DEFAULT
:
"tiiuae/falcon-180b"
,
DownloadSource
.
MODELSCOPE
:
"modelscope/falcon-180B"
,
},
"Falcon-7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"tiiuae/falcon-7b-instruct"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/falcon-7b-instruct"
,
},
"Falcon-40B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"tiiuae/falcon-40b-instruct"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/falcon-40b-instruct"
,
},
"Falcon-180B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"tiiuae/falcon-180b-chat"
,
DownloadSource
.
MODELSCOPE
:
"modelscope/falcon-180B-chat"
,
},
},
module
=
"query_key_value"
,
template
=
"falcon"
,
)
register_model_group
(
models
=
{
"InternLM-7B"
:
{
DownloadSource
.
DEFAULT
:
"internlm/internlm-7b"
,
DownloadSource
.
MODELSCOPE
:
"Shanghai_AI_Laboratory/internlm-7b"
,
},
"InternLM-20B"
:
{
DownloadSource
.
DEFAULT
:
"internlm/internlm-20b"
,
DownloadSource
.
MODELSCOPE
:
"Shanghai_AI_Laboratory/internlm-20b"
,
},
"InternLM-7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"internlm/internlm-chat-7b"
,
DownloadSource
.
MODELSCOPE
:
"Shanghai_AI_Laboratory/internlm-chat-7b"
,
},
"InternLM-20B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"internlm/internlm-chat-20b"
,
DownloadSource
.
MODELSCOPE
:
"Shanghai_AI_Laboratory/internlm-chat-20b"
,
},
},
template
=
"intern"
,
)
register_model_group
(
models
=
{
"InternLM2-7B"
:
{
DownloadSource
.
DEFAULT
:
"internlm/internlm2-7b"
,
DownloadSource
.
MODELSCOPE
:
"Shanghai_AI_Laboratory/internlm2-7b"
,
},
"InternLM2-20B"
:
{
DownloadSource
.
DEFAULT
:
"internlm/internlm2-20b"
,
DownloadSource
.
MODELSCOPE
:
"Shanghai_AI_Laboratory/internlm2-20b"
,
},
"InternLM2-7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"internlm/internlm2-chat-7b"
,
DownloadSource
.
MODELSCOPE
:
"Shanghai_AI_Laboratory/internlm2-chat-7b"
,
},
"InternLM2-20B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"internlm/internlm2-chat-20b"
,
DownloadSource
.
MODELSCOPE
:
"Shanghai_AI_Laboratory/internlm2-chat-20b"
,
},
},
module
=
"wqkv"
,
template
=
"intern2"
,
)
register_model_group
(
models
=
{
"LingoWhale-8B"
:
{
DownloadSource
.
DEFAULT
:
"deeplang-ai/LingoWhale-8B"
,
DownloadSource
.
MODELSCOPE
:
"DeepLang/LingoWhale-8B"
,
}
},
module
=
"qkv_proj"
,
)
register_model_group
(
models
=
{
"LLaMA-7B"
:
{
DownloadSource
.
DEFAULT
:
"huggyllama/llama-7b"
,
DownloadSource
.
MODELSCOPE
:
"skyline2006/llama-7b"
,
},
"LLaMA-13B"
:
{
DownloadSource
.
DEFAULT
:
"huggyllama/llama-13b"
,
DownloadSource
.
MODELSCOPE
:
"skyline2006/llama-13b"
,
},
"LLaMA-30B"
:
{
DownloadSource
.
DEFAULT
:
"huggyllama/llama-30b"
,
DownloadSource
.
MODELSCOPE
:
"skyline2006/llama-30b"
,
},
"LLaMA-65B"
:
{
DownloadSource
.
DEFAULT
:
"huggyllama/llama-65b"
,
DownloadSource
.
MODELSCOPE
:
"skyline2006/llama-65b"
,
},
}
)
register_model_group
(
models
=
{
"LLaMA2-7B"
:
{
DownloadSource
.
DEFAULT
:
"meta-llama/Llama-2-7b-hf"
,
DownloadSource
.
MODELSCOPE
:
"modelscope/Llama-2-7b-ms"
,
},
"LLaMA2-13B"
:
{
DownloadSource
.
DEFAULT
:
"meta-llama/Llama-2-13b-hf"
,
DownloadSource
.
MODELSCOPE
:
"modelscope/Llama-2-13b-ms"
,
},
"LLaMA2-70B"
:
{
DownloadSource
.
DEFAULT
:
"meta-llama/Llama-2-70b-hf"
,
DownloadSource
.
MODELSCOPE
:
"modelscope/Llama-2-70b-ms"
,
},
"LLaMA2-7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"meta-llama/Llama-2-7b-chat-hf"
,
DownloadSource
.
MODELSCOPE
:
"modelscope/Llama-2-7b-chat-ms"
,
},
"LLaMA2-13B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"meta-llama/Llama-2-13b-chat-hf"
,
DownloadSource
.
MODELSCOPE
:
"modelscope/Llama-2-13b-chat-ms"
,
},
"LLaMA2-70B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"meta-llama/Llama-2-70b-chat-hf"
,
DownloadSource
.
MODELSCOPE
:
"modelscope/Llama-2-70b-chat-ms"
,
},
},
template
=
"llama2"
,
)
register_model_group
(
models
=
{
"Mistral-7B"
:
{
DownloadSource
.
DEFAULT
:
"mistralai/Mistral-7B-v0.1"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/Mistral-7B-v0.1"
,
},
"Mistral-7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"mistralai/Mistral-7B-Instruct-v0.1"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/Mistral-7B-Instruct-v0.1"
,
},
"Mistral-7B-v0.2-Chat"
:
{
DownloadSource
.
DEFAULT
:
"mistralai/Mistral-7B-Instruct-v0.2"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/Mistral-7B-Instruct-v0.2"
,
},
},
template
=
"mistral"
,
)
register_model_group
(
models
=
{
"Mixtral-8x7B"
:
{
DownloadSource
.
DEFAULT
:
"mistralai/Mixtral-8x7B-v0.1"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/Mixtral-8x7B-v0.1"
,
},
"Mixtral-8x7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"mistralai/Mixtral-8x7B-Instruct-v0.1"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/Mixtral-8x7B-Instruct-v0.1"
,
},
},
template
=
"mistral"
,
)
register_model_group
(
models
=
{
"OpenChat3.5-7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"openchat/openchat-3.5-0106"
,
DownloadSource
.
MODELSCOPE
:
"myxiongmodel/openchat_3.5"
,
}
},
template
=
"openchat"
,
)
register_model_group
(
models
=
{
"Orion-14B-Base"
:
{
DownloadSource
.
DEFAULT
:
"OrionStarAI/Orion-14B-Base"
,
DownloadSource
.
MODELSCOPE
:
"OrionStarAI/Orion-14B-Base"
,
},
"Orion-14B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"OrionStarAI/Orion-14B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"OrionStarAI/Orion-14B-Chat"
,
},
"Orion-14B-Long-Chat"
:
{
DownloadSource
.
DEFAULT
:
"OrionStarAI/Orion-14B-LongChat"
,
DownloadSource
.
MODELSCOPE
:
"OrionStarAI/Orion-14B-LongChat"
,
},
"Orion-14B-RAG-Chat"
:
{
DownloadSource
.
DEFAULT
:
"OrionStarAI/Orion-14B-Chat-RAG"
,
DownloadSource
.
MODELSCOPE
:
"OrionStarAI/Orion-14B-Chat-RAG"
,
},
"Orion-14B-Plugin-Chat"
:
{
DownloadSource
.
DEFAULT
:
"OrionStarAI/Orion-14B-Chat-Plugin"
,
DownloadSource
.
MODELSCOPE
:
"OrionStarAI/Orion-14B-Chat-Plugin"
,
},
},
template
=
"orion"
,
)
register_model_group
(
models
=
{
"Phi-1.5-1.3B"
:
{
DownloadSource
.
DEFAULT
:
"microsoft/phi-1_5"
,
DownloadSource
.
MODELSCOPE
:
"allspace/PHI_1-5"
,
},
"Phi-2-2.7B"
:
{
DownloadSource
.
DEFAULT
:
"microsoft/phi-2"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/phi-2"
,
},
}
)
register_model_group
(
models
=
{
"Qwen-1.8B"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen-1_8B"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen-1_8B"
,
},
"Qwen-7B"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen-7B"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen-7B"
,
},
"Qwen-14B"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen-14B"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen-14B"
,
},
"Qwen-72B"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen-72B"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen-72B"
,
},
"Qwen-1.8B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen-1_8B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen-1_8B-Chat"
,
},
"Qwen-7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen-7B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen-7B-Chat"
},
"Qwen-14B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen-14B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen-14B-Chat"
,
},
"Qwen-72B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen-72B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen-72B-Chat"
,
},
"Qwen-1.8B-int8-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen-1_8B-Chat-Int8"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen-1_8B-Chat-Int8"
,
},
"Qwen-1.8B-int4-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen-1_8B-Chat-Int4"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen-1_8B-Chat-Int4"
,
},
"Qwen-7B-int8-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen-7B-Chat-Int8"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen-7B-Chat-Int8"
,
},
"Qwen-7B-int4-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen-7B-Chat-Int4"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen-7B-Chat-Int4"
,
},
"Qwen-14B-int8-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen-14B-Chat-Int8"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen-14B-Chat-Int8"
,
},
"Qwen-14B-int4-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen-14B-Chat-Int4"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen-14B-Chat-Int4"
,
},
"Qwen-72B-int8-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen-72B-Chat-Int8"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen-72B-Chat-Int8"
,
},
"Qwen-72B-int4-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen-72B-Chat-Int4"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen-72B-Chat-Int4"
,
},
},
module
=
"c_attn"
,
template
=
"qwen"
,
)
register_model_group
(
models
=
{
"Qwen1.5-0.5B"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-0.5B"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-0.5B"
,
},
"Qwen1.5-1.8B"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-1.8B"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-1.8B"
,
},
"Qwen1.5-4B"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-4B"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-4B"
,
},
"Qwen1.5-7B"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-7B"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-7B"
,
},
"Qwen1.5-14B"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-14B"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-14B"
,
},
"Qwen1.5-72B"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-72B"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-72B"
,
},
"Qwen1.5-0.5B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-0.5B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-0.5B-Chat"
,
},
"Qwen1.5-1.8B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-1.8B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-1.8B-Chat"
,
},
"Qwen1.5-4B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-4B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-4B-Chat"
,
},
"Qwen1.5-7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-7B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-7B-Chat"
,
},
"Qwen1.5-14B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-14B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-14B-Chat"
,
},
"Qwen1.5-72B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-72B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-72B-Chat"
,
},
"Qwen1.5-0.5B-int8-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8"
,
},
"Qwen1.5-0.5B-int4-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4"
,
},
"Qwen1.5-1.8B-int8-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-1.8B-Chat-GPTQ-Int8"
,
},
"Qwen1.5-1.8B-int4-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4"
,
},
"Qwen1.5-4B-int8-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-4B-Chat-GPTQ-Int8"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-4B-Chat-GPTQ-Int8"
,
},
"Qwen1.5-4B-int4-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-4B-Chat-GPTQ-Int4"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-4B-Chat-GPTQ-Int4"
,
},
"Qwen1.5-7B-int8-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-7B-Chat-GPTQ-Int8"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-7B-Chat-GPTQ-Int8"
,
},
"Qwen1.5-7B-int4-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-7B-Chat-GPTQ-Int4"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-7B-Chat-GPTQ-Int4"
,
},
"Qwen1.5-14B-int8-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-14B-Chat-GPTQ-Int8"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-14B-Chat-GPTQ-Int8"
,
},
"Qwen1.5-14B-int4-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-14B-Chat-GPTQ-Int4"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-14B-Chat-GPTQ-Int4"
,
},
"Qwen1.5-72B-int8-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-72B-Chat-GPTQ-Int8"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-72B-Chat-GPTQ-Int8"
,
},
"Qwen1.5-72B-int4-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Qwen/Qwen1.5-72B-Chat-GPTQ-Int4"
,
DownloadSource
.
MODELSCOPE
:
"qwen/Qwen1.5-72B-Chat-GPTQ-Int4"
,
},
},
template
=
"qwen"
,
)
register_model_group
(
models
=
{
"SOLAR-10.7B"
:
{
DownloadSource
.
DEFAULT
:
"upstage/SOLAR-10.7B-v1.0"
,
},
"SOLAR-10.7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"upstage/SOLAR-10.7B-Instruct-v1.0"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/SOLAR-10.7B-Instruct-v1.0"
,
},
},
template
=
"solar"
,
)
register_model_group
(
models
=
{
"Skywork-13B-Base"
:
{
DownloadSource
.
DEFAULT
:
"Skywork/Skywork-13B-base"
,
DownloadSource
.
MODELSCOPE
:
"skywork/Skywork-13B-base"
,
}
}
)
register_model_group
(
models
=
{
"Vicuna1.5-7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"lmsys/vicuna-7b-v1.5"
,
DownloadSource
.
MODELSCOPE
:
"Xorbits/vicuna-7b-v1.5"
,
},
"Vicuna1.5-13B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"lmsys/vicuna-13b-v1.5"
,
DownloadSource
.
MODELSCOPE
:
"Xorbits/vicuna-13b-v1.5"
,
},
},
template
=
"vicuna"
,
)
register_model_group
(
models
=
{
"XuanYuan-70B"
:
{
DownloadSource
.
DEFAULT
:
"Duxiaoman-DI/XuanYuan-70B"
,
},
"XuanYuan-70B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Duxiaoman-DI/XuanYuan-70B-Chat"
,
},
"XuanYuan-70B-int8-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Duxiaoman-DI/XuanYuan-70B-Chat-8bit"
,
},
"XuanYuan-70B-int4-Chat"
:
{
DownloadSource
.
DEFAULT
:
"Duxiaoman-DI/XuanYuan-70B-Chat-4bit"
,
},
},
template
=
"xuanyuan"
,
)
register_model_group
(
models
=
{
"XVERSE-7B"
:
{
DownloadSource
.
DEFAULT
:
"xverse/XVERSE-7B"
,
DownloadSource
.
MODELSCOPE
:
"xverse/XVERSE-7B"
,
},
"XVERSE-13B"
:
{
DownloadSource
.
DEFAULT
:
"xverse/XVERSE-13B"
,
DownloadSource
.
MODELSCOPE
:
"xverse/XVERSE-13B"
,
},
"XVERSE-65B"
:
{
DownloadSource
.
DEFAULT
:
"xverse/XVERSE-65B"
,
DownloadSource
.
MODELSCOPE
:
"xverse/XVERSE-65B"
,
},
"XVERSE-65B-2"
:
{
DownloadSource
.
DEFAULT
:
"xverse/XVERSE-65B-2"
,
DownloadSource
.
MODELSCOPE
:
"xverse/XVERSE-65B-2"
,
},
"XVERSE-7B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"xverse/XVERSE-7B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"xverse/XVERSE-7B-Chat"
,
},
"XVERSE-13B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"xverse/XVERSE-13B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"xverse/XVERSE-13B-Chat"
,
},
"XVERSE-65B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"xverse/XVERSE-65B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"xverse/XVERSE-65B-Chat"
,
},
},
template
=
"xverse"
,
)
register_model_group
(
models
=
{
"Yayi-7B"
:
{
DownloadSource
.
DEFAULT
:
"wenge-research/yayi-7b-llama2"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/yayi-7b-llama2"
,
},
"Yayi-13B"
:
{
DownloadSource
.
DEFAULT
:
"wenge-research/yayi-13b-llama2"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/yayi-13b-llama2"
,
},
},
template
=
"yayi"
,
)
register_model_group
(
models
=
{
"Yi-6B"
:
{
DownloadSource
.
DEFAULT
:
"01-ai/Yi-6B"
,
DownloadSource
.
MODELSCOPE
:
"01ai/Yi-6B"
,
},
"Yi-34B"
:
{
DownloadSource
.
DEFAULT
:
"01-ai/Yi-34B"
,
DownloadSource
.
MODELSCOPE
:
"01ai/Yi-34B"
,
},
"Yi-6B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"01-ai/Yi-6B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"01ai/Yi-6B-Chat"
,
},
"Yi-34B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"01-ai/Yi-34B-Chat"
,
DownloadSource
.
MODELSCOPE
:
"01ai/Yi-34B-Chat"
,
},
"Yi-6B-int8-Chat"
:
{
DownloadSource
.
DEFAULT
:
"01-ai/Yi-6B-Chat-8bits"
,
DownloadSource
.
MODELSCOPE
:
"01ai/Yi-6B-Chat-8bits"
,
},
"Yi-34B-int8-Chat"
:
{
DownloadSource
.
DEFAULT
:
"01-ai/Yi-34B-Chat-8bits"
,
DownloadSource
.
MODELSCOPE
:
"01ai/Yi-34B-Chat-8bits"
,
},
},
template
=
"yi"
,
)
register_model_group
(
models
=
{
"Yuan2-2B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"IEITYuan/Yuan2-2B-hf"
,
DownloadSource
.
MODELSCOPE
:
"YuanLLM/Yuan2.0-2B-hf"
,
},
"Yuan2-51B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"IEITYuan/Yuan2-51B-hf"
,
DownloadSource
.
MODELSCOPE
:
"YuanLLM/Yuan2.0-51B-hf"
,
},
"Yuan2-102B-Chat"
:
{
DownloadSource
.
DEFAULT
:
"IEITYuan/Yuan2-102B-hf"
,
DownloadSource
.
MODELSCOPE
:
"YuanLLM/Yuan2.0-102B-hf"
,
},
},
template
=
"yuan"
,
)
register_model_group
(
models
=
{
"Zephyr-7B-Alpha-Chat"
:
{
DownloadSource
.
DEFAULT
:
"HuggingFaceH4/zephyr-7b-alpha"
,
DownloadSource
.
MODELSCOPE
:
"AI-ModelScope/zephyr-7b-alpha"
,
},
"Zephyr-7B-Beta-Chat"
:
{
DownloadSource
.
DEFAULT
:
"HuggingFaceH4/zephyr-7b-beta"
,
DownloadSource
.
MODELSCOPE
:
"modelscope/zephyr-7b-beta"
,
},
},
template
=
"zephyr"
,
)
LLaMA-Factory/src/llmtuner/extras/logging.py
0 → 100644
View file @
afe180a6
import
logging
import
sys
class
LoggerHandler
(
logging
.
Handler
):
r
"""
Logger handler used in Web UI.
"""
def
__init__
(
self
):
super
().
__init__
()
self
.
log
=
""
def
reset
(
self
):
self
.
log
=
""
def
emit
(
self
,
record
):
if
record
.
name
==
"httpx"
:
return
log_entry
=
self
.
format
(
record
)
self
.
log
+=
log_entry
self
.
log
+=
"
\n\n
"
def
get_logger
(
name
:
str
)
->
logging
.
Logger
:
r
"""
Gets a standard logger with a stream hander to stdout.
"""
formatter
=
logging
.
Formatter
(
fmt
=
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
,
datefmt
=
"%m/%d/%Y %H:%M:%S"
)
handler
=
logging
.
StreamHandler
(
sys
.
stdout
)
handler
.
setFormatter
(
formatter
)
logger
=
logging
.
getLogger
(
name
)
logger
.
setLevel
(
logging
.
INFO
)
logger
.
addHandler
(
handler
)
return
logger
def
reset_logging
()
->
None
:
r
"""
Removes basic config of root logger. (unused in script)
"""
root
=
logging
.
getLogger
()
list
(
map
(
root
.
removeHandler
,
root
.
handlers
))
list
(
map
(
root
.
removeFilter
,
root
.
filters
))
LLaMA-Factory/src/llmtuner/extras/misc.py
0 → 100644
View file @
afe180a6
import
gc
import
os
from
typing
import
TYPE_CHECKING
,
Dict
,
Tuple
import
torch
from
peft
import
PeftModel
from
transformers
import
InfNanRemoveLogitsProcessor
,
LogitsProcessorList
,
PreTrainedModel
from
transformers.utils
import
(
SAFE_WEIGHTS_NAME
,
WEIGHTS_NAME
,
is_torch_bf16_gpu_available
,
is_torch_cuda_available
,
is_torch_mps_available
,
is_torch_npu_available
,
is_torch_xpu_available
,
)
from
.constants
import
V_HEAD_SAFE_WEIGHTS_NAME
,
V_HEAD_WEIGHTS_NAME
from
.logging
import
get_logger
_is_fp16_available
=
is_torch_npu_available
()
or
is_torch_cuda_available
()
try
:
_is_bf16_available
=
is_torch_bf16_gpu_available
()
except
Exception
:
_is_bf16_available
=
False
if
TYPE_CHECKING
:
from
trl
import
AutoModelForCausalLMWithValueHead
from
llmtuner.hparams
import
ModelArguments
logger
=
get_logger
(
__name__
)
class
AverageMeter
:
r
"""
Computes and stores the average and current value.
"""
def
__init__
(
self
):
self
.
reset
()
def
reset
(
self
):
self
.
val
=
0
self
.
avg
=
0
self
.
sum
=
0
self
.
count
=
0
def
update
(
self
,
val
,
n
=
1
):
self
.
val
=
val
self
.
sum
+=
val
*
n
self
.
count
+=
n
self
.
avg
=
self
.
sum
/
self
.
count
def
count_parameters
(
model
:
torch
.
nn
.
Module
)
->
Tuple
[
int
,
int
]:
r
"""
Returns the number of trainable parameters and number of all parameters in the model.
"""
trainable_params
,
all_param
=
0
,
0
for
param
in
model
.
parameters
():
num_params
=
param
.
numel
()
# if using DS Zero 3 and the weights are initialized empty
if
num_params
==
0
and
hasattr
(
param
,
"ds_numel"
):
num_params
=
param
.
ds_numel
# Due to the design of 4bit linear layers from bitsandbytes, multiply the number of parameters by 2
if
param
.
__class__
.
__name__
==
"Params4bit"
:
num_params
=
num_params
*
2
all_param
+=
num_params
if
param
.
requires_grad
:
trainable_params
+=
num_params
return
trainable_params
,
all_param
def
fix_valuehead_checkpoint
(
model
:
"AutoModelForCausalLMWithValueHead"
,
output_dir
:
str
,
safe_serialization
:
bool
)
->
None
:
r
"""
The model is already unwrapped.
There are three cases:
1. full tuning without ds_zero3: state_dict = {"model.layers.*": ..., "v_head.summary.*": ...}
2. lora tuning without ds_zero3: state_dict = {"v_head.summary.*": ...}
3. under deepspeed zero3: state_dict = {"pretrained_model.model.layers.*": ..., "v_head.summary.*": ...}
We assume `stage3_gather_16bit_weights_on_model_save=true`.
"""
if
not
isinstance
(
model
.
pretrained_model
,
(
PreTrainedModel
,
PeftModel
)):
return
if
safe_serialization
:
from
safetensors
import
safe_open
from
safetensors.torch
import
save_file
path_to_checkpoint
=
os
.
path
.
join
(
output_dir
,
SAFE_WEIGHTS_NAME
)
with
safe_open
(
path_to_checkpoint
,
framework
=
"pt"
,
device
=
"cpu"
)
as
f
:
state_dict
:
Dict
[
str
,
torch
.
Tensor
]
=
{
key
:
f
.
get_tensor
(
key
)
for
key
in
f
.
keys
()}
else
:
path_to_checkpoint
=
os
.
path
.
join
(
output_dir
,
WEIGHTS_NAME
)
state_dict
:
Dict
[
str
,
torch
.
Tensor
]
=
torch
.
load
(
path_to_checkpoint
,
map_location
=
"cpu"
)
decoder_state_dict
=
{}
v_head_state_dict
=
{}
for
name
,
param
in
state_dict
.
items
():
if
name
.
startswith
(
"v_head."
):
v_head_state_dict
[
name
]
=
param
else
:
decoder_state_dict
[
name
.
replace
(
"pretrained_model."
,
""
)]
=
param
os
.
remove
(
path_to_checkpoint
)
model
.
pretrained_model
.
save_pretrained
(
output_dir
,
state_dict
=
decoder_state_dict
or
None
,
safe_serialization
=
safe_serialization
)
if
safe_serialization
:
save_file
(
v_head_state_dict
,
os
.
path
.
join
(
output_dir
,
V_HEAD_SAFE_WEIGHTS_NAME
),
metadata
=
{
"format"
:
"pt"
})
else
:
torch
.
save
(
v_head_state_dict
,
os
.
path
.
join
(
output_dir
,
V_HEAD_WEIGHTS_NAME
))
logger
.
info
(
"Value head model saved at: {}"
.
format
(
output_dir
))
def
get_current_device
()
->
torch
.
device
:
r
"""
Gets the current available device.
"""
if
is_torch_xpu_available
():
device
=
"xpu:{}"
.
format
(
os
.
environ
.
get
(
"LOCAL_RANK"
,
"0"
))
elif
is_torch_npu_available
():
device
=
"npu:{}"
.
format
(
os
.
environ
.
get
(
"LOCAL_RANK"
,
"0"
))
elif
is_torch_mps_available
():
device
=
"mps:{}"
.
format
(
os
.
environ
.
get
(
"LOCAL_RANK"
,
"0"
))
elif
is_torch_cuda_available
():
device
=
"cuda:{}"
.
format
(
os
.
environ
.
get
(
"LOCAL_RANK"
,
"0"
))
else
:
device
=
"cpu"
return
torch
.
device
(
device
)
def
get_device_count
()
->
int
:
return
torch
.
cuda
.
device_count
()
def
get_logits_processor
()
->
"LogitsProcessorList"
:
r
"""
Gets logits processor that removes NaN and Inf logits.
"""
logits_processor
=
LogitsProcessorList
()
logits_processor
.
append
(
InfNanRemoveLogitsProcessor
())
return
logits_processor
def
infer_optim_dtype
(
model_dtype
:
torch
.
dtype
)
->
torch
.
dtype
:
r
"""
Infers the optimal dtype according to the model_dtype and device compatibility.
"""
if
_is_bf16_available
and
model_dtype
==
torch
.
bfloat16
:
return
torch
.
bfloat16
elif
_is_fp16_available
:
return
torch
.
float16
else
:
return
torch
.
float32
def
torch_gc
()
->
None
:
r
"""
Collects GPU memory.
"""
gc
.
collect
()
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
ipc_collect
()
def
try_download_model_from_ms
(
model_args
:
"ModelArguments"
)
->
None
:
if
not
use_modelscope
()
or
os
.
path
.
exists
(
model_args
.
model_name_or_path
):
return
try
:
from
modelscope
import
snapshot_download
revision
=
"master"
if
model_args
.
model_revision
==
"main"
else
model_args
.
model_revision
model_args
.
model_name_or_path
=
snapshot_download
(
model_args
.
model_name_or_path
,
revision
=
revision
,
cache_dir
=
model_args
.
cache_dir
)
except
ImportError
:
raise
ImportError
(
"Please install modelscope via `pip install modelscope -U`"
)
def
use_modelscope
()
->
bool
:
return
bool
(
int
(
os
.
environ
.
get
(
"USE_MODELSCOPE_HUB"
,
"0"
)))
LLaMA-Factory/src/llmtuner/extras/packages.py
0 → 100644
View file @
afe180a6
import
importlib.metadata
import
importlib.util
def
_is_package_available
(
name
:
str
)
->
bool
:
return
importlib
.
util
.
find_spec
(
name
)
is
not
None
def
_get_package_version
(
name
:
str
)
->
str
:
try
:
return
importlib
.
metadata
.
version
(
name
)
except
Exception
:
return
"0.0.0"
def
is_fastapi_availble
():
return
_is_package_available
(
"fastapi"
)
def
is_flash_attn2_available
():
return
_is_package_available
(
"flash_attn"
)
and
_get_package_version
(
"flash_attn"
).
startswith
(
"2"
)
def
is_jieba_available
():
return
_is_package_available
(
"jieba"
)
def
is_matplotlib_available
():
return
_is_package_available
(
"matplotlib"
)
def
is_nltk_available
():
return
_is_package_available
(
"nltk"
)
def
is_requests_available
():
return
_is_package_available
(
"requests"
)
def
is_rouge_available
():
return
_is_package_available
(
"rouge_chinese"
)
def
is_starlette_available
():
return
_is_package_available
(
"sse_starlette"
)
def
is_unsloth_available
():
return
_is_package_available
(
"unsloth"
)
def
is_uvicorn_available
():
return
_is_package_available
(
"uvicorn"
)
LLaMA-Factory/src/llmtuner/extras/patches/__init__.py
0 → 100644
View file @
afe180a6
LLaMA-Factory/src/llmtuner/extras/patches/llama_patch.py
0 → 100644
View file @
afe180a6
import
math
from
typing
import
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
from
transformers.models.llama.modeling_llama
import
(
Cache
,
LlamaAttention
,
LlamaFlashAttention2
,
apply_rotary_pos_emb
,
repeat_kv
,
)
from
transformers.utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
# Modified from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
def
llama_torch_attn_forward
(
self
:
"LlamaAttention"
,
hidden_states
:
torch
.
Tensor
,
attention_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
position_ids
:
Optional
[
torch
.
LongTensor
]
=
None
,
past_key_value
:
Optional
[
"Cache"
]
=
None
,
output_attentions
:
bool
=
False
,
**
kwargs
,
)
->
Tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
],
Optional
[
Tuple
[
torch
.
Tensor
]]]:
bsz
,
q_len
,
_
=
hidden_states
.
size
()
query_states
=
self
.
q_proj
(
hidden_states
)
key_states
=
self
.
k_proj
(
hidden_states
)
value_states
=
self
.
v_proj
(
hidden_states
)
query_states
=
query_states
.
view
(
bsz
,
q_len
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
key_states
=
key_states
.
view
(
bsz
,
q_len
,
self
.
num_key_value_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
value_states
=
value_states
.
view
(
bsz
,
q_len
,
self
.
num_key_value_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
kv_seq_len
=
key_states
.
shape
[
-
2
]
if
past_key_value
is
not
None
:
kv_seq_len
+=
past_key_value
.
get_usable_length
(
kv_seq_len
,
self
.
layer_idx
)
cos
,
sin
=
self
.
rotary_emb
(
value_states
,
seq_len
=
kv_seq_len
)
query_states
,
key_states
=
apply_rotary_pos_emb
(
query_states
,
key_states
,
cos
,
sin
,
position_ids
)
if
past_key_value
is
not
None
:
cache_kwargs
=
{
"sin"
:
sin
,
"cos"
:
cos
}
# Specific to RoPE models
key_states
,
value_states
=
past_key_value
.
update
(
key_states
,
value_states
,
self
.
layer_idx
,
cache_kwargs
)
key_states
=
repeat_kv
(
key_states
,
self
.
num_key_value_groups
)
value_states
=
repeat_kv
(
value_states
,
self
.
num_key_value_groups
)
if
getattr
(
self
.
config
,
"group_size_ratio"
,
None
)
and
self
.
training
:
# shift
groupsz
=
int
(
q_len
*
getattr
(
self
.
config
,
"group_size_ratio"
))
assert
q_len
%
groupsz
==
0
,
"q_len {} should be divisible by group size {}."
.
format
(
q_len
,
groupsz
)
num_groups
=
q_len
//
groupsz
def
shift
(
state
:
torch
.
Tensor
)
->
torch
.
Tensor
:
state
=
state
.
transpose
(
1
,
2
)
# output: (bsz, seq_len, n_heads, head_dim)
state
=
torch
.
cat
(
(
state
[:,
:,
:
self
.
num_heads
//
2
],
state
[:,
:,
self
.
num_heads
//
2
:].
roll
(
-
groupsz
//
2
,
dims
=
1
)),
dim
=
2
,
)
return
state
.
reshape
(
bsz
*
num_groups
,
groupsz
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
query_states
,
key_states
,
value_states
=
shift
(
query_states
),
shift
(
key_states
),
shift
(
value_states
)
if
attention_mask
is
not
None
:
attention_mask
=
attention_mask
[:,
:,
:
groupsz
,
:
groupsz
].
repeat
(
num_groups
,
1
,
1
,
1
)
attn_weights
=
torch
.
matmul
(
query_states
,
key_states
.
transpose
(
2
,
3
))
/
math
.
sqrt
(
self
.
head_dim
)
if
attention_mask
is
not
None
:
attn_weights
=
attn_weights
+
attention_mask
# upcast attention to fp32
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
,
dtype
=
torch
.
float32
).
to
(
query_states
.
dtype
)
attn_weights
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
attention_dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
matmul
(
attn_weights
,
value_states
)
# (bsz, :, seq_len, :) or (bsz*n_group, :, groupsz, :)
attn_output
=
attn_output
.
transpose
(
1
,
2
).
contiguous
()
if
getattr
(
self
.
config
,
"group_size_ratio"
,
None
)
and
self
.
training
:
# shift back
attn_output
.
reshape
(
bsz
,
q_len
,
self
.
num_heads
,
self
.
head_dim
)
attn_output
=
torch
.
cat
(
(
attn_output
[:,
:,
:
self
.
num_heads
//
2
],
attn_output
[:,
:,
self
.
num_heads
//
2
:].
roll
(
groupsz
//
2
,
dims
=
1
),
)
)
attn_output
=
attn_output
.
reshape
(
bsz
,
q_len
,
self
.
hidden_size
)
attn_output
=
self
.
o_proj
(
attn_output
)
if
not
output_attentions
:
attn_weights
=
None
return
attn_output
,
attn_weights
,
past_key_value
# Modified from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
def
llama_flash_attn_forward
(
self
:
"LlamaFlashAttention2"
,
hidden_states
:
torch
.
Tensor
,
attention_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
position_ids
:
Optional
[
torch
.
LongTensor
]
=
None
,
past_key_value
:
Optional
[
Tuple
[
torch
.
Tensor
]]
=
None
,
output_attentions
:
bool
=
False
,
**
kwargs
,
)
->
Tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
],
Optional
[
Tuple
[
torch
.
Tensor
]]]:
# LlamaFlashAttention2 attention does not support output_attentions
output_attentions
=
False
bsz
,
q_len
,
_
=
hidden_states
.
size
()
query_states
=
self
.
q_proj
(
hidden_states
)
key_states
=
self
.
k_proj
(
hidden_states
)
value_states
=
self
.
v_proj
(
hidden_states
)
# FlashAttention requires the input to have the shape (bsz, seq_len, n_heads, head_dim)
query_states
=
query_states
.
view
(
bsz
,
q_len
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
key_states
=
key_states
.
view
(
bsz
,
q_len
,
self
.
num_key_value_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
value_states
=
value_states
.
view
(
bsz
,
q_len
,
self
.
num_key_value_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
kv_seq_len
=
key_states
.
shape
[
-
2
]
if
past_key_value
is
not
None
:
kv_seq_len
+=
past_key_value
.
get_usable_length
(
kv_seq_len
,
self
.
layer_idx
)
cos
,
sin
=
self
.
rotary_emb
(
value_states
,
seq_len
=
kv_seq_len
)
query_states
,
key_states
=
apply_rotary_pos_emb
(
query_states
,
key_states
,
cos
,
sin
,
position_ids
)
if
past_key_value
is
not
None
:
cache_kwargs
=
{
"sin"
:
sin
,
"cos"
:
cos
}
# Specific to RoPE models
key_states
,
value_states
=
past_key_value
.
update
(
key_states
,
value_states
,
self
.
layer_idx
,
cache_kwargs
)
key_states
=
repeat_kv
(
key_states
,
self
.
num_key_value_groups
)
value_states
=
repeat_kv
(
value_states
,
self
.
num_key_value_groups
)
query_states
=
query_states
.
transpose
(
1
,
2
)
# (bsz, seq_len, n_heads, head_dim)
key_states
=
key_states
.
transpose
(
1
,
2
)
# (bsz, seq_len, n_heads, head_dim)
value_states
=
value_states
.
transpose
(
1
,
2
)
# (bsz, seq_len, n_heads, head_dim)
dropout_rate
=
self
.
attention_dropout
if
self
.
training
else
0.0
input_dtype
=
query_states
.
dtype
if
input_dtype
==
torch
.
float32
:
if
torch
.
is_autocast_enabled
():
target_dtype
=
torch
.
get_autocast_gpu_dtype
()
elif
hasattr
(
self
.
config
,
"_pre_quantization_dtype"
):
target_dtype
=
self
.
config
.
_pre_quantization_dtype
else
:
target_dtype
=
self
.
q_proj
.
weight
.
dtype
logger
.
warning_once
(
"The input hidden states seems to be silently casted in float32."
)
query_states
=
query_states
.
to
(
target_dtype
)
key_states
=
key_states
.
to
(
target_dtype
)
value_states
=
value_states
.
to
(
target_dtype
)
if
getattr
(
self
.
config
,
"group_size_ratio"
,
None
)
and
self
.
training
:
# shift
groupsz
=
int
(
q_len
*
getattr
(
self
.
config
,
"group_size_ratio"
))
assert
q_len
%
groupsz
==
0
,
"q_len {} should be divisible by group size {}."
.
format
(
q_len
,
groupsz
)
num_groups
=
q_len
//
groupsz
def
shift
(
state
:
torch
.
Tensor
)
->
torch
.
Tensor
:
state
=
torch
.
cat
(
(
state
[:,
:,
:
self
.
num_heads
//
2
],
state
[:,
:,
self
.
num_heads
//
2
:].
roll
(
-
groupsz
//
2
,
dims
=
1
)),
dim
=
2
,
)
return
state
.
reshape
(
bsz
*
num_groups
,
groupsz
,
self
.
num_heads
,
self
.
head_dim
)
query_states
,
key_states
,
value_states
=
shift
(
query_states
),
shift
(
key_states
),
shift
(
value_states
)
if
attention_mask
is
not
None
:
attention_mask
=
attention_mask
[:,
:,
:
groupsz
,
:
groupsz
].
repeat
(
num_groups
,
1
,
1
,
1
)
attn_output
:
torch
.
Tensor
=
self
.
_flash_attention_forward
(
query_states
,
key_states
,
value_states
,
attention_mask
,
q_len
,
dropout
=
dropout_rate
)
if
getattr
(
self
.
config
,
"group_size_ratio"
,
None
)
and
self
.
training
:
# shift back
attn_output
.
reshape
(
bsz
,
q_len
,
self
.
num_heads
,
self
.
head_dim
)
attn_output
=
torch
.
cat
(
(
attn_output
[:,
:,
:
self
.
num_heads
//
2
],
attn_output
[:,
:,
self
.
num_heads
//
2
:].
roll
(
groupsz
//
2
,
dims
=
1
),
)
)
attn_output
=
attn_output
.
reshape
(
bsz
,
q_len
,
self
.
hidden_size
).
contiguous
()
attn_output
=
self
.
o_proj
(
attn_output
)
if
not
output_attentions
:
attn_weights
=
None
return
attn_output
,
attn_weights
,
past_key_value
def
apply_llama_patch
()
->
None
:
LlamaAttention
.
forward
=
llama_torch_attn_forward
LlamaFlashAttention2
.
forward
=
llama_flash_attn_forward
LLaMA-Factory/src/llmtuner/extras/patches/mixtral_patch.py
0 → 100644
View file @
afe180a6
import
torch
import
torch.nn.functional
as
F
from
transformers.models.mixtral.modeling_mixtral
import
MixtralBLockSparseTop2MLP
,
MixtralSparseMoeBlock
def
mlp_forward
(
self
:
"MixtralBLockSparseTop2MLP"
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
current_hidden_states
=
self
.
act_fn
(
self
.
w1
(
hidden_states
))
*
self
.
w3
(
hidden_states
)
current_hidden_states
=
self
.
w2
(
current_hidden_states
)
return
current_hidden_states
# Modified from: https://huggingface.co/deepseek-ai/deepseek-moe-16b-base/blob/main/modeling_deepseek.py
def
moe_forward
(
self
:
"MixtralSparseMoeBlock"
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
batch_size
,
sequence_length
,
hidden_dim
=
hidden_states
.
shape
hidden_states
=
hidden_states
.
view
(
-
1
,
hidden_dim
)
# router_logits: (batch * sequence_length, n_experts)
router_logits
=
self
.
gate
(
hidden_states
)
routing_weights
=
F
.
softmax
(
router_logits
,
dim
=
1
,
dtype
=
torch
.
float
)
topk_weight
,
topk_idx
=
torch
.
topk
(
routing_weights
,
self
.
top_k
,
dim
=-
1
,
sorted
=
False
)
topk_weight
/=
topk_weight
.
sum
(
dim
=-
1
,
keepdim
=
True
)
# we cast back to the input dtype
topk_weight
=
topk_weight
.
to
(
hidden_states
.
dtype
)
hidden_states
=
hidden_states
.
repeat_interleave
(
self
.
top_k
,
dim
=
0
)
y
=
torch
.
empty_like
(
hidden_states
)
flat_topk_idx
=
topk_idx
.
view
(
-
1
)
for
i
in
range
(
self
.
num_experts
):
expert
=
self
.
experts
[
i
]
y
[
flat_topk_idx
==
i
]
=
expert
(
hidden_states
[
flat_topk_idx
==
i
])
y
=
(
y
.
view
(
*
topk_weight
.
shape
,
-
1
)
*
topk_weight
.
unsqueeze
(
-
1
)).
sum
(
dim
=
1
)
final_hidden_states
=
y
.
reshape
(
batch_size
,
sequence_length
,
hidden_dim
)
return
final_hidden_states
,
router_logits
def
patch_mixtral_replace_moe_impl
()
->
None
:
MixtralBLockSparseTop2MLP
.
forward
=
mlp_forward
MixtralSparseMoeBlock
.
forward
=
moe_forward
LLaMA-Factory/src/llmtuner/extras/ploting.py
0 → 100644
View file @
afe180a6
import
json
import
math
import
os
from
typing
import
List
,
Optional
from
transformers.trainer
import
TRAINER_STATE_NAME
from
.logging
import
get_logger
from
.packages
import
is_matplotlib_available
if
is_matplotlib_available
():
import
matplotlib.pyplot
as
plt
logger
=
get_logger
(
__name__
)
def
smooth
(
scalars
:
List
[
float
])
->
List
[
float
]:
r
"""
EMA implementation according to TensorBoard.
"""
last
=
scalars
[
0
]
smoothed
=
list
()
weight
=
1.8
*
(
1
/
(
1
+
math
.
exp
(
-
0.05
*
len
(
scalars
)))
-
0.5
)
# a sigmoid function
for
next_val
in
scalars
:
smoothed_val
=
last
*
weight
+
(
1
-
weight
)
*
next_val
smoothed
.
append
(
smoothed_val
)
last
=
smoothed_val
return
smoothed
def
plot_loss
(
save_dictionary
:
os
.
PathLike
,
keys
:
Optional
[
List
[
str
]]
=
[
"loss"
])
->
None
:
with
open
(
os
.
path
.
join
(
save_dictionary
,
TRAINER_STATE_NAME
),
"r"
,
encoding
=
"utf-8"
)
as
f
:
data
=
json
.
load
(
f
)
for
key
in
keys
:
steps
,
metrics
=
[],
[]
for
i
in
range
(
len
(
data
[
"log_history"
])):
if
key
in
data
[
"log_history"
][
i
]:
steps
.
append
(
data
[
"log_history"
][
i
][
"step"
])
metrics
.
append
(
data
[
"log_history"
][
i
][
key
])
if
len
(
metrics
)
==
0
:
logger
.
warning
(
f
"No metric
{
key
}
to plot."
)
continue
plt
.
figure
()
plt
.
plot
(
steps
,
metrics
,
alpha
=
0.4
,
label
=
"original"
)
plt
.
plot
(
steps
,
smooth
(
metrics
),
label
=
"smoothed"
)
plt
.
title
(
"training {} of {}"
.
format
(
key
,
save_dictionary
))
plt
.
xlabel
(
"step"
)
plt
.
ylabel
(
key
)
plt
.
legend
()
plt
.
savefig
(
os
.
path
.
join
(
save_dictionary
,
"training_{}.png"
.
format
(
key
)),
format
=
"png"
,
dpi
=
100
)
print
(
"Figure saved:"
,
os
.
path
.
join
(
save_dictionary
,
"training_{}.png"
.
format
(
key
)))
LLaMA-Factory/src/llmtuner/hparams/__init__.py
0 → 100644
View file @
afe180a6
from
.data_args
import
DataArguments
from
.evaluation_args
import
EvaluationArguments
from
.finetuning_args
import
FinetuningArguments
from
.generating_args
import
GeneratingArguments
from
.model_args
import
ModelArguments
from
.parser
import
get_eval_args
,
get_infer_args
,
get_train_args
__all__
=
[
"DataArguments"
,
"EvaluationArguments"
,
"FinetuningArguments"
,
"GeneratingArguments"
,
"ModelArguments"
,
"get_eval_args"
,
"get_infer_args"
,
"get_train_args"
,
]
LLaMA-Factory/src/llmtuner/hparams/data_args.py
0 → 100644
View file @
afe180a6
from
dataclasses
import
dataclass
,
field
from
typing
import
Literal
,
Optional
@
dataclass
class
DataArguments
:
r
"""
Arguments pertaining to what data we are going to input our model for training and evaluation.
"""
template
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Which template to use for constructing prompts in training and inference."
},
)
dataset
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The name of provided dataset(s) to use. Use commas to separate multiple datasets."
},
)
dataset_dir
:
Optional
[
str
]
=
field
(
default
=
"data"
,
metadata
=
{
"help"
:
"Path to the folder containing the datasets."
},
)
split
:
Optional
[
str
]
=
field
(
default
=
"train"
,
metadata
=
{
"help"
:
"Which dataset split to use for training and evaluation."
},
)
cutoff_len
:
Optional
[
int
]
=
field
(
default
=
1024
,
metadata
=
{
"help"
:
"The cutoff length of the model inputs after tokenization."
},
)
reserved_label_len
:
Optional
[
int
]
=
field
(
default
=
1
,
metadata
=
{
"help"
:
"The minimum cutoff length reserved for label after tokenization."
},
)
train_on_prompt
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether to disable the mask on the prompt or not."
},
)
streaming
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Enable dataset streaming."
},
)
buffer_size
:
Optional
[
int
]
=
field
(
default
=
16384
,
metadata
=
{
"help"
:
"Size of the buffer to randomly sample examples from in dataset streaming."
},
)
mix_strategy
:
Optional
[
Literal
[
"concat"
,
"interleave_under"
,
"interleave_over"
]]
=
field
(
default
=
"concat"
,
metadata
=
{
"help"
:
"Strategy to use in dataset mixing (concat/interleave) (undersampling/oversampling)."
},
)
interleave_probs
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Probabilities to sample data from datasets. Use commas to separate multiple datasets."
},
)
overwrite_cache
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Overwrite the cached training and evaluation sets."
},
)
preprocessing_num_workers
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The number of processes to use for the preprocessing."
},
)
max_samples
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"For debugging purposes, truncate the number of examples for each dataset."
},
)
eval_num_beams
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Number of beams to use for evaluation. This argument will be passed to `model.generate`"
},
)
ignore_pad_token_for_loss
:
Optional
[
bool
]
=
field
(
default
=
True
,
metadata
=
{
"help"
:
"Whether or not to ignore the tokens corresponding to padded labels in the loss computation."
},
)
val_size
:
Optional
[
float
]
=
field
(
default
=
0
,
metadata
=
{
"help"
:
"Size of the development set, should be an integer or a float in range `[0,1)`."
},
)
sft_packing
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Packing the questions and answers in the supervised fine-tuning stage."
},
)
cache_path
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Path to save or load the preprocessed datasets."
},
)
def
__post_init__
(
self
):
if
self
.
reserved_label_len
>=
self
.
cutoff_len
:
raise
ValueError
(
"`reserved_label_len` must be smaller than `cutoff_len`."
)
if
self
.
streaming
and
self
.
val_size
>
1e-6
and
self
.
val_size
<
1
:
raise
ValueError
(
"Streaming mode should have an integer val size."
)
if
self
.
streaming
and
self
.
max_samples
is
not
None
:
raise
ValueError
(
"`max_samples` is incompatible with `streaming`."
)
LLaMA-Factory/src/llmtuner/hparams/evaluation_args.py
0 → 100644
View file @
afe180a6
import
os
from
dataclasses
import
dataclass
,
field
from
typing
import
Literal
,
Optional
from
datasets
import
DownloadMode
@
dataclass
class
EvaluationArguments
:
r
"""
Arguments pertaining to specify the evaluation parameters.
"""
task
:
str
=
field
(
metadata
=
{
"help"
:
"Name of the evaluation task."
},
)
task_dir
:
Optional
[
str
]
=
field
(
default
=
"evaluation"
,
metadata
=
{
"help"
:
"Path to the folder containing the evaluation datasets."
},
)
batch_size
:
Optional
[
int
]
=
field
(
default
=
4
,
metadata
=
{
"help"
:
"The batch size per GPU for evaluation."
},
)
seed
:
Optional
[
int
]
=
field
(
default
=
42
,
metadata
=
{
"help"
:
"Random seed to be used with data loaders."
},
)
lang
:
Optional
[
Literal
[
"en"
,
"zh"
]]
=
field
(
default
=
"en"
,
metadata
=
{
"help"
:
"Language used at evaluation."
},
)
n_shot
:
Optional
[
int
]
=
field
(
default
=
5
,
metadata
=
{
"help"
:
"Number of examplars for few-shot learning."
},
)
save_dir
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Path to save the evaluation results."
},
)
download_mode
:
Optional
[
DownloadMode
]
=
field
(
default
=
DownloadMode
.
REUSE_DATASET_IF_EXISTS
,
metadata
=
{
"help"
:
"Download mode used for the evaluation datasets."
},
)
def
__post_init__
(
self
):
if
self
.
save_dir
is
not
None
and
os
.
path
.
exists
(
self
.
save_dir
):
raise
ValueError
(
"`save_dir` already exists, use another one."
)
LLaMA-Factory/src/llmtuner/hparams/finetuning_args.py
0 → 100644
View file @
afe180a6
import
json
from
dataclasses
import
asdict
,
dataclass
,
field
from
typing
import
Literal
,
Optional
@
dataclass
class
FreezeArguments
:
r
"""
Arguments pertaining to the freeze (partial-parameter) training.
"""
name_module_trainable
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"""Name of trainable modules for partial-parameter (freeze) fine-tuning.
\
Use commas to separate multiple modules.
\
Use "all" to specify all the available modules.
\
LLaMA choices: ["mlp", "self_attn"],
\
BLOOM & Falcon & ChatGLM choices: ["mlp", "self_attention"],
\
Qwen choices: ["mlp", "attn"],
\
InternLM2 choices: ["feed_forward", "attention"],
\
Others choices: the same as LLaMA."""
},
)
num_layer_trainable
:
Optional
[
int
]
=
field
(
default
=
3
,
metadata
=
{
"help"
:
"The number of trainable layers for partial-parameter (freeze) fine-tuning."
},
)
use_llama_pro
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether or not to use llama pro for partial-parameter (freeze) fine-tuning."
},
)
@
dataclass
class
LoraArguments
:
r
"""
Arguments pertaining to the LoRA training.
"""
additional_target
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Name(s) of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint."
},
)
lora_alpha
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The scale factor for LoRA fine-tuning (default: lora_rank * 2)."
},
)
lora_dropout
:
Optional
[
float
]
=
field
(
default
=
0.0
,
metadata
=
{
"help"
:
"Dropout rate for the LoRA fine-tuning."
},
)
lora_rank
:
Optional
[
int
]
=
field
(
default
=
8
,
metadata
=
{
"help"
:
"The intrinsic dimension for LoRA fine-tuning."
},
)
lora_target
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"""Name(s) of target modules to apply LoRA.
\
Use commas to separate multiple modules.
\
Use "all" to specify all the available modules.
\
LLaMA choices: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
\
BLOOM & Falcon & ChatGLM choices: ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],
\
Baichuan choices: ["W_pack", "o_proj", "gate_proj", "up_proj", "down_proj"],
\
Qwen choices: ["c_attn", "attn.c_proj", "w1", "w2", "mlp.c_proj"],
\
InternLM2 choices: ["wqkv", "wo", "w1", "w2", "w3"],
\
Others choices: the same as LLaMA."""
},
)
lora_bf16_mode
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether or not to train lora adapters in bf16 precision."
},
)
use_rslora
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether or not to use the rank stabilization scaling factor for LoRA layer."
},
)
create_new_adapter
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether or not to create a new adapter with randomly initialized weight."
},
)
@
dataclass
class
RLHFArguments
:
r
"""
Arguments pertaining to the PPO and DPO training.
"""
dpo_beta
:
Optional
[
float
]
=
field
(
default
=
0.1
,
metadata
=
{
"help"
:
"The beta parameter for the DPO loss."
},
)
dpo_loss
:
Optional
[
Literal
[
"sigmoid"
,
"hinge"
,
"ipo"
,
"kto"
]]
=
field
(
default
=
"sigmoid"
,
metadata
=
{
"help"
:
"The type of DPO loss to use."
},
)
dpo_ftx
:
Optional
[
float
]
=
field
(
default
=
0
,
metadata
=
{
"help"
:
"The supervised fine-tuning loss coefficient in DPO training."
},
)
ppo_buffer_size
:
Optional
[
int
]
=
field
(
default
=
1
,
metadata
=
{
"help"
:
"The number of mini-batches to make experience buffer in a PPO optimization step."
},
)
ppo_epochs
:
Optional
[
int
]
=
field
(
default
=
4
,
metadata
=
{
"help"
:
"The number of epochs to perform in a PPO optimization step."
},
)
ppo_logger
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
'Log with either "wandb" or "tensorboard" in PPO training.'
},
)
ppo_score_norm
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Use score normalization in PPO training."
},
)
ppo_target
:
Optional
[
float
]
=
field
(
default
=
6.0
,
metadata
=
{
"help"
:
"Target KL value for adaptive KL control in PPO training."
},
)
ppo_whiten_rewards
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whiten the rewards before compute advantages in PPO training."
},
)
ref_model
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Path to the reference model used for the PPO or DPO training."
},
)
ref_model_adapters
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Path to the adapters of the reference model."
},
)
ref_model_quantization_bit
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The number of bits to quantize the reference model."
},
)
reward_model
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Path to the reward model used for the PPO training."
},
)
reward_model_adapters
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Path to the adapters of the reward model."
},
)
reward_model_quantization_bit
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The number of bits to quantize the reward model."
},
)
reward_model_type
:
Optional
[
Literal
[
"lora"
,
"full"
,
"api"
]]
=
field
(
default
=
"lora"
,
metadata
=
{
"help"
:
"The type of the reward model in PPO training. Lora model only supports lora training."
},
)
@
dataclass
class
FinetuningArguments
(
FreezeArguments
,
LoraArguments
,
RLHFArguments
):
r
"""
Arguments pertaining to which techniques we are going to fine-tuning with.
"""
stage
:
Optional
[
Literal
[
"pt"
,
"sft"
,
"rm"
,
"ppo"
,
"dpo"
]]
=
field
(
default
=
"sft"
,
metadata
=
{
"help"
:
"Which stage will be performed in training."
},
)
finetuning_type
:
Optional
[
Literal
[
"lora"
,
"freeze"
,
"full"
]]
=
field
(
default
=
"lora"
,
metadata
=
{
"help"
:
"Which fine-tuning method to use."
},
)
disable_version_checking
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether or not to disable version checking."
},
)
plot_loss
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether or not to save the training loss curves."
},
)
def
__post_init__
(
self
):
def
split_arg
(
arg
):
if
isinstance
(
arg
,
str
):
return
[
item
.
strip
()
for
item
in
arg
.
split
(
","
)]
return
arg
self
.
name_module_trainable
=
split_arg
(
self
.
name_module_trainable
)
self
.
lora_alpha
=
self
.
lora_alpha
or
self
.
lora_rank
*
2
self
.
lora_target
=
split_arg
(
self
.
lora_target
)
self
.
additional_target
=
split_arg
(
self
.
additional_target
)
assert
self
.
finetuning_type
in
[
"lora"
,
"freeze"
,
"full"
],
"Invalid fine-tuning method."
assert
self
.
ref_model_quantization_bit
in
[
None
,
8
,
4
],
"We only accept 4-bit or 8-bit quantization."
assert
self
.
reward_model_quantization_bit
in
[
None
,
8
,
4
],
"We only accept 4-bit or 8-bit quantization."
if
self
.
stage
==
"ppo"
and
self
.
reward_model
is
None
:
raise
ValueError
(
"Reward model is necessary for PPO training."
)
if
self
.
stage
==
"ppo"
and
self
.
reward_model_type
==
"lora"
and
self
.
finetuning_type
!=
"lora"
:
raise
ValueError
(
"Freeze/Full PPO training needs `reward_model_type=full`."
)
def
save_to_json
(
self
,
json_path
:
str
):
r
"""Saves the content of this instance in JSON format inside `json_path`."""
json_string
=
json
.
dumps
(
asdict
(
self
),
indent
=
2
,
sort_keys
=
True
)
+
"
\n
"
with
open
(
json_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
json_string
)
@
classmethod
def
load_from_json
(
cls
,
json_path
:
str
):
r
"""Creates an instance from the content of `json_path`."""
with
open
(
json_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
text
=
f
.
read
()
return
cls
(
**
json
.
loads
(
text
))
LLaMA-Factory/src/llmtuner/hparams/generating_args.py
0 → 100644
View file @
afe180a6
from
dataclasses
import
asdict
,
dataclass
,
field
from
typing
import
Any
,
Dict
,
Optional
@
dataclass
class
GeneratingArguments
:
r
"""
Arguments pertaining to specify the decoding parameters.
"""
do_sample
:
Optional
[
bool
]
=
field
(
default
=
True
,
metadata
=
{
"help"
:
"Whether or not to use sampling, use greedy decoding otherwise."
},
)
temperature
:
Optional
[
float
]
=
field
(
default
=
0.95
,
metadata
=
{
"help"
:
"The value used to modulate the next token probabilities."
},
)
top_p
:
Optional
[
float
]
=
field
(
default
=
0.7
,
metadata
=
{
"help"
:
"The smallest set of most probable tokens with probabilities that add up to top_p or higher are kept."
},
)
top_k
:
Optional
[
int
]
=
field
(
default
=
50
,
metadata
=
{
"help"
:
"The number of highest probability vocabulary tokens to keep for top-k filtering."
},
)
num_beams
:
Optional
[
int
]
=
field
(
default
=
1
,
metadata
=
{
"help"
:
"Number of beams for beam search. 1 means no beam search."
},
)
max_length
:
Optional
[
int
]
=
field
(
default
=
512
,
metadata
=
{
"help"
:
"The maximum length the generated tokens can have. It can be overridden by max_new_tokens."
},
)
max_new_tokens
:
Optional
[
int
]
=
field
(
default
=
512
,
metadata
=
{
"help"
:
"The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt."
},
)
repetition_penalty
:
Optional
[
float
]
=
field
(
default
=
1.0
,
metadata
=
{
"help"
:
"The parameter for repetition penalty. 1.0 means no penalty."
},
)
length_penalty
:
Optional
[
float
]
=
field
(
default
=
1.0
,
metadata
=
{
"help"
:
"Exponential penalty to the length that is used with beam-based generation."
},
)
def
to_dict
(
self
)
->
Dict
[
str
,
Any
]:
args
=
asdict
(
self
)
if
args
.
get
(
"max_new_tokens"
,
-
1
)
>
0
:
args
.
pop
(
"max_length"
,
None
)
else
:
args
.
pop
(
"max_new_tokens"
,
None
)
return
args
LLaMA-Factory/src/llmtuner/hparams/model_args.py
0 → 100644
View file @
afe180a6
from
dataclasses
import
asdict
,
dataclass
,
field
from
typing
import
Any
,
Dict
,
Literal
,
Optional
@
dataclass
class
ModelArguments
:
r
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune.
"""
model_name_or_path
:
str
=
field
(
metadata
=
{
"help"
:
"Path to the model weight or identifier from huggingface.co/models or modelscope.cn/models."
},
)
adapter_name_or_path
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Path to the adapter weight or identifier from huggingface.co/models."
},
)
cache_dir
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn."
},
)
use_fast_tokenizer
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether or not to use one of the fast tokenizer (backed by the tokenizers library)."
},
)
resize_vocab
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether or not to resize the tokenizer vocab and the embedding layers."
},
)
split_special_tokens
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether or not the special tokens should be split during the tokenization process."
},
)
model_revision
:
Optional
[
str
]
=
field
(
default
=
"main"
,
metadata
=
{
"help"
:
"The specific model version to use (can be a branch name, tag name or commit id)."
},
)
quantization_bit
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The number of bits to quantize the model."
},
)
quantization_type
:
Optional
[
Literal
[
"fp4"
,
"nf4"
]]
=
field
(
default
=
"nf4"
,
metadata
=
{
"help"
:
"Quantization data type to use in int4 training."
},
)
double_quantization
:
Optional
[
bool
]
=
field
(
default
=
True
,
metadata
=
{
"help"
:
"Whether or not to use double quantization in int4 training."
},
)
rope_scaling
:
Optional
[
Literal
[
"linear"
,
"dynamic"
]]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Which scaling strategy should be adopted for the RoPE embeddings."
},
)
flash_attn
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Enable FlashAttention-2 for faster training."
},
)
shift_attn
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Enable shift short attention (S^2-Attn) proposed by LongLoRA."
},
)
use_unsloth
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether or not to use unsloth's optimization for the LoRA training."
},
)
disable_gradient_checkpointing
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether or not to disable gradient checkpointing."
},
)
upcast_layernorm
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether or not to upcast the layernorm weights in fp32."
},
)
upcast_lmhead_output
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether or not to upcast the output of lm_head in fp32."
},
)
hf_hub_token
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Auth token to log in with Hugging Face Hub."
},
)
ms_hub_token
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Auth token to log in with ModelScope Hub."
},
)
export_dir
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Path to the directory to save the exported model."
},
)
export_size
:
Optional
[
int
]
=
field
(
default
=
1
,
metadata
=
{
"help"
:
"The file shard size (in GB) of the exported model."
},
)
export_quantization_bit
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The number of bits to quantize the exported model."
},
)
export_quantization_dataset
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Path to the dataset or dataset name to use in quantizing the exported model."
},
)
export_quantization_nsamples
:
Optional
[
int
]
=
field
(
default
=
128
,
metadata
=
{
"help"
:
"The number of samples used for quantization."
},
)
export_quantization_maxlen
:
Optional
[
int
]
=
field
(
default
=
1024
,
metadata
=
{
"help"
:
"The maximum length of the model inputs used for quantization."
},
)
export_legacy_format
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether or not to save the `.bin` files instead of `.safetensors`."
},
)
export_hub_model_id
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The name of the repository if push the model to the Hugging Face hub."
},
)
print_param_status
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"For debugging purposes, print the status of the parameters in the model."
},
)
def
__post_init__
(
self
):
self
.
compute_dtype
=
None
self
.
model_max_length
=
None
if
self
.
split_special_tokens
and
self
.
use_fast_tokenizer
:
raise
ValueError
(
"`split_special_tokens` is only supported for slow tokenizers."
)
if
self
.
adapter_name_or_path
is
not
None
:
# support merging multiple lora weights
self
.
adapter_name_or_path
=
[
path
.
strip
()
for
path
in
self
.
adapter_name_or_path
.
split
(
","
)]
assert
self
.
quantization_bit
in
[
None
,
8
,
4
],
"We only accept 4-bit or 8-bit quantization."
assert
self
.
export_quantization_bit
in
[
None
,
8
,
4
,
3
,
2
],
"We only accept 2/3/4/8-bit quantization."
if
self
.
export_quantization_bit
is
not
None
and
self
.
export_quantization_dataset
is
None
:
raise
ValueError
(
"Quantization dataset is necessary for exporting."
)
def
to_dict
(
self
)
->
Dict
[
str
,
Any
]:
return
asdict
(
self
)
Prev
1
…
4
5
6
7
8
9
10
11
12
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment