Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
qwen2.5-coder_pytorch
Commits
53b3977b
Commit
53b3977b
authored
Jul 11, 2025
by
dongchy920
Browse files
Initial commit
parents
Pipeline
#2841
failed with stages
in 0 seconds
Changes
350
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2580 additions
and
0 deletions
+2580
-0
LLaMA-Factory/src/llamafactory/model/loader.py
LLaMA-Factory/src/llamafactory/model/loader.py
+213
-0
LLaMA-Factory/src/llamafactory/model/model_utils/__init__.py
LLaMA-Factory/src/llamafactory/model/model_utils/__init__.py
+0
-0
LLaMA-Factory/src/llamafactory/model/model_utils/attention.py
...A-Factory/src/llamafactory/model/model_utils/attention.py
+89
-0
LLaMA-Factory/src/llamafactory/model/model_utils/checkpointing.py
...ctory/src/llamafactory/model/model_utils/checkpointing.py
+169
-0
LLaMA-Factory/src/llamafactory/model/model_utils/embedding.py
...A-Factory/src/llamafactory/model/model_utils/embedding.py
+72
-0
LLaMA-Factory/src/llamafactory/model/model_utils/liger_kernel.py
...actory/src/llamafactory/model/model_utils/liger_kernel.py
+67
-0
LLaMA-Factory/src/llamafactory/model/model_utils/longlora.py
LLaMA-Factory/src/llamafactory/model/model_utils/longlora.py
+373
-0
LLaMA-Factory/src/llamafactory/model/model_utils/misc.py
LLaMA-Factory/src/llamafactory/model/model_utils/misc.py
+95
-0
LLaMA-Factory/src/llamafactory/model/model_utils/mod.py
LLaMA-Factory/src/llamafactory/model/model_utils/mod.py
+42
-0
LLaMA-Factory/src/llamafactory/model/model_utils/moe.py
LLaMA-Factory/src/llamafactory/model/model_utils/moe.py
+82
-0
LLaMA-Factory/src/llamafactory/model/model_utils/packing.py
LLaMA-Factory/src/llamafactory/model/model_utils/packing.py
+123
-0
LLaMA-Factory/src/llamafactory/model/model_utils/quantization.py
...actory/src/llamafactory/model/model_utils/quantization.py
+204
-0
LLaMA-Factory/src/llamafactory/model/model_utils/rope.py
LLaMA-Factory/src/llamafactory/model/model_utils/rope.py
+63
-0
LLaMA-Factory/src/llamafactory/model/model_utils/unsloth.py
LLaMA-Factory/src/llamafactory/model/model_utils/unsloth.py
+102
-0
LLaMA-Factory/src/llamafactory/model/model_utils/valuehead.py
...A-Factory/src/llamafactory/model/model_utils/valuehead.py
+73
-0
LLaMA-Factory/src/llamafactory/model/model_utils/visual.py
LLaMA-Factory/src/llamafactory/model/model_utils/visual.py
+207
-0
LLaMA-Factory/src/llamafactory/model/patcher.py
LLaMA-Factory/src/llamafactory/model/patcher.py
+193
-0
LLaMA-Factory/src/llamafactory/train/__init__.py
LLaMA-Factory/src/llamafactory/train/__init__.py
+0
-0
LLaMA-Factory/src/llamafactory/train/callbacks.py
LLaMA-Factory/src/llamafactory/train/callbacks.py
+395
-0
LLaMA-Factory/src/llamafactory/train/dpo/__init__.py
LLaMA-Factory/src/llamafactory/train/dpo/__init__.py
+18
-0
No files found.
LLaMA-Factory/src/llamafactory/model/loader.py
0 → 100644
View file @
53b3977b
# Copyright 2024 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
Optional
,
TypedDict
import
torch
from
transformers
import
AutoConfig
,
AutoModelForCausalLM
,
AutoModelForVision2Seq
,
AutoProcessor
,
AutoTokenizer
from
trl
import
AutoModelForCausalLMWithValueHead
from
..extras
import
logging
from
..extras.misc
import
count_parameters
,
skip_check_imports
,
try_download_model_from_other_hub
from
.adapter
import
init_adapter
from
.model_utils.liger_kernel
import
apply_liger_kernel
from
.model_utils.misc
import
register_autoclass
from
.model_utils.mod
import
convert_pretrained_model_to_mod
,
load_mod_pretrained_model
from
.model_utils.unsloth
import
load_unsloth_pretrained_model
from
.model_utils.valuehead
import
load_valuehead_params
from
.patcher
import
patch_config
,
patch_model
,
patch_processor
,
patch_tokenizer
,
patch_valuehead_model
if
TYPE_CHECKING
:
from
transformers
import
PretrainedConfig
,
PreTrainedModel
,
PreTrainedTokenizer
,
ProcessorMixin
from
..hparams
import
FinetuningArguments
,
ModelArguments
logger
=
logging
.
get_logger
(
__name__
)
class
TokenizerModule
(
TypedDict
):
tokenizer
:
"PreTrainedTokenizer"
processor
:
Optional
[
"ProcessorMixin"
]
def
_get_init_kwargs
(
model_args
:
"ModelArguments"
)
->
Dict
[
str
,
Any
]:
r
"""
Gets arguments to load config/tokenizer/model.
Note: including inplace operation of model_args.
"""
skip_check_imports
()
model_args
.
model_name_or_path
=
try_download_model_from_other_hub
(
model_args
)
return
{
"trust_remote_code"
:
model_args
.
trust_remote_code
,
"cache_dir"
:
model_args
.
cache_dir
,
"revision"
:
model_args
.
model_revision
,
"token"
:
model_args
.
hf_hub_token
,
}
def
load_tokenizer
(
model_args
:
"ModelArguments"
)
->
"TokenizerModule"
:
r
"""
Loads pretrained tokenizer and optionally loads processor.
Note: including inplace operation of model_args.
"""
init_kwargs
=
_get_init_kwargs
(
model_args
)
config
=
load_config
(
model_args
)
try
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_args
.
model_name_or_path
,
use_fast
=
model_args
.
use_fast_tokenizer
,
split_special_tokens
=
model_args
.
split_special_tokens
,
padding_side
=
"right"
,
**
init_kwargs
,
)
except
ValueError
:
# try the fast one
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_args
.
model_name_or_path
,
use_fast
=
True
,
padding_side
=
"right"
,
**
init_kwargs
,
)
except
Exception
as
e
:
raise
OSError
(
"Failed to load tokenizer."
)
from
e
if
model_args
.
new_special_tokens
is
not
None
:
num_added_tokens
=
tokenizer
.
add_special_tokens
(
dict
(
additional_special_tokens
=
model_args
.
new_special_tokens
),
replace_additional_special_tokens
=
False
,
)
logger
.
info_rank0
(
"Add {} to special tokens."
.
format
(
","
.
join
(
model_args
.
new_special_tokens
)))
if
num_added_tokens
>
0
and
not
model_args
.
resize_vocab
:
model_args
.
resize_vocab
=
True
logger
.
warning_rank0
(
"New tokens have been added, changed `resize_vocab` to True."
)
patch_tokenizer
(
tokenizer
)
try
:
processor
=
AutoProcessor
.
from_pretrained
(
model_args
.
model_name_or_path
,
**
init_kwargs
)
patch_processor
(
processor
,
config
,
tokenizer
,
model_args
)
except
Exception
as
e
:
logger
.
debug
(
f
"Processor was not found:
{
e
}
."
)
processor
=
None
# Avoid load tokenizer, see:
# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/auto/processing_auto.py#L324
if
processor
is
not
None
and
"Processor"
not
in
processor
.
__class__
.
__name__
:
processor
=
None
return
{
"tokenizer"
:
tokenizer
,
"processor"
:
processor
}
def
load_config
(
model_args
:
"ModelArguments"
)
->
"PretrainedConfig"
:
r
"""
Loads model config.
"""
init_kwargs
=
_get_init_kwargs
(
model_args
)
return
AutoConfig
.
from_pretrained
(
model_args
.
model_name_or_path
,
**
init_kwargs
)
def
load_model
(
tokenizer
:
"PreTrainedTokenizer"
,
model_args
:
"ModelArguments"
,
finetuning_args
:
"FinetuningArguments"
,
is_trainable
:
bool
=
False
,
add_valuehead
:
bool
=
False
,
)
->
"PreTrainedModel"
:
r
"""
Loads pretrained model.
"""
init_kwargs
=
_get_init_kwargs
(
model_args
)
config
=
load_config
(
model_args
)
patch_config
(
config
,
tokenizer
,
model_args
,
init_kwargs
,
is_trainable
)
apply_liger_kernel
(
config
,
model_args
,
is_trainable
,
require_logits
=
(
finetuning_args
.
stage
not
in
[
"pt"
,
"sft"
]))
model
=
None
lazy_load
=
False
if
model_args
.
use_unsloth
:
if
model_args
.
adapter_name_or_path
is
not
None
:
lazy_load
=
True
elif
is_trainable
:
model
=
load_unsloth_pretrained_model
(
config
,
model_args
)
if
model
is
None
and
not
lazy_load
:
init_kwargs
[
"config"
]
=
config
init_kwargs
[
"pretrained_model_name_or_path"
]
=
model_args
.
model_name_or_path
if
model_args
.
mixture_of_depths
==
"load"
:
model
=
load_mod_pretrained_model
(
**
init_kwargs
)
else
:
if
type
(
config
)
in
AutoModelForVision2Seq
.
_model_mapping
.
keys
():
# assume built-in models
load_class
=
AutoModelForVision2Seq
else
:
load_class
=
AutoModelForCausalLM
if
model_args
.
train_from_scratch
:
model
=
load_class
.
from_config
(
config
,
trust_remote_code
=
model_args
.
trust_remote_code
)
else
:
model
=
load_class
.
from_pretrained
(
**
init_kwargs
)
if
model_args
.
mixture_of_depths
==
"convert"
:
model
=
convert_pretrained_model_to_mod
(
model
,
config
,
model_args
)
if
not
lazy_load
:
patch_model
(
model
,
tokenizer
,
model_args
,
is_trainable
,
add_valuehead
)
register_autoclass
(
config
,
model
,
tokenizer
)
model
=
init_adapter
(
config
,
model
,
model_args
,
finetuning_args
,
is_trainable
)
if
add_valuehead
:
model
=
AutoModelForCausalLMWithValueHead
.
from_pretrained
(
model
)
patch_valuehead_model
(
model
)
if
model_args
.
adapter_name_or_path
is
not
None
:
vhead_path
=
model_args
.
adapter_name_or_path
[
-
1
]
else
:
vhead_path
=
model_args
.
model_name_or_path
vhead_params
=
load_valuehead_params
(
vhead_path
,
model_args
)
if
vhead_params
is
not
None
:
model
.
load_state_dict
(
vhead_params
,
strict
=
False
)
logger
.
info_rank0
(
f
"Loaded valuehead from checkpoint:
{
vhead_path
}
"
)
if
not
is_trainable
:
model
.
requires_grad_
(
False
)
for
param
in
model
.
parameters
():
if
param
.
data
.
dtype
==
torch
.
float32
and
model_args
.
compute_dtype
!=
torch
.
float32
:
param
.
data
=
param
.
data
.
to
(
model_args
.
compute_dtype
)
model
.
eval
()
else
:
model
.
train
()
trainable_params
,
all_param
=
count_parameters
(
model
)
if
is_trainable
:
param_stats
=
"trainable params: {:,} || all params: {:,} || trainable%: {:.4f}"
.
format
(
trainable_params
,
all_param
,
100
*
trainable_params
/
all_param
)
else
:
param_stats
=
f
"all params:
{
all_param
:,
}
"
logger
.
info_rank0
(
param_stats
)
if
model_args
.
print_param_status
:
for
name
,
param
in
model
.
named_parameters
():
print
(
"name: {}, dtype: {}, device: {}, trainable: {}"
.
format
(
name
,
param
.
dtype
,
param
.
device
,
param
.
requires_grad
)
)
return
model
LLaMA-Factory/src/llamafactory/model/model_utils/__init__.py
0 → 100644
View file @
53b3977b
LLaMA-Factory/src/llamafactory/model/model_utils/attention.py
0 → 100644
View file @
53b3977b
# Copyright 2024 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
TYPE_CHECKING
from
transformers.utils
import
is_flash_attn_2_available
,
is_torch_sdpa_available
from
transformers.utils.versions
import
require_version
from
...extras
import
logging
if
TYPE_CHECKING
:
from
transformers
import
PretrainedConfig
from
...hparams
import
ModelArguments
logger
=
logging
.
get_logger
(
__name__
)
def
configure_attn_implementation
(
config
:
"PretrainedConfig"
,
model_args
:
"ModelArguments"
,
is_trainable
:
bool
)
->
None
:
if
getattr
(
config
,
"model_type"
,
None
)
==
"gemma2"
and
is_trainable
:
if
model_args
.
flash_attn
==
"auto"
or
model_args
.
flash_attn
==
"fa2"
:
if
is_flash_attn_2_available
():
require_version
(
"transformers>=4.42.4"
,
"To fix: pip install transformers>=4.42.4"
)
require_version
(
"flash_attn>=2.6.3"
,
"To fix: pip install flash_attn>=2.6.3"
)
if
model_args
.
flash_attn
!=
"fa2"
:
logger
.
warning_rank0
(
"Gemma-2 should use flash attention 2, change `flash_attn` to fa2."
)
model_args
.
flash_attn
=
"fa2"
else
:
logger
.
warning_rank0
(
"FlashAttention-2 is not installed, use eager attention."
)
model_args
.
flash_attn
=
"disabled"
elif
model_args
.
flash_attn
==
"sdpa"
:
logger
.
warning_rank0
(
"Gemma-2 should use soft-capping attention, while the SDPA attention does not support it."
)
if
model_args
.
flash_attn
==
"auto"
:
return
elif
model_args
.
flash_attn
==
"disabled"
:
requested_attn_implementation
=
"eager"
elif
model_args
.
flash_attn
==
"sdpa"
:
if
not
is_torch_sdpa_available
():
logger
.
warning_rank0
(
"torch>=2.1.1 is required for SDPA attention."
)
return
requested_attn_implementation
=
"sdpa"
elif
model_args
.
flash_attn
==
"fa2"
:
if
not
is_flash_attn_2_available
():
logger
.
warning_rank0
(
"FlashAttention-2 is not installed."
)
return
requested_attn_implementation
=
"flash_attention_2"
else
:
raise
NotImplementedError
(
f
"Unknown attention type:
{
model_args
.
flash_attn
}
"
)
if
getattr
(
config
,
"model_type"
,
None
)
==
"internlm2"
:
# special case for custom models
setattr
(
config
,
"attn_implementation"
,
requested_attn_implementation
)
else
:
setattr
(
config
,
"_attn_implementation"
,
requested_attn_implementation
)
def
print_attn_implementation
(
config
:
"PretrainedConfig"
)
->
None
:
if
getattr
(
config
,
"model_type"
,
None
)
==
"internlm2"
:
# special case for custom models
attn_implementation
=
getattr
(
config
,
"attn_implementation"
,
None
)
else
:
attn_implementation
=
getattr
(
config
,
"_attn_implementation"
,
None
)
if
attn_implementation
==
"flash_attention_2"
:
logger
.
info_rank0
(
"Using FlashAttention-2 for faster training and inference."
)
elif
attn_implementation
==
"sdpa"
:
logger
.
info_rank0
(
"Using torch SDPA for faster training and inference."
)
else
:
logger
.
info_rank0
(
"Using vanilla attention implementation."
)
LLaMA-Factory/src/llamafactory/model/model_utils/checkpointing.py
0 → 100644
View file @
53b3977b
# Copyright 2024 HuggingFace Inc., Daniel Han-Chen & the Unsloth team and the LlamaFactory team.
#
# This code is inspired by the HuggingFace's Transformers and PEFT library,
# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/modeling_utils.py
# https://github.com/huggingface/peft/blob/v0.10.0/src/peft/utils/other.py
# and the Unsloth library.
# https://github.com/unslothai/unsloth/blob/July-2024/unsloth/models/_utils.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
inspect
from
functools
import
WRAPPER_ASSIGNMENTS
,
partial
,
wraps
from
types
import
MethodType
from
typing
import
TYPE_CHECKING
,
Any
,
Callable
,
Dict
,
Optional
,
Tuple
,
Union
import
torch
from
...extras
import
logging
from
...extras.constants
import
LAYERNORM_NAMES
if
TYPE_CHECKING
:
from
transformers
import
PreTrainedModel
from
...hparams
import
ModelArguments
logger
=
logging
.
get_logger
(
__name__
)
def
get_unsloth_gradient_checkpointing_func
()
->
Callable
:
class
UnslothGradientCheckpointing
(
torch
.
autograd
.
Function
):
r
"""
Saves VRAM by smartly offloading to RAM.
"""
@
staticmethod
@
torch
.
cuda
.
amp
.
custom_fwd
def
forward
(
ctx
:
"torch.autograd.Function"
,
forward_function
:
"torch.Module"
,
hidden_states
:
"torch.Tensor"
,
*
args
:
Union
[
"torch.Tensor"
,
Any
],
)
->
"torch.Tensor"
:
saved_hidden_states
=
hidden_states
.
to
(
"cpu"
,
non_blocking
=
True
)
with
torch
.
no_grad
():
output
=
forward_function
(
hidden_states
,
*
args
)
ctx
.
save_for_backward
(
saved_hidden_states
)
ctx
.
forward_function
=
forward_function
ctx
.
args
=
args
return
output
@
staticmethod
@
torch
.
cuda
.
amp
.
custom_bwd
def
backward
(
ctx
:
"torch.autograd.Function"
,
grad_output
:
"torch.Tensor"
)
->
"torch.Tensor"
:
(
hidden_states
,)
=
ctx
.
saved_tensors
hidden_states
=
hidden_states
.
to
(
"cuda"
,
non_blocking
=
True
).
detach
()
hidden_states
.
requires_grad_
(
True
)
with
torch
.
enable_grad
():
(
output
,)
=
ctx
.
forward_function
(
hidden_states
,
*
ctx
.
args
)
torch
.
autograd
.
backward
(
output
,
grad_output
)
return
(
None
,
hidden_states
.
grad
)
+
(
None
,)
*
len
(
ctx
.
args
)
return
UnslothGradientCheckpointing
.
apply
def
get_custom_gradient_checkpointing_func
(
gradient_checkpointing_func
:
Callable
)
->
Callable
:
r
"""
Only applies gradient checkpointing to trainable layers.
"""
@
wraps
(
gradient_checkpointing_func
,
assigned
=
WRAPPER_ASSIGNMENTS
+
(
"__self__"
,))
def
custom_gradient_checkpointing_func
(
func
:
Callable
,
*
args
:
Union
[
"torch.Tensor"
,
Any
],
**
kwargs
):
module
:
"torch.nn.Module"
=
func
.
__self__
if
any
(
param
.
requires_grad
for
param
in
module
.
parameters
()):
for
arg
in
args
:
if
torch
.
is_tensor
(
arg
)
and
torch
.
is_floating_point
(
arg
):
arg
.
requires_grad_
(
True
)
return
gradient_checkpointing_func
(
func
,
*
args
,
**
kwargs
)
return
custom_gradient_checkpointing_func
def
_gradient_checkpointing_enable
(
self
:
"PreTrainedModel"
,
gradient_checkpointing_kwargs
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
use_unsloth_gc
:
bool
=
False
,
)
->
None
:
r
"""
Activates gradient checkpointing for the current model.
Modification of the original method to enable gradient checkpointing for block-wise optimizer.
"""
from
torch.utils.checkpoint
import
checkpoint
if
not
self
.
supports_gradient_checkpointing
:
raise
ValueError
(
f
"
{
self
.
__class__
.
__name__
}
does not support gradient checkpointing."
)
if
gradient_checkpointing_kwargs
is
None
:
gradient_checkpointing_kwargs
=
{
"use_reentrant"
:
True
}
if
use_unsloth_gc
:
gradient_checkpointing_func
=
get_unsloth_gradient_checkpointing_func
()
else
:
gradient_checkpointing_func
=
partial
(
checkpoint
,
**
gradient_checkpointing_kwargs
)
gradient_checkpointing_func
=
get_custom_gradient_checkpointing_func
(
gradient_checkpointing_func
)
if
"value"
in
inspect
.
signature
(
self
.
_set_gradient_checkpointing
).
parameters
:
# old GC format
self
.
apply
(
partial
(
self
.
_set_gradient_checkpointing
,
value
=
True
))
self
.
enable_input_require_grads
()
logger
.
warning_once
(
"You are using the old GC format, some features (e.g. BAdam) will be invalid."
)
else
:
# have already enabled input require gradients
self
.
_set_gradient_checkpointing
(
enable
=
True
,
gradient_checkpointing_func
=
gradient_checkpointing_func
)
def
_fp32_forward_post_hook
(
module
:
"torch.nn.Module"
,
args
:
Tuple
[
"torch.Tensor"
],
output
:
"torch.Tensor"
)
->
"torch.Tensor"
:
return
output
.
to
(
torch
.
float32
)
def
prepare_model_for_training
(
model
:
"PreTrainedModel"
,
model_args
:
"ModelArguments"
)
->
None
:
r
"""
Includes:
(1) cast the layernorm in fp32
(2) make output embedding layer require grads
(3) add the upcasting of the lm_head in fp32
"""
if
model_args
.
upcast_layernorm
:
logger
.
info_rank0
(
"Upcasting layernorm weights in float32."
)
for
name
,
param
in
model
.
named_parameters
():
if
param
.
ndim
==
1
and
any
(
ln_name
in
name
for
ln_name
in
LAYERNORM_NAMES
):
param
.
data
=
param
.
data
.
to
(
torch
.
float32
)
if
not
model_args
.
disable_gradient_checkpointing
:
if
not
getattr
(
model
,
"supports_gradient_checkpointing"
,
False
):
logger
.
warning_rank0
(
"Current model does not support gradient checkpointing."
)
else
:
# use_reentrant=False might increase VRAM usage (have not been empirically verified yet)
# According to: https://github.com/huggingface/transformers/issues/28339
gradient_checkpointing_enable
=
partial
(
_gradient_checkpointing_enable
,
use_unsloth_gc
=
model_args
.
use_unsloth_gc
)
model
.
gradient_checkpointing_enable
=
MethodType
(
gradient_checkpointing_enable
,
model
)
model
.
gradient_checkpointing_enable
(
gradient_checkpointing_kwargs
=
{
"use_reentrant"
:
model_args
.
use_reentrant_gc
}
)
setattr
(
model
.
config
,
"use_cache"
,
False
)
# turn off when gradient checkpointing is enabled
logger
.
info_rank0
(
"Gradient checkpointing enabled."
)
if
model_args
.
upcast_lmhead_output
:
output_layer
=
model
.
get_output_embeddings
()
if
isinstance
(
output_layer
,
torch
.
nn
.
Linear
)
and
output_layer
.
weight
.
dtype
!=
torch
.
float32
:
logger
.
info_rank0
(
"Upcasting lm_head outputs in float32."
)
output_layer
.
register_forward_hook
(
_fp32_forward_post_hook
)
LLaMA-Factory/src/llamafactory/model/model_utils/embedding.py
0 → 100644
View file @
53b3977b
# Copyright 2024 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
from
contextlib
import
nullcontext
from
typing
import
TYPE_CHECKING
import
torch
from
transformers.integrations
import
is_deepspeed_zero3_enabled
from
...extras
import
logging
if
TYPE_CHECKING
:
from
transformers
import
PreTrainedModel
,
PreTrainedTokenizer
logger
=
logging
.
get_logger
(
__name__
)
def
_noisy_mean_initialization
(
embed_weight
:
"torch.Tensor"
,
num_new_tokens
:
int
)
->
None
:
embedding_dim
=
embed_weight
.
size
(
1
)
avg_weight
=
embed_weight
[:
-
num_new_tokens
].
mean
(
dim
=
0
,
keepdim
=
True
)
noise_weight
=
torch
.
empty_like
(
embed_weight
[
-
num_new_tokens
:])
noise_weight
.
normal_
(
mean
=
0
,
std
=
(
1.0
/
math
.
sqrt
(
embedding_dim
)))
embed_weight
[
-
num_new_tokens
:]
=
avg_weight
+
noise_weight
def
resize_embedding_layer
(
model
:
"PreTrainedModel"
,
tokenizer
:
"PreTrainedTokenizer"
)
->
None
:
r
"""
Resize token embeddings.
"""
if
is_deepspeed_zero3_enabled
():
import
deepspeed
# type: ignore
params
=
[
model
.
get_input_embeddings
().
weight
]
if
model
.
get_output_embeddings
()
is
not
None
and
not
model
.
config
.
tie_word_embeddings
:
params
.
append
(
model
.
get_output_embeddings
().
weight
)
context_maybe_zero3
=
deepspeed
.
zero
.
GatheredParameters
(
params
,
modifier_rank
=
0
)
else
:
context_maybe_zero3
=
nullcontext
()
with
context_maybe_zero3
:
current_embedding_size
=
model
.
get_input_embeddings
().
weight
.
size
(
0
)
if
len
(
tokenizer
)
>
current_embedding_size
:
if
getattr
(
model
,
"quantization_method"
,
None
):
raise
ValueError
(
"Cannot resize embedding layers of a quantized model."
)
if
not
isinstance
(
model
.
get_output_embeddings
(),
torch
.
nn
.
Linear
):
raise
ValueError
(
"Current model does not support resizing embedding layers."
)
model
.
resize_token_embeddings
(
len
(
tokenizer
),
pad_to_multiple_of
=
64
)
with
context_maybe_zero3
:
new_embedding_size
=
model
.
get_input_embeddings
().
weight
.
size
(
0
)
num_new_tokens
=
new_embedding_size
-
current_embedding_size
_noisy_mean_initialization
(
model
.
get_input_embeddings
().
weight
.
data
,
num_new_tokens
)
_noisy_mean_initialization
(
model
.
get_output_embeddings
().
weight
.
data
,
num_new_tokens
)
logger
.
info_rank0
(
f
"Resized token embeddings from
{
current_embedding_size
}
to
{
new_embedding_size
}
."
)
LLaMA-Factory/src/llamafactory/model/model_utils/liger_kernel.py
0 → 100644
View file @
53b3977b
# Copyright 2024 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
inspect
from
typing
import
TYPE_CHECKING
from
...extras
import
logging
if
TYPE_CHECKING
:
from
transformers
import
PretrainedConfig
from
...hparams
import
ModelArguments
logger
=
logging
.
get_logger
(
__name__
)
def
apply_liger_kernel
(
config
:
"PretrainedConfig"
,
model_args
:
"ModelArguments"
,
is_trainable
:
bool
,
require_logits
:
bool
,
)
->
None
:
if
not
is_trainable
or
not
model_args
.
enable_liger_kernel
:
return
model_type
=
getattr
(
config
,
"model_type"
,
None
)
if
model_type
==
"gemma"
:
from
liger_kernel.transformers
import
apply_liger_kernel_to_gemma
as
apply_liger_kernel
elif
model_type
==
"gemma2"
:
from
liger_kernel.transformers
import
apply_liger_kernel_to_gemma2
as
apply_liger_kernel
elif
model_type
==
"llama"
:
from
liger_kernel.transformers
import
apply_liger_kernel_to_llama
as
apply_liger_kernel
elif
model_type
==
"mistral"
:
from
liger_kernel.transformers
import
apply_liger_kernel_to_mistral
as
apply_liger_kernel
elif
model_type
==
"mixtral"
:
from
liger_kernel.transformers
import
apply_liger_kernel_to_mixtral
as
apply_liger_kernel
elif
model_type
==
"phi3"
:
from
liger_kernel.transformers
import
apply_liger_kernel_to_phi3
as
apply_liger_kernel
elif
model_type
==
"qwen2"
:
from
liger_kernel.transformers
import
apply_liger_kernel_to_qwen2
as
apply_liger_kernel
elif
model_type
==
"qwen2_vl"
:
from
liger_kernel.transformers
import
apply_liger_kernel_to_qwen2_vl
as
apply_liger_kernel
else
:
logger
.
warning_rank0
(
"Current model does not support liger kernel."
)
return
if
require_logits
and
"fused_linear_cross_entropy"
in
inspect
.
signature
(
apply_liger_kernel
).
parameters
:
logger
.
info_rank0
(
"Current training stage does not support chunked cross entropy."
)
kwargs
=
{
"fused_linear_cross_entropy"
:
False
}
else
:
kwargs
=
{}
apply_liger_kernel
(
**
kwargs
)
logger
.
info_rank0
(
"Liger kernel has been applied to the model."
)
LLaMA-Factory/src/llamafactory/model/model_utils/longlora.py
0 → 100644
View file @
53b3977b
# Copyright 2024 EleutherAI, HuggingFace Inc., Yukang Chen, and the LlamaFactory team.
#
# This code is based on the EleutherAI's GPT-NeoX and the HuggingFace's Transformers libraries.
# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
# This code is also inspired by the original LongLoRA implementation.
# https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
from
typing
import
TYPE_CHECKING
,
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
import
transformers
from
transformers.models.llama.modeling_llama
import
(
Cache
,
LlamaAttention
,
LlamaFlashAttention2
,
LlamaSdpaAttention
,
apply_rotary_pos_emb
,
repeat_kv
,
)
from
transformers.utils.versions
import
require_version
from
...extras
import
logging
from
...extras.constants
import
SUPPORTED_CLASS_FOR_S2ATTN
from
...extras.packages
import
is_transformers_version_greater_than
if
TYPE_CHECKING
:
from
transformers
import
PretrainedConfig
from
...hparams
import
ModelArguments
transformers_logger
=
transformers
.
utils
.
logging
.
get_logger
(
__name__
)
# Modified from:
# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
def
llama_attention_forward
(
self
:
"LlamaAttention"
,
hidden_states
:
"torch.Tensor"
,
attention_mask
:
Optional
[
"torch.Tensor"
]
=
None
,
position_ids
:
Optional
[
"torch.LongTensor"
]
=
None
,
past_key_value
:
Optional
[
"Cache"
]
=
None
,
output_attentions
:
bool
=
False
,
cache_position
:
Optional
[
"torch.LongTensor"
]
=
None
,
position_embeddings
:
Optional
[
Tuple
[
"torch.Tensor"
,
"torch.Tensor"
]]
=
None
,
**
kwargs
,
)
->
Tuple
[
"torch.Tensor"
,
Optional
[
"torch.Tensor"
],
Optional
[
Tuple
[
"torch.Tensor"
]]]:
bsz
,
q_len
,
_
=
hidden_states
.
size
()
query_states
:
"torch.Tensor"
=
self
.
q_proj
(
hidden_states
)
key_states
:
"torch.Tensor"
=
self
.
k_proj
(
hidden_states
)
value_states
:
"torch.Tensor"
=
self
.
v_proj
(
hidden_states
)
query_states
=
query_states
.
view
(
bsz
,
q_len
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
key_states
=
key_states
.
view
(
bsz
,
q_len
,
self
.
num_key_value_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
value_states
=
value_states
.
view
(
bsz
,
q_len
,
self
.
num_key_value_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
if
position_embeddings
is
None
:
cos
,
sin
=
self
.
rotary_emb
(
value_states
,
position_ids
)
else
:
cos
,
sin
=
position_embeddings
query_states
,
key_states
=
apply_rotary_pos_emb
(
query_states
,
key_states
,
cos
,
sin
)
if
past_key_value
is
not
None
:
cache_kwargs
=
{
"sin"
:
sin
,
"cos"
:
cos
,
"cache_position"
:
cache_position
}
key_states
,
value_states
=
past_key_value
.
update
(
key_states
,
value_states
,
self
.
layer_idx
,
cache_kwargs
)
key_states
=
repeat_kv
(
key_states
,
self
.
num_key_value_groups
)
value_states
=
repeat_kv
(
value_states
,
self
.
num_key_value_groups
)
if
getattr
(
self
.
config
,
"group_size_ratio"
,
None
)
and
self
.
training
:
# shift
groupsz
=
int
(
q_len
*
getattr
(
self
.
config
,
"group_size_ratio"
))
assert
q_len
%
groupsz
==
0
,
f
"q_len
{
q_len
}
should be divisible by group size
{
groupsz
}
."
num_groups
=
q_len
//
groupsz
def
shift
(
state
:
"torch.Tensor"
)
->
"torch.Tensor"
:
state
=
state
.
transpose
(
1
,
2
)
# output: (bsz, seq_len, n_heads, head_dim)
state
=
torch
.
cat
(
(
state
[:,
:,
:
self
.
num_heads
//
2
],
state
[:,
:,
self
.
num_heads
//
2
:].
roll
(
-
groupsz
//
2
,
dims
=
1
)),
dim
=
2
,
)
return
state
.
reshape
(
bsz
*
num_groups
,
groupsz
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
query_states
,
key_states
,
value_states
=
shift
(
query_states
),
shift
(
key_states
),
shift
(
value_states
)
if
attention_mask
is
not
None
:
attention_mask
=
attention_mask
[:,
:,
:
groupsz
,
:
groupsz
].
repeat
(
num_groups
,
1
,
1
,
1
)
attn_weights
=
torch
.
matmul
(
query_states
,
key_states
.
transpose
(
2
,
3
))
/
math
.
sqrt
(
self
.
head_dim
)
if
attention_mask
is
not
None
:
# no matter the length, we just slice it
causal_mask
=
attention_mask
[:,
:,
:,
:
key_states
.
shape
[
-
2
]]
attn_weights
=
attn_weights
+
causal_mask
# upcast attention to fp32
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
,
dtype
=
torch
.
float32
).
to
(
query_states
.
dtype
)
attn_weights
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
attention_dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
matmul
(
attn_weights
,
value_states
)
# (bsz, :, seq_len, :) or (bsz * n_group, :, groupsz, :)
attn_output
=
attn_output
.
transpose
(
1
,
2
).
contiguous
()
if
getattr
(
self
.
config
,
"group_size_ratio"
,
None
)
and
self
.
training
:
# shift back
attn_output
.
reshape
(
bsz
,
q_len
,
self
.
num_heads
,
self
.
head_dim
)
attn_output
=
torch
.
cat
(
(
attn_output
[:,
:,
:
self
.
num_heads
//
2
],
attn_output
[:,
:,
self
.
num_heads
//
2
:].
roll
(
groupsz
//
2
,
dims
=
1
),
),
dim
=
2
,
)
attn_output
=
attn_output
.
reshape
(
bsz
,
q_len
,
self
.
hidden_size
)
attn_output
=
self
.
o_proj
(
attn_output
)
if
not
output_attentions
:
attn_weights
=
None
return
attn_output
,
attn_weights
,
past_key_value
# Modified from:
# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
def
llama_flash_attention_2_forward
(
self
:
"LlamaFlashAttention2"
,
hidden_states
:
"torch.Tensor"
,
attention_mask
:
Optional
[
"torch.Tensor"
]
=
None
,
position_ids
:
Optional
[
"torch.LongTensor"
]
=
None
,
past_key_value
:
Optional
[
"Cache"
]
=
None
,
output_attentions
:
bool
=
False
,
cache_position
:
Optional
[
"torch.LongTensor"
]
=
None
,
position_embeddings
:
Optional
[
Tuple
[
"torch.Tensor"
,
"torch.Tensor"
]]
=
None
,
**
kwargs
,
)
->
Tuple
[
"torch.Tensor"
,
Optional
[
"torch.Tensor"
],
Optional
[
Tuple
[
"torch.Tensor"
]]]:
# LlamaFlashAttention2 attention does not support output_attentions
output_attentions
=
False
bsz
,
q_len
,
_
=
hidden_states
.
size
()
query_states
:
"torch.Tensor"
=
self
.
q_proj
(
hidden_states
)
key_states
:
"torch.Tensor"
=
self
.
k_proj
(
hidden_states
)
value_states
:
"torch.Tensor"
=
self
.
v_proj
(
hidden_states
)
query_states
=
query_states
.
view
(
bsz
,
q_len
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
key_states
=
key_states
.
view
(
bsz
,
q_len
,
self
.
num_key_value_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
value_states
=
value_states
.
view
(
bsz
,
q_len
,
self
.
num_key_value_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
if
position_embeddings
is
None
:
cos
,
sin
=
self
.
rotary_emb
(
value_states
,
position_ids
)
else
:
cos
,
sin
=
position_embeddings
query_states
,
key_states
=
apply_rotary_pos_emb
(
query_states
,
key_states
,
cos
,
sin
)
if
past_key_value
is
not
None
:
cache_kwargs
=
{
"sin"
:
sin
,
"cos"
:
cos
,
"cache_position"
:
cache_position
}
key_states
,
value_states
=
past_key_value
.
update
(
key_states
,
value_states
,
self
.
layer_idx
,
cache_kwargs
)
key_states
=
repeat_kv
(
key_states
,
self
.
num_key_value_groups
)
value_states
=
repeat_kv
(
value_states
,
self
.
num_key_value_groups
)
# FlashAttention requires the input to have the shape (bsz, seq_len, n_heads, head_dim)
query_states
=
query_states
.
transpose
(
1
,
2
)
key_states
=
key_states
.
transpose
(
1
,
2
)
value_states
=
value_states
.
transpose
(
1
,
2
)
dropout_rate
=
self
.
attention_dropout
if
self
.
training
else
0.0
input_dtype
=
query_states
.
dtype
if
input_dtype
==
torch
.
float32
:
if
torch
.
is_autocast_enabled
():
target_dtype
=
torch
.
get_autocast_gpu_dtype
()
elif
hasattr
(
self
.
config
,
"_pre_quantization_dtype"
):
target_dtype
=
self
.
config
.
_pre_quantization_dtype
else
:
target_dtype
=
self
.
q_proj
.
weight
.
dtype
transformers_logger
.
warning_once
(
"The input hidden states seems to be silently casted in float32."
)
query_states
=
query_states
.
to
(
target_dtype
)
key_states
=
key_states
.
to
(
target_dtype
)
value_states
=
value_states
.
to
(
target_dtype
)
if
getattr
(
self
.
config
,
"group_size_ratio"
,
None
)
and
self
.
training
:
# shift
groupsz
=
int
(
q_len
*
getattr
(
self
.
config
,
"group_size_ratio"
))
assert
q_len
%
groupsz
==
0
,
f
"q_len
{
q_len
}
should be divisible by group size
{
groupsz
}
."
num_groups
=
q_len
//
groupsz
def
shift
(
state
:
"torch.Tensor"
)
->
"torch.Tensor"
:
state
=
torch
.
cat
(
(
state
[:,
:,
:
self
.
num_heads
//
2
],
state
[:,
:,
self
.
num_heads
//
2
:].
roll
(
-
groupsz
//
2
,
dims
=
1
)),
dim
=
2
,
)
return
state
.
reshape
(
bsz
*
num_groups
,
groupsz
,
self
.
num_heads
,
self
.
head_dim
)
query_states
,
key_states
,
value_states
=
shift
(
query_states
),
shift
(
key_states
),
shift
(
value_states
)
if
attention_mask
is
not
None
:
attention_mask
=
attention_mask
[:,
:
groupsz
].
repeat
(
num_groups
,
1
)
if
is_transformers_version_greater_than
(
"4.43.0"
):
from
transformers.modeling_flash_attention_utils
import
_flash_attention_forward
attn_output
:
"torch.Tensor"
=
_flash_attention_forward
(
query_states
,
key_states
,
value_states
,
attention_mask
,
query_states
.
size
(
1
),
dropout
=
dropout_rate
,
sliding_window
=
getattr
(
self
,
"sliding_window"
,
None
),
use_top_left_mask
=
self
.
_flash_attn_uses_top_left_mask
,
is_causal
=
self
.
is_causal
,
)
else
:
attn_output
:
"torch.Tensor"
=
self
.
_flash_attention_forward
(
query_states
,
key_states
,
value_states
,
attention_mask
,
query_states
.
size
(
1
),
dropout
=
dropout_rate
)
if
getattr
(
self
.
config
,
"group_size_ratio"
,
None
)
and
self
.
training
:
# shift back
attn_output
.
reshape
(
bsz
,
q_len
,
self
.
num_heads
,
self
.
head_dim
)
attn_output
=
torch
.
cat
(
(
attn_output
[:,
:,
:
self
.
num_heads
//
2
],
attn_output
[:,
:,
self
.
num_heads
//
2
:].
roll
(
groupsz
//
2
,
dims
=
1
),
),
dim
=
2
,
)
attn_output
=
attn_output
.
reshape
(
bsz
,
q_len
,
self
.
hidden_size
).
contiguous
()
attn_output
=
self
.
o_proj
(
attn_output
)
if
not
output_attentions
:
attn_weights
=
None
return
attn_output
,
attn_weights
,
past_key_value
# Modified from:
# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py
def
llama_sdpa_attention_forward
(
self
:
"LlamaSdpaAttention"
,
hidden_states
:
"torch.Tensor"
,
attention_mask
:
Optional
[
"torch.Tensor"
]
=
None
,
position_ids
:
Optional
[
"torch.LongTensor"
]
=
None
,
past_key_value
:
Optional
[
"Cache"
]
=
None
,
output_attentions
:
bool
=
False
,
cache_position
:
Optional
[
"torch.LongTensor"
]
=
None
,
position_embeddings
:
Optional
[
Tuple
[
"torch.Tensor"
,
"torch.Tensor"
]]
=
None
,
**
kwargs
,
)
->
Tuple
[
"torch.Tensor"
,
Optional
[
"torch.Tensor"
],
Optional
[
Tuple
[
"torch.Tensor"
]]]:
if
output_attentions
:
transformers_logger
.
warning_once
(
"SDPA does not support `output_attentions=True`. Falling back to the vanilla attention"
)
return
llama_attention_forward
(
self
,
hidden_states
=
hidden_states
,
attention_mask
=
attention_mask
,
position_ids
=
position_ids
,
past_key_value
=
past_key_value
,
output_attentions
=
output_attentions
,
cache_position
=
cache_position
,
**
kwargs
,
)
bsz
,
q_len
,
_
=
hidden_states
.
size
()
query_states
:
"torch.Tensor"
=
self
.
q_proj
(
hidden_states
)
key_states
:
"torch.Tensor"
=
self
.
k_proj
(
hidden_states
)
value_states
:
"torch.Tensor"
=
self
.
v_proj
(
hidden_states
)
query_states
=
query_states
.
view
(
bsz
,
q_len
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
key_states
=
key_states
.
view
(
bsz
,
q_len
,
self
.
num_key_value_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
value_states
=
value_states
.
view
(
bsz
,
q_len
,
self
.
num_key_value_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
if
position_embeddings
is
None
:
cos
,
sin
=
self
.
rotary_emb
(
value_states
,
position_ids
)
else
:
cos
,
sin
=
position_embeddings
query_states
,
key_states
=
apply_rotary_pos_emb
(
query_states
,
key_states
,
cos
,
sin
)
if
past_key_value
is
not
None
:
cache_kwargs
=
{
"sin"
:
sin
,
"cos"
:
cos
,
"cache_position"
:
cache_position
}
key_states
,
value_states
=
past_key_value
.
update
(
key_states
,
value_states
,
self
.
layer_idx
,
cache_kwargs
)
key_states
=
repeat_kv
(
key_states
,
self
.
num_key_value_groups
)
value_states
=
repeat_kv
(
value_states
,
self
.
num_key_value_groups
)
if
getattr
(
self
.
config
,
"group_size_ratio"
,
None
)
and
self
.
training
:
# shift
groupsz
=
int
(
q_len
*
getattr
(
self
.
config
,
"group_size_ratio"
))
assert
q_len
%
groupsz
==
0
,
f
"q_len
{
q_len
}
should be divisible by group size
{
groupsz
}
."
num_groups
=
q_len
//
groupsz
def
shift
(
state
:
"torch.Tensor"
)
->
"torch.Tensor"
:
state
=
state
.
transpose
(
1
,
2
)
# output: (bsz, seq_len, n_heads, head_dim)
state
=
torch
.
cat
(
(
state
[:,
:,
:
self
.
num_heads
//
2
],
state
[:,
:,
self
.
num_heads
//
2
:].
roll
(
-
groupsz
//
2
,
dims
=
1
)),
dim
=
2
,
)
return
state
.
reshape
(
bsz
*
num_groups
,
groupsz
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
query_states
,
key_states
,
value_states
=
shift
(
query_states
),
shift
(
key_states
),
shift
(
value_states
)
if
attention_mask
is
not
None
:
attention_mask
=
attention_mask
[:,
:,
:
groupsz
,
:
groupsz
].
repeat
(
num_groups
,
1
,
1
,
1
)
causal_mask
=
attention_mask
if
attention_mask
is
not
None
:
causal_mask
=
causal_mask
[:,
:,
:,
:
key_states
.
shape
[
-
2
]]
if
query_states
.
device
.
type
==
"cuda"
and
causal_mask
is
not
None
:
# avoid pytorch bug
query_states
=
query_states
.
contiguous
()
key_states
=
key_states
.
contiguous
()
value_states
=
value_states
.
contiguous
()
is_causal
=
True
if
causal_mask
is
None
and
q_len
>
1
else
False
attn_output
=
torch
.
nn
.
functional
.
scaled_dot_product_attention
(
query_states
,
key_states
,
value_states
,
attn_mask
=
causal_mask
,
dropout_p
=
self
.
attention_dropout
if
self
.
training
else
0.0
,
is_causal
=
is_causal
,
)
attn_output
=
attn_output
.
transpose
(
1
,
2
).
contiguous
()
if
getattr
(
self
.
config
,
"group_size_ratio"
,
None
)
and
self
.
training
:
# shift back
attn_output
.
reshape
(
bsz
,
q_len
,
self
.
num_heads
,
self
.
head_dim
)
attn_output
=
torch
.
cat
(
(
attn_output
[:,
:,
:
self
.
num_heads
//
2
],
attn_output
[:,
:,
self
.
num_heads
//
2
:].
roll
(
groupsz
//
2
,
dims
=
1
),
),
dim
=
2
,
)
attn_output
=
attn_output
.
reshape
(
bsz
,
q_len
,
self
.
hidden_size
)
attn_output
=
self
.
o_proj
(
attn_output
)
return
attn_output
,
None
,
past_key_value
def
_apply_llama_patch
()
->
None
:
require_version
(
"transformers>=4.41.2,<=4.46.1"
,
"To fix: pip install transformers>=4.41.2,<=4.46.1"
)
LlamaAttention
.
forward
=
llama_attention_forward
LlamaFlashAttention2
.
forward
=
llama_flash_attention_2_forward
LlamaSdpaAttention
.
forward
=
llama_sdpa_attention_forward
def
configure_longlora
(
config
:
"PretrainedConfig"
,
model_args
:
"ModelArguments"
,
is_trainable
:
bool
)
->
None
:
if
not
is_trainable
or
not
model_args
.
shift_attn
:
return
logger
=
logging
.
get_logger
(
__name__
)
if
getattr
(
config
,
"model_type"
,
None
)
in
SUPPORTED_CLASS_FOR_S2ATTN
:
setattr
(
config
,
"group_size_ratio"
,
0.25
)
_apply_llama_patch
()
logger
.
info_rank0
(
"Using shift short attention with group_size_ratio=1/4."
)
else
:
logger
.
warning_rank0
(
"Current model does not support shift short attention."
)
LLaMA-Factory/src/llamafactory/model/model_utils/misc.py
0 → 100644
View file @
53b3977b
# Copyright 2024 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
TYPE_CHECKING
,
List
from
...extras
import
logging
if
TYPE_CHECKING
:
from
transformers
import
PretrainedConfig
,
PreTrainedModel
,
PreTrainedTokenizer
logger
=
logging
.
get_logger
(
__name__
)
def
find_all_linear_modules
(
model
:
"PreTrainedModel"
,
freeze_vision_tower
:
bool
)
->
List
[
str
]:
r
"""
Finds all available modules to apply lora or galore.
"""
model_type
=
getattr
(
model
.
config
,
"model_type"
,
None
)
forbidden_modules
=
{
"lm_head"
}
if
model_type
==
"chatglm"
:
forbidden_modules
.
add
(
"output_layer"
)
elif
model_type
==
"internlm2"
:
forbidden_modules
.
add
(
"output"
)
elif
model_type
in
[
"llava"
,
"llava_next"
,
"llava_next_video"
,
"mllama"
,
"paligemma"
,
"video_llava"
]:
forbidden_modules
.
add
(
"multi_modal_projector"
)
elif
model_type
==
"qwen2_vl"
:
forbidden_modules
.
add
(
"merger"
)
if
freeze_vision_tower
:
if
model_type
==
"mllama"
:
forbidden_modules
.
add
(
"vision_model"
)
elif
model_type
==
"qwen2_vl"
:
forbidden_modules
.
add
(
"visual"
)
else
:
forbidden_modules
.
add
(
"vision_tower"
)
module_names
=
set
()
for
name
,
module
in
model
.
named_modules
():
if
any
(
forbidden_module
in
name
for
forbidden_module
in
forbidden_modules
):
continue
if
"Linear"
in
module
.
__class__
.
__name__
and
"Embedding"
not
in
module
.
__class__
.
__name__
:
module_names
.
add
(
name
.
split
(
"."
)[
-
1
])
logger
.
info_rank0
(
"Found linear modules: {}"
.
format
(
","
.
join
(
module_names
)))
return
list
(
module_names
)
def
find_expanded_modules
(
model
:
"PreTrainedModel"
,
target_modules
:
List
[
str
],
num_layer_trainable
:
int
)
->
List
[
str
]:
r
"""
Finds the modules in the expanded blocks to apply lora.
"""
num_layers
=
getattr
(
model
.
config
,
"num_hidden_layers"
,
None
)
if
not
num_layers
:
raise
ValueError
(
"Model was not supported."
)
if
num_layers
%
num_layer_trainable
!=
0
:
raise
ValueError
(
f
"`num_layers`
{
num_layers
}
should be divisible by `num_layer_trainable`
{
num_layer_trainable
}
."
)
stride
=
num_layers
//
num_layer_trainable
trainable_layer_ids
=
range
(
stride
-
1
,
num_layers
+
stride
-
1
,
stride
)
trainable_layers
=
[
f
".
{
idx
:
d
}
."
for
idx
in
trainable_layer_ids
]
module_names
=
[]
for
name
,
_
in
model
.
named_modules
():
if
any
(
target_module
in
name
for
target_module
in
target_modules
)
and
any
(
trainable_layer
in
name
for
trainable_layer
in
trainable_layers
):
module_names
.
append
(
name
)
logger
.
info_rank0
(
"Apply lora to layers: {}"
.
format
(
","
.
join
(
map
(
str
,
trainable_layer_ids
))))
return
module_names
def
register_autoclass
(
config
:
"PretrainedConfig"
,
model
:
"PreTrainedModel"
,
tokenizer
:
"PreTrainedTokenizer"
):
if
"AutoConfig"
in
getattr
(
config
,
"auto_map"
,
{}):
config
.
__class__
.
register_for_auto_class
()
if
"AutoModelForCausalLM"
in
getattr
(
config
,
"auto_map"
,
{}):
model
.
__class__
.
register_for_auto_class
()
if
"AutoTokenizer"
in
tokenizer
.
init_kwargs
.
get
(
"auto_map"
,
{}):
tokenizer
.
__class__
.
register_for_auto_class
()
LLaMA-Factory/src/llamafactory/model/model_utils/mod.py
0 → 100644
View file @
53b3977b
# Copyright 2024 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
TYPE_CHECKING
from
...extras.constants
import
MOD_SUPPORTED_MODELS
if
TYPE_CHECKING
:
from
transformers
import
PretrainedConfig
,
PreTrainedModel
from
...hparams
import
ModelArguments
def
load_mod_pretrained_model
(
**
init_kwargs
)
->
"PreTrainedModel"
:
from
MoD
import
AutoMoDModelForCausalLM
return
AutoMoDModelForCausalLM
.
from_pretrained
(
**
init_kwargs
)
def
convert_pretrained_model_to_mod
(
model
:
"PreTrainedModel"
,
config
:
"PretrainedConfig"
,
model_args
:
"ModelArguments"
)
->
"PreTrainedModel"
:
from
MoD
import
apply_mod_to_hf
if
getattr
(
config
,
"model_type"
,
None
)
not
in
MOD_SUPPORTED_MODELS
:
raise
ValueError
(
"Current model is not supported by mixture-of-depth."
)
model
=
apply_mod_to_hf
(
model
)
model
=
model
.
to
(
model_args
.
compute_dtype
)
return
model
LLaMA-Factory/src/llamafactory/model/model_utils/moe.py
0 → 100644
View file @
53b3977b
# Copyright 2024 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
TYPE_CHECKING
,
Sequence
import
torch
from
transformers.integrations
import
is_deepspeed_zero3_enabled
from
transformers.utils.versions
import
require_version
if
TYPE_CHECKING
:
from
transformers
import
PretrainedConfig
,
PreTrainedModel
from
...hparams
import
ModelArguments
def
_set_z3_leaf_modules
(
model
:
"PreTrainedModel"
,
leaf_modules
:
Sequence
[
"torch.nn.Module"
])
->
None
:
require_version
(
"deepspeed>=0.13.0"
,
"To fix: pip install deepspeed>=0.13.0"
)
from
deepspeed.utils
import
set_z3_leaf_modules
# type: ignore
set_z3_leaf_modules
(
model
,
leaf_modules
)
def
add_z3_leaf_module
(
model
:
"PreTrainedModel"
)
->
None
:
r
"""
Sets module as a leaf module to skip partitioning in deepspeed zero3.
"""
if
not
is_deepspeed_zero3_enabled
():
return
model_type
=
getattr
(
model
.
config
,
"model_type"
,
None
)
if
model_type
==
"dbrx"
:
from
transformers.models.dbrx.modeling_dbrx
import
DbrxFFN
_set_z3_leaf_modules
(
model
,
[
DbrxFFN
])
if
model_type
==
"jamba"
:
from
transformers.models.jamba.modeling_jamba
import
JambaSparseMoeBlock
_set_z3_leaf_modules
(
model
,
[
JambaSparseMoeBlock
])
if
model_type
==
"jetmoe"
:
from
transformers.models.jetmoe.modeling_jetmoe
import
JetMoeMoA
,
JetMoeMoE
_set_z3_leaf_modules
(
model
,
[
JetMoeMoA
,
JetMoeMoE
])
if
model_type
==
"mixtral"
:
from
transformers.models.mixtral.modeling_mixtral
import
MixtralSparseMoeBlock
_set_z3_leaf_modules
(
model
,
[
MixtralSparseMoeBlock
])
if
model_type
==
"qwen2moe"
:
from
transformers.models.qwen2_moe.modeling_qwen2_moe
import
Qwen2MoeSparseMoeBlock
_set_z3_leaf_modules
(
model
,
[
Qwen2MoeSparseMoeBlock
])
def
configure_moe
(
config
:
"PretrainedConfig"
,
model_args
:
"ModelArguments"
,
is_trainable
:
bool
)
->
None
:
model_type
=
getattr
(
config
,
"model_type"
,
None
)
if
model_args
.
moe_aux_loss_coef
is
not
None
:
if
model_type
in
[
"jamba"
,
"mixtral"
,
"qwen2_moe"
]:
setattr
(
config
,
"router_aux_loss_coef"
,
model_args
.
moe_aux_loss_coef
)
elif
model_type
==
"deepseek"
:
setattr
(
config
,
"aux_loss_alpha"
,
model_args
.
moe_aux_loss_coef
)
elif
model_type
==
"jetmoe"
:
setattr
(
config
,
"aux_loss_coef"
,
model_args
.
moe_aux_loss_coef
)
if
model_type
in
[
"dbrx"
,
"jamba"
,
"jetmoe"
,
"mixtral"
,
"qwen2_moe"
]:
setattr
(
config
,
"output_router_logits"
,
is_trainable
)
LLaMA-Factory/src/llamafactory/model/model_utils/packing.py
0 → 100644
View file @
53b3977b
# Copyright 2024 Musab Gultekin and the LlamaFactory team.
#
# This code is based on the Musab Gultekin's functionary library.
# https://github.com/MeetKai/functionary/blob/main/functionary/train/packing/monkey_patch_packing.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# MIT License
#
# Copyright (c) 2023 Musab Gultekin
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from
typing
import
TYPE_CHECKING
,
Tuple
import
torch
import
torch.nn.functional
as
F
from
transformers.utils.versions
import
require_version
from
...extras
import
logging
from
...extras.packages
import
is_transformers_version_greater_than
if
is_transformers_version_greater_than
(
"4.43.0"
):
import
transformers.modeling_flash_attention_utils
if
TYPE_CHECKING
:
from
...hparams
import
ModelArguments
logger
=
logging
.
get_logger
(
__name__
)
def
get_seqlens_in_batch
(
attention_mask
:
"torch.Tensor"
)
->
"torch.Tensor"
:
r
"""
Gets the sequnce lengths in the current batch.
e.g.
```python
# input
[
[1, 1, 2, 2, 2, 0],
[1, 2, 2, 3, 3, 3],
]
# output
[2, 3, 1, 2, 3]
```
"""
bsz
=
attention_mask
.
size
(
0
)
dtype
,
device
=
attention_mask
.
dtype
,
attention_mask
.
device
max_num
=
torch
.
max
(
attention_mask
).
item
()
counts
:
"torch.Tensor"
=
torch
.
zeros
((
bsz
,
max_num
),
dtype
=
dtype
,
device
=
device
)
for
i
in
range
(
max_num
):
counts
[:,
i
]
=
torch
.
sum
(
attention_mask
==
(
i
+
1
),
dim
=-
1
)
counts
=
counts
.
flatten
()
seqlens
=
counts
[
counts
.
nonzero
().
squeeze
(
dim
=-
1
)]
return
seqlens
def
get_unpad_data
(
attention_mask
:
"torch.Tensor"
)
->
Tuple
[
"torch.Tensor"
,
"torch.Tensor"
,
int
]:
r
"""
Prepares the indices and seqlens for flash attn varlen function.
Returns:
indices: indices of non-masked tokens from the flattened sequence.
cu_seqlens: the cumulative sequence lengths in the current batch, always starts from 0.
max_seqlen_in_batch: the largest seqlen in the current batch.
e.g.
```python
# input
[
[1, 1, 2, 2, 2, 0],
[1, 2, 2, 3, 3, 3],
]
# output
[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11]
[0, 2, 5, 6, 8, 11]
3
```
"""
seqlens_in_batch
=
get_seqlens_in_batch
(
attention_mask
)
indices
=
torch
.
nonzero
(
attention_mask
.
flatten
(),
as_tuple
=
False
).
flatten
()
max_seqlen_in_batch
=
seqlens_in_batch
.
max
().
item
()
cu_seqlens
=
F
.
pad
(
torch
.
cumsum
(
seqlens_in_batch
,
dim
=
0
,
dtype
=
torch
.
int32
),
(
1
,
0
))
return
indices
,
cu_seqlens
,
max_seqlen_in_batch
def
configure_packing
(
model_args
:
"ModelArguments"
,
is_trainable
:
bool
)
->
None
:
if
not
is_trainable
or
not
model_args
.
block_diag_attn
:
return
require_version
(
"transformers>=4.43.0,<=4.46.1"
,
"To fix: pip install transformers>=4.43.0,<=4.46.1"
)
transformers
.
modeling_flash_attention_utils
.
_get_unpad_data
=
get_unpad_data
logger
.
info_rank0
(
"Using block diagonal attention for sequence packing without cross-attention."
)
LLaMA-Factory/src/llamafactory/model/model_utils/quantization.py
0 → 100644
View file @
53b3977b
# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
#
# This code is inspired by the HuggingFace's Transformers and Optimum library.
# https://github.com/huggingface/transformers/blob/v4.41.0/src/transformers/utils/quantization_config.py
# https://github.com/huggingface/optimum/blob/v1.20.0/optimum/gptq/data.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
random
from
enum
import
Enum
,
unique
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
List
import
torch
from
datasets
import
load_dataset
from
transformers
import
BitsAndBytesConfig
,
EetqConfig
,
GPTQConfig
,
HqqConfig
from
transformers.integrations
import
is_deepspeed_zero3_enabled
from
transformers.modeling_utils
import
is_fsdp_enabled
from
transformers.utils.versions
import
require_version
from
...extras
import
logging
from
...extras.constants
import
FILEEXT2TYPE
from
...extras.misc
import
get_current_device
if
TYPE_CHECKING
:
from
transformers
import
PretrainedConfig
,
PreTrainedTokenizer
from
...hparams
import
ModelArguments
logger
=
logging
.
get_logger
(
__name__
)
@
unique
class
QuantizationMethod
(
str
,
Enum
):
r
"""
Borrowed from `transformers.utils.quantization_config.QuantizationMethod`.
"""
BITS_AND_BYTES
=
"bitsandbytes"
GPTQ
=
"gptq"
AWQ
=
"awq"
AQLM
=
"aqlm"
QUANTO
=
"quanto"
EETQ
=
"eetq"
HQQ
=
"hqq"
def
_get_quantization_dataset
(
tokenizer
:
"PreTrainedTokenizer"
,
model_args
:
"ModelArguments"
)
->
List
[
Dict
[
str
,
Any
]]:
r
"""
Prepares the tokenized dataset to perform AutoGPTQ. Do not use tensor output for JSON serialization.
"""
if
os
.
path
.
isfile
(
model_args
.
export_quantization_dataset
):
data_path
=
FILEEXT2TYPE
.
get
(
model_args
.
export_quantization_dataset
.
split
(
"."
)[
-
1
],
None
)
data_files
=
model_args
.
export_quantization_dataset
else
:
data_path
=
model_args
.
export_quantization_dataset
data_files
=
None
dataset
=
load_dataset
(
path
=
data_path
,
data_files
=
data_files
,
split
=
"train"
,
cache_dir
=
model_args
.
cache_dir
,
token
=
model_args
.
hf_hub_token
,
)
samples
=
[]
maxlen
=
model_args
.
export_quantization_maxlen
for
_
in
range
(
model_args
.
export_quantization_nsamples
):
n_try
=
0
while
True
:
if
n_try
>
100
:
raise
ValueError
(
"Cannot find satisfying example, considering decrease `export_quantization_maxlen`."
)
sample_idx
=
random
.
randint
(
0
,
len
(
dataset
)
-
1
)
sample
:
Dict
[
str
,
"torch.Tensor"
]
=
tokenizer
(
dataset
[
sample_idx
][
"text"
],
return_tensors
=
"pt"
)
n_try
+=
1
if
sample
[
"input_ids"
].
size
(
1
)
>
maxlen
:
break
# TODO: fix large maxlen
word_idx
=
random
.
randint
(
0
,
sample
[
"input_ids"
].
size
(
1
)
-
maxlen
-
1
)
input_ids
=
sample
[
"input_ids"
][:,
word_idx
:
word_idx
+
maxlen
]
attention_mask
=
sample
[
"attention_mask"
][:,
word_idx
:
word_idx
+
maxlen
]
samples
.
append
({
"input_ids"
:
input_ids
.
tolist
(),
"attention_mask"
:
attention_mask
.
tolist
()})
return
samples
def
configure_quantization
(
config
:
"PretrainedConfig"
,
tokenizer
:
"PreTrainedTokenizer"
,
model_args
:
"ModelArguments"
,
init_kwargs
:
Dict
[
str
,
Any
],
)
->
None
:
r
"""
Priority: PTQ-quantized (train/infer) > AutoGPTQ (export) > On-the-fly quantization (train/infer)
"""
if
getattr
(
config
,
"quantization_config"
,
None
):
# ptq
if
model_args
.
quantization_bit
is
not
None
:
logger
.
warning_rank0
(
"`quantization_bit` will not affect on the PTQ-quantized models."
)
if
is_deepspeed_zero3_enabled
()
or
is_fsdp_enabled
():
raise
ValueError
(
"DeepSpeed ZeRO-3 or FSDP is incompatible with PTQ-quantized models."
)
quantization_config
:
Dict
[
str
,
Any
]
=
getattr
(
config
,
"quantization_config"
,
None
)
quant_method
=
quantization_config
.
get
(
"quant_method"
,
""
)
if
quant_method
==
QuantizationMethod
.
GPTQ
:
require_version
(
"auto_gptq>=0.5.0"
,
"To fix: pip install auto_gptq>=0.5.0"
)
quantization_config
.
pop
(
"disable_exllama"
,
None
)
# remove deprecated args
quantization_config
[
"use_exllama"
]
=
False
# disable exllama
if
quant_method
==
QuantizationMethod
.
AWQ
:
require_version
(
"autoawq"
,
"To fix: pip install autoawq"
)
if
quant_method
==
QuantizationMethod
.
AQLM
:
require_version
(
"aqlm>=1.1.0"
,
"To fix: pip install aqlm[gpu]>=1.1.0"
)
quantization_config
[
"bits"
]
=
2
quant_bits
=
quantization_config
.
get
(
"bits"
,
"?"
)
logger
.
info_rank0
(
f
"Loading
{
quant_bits
}
-bit
{
quant_method
.
upper
()
}
-quantized model."
)
elif
model_args
.
export_quantization_bit
is
not
None
:
# auto-gptq
if
model_args
.
export_quantization_bit
not
in
[
8
,
4
,
3
,
2
]:
raise
ValueError
(
"AutoGPTQ only accepts 2/3/4/8-bit quantization."
)
require_version
(
"optimum>=1.17.0"
,
"To fix: pip install optimum>=1.17.0"
)
require_version
(
"auto_gptq>=0.5.0"
,
"To fix: pip install auto_gptq>=0.5.0"
)
from
accelerate.utils
import
get_max_memory
if
getattr
(
config
,
"model_type"
,
None
)
==
"chatglm"
:
raise
ValueError
(
"ChatGLM model is not supported yet."
)
init_kwargs
[
"quantization_config"
]
=
GPTQConfig
(
bits
=
model_args
.
export_quantization_bit
,
dataset
=
_get_quantization_dataset
(
tokenizer
,
model_args
),
)
init_kwargs
[
"device_map"
]
=
"auto"
init_kwargs
[
"max_memory"
]
=
get_max_memory
()
logger
.
info_rank0
(
f
"Quantizing model to
{
model_args
.
export_quantization_bit
}
bit with AutoGPTQ."
)
elif
model_args
.
quantization_bit
is
not
None
:
# on-the-fly
if
model_args
.
quantization_method
==
QuantizationMethod
.
BITS_AND_BYTES
.
value
:
if
model_args
.
quantization_bit
==
8
:
require_version
(
"bitsandbytes>=0.37.0"
,
"To fix: pip install bitsandbytes>=0.37.0"
)
init_kwargs
[
"quantization_config"
]
=
BitsAndBytesConfig
(
load_in_8bit
=
True
)
elif
model_args
.
quantization_bit
==
4
:
require_version
(
"bitsandbytes>=0.39.0"
,
"To fix: pip install bitsandbytes>=0.39.0"
)
init_kwargs
[
"quantization_config"
]
=
BitsAndBytesConfig
(
load_in_4bit
=
True
,
bnb_4bit_compute_dtype
=
model_args
.
compute_dtype
,
bnb_4bit_use_double_quant
=
model_args
.
double_quantization
,
bnb_4bit_quant_type
=
model_args
.
quantization_type
,
bnb_4bit_quant_storage
=
model_args
.
compute_dtype
,
# crucial for fsdp+qlora
)
else
:
raise
ValueError
(
"Bitsandbytes only accepts 4-bit or 8-bit quantization."
)
# Do not assign device map if:
# 1. deepspeed zero3 or fsdp (train)
# 2. auto quantization device map (inference)
if
is_deepspeed_zero3_enabled
()
or
is_fsdp_enabled
()
or
model_args
.
quantization_device_map
==
"auto"
:
if
model_args
.
quantization_bit
!=
4
:
raise
ValueError
(
"Only 4-bit quantized model can use fsdp+qlora or auto device map."
)
require_version
(
"bitsandbytes>=0.43.0"
,
"To fix: pip install bitsandbytes>=0.43.0"
)
else
:
init_kwargs
[
"device_map"
]
=
{
""
:
get_current_device
()}
# change auto device map for inference
logger
.
info_rank0
(
f
"Quantizing model to
{
model_args
.
quantization_bit
}
bit with bitsandbytes."
)
elif
model_args
.
quantization_method
==
QuantizationMethod
.
HQQ
.
value
:
if
model_args
.
quantization_bit
not
in
[
8
,
6
,
5
,
4
,
3
,
2
,
1
]:
raise
ValueError
(
"HQQ only accepts 1/2/3/4/5/6/8-bit quantization."
)
if
is_deepspeed_zero3_enabled
()
or
is_fsdp_enabled
():
raise
ValueError
(
"HQQ quantization is incompatible with DeepSpeed ZeRO-3 or FSDP."
)
require_version
(
"hqq"
,
"To fix: pip install hqq"
)
init_kwargs
[
"quantization_config"
]
=
HqqConfig
(
nbits
=
model_args
.
quantization_bit
,
quant_zero
=
False
,
quant_scale
=
False
,
axis
=
0
)
# use ATEN kernel (axis=0) for performance
logger
.
info_rank0
(
f
"Quantizing model to
{
model_args
.
quantization_bit
}
bit with HQQ."
)
elif
model_args
.
quantization_method
==
QuantizationMethod
.
EETQ
.
value
:
if
model_args
.
quantization_bit
!=
8
:
raise
ValueError
(
"EETQ only accepts 8-bit quantization."
)
if
is_deepspeed_zero3_enabled
()
or
is_fsdp_enabled
():
raise
ValueError
(
"EETQ quantization is incompatible with DeepSpeed ZeRO-3 or FSDP."
)
require_version
(
"eetq"
,
"To fix: pip install eetq"
)
init_kwargs
[
"quantization_config"
]
=
EetqConfig
()
logger
.
info_rank0
(
f
"Quantizing model to
{
model_args
.
quantization_bit
}
bit with EETQ."
)
LLaMA-Factory/src/llamafactory/model/model_utils/rope.py
0 → 100644
View file @
53b3977b
# Copyright 2024 LMSYS and the LlamaFactory team.
# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
#
# This code is inspired by the LMSYS's FastChat library.
# https://github.com/lm-sys/FastChat/blob/v0.2.30/fastchat/train/train.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
from
typing
import
TYPE_CHECKING
from
...extras
import
logging
if
TYPE_CHECKING
:
from
transformers
import
PretrainedConfig
from
...hparams
import
ModelArguments
logger
=
logging
.
get_logger
(
__name__
)
def
configure_rope
(
config
:
"PretrainedConfig"
,
model_args
:
"ModelArguments"
,
is_trainable
:
bool
)
->
None
:
if
model_args
.
rope_scaling
is
None
:
return
if
not
hasattr
(
config
,
"rope_scaling"
):
logger
.
warning_rank0
(
"Current model does not support RoPE scaling."
)
return
if
model_args
.
model_max_length
is
not
None
:
if
is_trainable
and
model_args
.
rope_scaling
==
"dynamic"
:
logger
.
warning_rank0
(
"Dynamic NTK scaling may not work well with fine-tuning. "
"See: https://github.com/huggingface/transformers/pull/24653"
)
current_max_length
=
getattr
(
config
,
"max_position_embeddings"
,
None
)
if
current_max_length
and
model_args
.
model_max_length
>
current_max_length
:
logger
.
info_rank0
(
f
"Enlarge max model length from
{
current_max_length
}
to
{
model_args
.
model_max_length
}
."
)
setattr
(
config
,
"max_position_embeddings"
,
model_args
.
model_max_length
)
scaling_factor
=
float
(
math
.
ceil
(
model_args
.
model_max_length
/
current_max_length
))
else
:
logger
.
warning_rank0
(
"Input length is smaller than max length. Consider increase input length."
)
scaling_factor
=
1.0
else
:
scaling_factor
=
2.0
setattr
(
config
,
"rope_scaling"
,
{
"type"
:
model_args
.
rope_scaling
,
"factor"
:
scaling_factor
})
logger
.
info_rank0
(
f
"Using
{
model_args
.
rope_scaling
}
scaling strategy and setting scaling factor to
{
scaling_factor
}
"
)
LLaMA-Factory/src/llamafactory/model/model_utils/unsloth.py
0 → 100644
View file @
53b3977b
# Copyright 2024 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
Optional
from
...extras
import
logging
from
...extras.misc
import
get_current_device
if
TYPE_CHECKING
:
from
transformers
import
PretrainedConfig
,
PreTrainedModel
from
...hparams
import
ModelArguments
logger
=
logging
.
get_logger
(
__name__
)
def
_get_unsloth_kwargs
(
config
:
"PretrainedConfig"
,
model_name_or_path
:
str
,
model_args
:
"ModelArguments"
)
->
Dict
[
str
,
Any
]:
return
{
"model_name"
:
model_name_or_path
,
"max_seq_length"
:
model_args
.
model_max_length
or
4096
,
"dtype"
:
model_args
.
compute_dtype
,
"load_in_4bit"
:
model_args
.
quantization_bit
==
4
,
"token"
:
model_args
.
hf_hub_token
,
"device_map"
:
{
""
:
get_current_device
()},
"rope_scaling"
:
getattr
(
config
,
"rope_scaling"
,
None
),
"fix_tokenizer"
:
False
,
"trust_remote_code"
:
model_args
.
trust_remote_code
,
"use_gradient_checkpointing"
:
"unsloth"
,
}
def
load_unsloth_pretrained_model
(
config
:
"PretrainedConfig"
,
model_args
:
"ModelArguments"
)
->
Optional
[
"PreTrainedModel"
]:
r
"""
Optionally loads pretrained model with unsloth. Used in training.
"""
from
unsloth
import
FastLanguageModel
unsloth_kwargs
=
_get_unsloth_kwargs
(
config
,
model_args
.
model_name_or_path
,
model_args
)
try
:
model
,
_
=
FastLanguageModel
.
from_pretrained
(
**
unsloth_kwargs
)
except
NotImplementedError
:
logger
.
warning_rank0
(
"Unsloth does not support model type {}."
.
format
(
getattr
(
config
,
"model_type"
,
None
)))
model
=
None
model_args
.
use_unsloth
=
False
return
model
def
get_unsloth_peft_model
(
model
:
"PreTrainedModel"
,
model_args
:
"ModelArguments"
,
peft_kwargs
:
Dict
[
str
,
Any
]
)
->
"PreTrainedModel"
:
r
"""
Gets the peft model for the pretrained model with unsloth. Used in training.
"""
from
unsloth
import
FastLanguageModel
unsloth_peft_kwargs
=
{
"model"
:
model
,
"max_seq_length"
:
model_args
.
model_max_length
,
"use_gradient_checkpointing"
:
"unsloth"
,
}
return
FastLanguageModel
.
get_peft_model
(
**
peft_kwargs
,
**
unsloth_peft_kwargs
)
def
load_unsloth_peft_model
(
config
:
"PretrainedConfig"
,
model_args
:
"ModelArguments"
,
is_trainable
:
bool
)
->
"PreTrainedModel"
:
r
"""
Loads peft model with unsloth. Used in both training and inference.
"""
from
unsloth
import
FastLanguageModel
unsloth_kwargs
=
_get_unsloth_kwargs
(
config
,
model_args
.
adapter_name_or_path
[
0
],
model_args
)
try
:
if
not
is_trainable
:
unsloth_kwargs
[
"use_gradient_checkpointing"
]
=
False
model
,
_
=
FastLanguageModel
.
from_pretrained
(
**
unsloth_kwargs
)
except
NotImplementedError
:
raise
ValueError
(
"Unsloth does not support model type {}."
.
format
(
getattr
(
config
,
"model_type"
,
None
)))
if
not
is_trainable
:
FastLanguageModel
.
for_inference
(
model
)
return
model
LLaMA-Factory/src/llamafactory/model/model_utils/valuehead.py
0 → 100644
View file @
53b3977b
# Copyright 2024 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
TYPE_CHECKING
,
Dict
import
torch
from
transformers.utils
import
cached_file
from
...extras
import
logging
from
...extras.constants
import
V_HEAD_SAFE_WEIGHTS_NAME
,
V_HEAD_WEIGHTS_NAME
if
TYPE_CHECKING
:
from
transformers
import
PreTrainedModel
from
...hparams
import
ModelArguments
logger
=
logging
.
get_logger
(
__name__
)
def
load_valuehead_params
(
path_or_repo_id
:
str
,
model_args
:
"ModelArguments"
)
->
Dict
[
str
,
torch
.
Tensor
]:
r
"""
Loads value head parameters from Hugging Face Hub or local disk.
Returns: dict with keys `v_head.summary.weight` and `v_head.summary.bias`.
"""
kwargs
=
{
"path_or_repo_id"
:
path_or_repo_id
,
"cache_dir"
:
model_args
.
cache_dir
,
"token"
:
model_args
.
hf_hub_token
}
err_text
=
""
try
:
from
safetensors
import
safe_open
vhead_file
=
cached_file
(
filename
=
V_HEAD_SAFE_WEIGHTS_NAME
,
**
kwargs
)
with
safe_open
(
vhead_file
,
framework
=
"pt"
,
device
=
"cpu"
)
as
f
:
return
{
key
:
f
.
get_tensor
(
key
)
for
key
in
f
.
keys
()}
except
Exception
as
err
:
err_text
=
str
(
err
)
try
:
vhead_file
=
cached_file
(
filename
=
V_HEAD_WEIGHTS_NAME
,
**
kwargs
)
return
torch
.
load
(
vhead_file
,
map_location
=
"cpu"
)
except
Exception
as
err
:
err_text
=
str
(
err
)
logger
.
info_rank0
(
f
"Provided path (
{
path_or_repo_id
}
) does not contain value head weights:
{
err_text
}
."
)
logger
.
info_rank0
(
"Ignore the above message if you are not resuming the training of a value head model."
)
return
None
def
prepare_valuehead_model
(
model
:
"PreTrainedModel"
)
->
None
:
if
getattr
(
model
.
config
,
"model_type"
,
None
)
==
"llava"
:
setattr
(
model
,
"lm_head"
,
model
.
language_model
.
get_output_embeddings
())
setattr
(
model
,
"_keys_to_ignore_on_save"
,
[
"lm_head.weight"
])
if
getattr
(
model
.
config
,
"model_type"
,
None
)
==
"chatglm"
:
setattr
(
model
,
"lm_head"
,
model
.
transformer
.
output_layer
)
setattr
(
model
,
"_keys_to_ignore_on_save"
,
[
"lm_head.weight"
])
if
getattr
(
model
.
config
,
"model_type"
,
None
)
==
"internlm2"
:
setattr
(
model
,
"lm_head"
,
model
.
output
)
setattr
(
model
,
"_keys_to_ignore_on_save"
,
[
"lm_head.weight"
])
LLaMA-Factory/src/llamafactory/model/model_utils/visual.py
0 → 100644
View file @
53b3977b
# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
#
# This code is inspired by the HuggingFace's Transformers library.
# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava/modeling_llava.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
TYPE_CHECKING
,
List
,
Sequence
,
Set
,
Tuple
,
Union
import
torch
import
transformers
import
transformers.models
from
transformers.activations
import
ACT2FN
from
...extras
import
logging
if
TYPE_CHECKING
:
from
transformers
import
LlavaConfig
,
PretrainedConfig
,
PreTrainedModel
,
ProcessorMixin
from
...hparams
import
FinetuningArguments
,
ModelArguments
logger
=
logging
.
get_logger
(
__name__
)
transformers_logger
=
transformers
.
utils
.
logging
.
get_logger
(
__name__
)
class
LlavaMultiModalProjectorForYiVL
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
config
:
"LlavaConfig"
)
->
None
:
super
().
__init__
()
self
.
config
=
config
if
config
is
None
:
return
self
.
linear_1
=
torch
.
nn
.
Linear
(
config
.
vision_config
.
hidden_size
,
config
.
text_config
.
hidden_size
,
bias
=
True
)
self
.
linear_2
=
torch
.
nn
.
LayerNorm
(
config
.
text_config
.
hidden_size
,
bias
=
True
)
self
.
linear_3
=
torch
.
nn
.
Linear
(
config
.
text_config
.
hidden_size
,
config
.
text_config
.
hidden_size
,
bias
=
True
)
self
.
linear_4
=
torch
.
nn
.
LayerNorm
(
config
.
text_config
.
hidden_size
,
bias
=
True
)
self
.
act
=
ACT2FN
[
config
.
projector_hidden_act
]
def
forward
(
self
,
image_features
:
"torch.Tensor"
)
->
"torch.Tensor"
:
hidden_states
=
self
.
linear_1
(
image_features
)
hidden_states
=
self
.
linear_2
(
hidden_states
)
hidden_states
=
self
.
act
(
hidden_states
)
hidden_states
=
self
.
linear_3
(
hidden_states
)
hidden_states
=
self
.
linear_4
(
hidden_states
)
if
hidden_states
.
dtype
==
torch
.
float32
:
if
torch
.
is_autocast_enabled
():
target_dtype
=
torch
.
get_autocast_gpu_dtype
()
elif
hasattr
(
self
.
config
,
"_pre_quantization_dtype"
):
target_dtype
=
self
.
config
.
_pre_quantization_dtype
else
:
target_dtype
=
self
.
linear_1
.
weight
.
dtype
transformers_logger
.
warning_once
(
"The hidden states seems to be silently casted in float32."
)
hidden_states
=
hidden_states
.
to
(
target_dtype
)
return
hidden_states
class
LlavaMultiModalProjectorForYiVLForVLLM
(
LlavaMultiModalProjectorForYiVL
):
def
__init__
(
self
,
vision_hidden_size
:
int
,
text_hidden_size
:
int
,
projector_hidden_act
:
str
)
->
None
:
super
().
__init__
(
config
=
None
)
self
.
linear_1
=
torch
.
nn
.
Linear
(
vision_hidden_size
,
text_hidden_size
,
bias
=
True
)
self
.
linear_2
=
torch
.
nn
.
LayerNorm
(
text_hidden_size
,
bias
=
True
)
self
.
linear_3
=
torch
.
nn
.
Linear
(
text_hidden_size
,
text_hidden_size
,
bias
=
True
)
self
.
linear_4
=
torch
.
nn
.
LayerNorm
(
text_hidden_size
,
bias
=
True
)
self
.
act
=
ACT2FN
[
projector_hidden_act
]
def
autocast_projector_dtype
(
model
:
"PreTrainedModel"
,
model_args
:
"ModelArguments"
)
->
None
:
r
"""
Casts projector output to half precision for fine-tuning quantized VLMs.
"""
def
_mm_projector_forward_post_hook
(
module
:
"torch.nn.Module"
,
args
:
Tuple
[
"torch.Tensor"
],
output
:
"torch.Tensor"
)
->
"torch.Tensor"
:
return
output
.
to
(
model_args
.
compute_dtype
)
if
getattr
(
model
,
"quantization_method"
,
None
):
model_type
=
getattr
(
model
.
config
,
"model_type"
,
None
)
if
model_type
in
[
"llava"
,
"llava_next"
,
"llava_next_video"
,
"mllama"
,
"paligemma"
,
"video_llava"
]:
mm_projector
:
"torch.nn.Module"
=
getattr
(
model
,
"multi_modal_projector"
)
elif
model_type
==
"qwen2_vl"
:
mm_projector
:
"torch.nn.Module"
=
getattr
(
getattr
(
model
,
"visual"
),
"merger"
)
else
:
return
logger
.
info_rank0
(
f
"Casting multimodal projector outputs in
{
model_args
.
compute_dtype
}
."
)
mm_projector
.
register_forward_hook
(
_mm_projector_forward_post_hook
)
def
configure_visual_model
(
config
:
"PretrainedConfig"
)
->
None
:
r
"""
Patches VLMs before loading them.
"""
model_type
=
getattr
(
config
,
"model_type"
,
None
)
if
model_type
in
[
"llava"
,
"llava_next"
,
"llava_next_video"
,
"mllama"
,
"paligemma"
,
"video_llava"
]:
# required for ds zero3 and valuehead models
setattr
(
config
,
"hidden_size"
,
getattr
(
config
.
text_config
,
"hidden_size"
,
None
))
if
getattr
(
config
,
"is_yi_vl_derived_model"
,
None
):
logger
.
info_rank0
(
"Detected Yi-VL model, applying projector patch."
)
transformers
.
models
.
llava
.
modeling_llava
.
LlavaMultiModalProjector
=
LlavaMultiModalProjectorForYiVL
def
get_forbidden_modules
(
config
:
"PretrainedConfig"
,
finetuning_args
:
"FinetuningArguments"
)
->
Set
[
str
]:
r
"""
Freezes vision tower and language model for VLM full/freeze tuning.
"""
model_type
=
getattr
(
config
,
"model_type"
,
None
)
forbidden_modules
=
set
()
if
model_type
in
[
"llava"
,
"llava_next"
,
"llava_next_video"
,
"paligemma"
,
"video_llava"
]:
if
finetuning_args
.
freeze_vision_tower
:
forbidden_modules
.
add
(
"vision_tower"
)
if
finetuning_args
.
train_mm_proj_only
:
forbidden_modules
.
add
(
"language_model"
)
elif
model_type
==
"mllama"
:
if
finetuning_args
.
freeze_vision_tower
:
forbidden_modules
.
add
(
"vision_model"
)
if
finetuning_args
.
train_mm_proj_only
:
forbidden_modules
.
add
(
"language_model"
)
elif
model_type
==
"qwen2_vl"
:
if
finetuning_args
.
train_mm_proj_only
:
forbidden_modules
.
update
({
"visual.patch_embed"
,
"visual.blocks"
,
"model"
,
"lm_head"
})
elif
finetuning_args
.
freeze_vision_tower
:
forbidden_modules
.
add
(
"visual"
)
return
forbidden_modules
def
get_image_seqlen
(
config
:
"PretrainedConfig"
)
->
int
:
r
"""
Computes the number of special tokens per image.
"""
model_type
=
getattr
(
config
,
"model_type"
,
None
)
if
model_type
==
"llava"
:
image_seqlen
=
(
config
.
vision_config
.
image_size
//
config
.
vision_config
.
patch_size
)
**
2
if
getattr
(
config
,
"vision_feature_select_strategy"
,
"default"
)
==
"full"
:
# add [CLS] token
image_seqlen
+=
1
elif
model_type
==
"paligemma"
:
image_seqlen
=
config
.
vision_config
.
num_image_tokens
else
:
image_seqlen
=
-
1
return
image_seqlen
def
get_patch_size
(
config
:
"PretrainedConfig"
,
processor
:
"ProcessorMixin"
)
->
int
:
r
"""
Computes the patch size of the vit.
"""
patch_size
=
getattr
(
config
.
vision_config
,
"patch_size"
,
getattr
(
processor
,
"patch_size"
,
-
1
))
return
patch_size
def
get_vision_feature_select_strategy
(
config
:
"PretrainedConfig"
,
processor
:
"ProcessorMixin"
)
->
int
:
r
"""
Get the vision_feature_select_strategy.
"""
vision_feature_select_strategy
=
getattr
(
config
,
"vision_feature_select_strategy"
,
getattr
(
processor
,
"vision_feature_select_strategy"
,
"default"
)
)
return
vision_feature_select_strategy
def
patch_target_modules
(
config
:
"PretrainedConfig"
,
finetuning_args
:
"FinetuningArguments"
,
target_modules
:
Sequence
[
str
]
)
->
Union
[
str
,
List
[
str
]]:
r
"""
Freezes vision tower for VLM LoRA tuning.
"""
model_type
=
getattr
(
config
,
"model_type"
,
None
)
vit_model_type
=
getattr
(
getattr
(
config
,
"vision_config"
,
None
),
"model_type"
,
None
)
if
finetuning_args
.
freeze_vision_tower
:
if
model_type
in
[
"llava"
,
"llava_next"
,
"llava_next_video"
,
"paligemma"
,
"video_llava"
]:
return
"^(?!.*vision_tower).*(?:{}).*"
.
format
(
"|"
.
join
(
target_modules
))
elif
model_type
==
"mllama"
:
return
"^(?!.*vision_model).*(?:{}).*"
.
format
(
"|"
.
join
(
target_modules
))
elif
model_type
==
"qwen2_vl"
:
return
"^(?!.*visual).*(?:{}).*"
.
format
(
"|"
.
join
(
target_modules
))
else
:
return
target_modules
else
:
if
model_type
==
"qwen2_vl"
:
return
"^(?!.*patch_embed).*(?:{}).*"
.
format
(
"|"
.
join
(
target_modules
))
elif
vit_model_type
==
"pixtral"
:
return
"^(?!.*patch_conv).*(?:{}).*"
.
format
(
"|"
.
join
(
target_modules
))
else
:
return
target_modules
LLaMA-Factory/src/llamafactory/model/patcher.py
0 → 100644
View file @
53b3977b
# Copyright 2024 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
from
types
import
MethodType
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
import
torch
from
peft
import
PeftModel
from
transformers
import
PreTrainedModel
,
PreTrainedTokenizerBase
,
is_torch_npu_available
from
transformers.integrations
import
is_deepspeed_zero3_enabled
from
transformers.modeling_utils
import
is_fsdp_enabled
from
..extras
import
logging
from
..extras.misc
import
infer_optim_dtype
from
.model_utils.attention
import
configure_attn_implementation
,
print_attn_implementation
from
.model_utils.checkpointing
import
prepare_model_for_training
from
.model_utils.embedding
import
resize_embedding_layer
from
.model_utils.longlora
import
configure_longlora
from
.model_utils.moe
import
add_z3_leaf_module
,
configure_moe
from
.model_utils.packing
import
configure_packing
from
.model_utils.quantization
import
configure_quantization
from
.model_utils.rope
import
configure_rope
from
.model_utils.valuehead
import
prepare_valuehead_model
from
.model_utils.visual
import
(
autocast_projector_dtype
,
configure_visual_model
,
get_image_seqlen
,
get_patch_size
,
get_vision_feature_select_strategy
,
)
if
TYPE_CHECKING
:
from
transformers
import
PretrainedConfig
,
PreTrainedTokenizer
,
ProcessorMixin
from
trl
import
AutoModelForCausalLMWithValueHead
from
..hparams
import
ModelArguments
logger
=
logging
.
get_logger
(
__name__
)
def
patch_tokenizer
(
tokenizer
:
"PreTrainedTokenizer"
)
->
None
:
if
"PreTrainedTokenizerBase"
not
in
str
(
tokenizer
.
_pad
.
__func__
):
tokenizer
.
_pad
=
MethodType
(
PreTrainedTokenizerBase
.
_pad
,
tokenizer
)
def
patch_processor
(
processor
:
"ProcessorMixin"
,
config
:
"PretrainedConfig"
,
tokenizer
:
"PreTrainedTokenizer"
,
model_args
:
"ModelArguments"
,
)
->
None
:
setattr
(
processor
,
"tokenizer"
,
tokenizer
)
setattr
(
processor
,
"image_seqlen"
,
get_image_seqlen
(
config
))
setattr
(
processor
,
"image_resolution"
,
model_args
.
image_resolution
)
setattr
(
processor
,
"patch_size"
,
get_patch_size
(
config
,
processor
))
setattr
(
processor
,
"video_resolution"
,
model_args
.
video_resolution
)
setattr
(
processor
,
"video_fps"
,
model_args
.
video_fps
)
setattr
(
processor
,
"video_maxlen"
,
model_args
.
video_maxlen
)
setattr
(
processor
,
"vision_feature_select_strategy"
,
get_vision_feature_select_strategy
(
config
,
processor
))
def
patch_config
(
config
:
"PretrainedConfig"
,
tokenizer
:
"PreTrainedTokenizer"
,
model_args
:
"ModelArguments"
,
init_kwargs
:
Dict
[
str
,
Any
],
is_trainable
:
bool
,
)
->
None
:
if
model_args
.
compute_dtype
is
None
:
# priority: bf16 > fp16 > fp32
if
model_args
.
infer_dtype
!=
"auto"
and
not
is_trainable
:
model_args
.
compute_dtype
=
getattr
(
torch
,
model_args
.
infer_dtype
)
else
:
model_args
.
compute_dtype
=
infer_optim_dtype
(
model_dtype
=
getattr
(
config
,
"torch_dtype"
,
None
))
if
is_torch_npu_available
():
use_jit_compile
=
os
.
environ
.
get
(
"JIT_COMPILE"
,
"0"
).
lower
()
in
[
"true"
,
"1"
]
torch
.
npu
.
set_compile_mode
(
jit_compile
=
use_jit_compile
)
configure_attn_implementation
(
config
,
model_args
,
is_trainable
)
configure_rope
(
config
,
model_args
,
is_trainable
)
configure_longlora
(
config
,
model_args
,
is_trainable
)
configure_quantization
(
config
,
tokenizer
,
model_args
,
init_kwargs
)
configure_moe
(
config
,
model_args
,
is_trainable
)
configure_visual_model
(
config
)
configure_packing
(
model_args
,
is_trainable
)
if
model_args
.
use_cache
and
not
is_trainable
:
setattr
(
config
,
"use_cache"
,
True
)
logger
.
info_rank0
(
"Using KV cache for faster generation."
)
if
getattr
(
config
,
"model_type"
,
None
)
==
"qwen"
:
setattr
(
config
,
"use_flash_attn"
,
model_args
.
flash_attn
==
"fa2"
)
for
dtype_name
,
dtype
in
[(
"fp16"
,
torch
.
float16
),
(
"bf16"
,
torch
.
bfloat16
),
(
"fp32"
,
torch
.
float32
)]:
setattr
(
config
,
dtype_name
,
model_args
.
compute_dtype
==
dtype
)
if
getattr
(
config
,
"model_type"
,
None
)
==
"qwen2"
and
is_trainable
and
model_args
.
flash_attn
==
"fa2"
:
setattr
(
config
,
"use_cache"
,
False
)
# qwen2 does not support use_cache when using flash attn
if
"LlavaLlamaForCausalLM"
in
getattr
(
config
,
"architectures"
,
[]):
raise
ValueError
(
"Please download llava models with hf-compatible format: https://huggingface.co/llava-hf"
)
# deepspeed zero3 is not compatible with low_cpu_mem_usage
init_kwargs
[
"low_cpu_mem_usage"
]
=
model_args
.
low_cpu_mem_usage
and
(
not
is_deepspeed_zero3_enabled
())
# cast data type of the model if:
# 1. not deepspeed zero3 and not fsdp (keep zero3 or fsdp in float32)
# 2. quantization_bit is not None (qlora)
if
(
not
is_deepspeed_zero3_enabled
()
and
not
is_fsdp_enabled
())
or
model_args
.
quantization_bit
is
not
None
:
init_kwargs
[
"torch_dtype"
]
=
model_args
.
compute_dtype
if
init_kwargs
[
"low_cpu_mem_usage"
]:
# device map requires low_cpu_mem_usage=True
if
"device_map"
not
in
init_kwargs
and
model_args
.
device_map
:
init_kwargs
[
"device_map"
]
=
model_args
.
device_map
if
init_kwargs
.
get
(
"device_map"
,
None
)
==
"auto"
:
init_kwargs
[
"offload_folder"
]
=
model_args
.
offload_folder
def
patch_model
(
model
:
"PreTrainedModel"
,
tokenizer
:
"PreTrainedTokenizer"
,
model_args
:
"ModelArguments"
,
is_trainable
:
bool
,
add_valuehead
:
bool
,
)
->
None
:
gen_config
=
model
.
generation_config
# check and fix generation config
if
not
gen_config
.
do_sample
and
(
(
gen_config
.
temperature
is
not
None
and
gen_config
.
temperature
!=
1.0
)
or
(
gen_config
.
top_p
is
not
None
and
gen_config
.
top_p
!=
1.0
)
or
(
gen_config
.
typical_p
is
not
None
and
gen_config
.
typical_p
!=
1.0
)
):
gen_config
.
do_sample
=
True
if
"GenerationMixin"
not
in
str
(
model
.
generate
.
__func__
):
model
.
generate
=
MethodType
(
PreTrainedModel
.
generate
,
model
)
if
add_valuehead
:
prepare_valuehead_model
(
model
)
if
model_args
.
resize_vocab
:
resize_embedding_layer
(
model
,
tokenizer
)
if
is_trainable
:
prepare_model_for_training
(
model
,
model_args
)
autocast_projector_dtype
(
model
,
model_args
)
add_z3_leaf_module
(
model
)
if
not
model_args
.
use_unsloth
:
print_attn_implementation
(
model
.
config
)
try
:
model
.
add_model_tags
([
"llama-factory"
])
except
Exception
:
logger
.
warning_rank0
(
"Cannot properly tag the model."
)
def
patch_valuehead_model
(
model
:
"AutoModelForCausalLMWithValueHead"
)
->
None
:
def
tie_weights
(
self
:
"AutoModelForCausalLMWithValueHead"
)
->
None
:
if
isinstance
(
self
.
pretrained_model
,
PreTrainedModel
):
self
.
pretrained_model
.
tie_weights
()
def
get_input_embeddings
(
self
:
"AutoModelForCausalLMWithValueHead"
)
->
torch
.
nn
.
Module
:
if
isinstance
(
self
.
pretrained_model
,
PreTrainedModel
):
return
self
.
pretrained_model
.
get_input_embeddings
()
def
get_output_embeddings
(
self
:
"AutoModelForCausalLMWithValueHead"
)
->
torch
.
nn
.
Module
:
if
isinstance
(
self
.
pretrained_model
,
PreTrainedModel
):
return
self
.
pretrained_model
.
get_output_embeddings
()
def
create_or_update_model_card
(
self
:
"AutoModelForCausalLMWithValueHead"
,
output_dir
:
str
)
->
None
:
if
isinstance
(
self
.
pretrained_model
,
PeftModel
):
self
.
pretrained_model
.
create_or_update_model_card
(
output_dir
)
ignore_modules
=
[
name
for
name
,
_
in
model
.
named_parameters
()
if
"pretrained_model"
in
name
]
setattr
(
model
,
"_keys_to_ignore_on_save"
,
ignore_modules
)
setattr
(
model
,
"tie_weights"
,
MethodType
(
tie_weights
,
model
))
setattr
(
model
,
"get_input_embeddings"
,
MethodType
(
get_input_embeddings
,
model
))
setattr
(
model
,
"get_output_embeddings"
,
MethodType
(
get_output_embeddings
,
model
))
setattr
(
model
,
"create_or_update_model_card"
,
MethodType
(
create_or_update_model_card
,
model
))
LLaMA-Factory/src/llamafactory/train/__init__.py
0 → 100644
View file @
53b3977b
LLaMA-Factory/src/llamafactory/train/callbacks.py
0 → 100644
View file @
53b3977b
# Copyright 2024 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
os
import
signal
import
sys
import
time
from
concurrent.futures
import
ThreadPoolExecutor
from
datetime
import
timedelta
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
Optional
import
torch
import
transformers
from
peft
import
PeftModel
from
transformers
import
PreTrainedModel
,
ProcessorMixin
,
TrainerCallback
from
transformers.trainer_utils
import
PREFIX_CHECKPOINT_DIR
,
has_length
from
transformers.utils
import
(
SAFE_WEIGHTS_NAME
,
WEIGHTS_NAME
,
is_safetensors_available
,
)
from
typing_extensions
import
override
from
..extras
import
logging
from
..extras.constants
import
TRAINER_LOG
,
V_HEAD_SAFE_WEIGHTS_NAME
,
V_HEAD_WEIGHTS_NAME
from
..extras.misc
import
get_peak_memory
if
is_safetensors_available
():
from
safetensors
import
safe_open
from
safetensors.torch
import
save_file
if
TYPE_CHECKING
:
from
transformers
import
TrainerControl
,
TrainerState
,
TrainingArguments
from
trl
import
AutoModelForCausalLMWithValueHead
from
..hparams
import
DataArguments
,
FinetuningArguments
,
GeneratingArguments
,
ModelArguments
logger
=
logging
.
get_logger
(
__name__
)
def
fix_valuehead_checkpoint
(
model
:
"AutoModelForCausalLMWithValueHead"
,
output_dir
:
str
,
safe_serialization
:
bool
)
->
None
:
r
"""
The model is already unwrapped.
There are three cases:
1. full tuning without ds_zero3: state_dict = {"model.layers.*": ..., "v_head.summary.*": ...}
2. lora tuning without ds_zero3: state_dict = {"v_head.summary.*": ...}
3. under deepspeed zero3: state_dict = {"pretrained_model.model.layers.*": ..., "v_head.summary.*": ...}
We assume `stage3_gather_16bit_weights_on_model_save=true`.
"""
if
not
isinstance
(
model
.
pretrained_model
,
(
PreTrainedModel
,
PeftModel
)):
return
if
safe_serialization
:
path_to_checkpoint
=
os
.
path
.
join
(
output_dir
,
SAFE_WEIGHTS_NAME
)
with
safe_open
(
path_to_checkpoint
,
framework
=
"pt"
,
device
=
"cpu"
)
as
f
:
state_dict
:
Dict
[
str
,
torch
.
Tensor
]
=
{
key
:
f
.
get_tensor
(
key
)
for
key
in
f
.
keys
()}
else
:
path_to_checkpoint
=
os
.
path
.
join
(
output_dir
,
WEIGHTS_NAME
)
state_dict
:
Dict
[
str
,
torch
.
Tensor
]
=
torch
.
load
(
path_to_checkpoint
,
map_location
=
"cpu"
)
os
.
remove
(
path_to_checkpoint
)
decoder_state_dict
,
v_head_state_dict
=
{},
{}
for
name
,
param
in
state_dict
.
items
():
if
name
.
startswith
(
"v_head."
):
v_head_state_dict
[
name
]
=
param
else
:
decoder_state_dict
[
name
.
replace
(
"pretrained_model."
,
""
,
1
)]
=
param
model
.
pretrained_model
.
save_pretrained
(
output_dir
,
state_dict
=
decoder_state_dict
or
None
,
safe_serialization
=
safe_serialization
)
if
safe_serialization
:
save_file
(
v_head_state_dict
,
os
.
path
.
join
(
output_dir
,
V_HEAD_SAFE_WEIGHTS_NAME
),
metadata
=
{
"format"
:
"pt"
})
else
:
torch
.
save
(
v_head_state_dict
,
os
.
path
.
join
(
output_dir
,
V_HEAD_WEIGHTS_NAME
))
logger
.
info_rank0
(
f
"Value head model saved at:
{
output_dir
}
"
)
class
FixValueHeadModelCallback
(
TrainerCallback
):
r
"""
A callback for fixing the checkpoint for valuehead models.
"""
@
override
def
on_save
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
if
args
.
should_save
:
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
f
"
{
PREFIX_CHECKPOINT_DIR
}
-
{
state
.
global_step
}
"
)
fix_valuehead_checkpoint
(
model
=
kwargs
.
pop
(
"model"
),
output_dir
=
output_dir
,
safe_serialization
=
args
.
save_safetensors
)
class
SaveProcessorCallback
(
TrainerCallback
):
r
"""
A callback for saving the processor.
"""
def
__init__
(
self
,
processor
:
"ProcessorMixin"
)
->
None
:
self
.
processor
=
processor
@
override
def
on_save
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
if
args
.
should_save
:
output_dir
=
os
.
path
.
join
(
args
.
output_dir
,
f
"
{
PREFIX_CHECKPOINT_DIR
}
-
{
state
.
global_step
}
"
)
self
.
processor
.
save_pretrained
(
output_dir
)
@
override
def
on_train_end
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
if
args
.
should_save
:
self
.
processor
.
save_pretrained
(
args
.
output_dir
)
class
PissaConvertCallback
(
TrainerCallback
):
r
"""
A callback for converting the PiSSA adapter to a normal one.
"""
@
override
def
on_train_begin
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
if
args
.
should_save
:
model
=
kwargs
.
pop
(
"model"
)
pissa_init_dir
=
os
.
path
.
join
(
args
.
output_dir
,
"pissa_init"
)
logger
.
info_rank0
(
f
"Initial PiSSA adapter will be saved at:
{
pissa_init_dir
}
."
)
if
isinstance
(
model
,
PeftModel
):
init_lora_weights
=
getattr
(
model
.
peft_config
[
"default"
],
"init_lora_weights"
)
setattr
(
model
.
peft_config
[
"default"
],
"init_lora_weights"
,
True
)
model
.
save_pretrained
(
pissa_init_dir
,
safe_serialization
=
args
.
save_safetensors
)
setattr
(
model
.
peft_config
[
"default"
],
"init_lora_weights"
,
init_lora_weights
)
@
override
def
on_train_end
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
if
args
.
should_save
:
model
=
kwargs
.
pop
(
"model"
)
pissa_init_dir
=
os
.
path
.
join
(
args
.
output_dir
,
"pissa_init"
)
pissa_backup_dir
=
os
.
path
.
join
(
args
.
output_dir
,
"pissa_backup"
)
pissa_convert_dir
=
os
.
path
.
join
(
args
.
output_dir
,
"pissa_converted"
)
logger
.
info_rank0
(
f
"Converted PiSSA adapter will be saved at:
{
pissa_convert_dir
}
."
)
# 1. save a pissa backup with init_lora_weights: True
# 2. save a converted lora with init_lora_weights: pissa
# 3. load the pissa backup with init_lora_weights: True
# 4. delete the initial adapter and change init_lora_weights to pissa
if
isinstance
(
model
,
PeftModel
):
init_lora_weights
=
getattr
(
model
.
peft_config
[
"default"
],
"init_lora_weights"
)
setattr
(
model
.
peft_config
[
"default"
],
"init_lora_weights"
,
True
)
model
.
save_pretrained
(
pissa_backup_dir
,
safe_serialization
=
args
.
save_safetensors
)
setattr
(
model
.
peft_config
[
"default"
],
"init_lora_weights"
,
init_lora_weights
)
model
.
save_pretrained
(
pissa_convert_dir
,
safe_serialization
=
args
.
save_safetensors
,
convert_pissa_to_lora
=
pissa_init_dir
)
# TODO: use `path_initial_model_for_weight_conversion` (peft>=0.12.0)
model
.
load_adapter
(
pissa_backup_dir
,
"default"
,
is_trainable
=
True
)
model
.
set_adapter
(
"default"
)
if
"pissa_init"
in
model
.
peft_config
.
keys
():
# backward compatibility (peft<0.12.0)
model
.
delete_adapter
(
"pissa_init"
)
setattr
(
model
.
peft_config
[
"default"
],
"init_lora_weights"
,
init_lora_weights
)
class
LogCallback
(
TrainerCallback
):
r
"""
A callback for logging training and evaluation status.
"""
def
__init__
(
self
)
->
None
:
# Progress
self
.
start_time
=
0
self
.
cur_steps
=
0
self
.
max_steps
=
0
self
.
elapsed_time
=
""
self
.
remaining_time
=
""
self
.
thread_pool
:
Optional
[
"ThreadPoolExecutor"
]
=
None
# Status
self
.
aborted
=
False
self
.
do_train
=
False
# Web UI
self
.
webui_mode
=
os
.
environ
.
get
(
"LLAMABOARD_ENABLED"
,
"0"
).
lower
()
in
[
"true"
,
"1"
]
if
self
.
webui_mode
:
signal
.
signal
(
signal
.
SIGABRT
,
self
.
_set_abort
)
self
.
logger_handler
=
logging
.
LoggerHandler
(
os
.
environ
.
get
(
"LLAMABOARD_WORKDIR"
))
logging
.
add_handler
(
self
.
logger_handler
)
transformers
.
logging
.
add_handler
(
self
.
logger_handler
)
def
_set_abort
(
self
,
signum
,
frame
)
->
None
:
self
.
aborted
=
True
def
_reset
(
self
,
max_steps
:
int
=
0
)
->
None
:
self
.
start_time
=
time
.
time
()
self
.
cur_steps
=
0
self
.
max_steps
=
max_steps
self
.
elapsed_time
=
""
self
.
remaining_time
=
""
def
_timing
(
self
,
cur_steps
:
int
)
->
None
:
cur_time
=
time
.
time
()
elapsed_time
=
cur_time
-
self
.
start_time
avg_time_per_step
=
elapsed_time
/
cur_steps
if
cur_steps
!=
0
else
0
remaining_time
=
(
self
.
max_steps
-
cur_steps
)
*
avg_time_per_step
self
.
cur_steps
=
cur_steps
self
.
elapsed_time
=
str
(
timedelta
(
seconds
=
int
(
elapsed_time
)))
self
.
remaining_time
=
str
(
timedelta
(
seconds
=
int
(
remaining_time
)))
def
_write_log
(
self
,
output_dir
:
str
,
logs
:
Dict
[
str
,
Any
])
->
None
:
with
open
(
os
.
path
.
join
(
output_dir
,
TRAINER_LOG
),
"a"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
json
.
dumps
(
logs
)
+
"
\n
"
)
def
_create_thread_pool
(
self
,
output_dir
:
str
)
->
None
:
os
.
makedirs
(
output_dir
,
exist_ok
=
True
)
self
.
thread_pool
=
ThreadPoolExecutor
(
max_workers
=
1
)
def
_close_thread_pool
(
self
)
->
None
:
if
self
.
thread_pool
is
not
None
:
self
.
thread_pool
.
shutdown
(
wait
=
True
)
self
.
thread_pool
=
None
@
override
def
on_init_end
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
if
(
args
.
should_save
and
os
.
path
.
exists
(
os
.
path
.
join
(
args
.
output_dir
,
TRAINER_LOG
))
and
args
.
overwrite_output_dir
):
logger
.
warning_once
(
"Previous trainer log in this folder will be deleted."
)
os
.
remove
(
os
.
path
.
join
(
args
.
output_dir
,
TRAINER_LOG
))
@
override
def
on_train_begin
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
if
args
.
should_save
:
self
.
do_train
=
True
self
.
_reset
(
max_steps
=
state
.
max_steps
)
self
.
_create_thread_pool
(
output_dir
=
args
.
output_dir
)
@
override
def
on_train_end
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
self
.
_close_thread_pool
()
@
override
def
on_substep_end
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
if
self
.
aborted
:
control
.
should_epoch_stop
=
True
control
.
should_training_stop
=
True
@
override
def
on_step_end
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
if
self
.
aborted
:
control
.
should_epoch_stop
=
True
control
.
should_training_stop
=
True
@
override
def
on_evaluate
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
if
not
self
.
do_train
:
self
.
_close_thread_pool
()
@
override
def
on_predict
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
if
not
self
.
do_train
:
self
.
_close_thread_pool
()
@
override
def
on_log
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
if
not
args
.
should_save
:
return
self
.
_timing
(
cur_steps
=
state
.
global_step
)
logs
=
dict
(
current_steps
=
self
.
cur_steps
,
total_steps
=
self
.
max_steps
,
loss
=
state
.
log_history
[
-
1
].
get
(
"loss"
),
eval_loss
=
state
.
log_history
[
-
1
].
get
(
"eval_loss"
),
predict_loss
=
state
.
log_history
[
-
1
].
get
(
"predict_loss"
),
reward
=
state
.
log_history
[
-
1
].
get
(
"reward"
),
accuracy
=
state
.
log_history
[
-
1
].
get
(
"rewards/accuracies"
),
lr
=
state
.
log_history
[
-
1
].
get
(
"learning_rate"
),
epoch
=
state
.
log_history
[
-
1
].
get
(
"epoch"
),
percentage
=
round
(
self
.
cur_steps
/
self
.
max_steps
*
100
,
2
)
if
self
.
max_steps
!=
0
else
100
,
elapsed_time
=
self
.
elapsed_time
,
remaining_time
=
self
.
remaining_time
,
)
if
state
.
num_input_tokens_seen
:
logs
[
"throughput"
]
=
round
(
state
.
num_input_tokens_seen
/
(
time
.
time
()
-
self
.
start_time
),
2
)
logs
[
"total_tokens"
]
=
state
.
num_input_tokens_seen
if
os
.
environ
.
get
(
"RECORD_VRAM"
,
"0"
).
lower
()
in
[
"true"
,
"1"
]:
vram_allocated
,
vram_reserved
=
get_peak_memory
()
logs
[
"vram_allocated"
]
=
round
(
vram_allocated
/
(
1024
**
3
),
2
)
logs
[
"vram_reserved"
]
=
round
(
vram_reserved
/
(
1024
**
3
),
2
)
logs
=
{
k
:
v
for
k
,
v
in
logs
.
items
()
if
v
is
not
None
}
if
self
.
webui_mode
and
all
(
key
in
logs
for
key
in
(
"loss"
,
"lr"
,
"epoch"
)):
log_str
=
f
"'loss':
{
logs
[
'loss'
]:.
4
f
}
, 'learning_rate':
{
logs
[
'lr'
]:
2.4
e
}
, 'epoch':
{
logs
[
'epoch'
]:.
2
f
}
"
for
extra_key
in
(
"reward"
,
"accuracy"
,
"throughput"
):
if
logs
.
get
(
extra_key
):
log_str
+=
f
", '
{
extra_key
}
':
{
logs
[
extra_key
]:.
2
f
}
"
logger
.
info_rank0
(
"{"
+
log_str
+
"}"
)
if
self
.
thread_pool
is
not
None
:
self
.
thread_pool
.
submit
(
self
.
_write_log
,
args
.
output_dir
,
logs
)
@
override
def
on_prediction_step
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
if
self
.
do_train
:
return
if
self
.
aborted
:
sys
.
exit
(
0
)
if
not
args
.
should_save
:
return
eval_dataloader
=
kwargs
.
pop
(
"eval_dataloader"
,
None
)
if
has_length
(
eval_dataloader
):
if
self
.
max_steps
==
0
:
self
.
_reset
(
max_steps
=
len
(
eval_dataloader
))
self
.
_create_thread_pool
(
output_dir
=
args
.
output_dir
)
self
.
_timing
(
cur_steps
=
self
.
cur_steps
+
1
)
if
self
.
cur_steps
%
5
==
0
and
self
.
thread_pool
is
not
None
:
logs
=
dict
(
current_steps
=
self
.
cur_steps
,
total_steps
=
self
.
max_steps
,
percentage
=
round
(
self
.
cur_steps
/
self
.
max_steps
*
100
,
2
)
if
self
.
max_steps
!=
0
else
100
,
elapsed_time
=
self
.
elapsed_time
,
remaining_time
=
self
.
remaining_time
,
)
self
.
thread_pool
.
submit
(
self
.
_write_log
,
args
.
output_dir
,
logs
)
class
ReporterCallback
(
TrainerCallback
):
r
"""
A callback for reporting training status to external logger.
"""
def
__init__
(
self
,
model_args
:
"ModelArguments"
,
data_args
:
"DataArguments"
,
finetuning_args
:
"FinetuningArguments"
,
generating_args
:
"GeneratingArguments"
,
)
->
None
:
self
.
model_args
=
model_args
self
.
data_args
=
data_args
self
.
finetuning_args
=
finetuning_args
self
.
generating_args
=
generating_args
os
.
environ
[
"WANDB_PROJECT"
]
=
os
.
getenv
(
"WANDB_PROJECT"
,
"llamafactory"
)
@
override
def
on_train_begin
(
self
,
args
:
"TrainingArguments"
,
state
:
"TrainerState"
,
control
:
"TrainerControl"
,
**
kwargs
):
if
not
state
.
is_world_process_zero
:
return
if
"wandb"
in
args
.
report_to
:
import
wandb
wandb
.
config
.
update
(
{
"model_args"
:
self
.
model_args
.
to_dict
(),
"data_args"
:
self
.
data_args
.
to_dict
(),
"finetuning_args"
:
self
.
finetuning_args
.
to_dict
(),
"generating_args"
:
self
.
generating_args
.
to_dict
(),
}
)
if
self
.
finetuning_args
.
use_swanlab
:
import
swanlab
swanlab
.
config
.
update
(
{
"model_args"
:
self
.
model_args
.
to_dict
(),
"data_args"
:
self
.
data_args
.
to_dict
(),
"finetuning_args"
:
self
.
finetuning_args
.
to_dict
(),
"generating_args"
:
self
.
generating_args
.
to_dict
(),
}
)
LLaMA-Factory/src/llamafactory/train/dpo/__init__.py
0 → 100644
View file @
53b3977b
# Copyright 2024 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.workflow
import
run_dpo
__all__
=
[
"run_dpo"
]
Prev
1
…
6
7
8
9
10
11
12
13
14
…
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment