Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Chinese-LLaMA-Alpaca-2
Commits
7d06d0f9
Commit
7d06d0f9
authored
Jun 26, 2025
by
yangzhong
Browse files
Update files
parent
2f320edb
Pipeline
#2827
failed with stages
in 0 seconds
Changes
84
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
636 additions
and
0 deletions
+636
-0
scripts/training/run_clm_sft_with_peft.py
scripts/training/run_clm_sft_with_peft.py
+513
-0
scripts/training/run_pt.sh
scripts/training/run_pt.sh
+58
-0
scripts/training/run_sft.sh
scripts/training/run_sft.sh
+62
-0
wget-log
wget-log
+3
-0
No files found.
scripts/training/run_clm_sft_with_peft.py
0 → 100644
View file @
7d06d0f9
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
https://huggingface.co/models?filter=text-generation
"""
# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
import
logging
import
math
import
os
import
sys
from
dataclasses
import
dataclass
,
field
from
typing
import
Optional
from
pathlib
import
Path
import
datasets
import
torch
from
build_dataset
import
build_instruction_dataset
,
DataCollatorForSupervisedDataset
import
transformers
from
transformers
import
(
CONFIG_MAPPING
,
AutoConfig
,
BitsAndBytesConfig
,
LlamaForCausalLM
,
LlamaTokenizer
,
AutoTokenizer
,
HfArgumentParser
,
Trainer
,
TrainingArguments
,
set_seed
,
)
from
transformers.trainer_utils
import
get_last_checkpoint
from
transformers.utils
import
send_example_telemetry
from
transformers.utils.versions
import
require_version
from
peft
import
LoraConfig
,
TaskType
,
get_peft_model
,
PeftModel
,
get_peft_model_state_dict
from
peft.tuners.lora
import
LoraLayer
from
transformers.trainer_utils
import
PREFIX_CHECKPOINT_DIR
require_version
(
"datasets>=1.8.0"
,
"To fix: pip install -r examples/pytorch/language-modeling/requirements.txt"
)
class
SavePeftModelCallback
(
transformers
.
TrainerCallback
):
def
save_model
(
self
,
args
,
state
,
kwargs
):
if
state
.
best_model_checkpoint
is
not
None
:
checkpoint_folder
=
os
.
path
.
join
(
state
.
best_model_checkpoint
,
"sft_lora_model"
)
else
:
checkpoint_folder
=
os
.
path
.
join
(
args
.
output_dir
,
f
"
{
PREFIX_CHECKPOINT_DIR
}
-
{
state
.
global_step
}
"
)
peft_model_path
=
os
.
path
.
join
(
checkpoint_folder
,
"sft_lora_model"
)
kwargs
[
"model"
].
save_pretrained
(
peft_model_path
)
kwargs
[
"tokenizer"
].
save_pretrained
(
peft_model_path
)
def
on_save
(
self
,
args
,
state
,
control
,
**
kwargs
):
self
.
save_model
(
args
,
state
,
kwargs
)
return
control
def
on_train_end
(
self
,
args
,
state
,
control
,
**
kwargs
):
peft_model_path
=
os
.
path
.
join
(
args
.
output_dir
,
"sft_lora_model"
)
kwargs
[
"model"
].
save_pretrained
(
peft_model_path
)
kwargs
[
"tokenizer"
].
save_pretrained
(
peft_model_path
)
def
prepare_model_for_kbit_training
(
model
,
use_gradient_checkpointing
=
True
):
r
"""
This method wraps the entire protocol for preparing a model before running a training. This includes:
1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm
head to fp32
Args:
model, (`transformers.PreTrainedModel`):
The loaded model from `transformers`
"""
loaded_in_kbit
=
getattr
(
model
,
"is_loaded_in_8bit"
,
False
)
or
getattr
(
model
,
"is_loaded_in_4bit"
,
False
)
for
name
,
param
in
model
.
named_parameters
():
# freeze base model's layers
param
.
requires_grad
=
False
# cast all non INT8/INT4 parameters to fp32
for
param
in
model
.
parameters
():
if
((
param
.
dtype
==
torch
.
float16
)
or
(
param
.
dtype
==
torch
.
bfloat16
))
and
loaded_in_kbit
:
param
.
data
=
param
.
data
.
to
(
torch
.
float32
)
for
name
,
module
in
model
.
named_modules
():
if
'norm'
in
name
:
module
=
module
.
to
(
torch
.
float32
)
if
loaded_in_kbit
and
use_gradient_checkpointing
:
# For backward compatibility
if
hasattr
(
model
,
"enable_input_require_grads"
):
model
.
enable_input_require_grads
()
else
:
def
make_inputs_require_grad
(
module
,
_input
,
output
):
output
.
requires_grad_
(
True
)
model
.
get_input_embeddings
().
register_forward_hook
(
make_inputs_require_grad
)
# enable gradient checkpointing for memory efficiency
model
.
gradient_checkpointing_enable
()
return
model
@
dataclass
class
ModelArguments
:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
"""
model_name_or_path
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
(
"The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
)
},
)
tokenizer_name_or_path
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
(
"The tokenizer for weights initialization.Don't set if you want to train a model from scratch."
)
},
)
config_overrides
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
(
"Override some existing default config settings when a model is trained from scratch. Example: "
"n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
)
},
)
config_name
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Pretrained config name or path if not the same as model_name"
}
)
tokenizer_name
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Pretrained tokenizer name or path if not the same as model_name"
}
)
cache_dir
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Where do you want to store the pretrained models downloaded from huggingface.co"
},
)
use_fast_tokenizer
:
bool
=
field
(
default
=
True
,
metadata
=
{
"help"
:
"Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."
},
)
model_revision
:
str
=
field
(
default
=
"main"
,
metadata
=
{
"help"
:
"The specific model version to use (can be a branch name, tag name or commit id)."
},
)
use_auth_token
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
(
"Will use the token generated when running `huggingface-cli login` (necessary to use this script "
"with private models)."
)
},
)
torch_dtype
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
(
"Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
"dtype will be automatically derived from the model's weights."
),
"choices"
:
[
"auto"
,
"bfloat16"
,
"float16"
,
"float32"
],
},
)
def
__post_init__
(
self
):
if
self
.
config_overrides
is
not
None
and
(
self
.
config_name
is
not
None
or
self
.
model_name_or_path
is
not
None
):
raise
ValueError
(
"--config_overrides can't be used in combination with --config_name or --model_name_or_path"
)
@
dataclass
class
DataTrainingArguments
:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
"""
dataset_dir
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The name of the dataset to use (via the datasets library)."
}
)
train_file
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The input training data file (a text file)."
})
validation_file
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"An optional input evaluation data file to evaluate the perplexity on (a text file)."
},
)
overwrite_cache
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Overwrite the cached training and evaluation sets"
}
)
validation_split_percentage
:
Optional
[
float
]
=
field
(
default
=
0.05
,
metadata
=
{
"help"
:
"The percentage of the train set used as validation set in case there's no validation split"
},
)
preprocessing_num_workers
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The number of processes to use for the preprocessing."
},
)
keep_linebreaks
:
bool
=
field
(
default
=
True
,
metadata
=
{
"help"
:
"Whether to keep line breaks when using TXT files or not."
}
)
data_cache_dir
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"The datasets processed stored"
})
max_seq_length
:
Optional
[
int
]
=
field
(
default
=
1024
)
@
dataclass
class
MyTrainingArguments
(
TrainingArguments
):
trainable
:
Optional
[
str
]
=
field
(
default
=
"q_proj,v_proj"
)
lora_rank
:
Optional
[
int
]
=
field
(
default
=
8
)
lora_dropout
:
Optional
[
float
]
=
field
(
default
=
0.1
)
lora_alpha
:
Optional
[
float
]
=
field
(
default
=
32.
)
modules_to_save
:
Optional
[
str
]
=
field
(
default
=
None
)
peft_path
:
Optional
[
str
]
=
field
(
default
=
None
)
use_flash_attention_2
:
Optional
[
bool
]
=
field
(
default
=
False
)
double_quant
:
Optional
[
bool
]
=
field
(
default
=
True
)
quant_type
:
Optional
[
str
]
=
field
(
default
=
"nf4"
)
load_in_kbits
:
Optional
[
int
]
=
field
(
default
=
16
)
full_finetuning
:
Optional
[
bool
]
=
field
(
default
=
False
)
logger
=
logging
.
getLogger
(
__name__
)
def
main
():
parser
=
HfArgumentParser
((
ModelArguments
,
DataTrainingArguments
,
MyTrainingArguments
))
if
len
(
sys
.
argv
)
==
2
and
sys
.
argv
[
1
].
endswith
(
".json"
):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args
,
data_args
,
training_args
=
parser
.
parse_json_file
(
json_file
=
os
.
path
.
abspath
(
sys
.
argv
[
1
]))
else
:
model_args
,
data_args
,
training_args
=
parser
.
parse_args_into_dataclasses
()
send_example_telemetry
(
"run_clm"
,
model_args
,
data_args
)
# Setup logging
logging
.
basicConfig
(
format
=
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
,
datefmt
=
"%m/%d/%Y %H:%M:%S"
,
level
=
logging
.
INFO
,
# if training_args.local_rank in [-1, 0] else logging.WARN,
handlers
=
[
logging
.
StreamHandler
(
sys
.
stdout
)],)
if
training_args
.
should_log
:
# The default of training_args.log_level is passive, so we set log level at info here to have that default.
transformers
.
utils
.
logging
.
set_verbosity_info
()
log_level
=
training_args
.
get_process_log_level
()
logger
.
setLevel
(
log_level
)
datasets
.
utils
.
logging
.
set_verbosity
(
log_level
)
transformers
.
utils
.
logging
.
set_verbosity
(
log_level
)
transformers
.
utils
.
logging
.
enable_default_handler
()
transformers
.
utils
.
logging
.
enable_explicit_format
()
# transformers.tokenization_utils.logging.set_verbosity_warning()
# Log on each process the small summary:
logger
.
warning
(
f
"Process rank:
{
training_args
.
local_rank
}
, device:
{
training_args
.
device
}
, n_gpu:
{
training_args
.
n_gpu
}
"
+
f
"distributed training:
{
bool
(
training_args
.
local_rank
!=
-
1
)
}
, 16-bits training:
{
training_args
.
fp16
or
training_args
.
bf16
}
"
)
# Detecting last checkpoint.
last_checkpoint
=
None
if
os
.
path
.
isdir
(
training_args
.
output_dir
)
and
training_args
.
do_train
and
not
training_args
.
overwrite_output_dir
:
last_checkpoint
=
get_last_checkpoint
(
training_args
.
output_dir
)
if
last_checkpoint
is
None
and
len
(
os
.
listdir
(
training_args
.
output_dir
))
>
0
:
raise
ValueError
(
f
"Output directory (
{
training_args
.
output_dir
}
) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif
last_checkpoint
is
not
None
and
training_args
.
resume_from_checkpoint
is
None
:
logger
.
info
(
f
"Checkpoint detected, resuming training at
{
last_checkpoint
}
. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Set seed before initializing model.
set_seed
(
training_args
.
seed
)
config_kwargs
=
{
"cache_dir"
:
model_args
.
cache_dir
,
"revision"
:
model_args
.
model_revision
,
"use_auth_token"
:
True
if
model_args
.
use_auth_token
else
None
,
}
if
model_args
.
config_name
:
config
=
AutoConfig
.
from_pretrained
(
model_args
.
config_name
,
**
config_kwargs
)
elif
model_args
.
model_name_or_path
:
config
=
AutoConfig
.
from_pretrained
(
model_args
.
model_name_or_path
,
**
config_kwargs
)
else
:
config
=
CONFIG_MAPPING
[
model_args
.
model_type
]()
logger
.
warning
(
"You are instantiating a new config instance from scratch."
)
if
model_args
.
config_overrides
is
not
None
:
logger
.
info
(
f
"Overriding config:
{
model_args
.
config_overrides
}
"
)
config
.
update_from_string
(
model_args
.
config_overrides
)
logger
.
info
(
f
"New config:
{
config
}
"
)
tokenizer_kwargs
=
{
"cache_dir"
:
model_args
.
cache_dir
,
"use_fast"
:
model_args
.
use_fast_tokenizer
,
"revision"
:
model_args
.
model_revision
,
"use_auth_token"
:
True
if
model_args
.
use_auth_token
else
None
,
}
if
model_args
.
tokenizer_name
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_args
.
tokenizer_name
,
**
tokenizer_kwargs
)
elif
model_args
.
tokenizer_name_or_path
:
tokenizer
=
LlamaTokenizer
.
from_pretrained
(
model_args
.
tokenizer_name_or_path
,
**
tokenizer_kwargs
)
else
:
raise
ValueError
(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
)
if
(
len
(
tokenizer
))
!=
55296
:
raise
ValueError
(
f
"The vocab size of the tokenizer should be 55296, but found
{
len
(
tokenizer
)
}
.
\n
"
"Please use Chinese-LLaMA-2 tokenizer."
)
data_collator
=
DataCollatorForSupervisedDataset
(
tokenizer
=
tokenizer
)
eval_dataset
=
None
train_dataset
=
None
if
training_args
.
do_train
:
with
training_args
.
main_process_first
(
desc
=
"loading and tokenization"
):
path
=
Path
(
data_args
.
dataset_dir
)
files
=
[
os
.
path
.
join
(
path
,
file
.
name
)
for
file
in
path
.
glob
(
"*.json"
)]
logger
.
info
(
f
"Training files:
{
' '
.
join
(
files
)
}
"
)
train_dataset
=
build_instruction_dataset
(
data_path
=
files
,
tokenizer
=
tokenizer
,
max_seq_length
=
data_args
.
max_seq_length
,
data_cache_dir
=
None
,
preprocessing_num_workers
=
data_args
.
preprocessing_num_workers
)
logger
.
info
(
f
"Num train_samples
{
len
(
train_dataset
)
}
"
)
logger
.
info
(
"Training example:"
)
logger
.
info
(
tokenizer
.
decode
(
train_dataset
[
0
][
'input_ids'
]))
if
training_args
.
do_eval
:
with
training_args
.
main_process_first
(
desc
=
"loading and tokenization"
):
files
=
[
data_args
.
validation_file
]
logger
.
info
(
f
"Evaluation files:
{
' '
.
join
(
files
)
}
"
)
eval_dataset
=
build_instruction_dataset
(
data_path
=
files
,
tokenizer
=
tokenizer
,
max_seq_length
=
data_args
.
max_seq_length
,
data_cache_dir
=
None
,
preprocessing_num_workers
=
data_args
.
preprocessing_num_workers
)
logger
.
info
(
f
"Num eval_samples
{
len
(
eval_dataset
)
}
"
)
logger
.
info
(
"Evaluation example:"
)
logger
.
info
(
tokenizer
.
decode
(
eval_dataset
[
0
][
'input_ids'
]))
torch_dtype
=
(
model_args
.
torch_dtype
if
model_args
.
torch_dtype
in
[
"auto"
,
None
]
else
getattr
(
torch
,
model_args
.
torch_dtype
)
)
compute_dtype
=
(
torch
.
float16
if
training_args
.
fp16
else
(
torch
.
bfloat16
if
training_args
.
bf16
else
torch
.
float32
))
if
training_args
.
load_in_kbits
in
[
4
,
8
]:
load_in_4bit
=
training_args
.
load_in_kbits
==
4
load_in_8bit
=
training_args
.
load_in_kbits
==
8
if
training_args
.
modules_to_save
is
not
None
:
load_in_8bit_skip_modules
=
training_args
.
modules_to_save
.
split
(
','
)
else
:
load_in_8bit_skip_modules
=
None
quantization_config
=
BitsAndBytesConfig
(
load_in_4bit
=
training_args
.
load_in_kbits
==
4
,
load_in_8bit
=
training_args
.
load_in_kbits
==
8
,
llm_int8_threshold
=
6.0
,
load_in_8bit_skip_modules
=
load_in_8bit_skip_modules
,
bnb_4bit_compute_dtype
=
compute_dtype
,
bnb_4bit_use_double_quant
=
training_args
.
double_quant
,
bnb_4bit_quant_type
=
training_args
.
quant_type
# {'fp4', 'nf4'}
)
else
:
load_in_4bit
=
False
load_in_8bit
=
False
quantization_config
=
None
if
quantization_config
is
not
None
:
logger
.
info
(
f
"quantization_config:
{
quantization_config
.
to_dict
()
}
"
)
device_map
=
{
""
:
int
(
os
.
environ
.
get
(
"LOCAL_RANK"
)
or
0
)}
model
=
LlamaForCausalLM
.
from_pretrained
(
model_args
.
model_name_or_path
,
config
=
config
,
cache_dir
=
model_args
.
cache_dir
,
revision
=
model_args
.
model_revision
,
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
torch_dtype
=
torch_dtype
,
low_cpu_mem_usage
=
True
,
device_map
=
device_map
,
load_in_4bit
=
load_in_4bit
,
load_in_8bit
=
load_in_8bit
,
quantization_config
=
quantization_config
,
use_flash_attention_2
=
training_args
.
use_flash_attention_2
)
if
training_args
.
load_in_kbits
in
[
4
,
8
]:
model
=
prepare_model_for_kbit_training
(
model
,
use_gradient_checkpointing
=
training_args
.
gradient_checkpointing
)
model
.
config
.
use_cache
=
False
model_vocab_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
logger
.
info
(
f
"Model vocab size:
{
model_vocab_size
}
"
)
logger
.
info
(
f
"len(tokenizer):
{
len
(
tokenizer
)
}
"
)
if
model_vocab_size
!=
len
(
tokenizer
):
logger
.
info
(
f
"Resize model vocab size to
{
len
(
tokenizer
)
}
"
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
if
not
training_args
.
full_finetuning
:
if
training_args
.
peft_path
is
not
None
:
logger
.
info
(
"Peft from pre-trained model"
)
model
=
PeftModel
.
from_pretrained
(
model
,
training_args
.
peft_path
,
device_map
=
device_map
)
else
:
logger
.
info
(
"Init new peft model"
)
target_modules
=
training_args
.
trainable
.
split
(
','
)
modules_to_save
=
training_args
.
modules_to_save
if
modules_to_save
is
not
None
:
modules_to_save
=
modules_to_save
.
split
(
','
)
lora_rank
=
training_args
.
lora_rank
lora_dropout
=
training_args
.
lora_dropout
lora_alpha
=
training_args
.
lora_alpha
logger
.
info
(
f
"target_modules:
{
target_modules
}
"
)
logger
.
info
(
f
"lora_rank:
{
lora_rank
}
"
)
peft_config
=
LoraConfig
(
task_type
=
TaskType
.
CAUSAL_LM
,
target_modules
=
target_modules
,
inference_mode
=
False
,
r
=
lora_rank
,
lora_alpha
=
lora_alpha
,
lora_dropout
=
lora_dropout
,
modules_to_save
=
modules_to_save
)
model
=
get_peft_model
(
model
,
peft_config
)
model
.
print_trainable_parameters
()
logger
.
info
(
f
"model.modules_to_save:
{
model
.
modules_to_save
}
"
)
old_state_dict
=
model
.
state_dict
model
.
state_dict
=
(
lambda
self
,
*
_
,
**
__
:
get_peft_model_state_dict
(
self
,
old_state_dict
())
).
__get__
(
model
,
type
(
model
))
if
not
training_args
.
full_finetuning
and
training_args
.
gradient_checkpointing
and
\
(
not
model
.
modules_to_save
or
'embed_tokens'
not
in
model
.
modules_to_save
):
# enable requires_grad to avoid exception during backward pass when using gradient_checkpoint without tuning embed.
if
hasattr
(
model
.
base_model
,
"enable_input_require_grads"
):
model
.
base_model
.
enable_input_require_grads
()
elif
hasattr
(
model
.
base_model
,
"get_input_embeddings"
):
def
make_inputs_require_grad
(
_module
,
_input
,
_output
):
_output
.
requires_grad_
(
True
)
model
.
base_model
.
get_input_embeddings
().
register_forward_hook
(
make_inputs_require_grad
)
# Initialize our Trainer
trainer
=
Trainer
(
model
=
model
,
args
=
training_args
,
train_dataset
=
train_dataset
,
eval_dataset
=
eval_dataset
,
tokenizer
=
tokenizer
,
data_collator
=
data_collator
,
)
trainer
.
add_callback
(
SavePeftModelCallback
)
# Training
if
training_args
.
do_train
:
checkpoint
=
None
if
training_args
.
resume_from_checkpoint
is
not
None
:
checkpoint
=
training_args
.
resume_from_checkpoint
elif
last_checkpoint
is
not
None
:
checkpoint
=
last_checkpoint
train_result
=
trainer
.
train
(
resume_from_checkpoint
=
checkpoint
)
metrics
=
train_result
.
metrics
metrics
[
"train_samples"
]
=
len
(
train_dataset
)
trainer
.
log_metrics
(
"train"
,
metrics
)
trainer
.
save_metrics
(
"train"
,
metrics
)
trainer
.
save_state
()
# Evaluation
if
training_args
.
do_eval
:
logger
.
info
(
"*** Evaluate ***"
)
metrics
=
trainer
.
evaluate
()
metrics
[
"eval_samples"
]
=
len
(
eval_dataset
)
try
:
perplexity
=
math
.
exp
(
metrics
[
"eval_loss"
])
except
OverflowError
:
perplexity
=
float
(
"inf"
)
metrics
[
"perplexity"
]
=
perplexity
trainer
.
log_metrics
(
"eval"
,
metrics
)
trainer
.
save_metrics
(
"eval"
,
metrics
)
if
__name__
==
"__main__"
:
main
()
scripts/training/run_pt.sh
0 → 100755
View file @
7d06d0f9
# 运行脚本前请仔细阅读wiki(https://github.com/ymcui/Chinese-LLaMA-Alpaca-2/wiki/pt_scripts_zh)
# Read the wiki(https://github.com/ymcui/Chinese-LLaMA-Alpaca-2/wiki/pt_scripts_zh) carefully before running the script
lr
=
2e-4
lora_rank
=
64
lora_alpha
=
128
lora_trainable
=
"q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj"
modules_to_save
=
"embed_tokens,lm_head"
lora_dropout
=
0.05
pretrained_model
=
"/Chinese-LLaMA-Alpaca-2/pre_model"
chinese_tokenizer_path
=
"/Chinese-LLaMA-Alpaca-2/scripts/tokenizer/tokenizer.model"
dataset_dir
=
"/Chinese-LLaMA-Alpaca-2/dataset"
data_cache
=
"/Chinese-LLaMA-Alpaca-2/tmp"
per_device_train_batch_size
=
1
gradient_accumulation_steps
=
8
block_size
=
512
output_dir
=
"/Chinese-LLaMA-Alpaca-2/output"
deepspeed_config_file
=
"./ds_zero2_no_offload.json"
torchrun
--nnodes
1
--nproc_per_node
8
--master_port
=
25007 run_clm_pt_with_peft.py
\
--deepspeed
${
deepspeed_config_file
}
\
--model_name_or_path
${
pretrained_model
}
\
--tokenizer_name_or_path
${
chinese_tokenizer_path
}
\
--dataset_dir
${
dataset_dir
}
\
--data_cache_dir
${
data_cache
}
\
--validation_split_percentage
0.001
\
--per_device_train_batch_size
${
per_device_train_batch_size
}
\
--do_train
\
--seed
$RANDOM
\
--fp16
\
--num_train_epochs
1
\
--lr_scheduler_type
cosine
\
--learning_rate
${
lr
}
\
--warmup_ratio
0.05
\
--weight_decay
0.01
\
--logging_strategy
steps
\
--logging_steps
10
\
--save_strategy
steps
\
--save_total_limit
3
\
--save_steps
200
\
--gradient_accumulation_steps
${
gradient_accumulation_steps
}
\
--preprocessing_num_workers
8
\
--block_size
${
block_size
}
\
--output_dir
${
output_dir
}
\
--overwrite_output_dir
\
--ddp_timeout
30000
\
--logging_first_step
True
\
--lora_rank
${
lora_rank
}
\
--lora_alpha
${
lora_alpha
}
\
--trainable
${
lora_trainable
}
\
--lora_dropout
${
lora_dropout
}
\
--modules_to_save
${
modules_to_save
}
\
--torch_dtype
float16
\
--load_in_kbits
16
\
--save_safetensors
False
\
--gradient_checkpointing
\
--ddp_find_unused_parameters
False
scripts/training/run_sft.sh
0 → 100644
View file @
7d06d0f9
# 运行脚本前请仔细阅读wiki(https://github.com/ymcui/Chinese-LLaMA-Alpaca-2/wiki/sft_scripts_zh)
# Read the wiki(https://github.com/ymcui/Chinese-LLaMA-Alpaca-2/wiki/sft_scripts_zh) carefully before running the script
lr
=
1e-4
lora_rank
=
64
lora_alpha
=
128
lora_trainable
=
"q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj"
modules_to_save
=
"embed_tokens,lm_head"
lora_dropout
=
0.05
pretrained_model
=
path/to/hf/llama-2/or/chinese-llama-2/dir/or/model_id
chinese_tokenizer_path
=
path/to/chinese-llama-2/tokenizer/dir
dataset_dir
=
path/to/sft/data/dir
per_device_train_batch_size
=
1
per_device_eval_batch_size
=
1
gradient_accumulation_steps
=
8
max_seq_length
=
512
output_dir
=
output_dir
validation_file
=
validation_file_name
deepspeed_config_file
=
ds_zero2_no_offload.json
torchrun
--nnodes
1
--nproc_per_node
1 run_clm_sft_with_peft.py
\
--deepspeed
${
deepspeed_config_file
}
\
--model_name_or_path
${
pretrained_model
}
\
--tokenizer_name_or_path
${
chinese_tokenizer_path
}
\
--dataset_dir
${
dataset_dir
}
\
--per_device_train_batch_size
${
per_device_train_batch_size
}
\
--per_device_eval_batch_size
${
per_device_eval_batch_size
}
\
--do_train
\
--do_eval
\
--seed
$RANDOM
\
--fp16
\
--num_train_epochs
1
\
--lr_scheduler_type
cosine
\
--learning_rate
${
lr
}
\
--warmup_ratio
0.03
\
--weight_decay
0
\
--logging_strategy
steps
\
--logging_steps
10
\
--save_strategy
steps
\
--save_total_limit
3
\
--evaluation_strategy
steps
\
--eval_steps
100
\
--save_steps
200
\
--gradient_accumulation_steps
${
gradient_accumulation_steps
}
\
--preprocessing_num_workers
8
\
--max_seq_length
${
max_seq_length
}
\
--output_dir
${
output_dir
}
\
--overwrite_output_dir
\
--ddp_timeout
30000
\
--logging_first_step
True
\
--lora_rank
${
lora_rank
}
\
--lora_alpha
${
lora_alpha
}
\
--trainable
${
lora_trainable
}
\
--lora_dropout
${
lora_dropout
}
\
--modules_to_save
${
modules_to_save
}
\
--torch_dtype
float16
\
--validation_file
${
validation_file
}
\
--load_in_kbits
16
\
--save_safetensors
False
\
--gradient_checkpointing
\
--ddp_find_unused_parameters
False
wget-log
0 → 100644
View file @
7d06d0f9
--2024-12-18 15:54:33-- https://drive.usercontent.google.com/open?id=1EX8eE5YWBxCaohBO8Fh4e2j3b9C2bTVQ
Connecting to 10.16.5.10:3128... connected.
Proxy tunneling failed: Service UnavailableUnable to establish SSL connection.
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment