Commit 729150b6 authored by yinger_z's avatar yinger_z
Browse files

Merge branch 'k100ai_dtk2404_optim'

parents d8704953 b0b8940e
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
This diff is collapsed.
File mode changed from 100644 to 100755
...@@ -21,14 +21,6 @@ ...@@ -21,14 +21,6 @@
}, },
"zero_optimization": { "zero_optimization": {
"stage": 3, "stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"stage3_gather_16bit_weights_on_model_save": true, "stage3_gather_16bit_weights_on_model_save": true,
"allgather_partitions": true, "allgather_partitions": true,
"allgather_bucket_size": 5e8, "allgather_bucket_size": 5e8,
......
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
LR=5e-5 LR=5e-5
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_P2P_LEVEL=5
export NCCL_MIN_NCHANNELS=20
MASTER_PORT=$(shuf -n 1 -i 10000-65535) MASTER_PORT=$(shuf -n 1 -i 10000-65535)
HIP_VISIBLE_DEVICES=0,1,2,3 deepspeed --num_gpus=4 --master_port $MASTER_PORT main.py \ HIP_VISIBLE_DEVICES=0,1,2,3 deepspeed --num_gpus=4 --master_port $MASTER_PORT main.py \
--deepspeed deepspeed.json \ --deepspeed deepspeed.json \
--do_train \ --do_train \
...@@ -9,19 +10,17 @@ HIP_VISIBLE_DEVICES=0,1,2,3 deepspeed --num_gpus=4 --master_port $MASTER_PORT ma ...@@ -9,19 +10,17 @@ HIP_VISIBLE_DEVICES=0,1,2,3 deepspeed --num_gpus=4 --master_port $MASTER_PORT ma
--test_file AdvertiseGen/dev.json \ --test_file AdvertiseGen/dev.json \
--prompt_column content \ --prompt_column content \
--response_column summary \ --response_column summary \
--overwrite_cache \ --model_name_or_path ../model/chatglm-6b \
--model_name_or_path THUDM/chatglm-6b \ --output_dir ./output_ft/nooptim-adgen-chatglm-6b-ft-4c-$LR \
--output_dir ./output_ft/adgen-chatglm-6b-ft-4c-$LR \
--overwrite_output_dir \ --overwrite_output_dir \
--max_source_length 64 \ --max_source_length 512 \
--max_target_length 64 \ --max_target_length 512 \
--per_device_train_batch_size 32 \ --per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \ --per_device_eval_batch_size 1 \
--gradient_accumulation_steps 1 \ --gradient_accumulation_steps 1 \
--predict_with_generate \ --predict_with_generate \
--max_steps 5000 \ --max_steps 30 \
--logging_steps 10 \ --logging_steps 30 \
--save_steps 1000 \ --save_steps 1000 \
--learning_rate $LR \ --learning_rate $LR \
--fp16 --fp16 \
\ No newline at end of file
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
PRE_SEQ_LEN=128 PRE_SEQ_LEN=1024
LR=5e-3 LR=5e-3
MASTER_PORT=$(shuf -n 1 -i 10000-65535) export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_P2P_LEVEL=5
export NCCL_MIN_NCHANNELS=20
HIP_VISIBLE_DEVICES=0,1,2,3 deepspeed --master_port $MASTER_PORT main.py \ MASTER_PORT=$(shuf -n 1 -i 10000-65535)
--deepspeed deepspeed.json \ HIP_VISIBLE_DEVICES=0,1,2,3 deepspeed --num_gpus=4 --master_port $MASTER_PORT main.py \
--do_train \ --do_train \
--train_file AdvertiseGen/train.json \ --train_file AdvertiseGen/train.json \
--test_file AdvertiseGen/dev.json \ --test_file AdvertiseGen/dev.json \
--prompt_column content \ --prompt_column content \
--response_column summary \ --response_column summary \
--model_name_or_path THUDM/chatglm-6b \ --model_name_or_path ../model/chatglm-6b \
--output_dir ./output_pt/adgen-chatglm-6b-pt-4c-$LR \ --output_dir ./output_pt/adgen-chatglm-6b-pt-4c-$LR \
--overwrite_output_dir \ --overwrite_output_dir \
--max_source_length 64 \ --max_source_length 512 \
--max_target_length 64 \ --max_target_length 512 \
--per_device_train_batch_size 16 \ --per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \ --per_device_eval_batch_size 1 \
--gradient_accumulation_steps 1 \ --gradient_accumulation_steps 1 \
--predict_with_generate \ --predict_with_generate \
--max_steps 3000 \ --max_steps 30 \
--logging_steps 10 \ --logging_steps 1 \
--save_steps 1000 \ --save_steps 1000 \
--learning_rate $LR \ --learning_rate $LR \
--pre_seq_len $PRE_SEQ_LEN \ --pre_seq_len $PRE_SEQ_LEN \
--fp16 --fp16 \
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment