Merge branch 'k100ai_dtk2404_optim'

729150b6 · yinger_z · d8704953 · b0b8940e · 729150b6 · 729150b6
Commit 729150b6 authored Aug 29, 2024 by yinger_z
20 changed files
--- a/limitations/self-confusion_tencent.jpg
+++ b/limitations/self-confusion_tencent.jpg
--- a/model.properties
+++ b/model.properties
--- a/model/modeling_chatglm.py
+++ b/model/modeling_chatglm.py
--- a/ptuning/arguments.py
+++ b/ptuning/arguments.py
--- a/ptuning/deepspeed.json
+++ b/ptuning/deepspeed.json
@@ -21,14 +21,6 @@
  },
  "zero_optimization": {
    "stage": 3,
-    "offload_optimizer": {
-      "device": "cpu",
-      "pin_memory": true
-    },
-    "offload_param": {
-      "device": "cpu",
-      "pin_memory": true
-    },
    "stage3_gather_16bit_weights_on_model_save": true,
    "allgather_partitions": true,
    "allgather_bucket_size": 5e8,

--- a/ptuning/deepspeed_zero2.json
+++ b/ptuning/deepspeed_zero2.json
--- a/ptuning/evaluate_ft.sh
+++ b/ptuning/evaluate_ft.sh
--- a/ptuning/evaluate_ptuning.sh
+++ b/ptuning/evaluate_ptuning.sh
--- a/ptuning/ft_train.sh
+++ b/ptuning/ft_train.sh
 LR=5e-5
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_P2P_LEVEL=5
+export NCCL_MIN_NCHANNELS=20
 MASTER_PORT=$(shuf -n 1 -i 10000-65535)
 HIP_VISIBLE_DEVICES=0,1,2,3 deepspeed --num_gpus=4 --master_port $MASTER_PORT main.py \
    --deepspeed deepspeed.json \
    --do_train \
@@ -9,19 +10,17 @@ HIP_VISIBLE_DEVICES=0,1,2,3 deepspeed --num_gpus=4 --master_port $MASTER_PORT ma
    --test_file AdvertiseGen/dev.json \
    --prompt_column content \
    --response_column summary \
-    --overwrite_cache \
+    --model_name_or_path ../model/chatglm-6b \
-    --model_name_or_path THUDM/chatglm-6b \
+    --output_dir ./output_ft/nooptim-adgen-chatglm-6b-ft-4c-$LR \
-    --output_dir ./output_ft/adgen-chatglm-6b-ft-4c-$LR \
    --overwrite_output_dir \
-    --max_source_length 64 \
+    --max_source_length 512 \
-    --max_target_length 64 \
+    --max_target_length 512 \
-    --per_device_train_batch_size 32 \
+    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 1 \
    --predict_with_generate \
-    --max_steps 5000 \
+    --max_steps 30 \
-    --logging_steps 10 \
+    --logging_steps 30 \
    --save_steps 1000 \
    --learning_rate $LR \
-    --fp16
+    --fp16 \
\ No newline at end of file
--- a/ptuning/lora_train.sh
+++ b/ptuning/lora_train.sh
--- a/ptuning/main.py
+++ b/ptuning/main.py
--- a/ptuning/media/6B_ds_ft_bs32_accum1_4cards_zero3_5e-5.jpg
+++ b/ptuning/media/6B_ds_ft_bs32_accum1_4cards_zero3_5e-5.jpg
--- a/ptuning/media/6B_ds_pt_bs16_accum1_4cards_zero2_5e-3.jpg
+++ b/ptuning/media/6B_ds_pt_bs16_accum1_4cards_zero2_5e-3.jpg
--- a/ptuning/media/GLM.png
+++ b/ptuning/media/GLM.png
--- a/ptuning/media/cli.png
+++ b/ptuning/media/cli.png
--- a/ptuning/media/pretrain.jpeg
+++ b/ptuning/media/pretrain.jpeg
--- a/ptuning/media/transformers.jpg
+++ b/ptuning/media/transformers.jpg
--- a/ptuning/multi_node/run_train.sh
+++ b/ptuning/multi_node/run_train.sh
--- a/ptuning/multi_node/run_train_single.sh
+++ b/ptuning/multi_node/run_train_single.sh
--- a/ptuning/ptuning_train.sh
+++ b/ptuning/ptuning_train.sh
-PRE_SEQ_LEN=128
+PRE_SEQ_LEN=1024
 LR=5e-3
-MASTER_PORT=$(shuf -n 1 -i 10000-65535)
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_P2P_LEVEL=5
+export NCCL_MIN_NCHANNELS=20
-HIP_VISIBLE_DEVICES=0,1,2,3 deepspeed --master_port $MASTER_PORT main.py \
+MASTER_PORT=$(shuf -n 1 -i 10000-65535)
-    --deepspeed deepspeed.json \
+HIP_VISIBLE_DEVICES=0,1,2,3 deepspeed --num_gpus=4 --master_port $MASTER_PORT main.py \
    --do_train \
    --train_file AdvertiseGen/train.json \
    --test_file AdvertiseGen/dev.json \
    --prompt_column content \
    --response_column summary \
-    --model_name_or_path THUDM/chatglm-6b \
+    --model_name_or_path ../model/chatglm-6b \
    --output_dir ./output_pt/adgen-chatglm-6b-pt-4c-$LR \
    --overwrite_output_dir \
-    --max_source_length 64 \
+    --max_source_length 512 \
-    --max_target_length 64 \
+    --max_target_length 512 \
-    --per_device_train_batch_size 16 \
+    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 1 \
    --predict_with_generate \
-    --max_steps 3000 \
+    --max_steps 30 \
-    --logging_steps 10 \
+    --logging_steps 1 \
    --save_steps 1000 \
    --learning_rate $LR \
    --pre_seq_len $PRE_SEQ_LEN \
-    --fp16
+    --fp16 \
\ No newline at end of file