Commit 70368616 authored by silencealiang's avatar silencealiang
Browse files

update model parameters

parent 8551c38e
...@@ -96,11 +96,11 @@ TRAINING_ARGS=( ...@@ -96,11 +96,11 @@ TRAINING_ARGS=(
--global-batch-size 256 --global-batch-size 256
--lr 1e-4 --lr 1e-4
--train-iters 10 --train-iters 10
--lr-decay-iters 320000 --lr-decay-iters 10000
--lr-decay-style cosine --lr-decay-style cosine
--min-lr 1.0e-5 --min-lr 1.0e-6
--weight-decay 0.1 --weight-decay 0.1
--lr-warmup-iters 500 --lr-warmup-iters 2000
--clip-grad 1.0 --clip-grad 1.0
--bf16 --bf16
--overlap-param-gather --overlap-param-gather
...@@ -112,7 +112,7 @@ TORCH_PROFIE_ARGS=( ...@@ -112,7 +112,7 @@ TORCH_PROFIE_ARGS=(
--profile-ranks 0 1 2 3 4 5 6 7 --profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3 --profile-step-start 3
--profile-step-end 4 --profile-step-end 4
--profile-dir torch_prof_mixtral8x7B_1nodes_tp2-pp1-ep8-ep_tp1-cp1 --profile-dir torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
--use-pytorch-profiler --use-pytorch-profiler
) )
...@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=( ...@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=(
--pipeline-model-parallel-size 1 --pipeline-model-parallel-size 1
--expert-model-parallel-size 8 --expert-model-parallel-size 8
--expert-tensor-parallel-size 1 --expert-tensor-parallel-size 1
--context-parallel-size 1
--use-distributed-optimizer --use-distributed-optimizer
--sequence-parallel --sequence-parallel
) )
...@@ -143,7 +144,8 @@ LOGGING_ARGS=( ...@@ -143,7 +144,8 @@ LOGGING_ARGS=(
#--load $CHECKPOINT_PATH \ #--load $CHECKPOINT_PATH \
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \ --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
--no-load-optim \ --no-load-optim \
--no-load-rng --no-load-rng \
--no-save-optim
) )
if [ -n "${WANDB_API_KEY}" ]; then if [ -n "${WANDB_API_KEY}" ]; then
...@@ -175,43 +177,34 @@ fi ...@@ -175,43 +177,34 @@ fi
case ${LOCAL_RANK} in case ${LOCAL_RANK} in
[0]) [0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=0 --membind=0 ${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[1]) [1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=1 --membind=1 ${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
;; ;;
[2]) [2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=2 --membind=2 ${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
;; ;;
[3]) [3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=3 --membind=3 ${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
;; ;;
[4]) [4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=4 --membind=4 ${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
;; ;;
[5]) [5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=5 --membind=5 ${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
;; ;;
[6]) [6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=6 --membind=6 ${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
;; ;;
[7]) [7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=7 --membind=7 ${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
;; ;;
esac esac
...@@ -96,11 +96,11 @@ TRAINING_ARGS=( ...@@ -96,11 +96,11 @@ TRAINING_ARGS=(
--global-batch-size 256 --global-batch-size 256
--lr 1e-4 --lr 1e-4
--train-iters 10 --train-iters 10
--lr-decay-iters 320000 --lr-decay-iters 10000
--lr-decay-style cosine --lr-decay-style cosine
--min-lr 1.0e-5 --min-lr 1.0e-6
--weight-decay 0.1 --weight-decay 0.1
--lr-warmup-iters 500 --lr-warmup-iters 2000
--clip-grad 1.0 --clip-grad 1.0
--bf16 --bf16
--overlap-param-gather --overlap-param-gather
...@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=( ...@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=(
--pipeline-model-parallel-size 4 --pipeline-model-parallel-size 4
--expert-model-parallel-size 8 --expert-model-parallel-size 8
--expert-tensor-parallel-size 1 --expert-tensor-parallel-size 1
--context-parallel-size 1
--use-distributed-optimizer --use-distributed-optimizer
--sequence-parallel --sequence-parallel
) )
...@@ -143,7 +144,8 @@ LOGGING_ARGS=( ...@@ -143,7 +144,8 @@ LOGGING_ARGS=(
#--load $CHECKPOINT_PATH \ #--load $CHECKPOINT_PATH \
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \ --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
--no-load-optim \ --no-load-optim \
--no-load-rng --no-load-rng \
--no-save-optim
) )
if [ -n "${WANDB_API_KEY}" ]; then if [ -n "${WANDB_API_KEY}" ]; then
...@@ -175,43 +177,34 @@ fi ...@@ -175,43 +177,34 @@ fi
case ${LOCAL_RANK} in case ${LOCAL_RANK} in
[0]) [0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=0 --membind=0 ${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[1]) [1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=1 --membind=1 ${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
;; ;;
[2]) [2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=2 --membind=2 ${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
;; ;;
[3]) [3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=3 --membind=3 ${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
;; ;;
[4]) [4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=4 --membind=4 ${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
;; ;;
[5]) [5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=5 --membind=5 ${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
;; ;;
[6]) [6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=6 --membind=6 ${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
;; ;;
[7]) [7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP} numactl --cpunodebind=7 --membind=7 ${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
;; ;;
esac esac
...@@ -170,7 +170,7 @@ def model_provider( ...@@ -170,7 +170,7 @@ def model_provider(
rope_scaling=args.use_rope_scaling, rope_scaling=args.use_rope_scaling,
mtp_block_spec=mtp_block_spec, mtp_block_spec=mtp_block_spec,
) )
print_rank_0(model)
return model return model
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment