Commit 70368616 authored by silencealiang's avatar silencealiang
Browse files

update model parameters

parent 8551c38e
......@@ -96,11 +96,11 @@ TRAINING_ARGS=(
--global-batch-size 256
--lr 1e-4
--train-iters 10
--lr-decay-iters 320000
--lr-decay-iters 10000
--lr-decay-style cosine
--min-lr 1.0e-5
--min-lr 1.0e-6
--weight-decay 0.1
--lr-warmup-iters 500
--lr-warmup-iters 2000
--clip-grad 1.0
--bf16
--overlap-param-gather
......@@ -112,7 +112,7 @@ TORCH_PROFIE_ARGS=(
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x7B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
--profile-dir torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
......@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=(
--pipeline-model-parallel-size 1
--expert-model-parallel-size 8
--expert-tensor-parallel-size 1
--context-parallel-size 1
--use-distributed-optimizer
--sequence-parallel
)
......@@ -143,7 +144,8 @@ LOGGING_ARGS=(
#--load $CHECKPOINT_PATH \
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
--no-load-optim \
--no-load-rng
--no-load-rng \
--no-save-optim
)
if [ -n "${WANDB_API_KEY}" ]; then
......@@ -175,43 +177,34 @@ fi
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
......@@ -96,11 +96,11 @@ TRAINING_ARGS=(
--global-batch-size 256
--lr 1e-4
--train-iters 10
--lr-decay-iters 320000
--lr-decay-iters 10000
--lr-decay-style cosine
--min-lr 1.0e-5
--min-lr 1.0e-6
--weight-decay 0.1
--lr-warmup-iters 500
--lr-warmup-iters 2000
--clip-grad 1.0
--bf16
--overlap-param-gather
......@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=(
--pipeline-model-parallel-size 4
--expert-model-parallel-size 8
--expert-tensor-parallel-size 1
--context-parallel-size 1
--use-distributed-optimizer
--sequence-parallel
)
......@@ -143,7 +144,8 @@ LOGGING_ARGS=(
#--load $CHECKPOINT_PATH \
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
--no-load-optim \
--no-load-rng
--no-load-rng \
--no-save-optim
)
if [ -n "${WANDB_API_KEY}" ]; then
......@@ -175,43 +177,34 @@ fi
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
......@@ -170,7 +170,7 @@ def model_provider(
rope_scaling=args.use_rope_scaling,
mtp_block_spec=mtp_block_spec,
)
print_rank_0(model)
return model
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment