Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
70368616
Commit
70368616
authored
Apr 30, 2025
by
silencealiang
Browse files
update model parameters
parent
8551c38e
Changes
43
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
30 additions
and
44 deletions
+30
-44
examples/mixtral/train_mixtral_8x7B_1nodes.sh
examples/mixtral/train_mixtral_8x7B_1nodes.sh
+15
-22
examples/mixtral/train_mixtral_8x7B_multinodes.sh
examples/mixtral/train_mixtral_8x7B_multinodes.sh
+14
-21
pretrain_gpt.py
pretrain_gpt.py
+1
-1
No files found.
examples/mixtral/train_mixtral_8x7B_1nodes.sh
View file @
70368616
...
...
@@ -96,11 +96,11 @@ TRAINING_ARGS=(
--global-batch-size
256
--lr
1e-4
--train-iters
10
--lr-decay-iters
32
0000
--lr-decay-iters
1
0000
--lr-decay-style
cosine
--min-lr
1.0e-
5
--min-lr
1.0e-
6
--weight-decay
0.1
--lr-warmup-iters
5
00
--lr-warmup-iters
20
00
--clip-grad
1.0
--bf16
--overlap-param-gather
...
...
@@ -112,7 +112,7 @@ TORCH_PROFIE_ARGS=(
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x
7
B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
--profile-dir
torch_prof_mixtral8x
22
B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
...
...
@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=(
--pipeline-model-parallel-size
1
--expert-model-parallel-size
8
--expert-tensor-parallel-size
1
--context-parallel-size
1
--use-distributed-optimizer
--sequence-parallel
)
...
...
@@ -143,7 +144,8 @@ LOGGING_ARGS=(
#--load $CHECKPOINT_PATH \
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
\
--no-load-optim
\
--no-load-rng
--no-load-rng
\
--no-save-optim
)
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
...
...
@@ -175,43 +177,34 @@ fi
case
${
LOCAL_RANK
}
in
[
0]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
#numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
#numactl --cpunodebind=1 --membind=1 ${APP}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
#numactl --cpunodebind=2 --membind=2 ${APP}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
#numactl --cpunodebind=3 --membind=3 ${APP}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
[
4]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
#numactl --cpunodebind=4 --membind=4 ${APP}
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
;;
[
5]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
#numactl --cpunodebind=5 --membind=5 ${APP}
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
;;
[
6]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
#numactl --cpunodebind=6 --membind=6 ${APP}
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
;;
[
7]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
#numactl --cpunodebind=7 --membind=7 ${APP}
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
;;
esac
examples/mixtral/train_mixtral_8x7B_multinodes.sh
View file @
70368616
...
...
@@ -96,11 +96,11 @@ TRAINING_ARGS=(
--global-batch-size
256
--lr
1e-4
--train-iters
10
--lr-decay-iters
32
0000
--lr-decay-iters
1
0000
--lr-decay-style
cosine
--min-lr
1.0e-
5
--min-lr
1.0e-
6
--weight-decay
0.1
--lr-warmup-iters
5
00
--lr-warmup-iters
20
00
--clip-grad
1.0
--bf16
--overlap-param-gather
...
...
@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=(
--pipeline-model-parallel-size
4
--expert-model-parallel-size
8
--expert-tensor-parallel-size
1
--context-parallel-size
1
--use-distributed-optimizer
--sequence-parallel
)
...
...
@@ -143,7 +144,8 @@ LOGGING_ARGS=(
#--load $CHECKPOINT_PATH \
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
\
--no-load-optim
\
--no-load-rng
--no-load-rng
\
--no-save-optim
)
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
...
...
@@ -175,43 +177,34 @@ fi
case
${
LOCAL_RANK
}
in
[
0]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
#numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
#numactl --cpunodebind=1 --membind=1 ${APP}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
#numactl --cpunodebind=2 --membind=2 ${APP}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
#numactl --cpunodebind=3 --membind=3 ${APP}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
[
4]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
#numactl --cpunodebind=4 --membind=4 ${APP}
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
;;
[
5]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
#numactl --cpunodebind=5 --membind=5 ${APP}
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
;;
[
6]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
#numactl --cpunodebind=6 --membind=6 ${APP}
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
;;
[
7]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
#numactl --cpunodebind=7 --membind=7 ${APP}
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
;;
esac
pretrain_gpt.py
View file @
70368616
...
...
@@ -170,7 +170,7 @@ def model_provider(
rope_scaling
=
args
.
use_rope_scaling
,
mtp_block_spec
=
mtp_block_spec
,
)
print_rank_0
(
model
)
return
model
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment