Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
70368616
Commit
70368616
authored
Apr 30, 2025
by
silencealiang
Browse files
update model parameters
parent
8551c38e
Changes
43
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1354 additions
and
248 deletions
+1354
-248
examples/deepseek_v3/hostfile_deepseekv3_671B
examples/deepseek_v3/hostfile_deepseekv3_671B
+0
-0
examples/deepseek_v3/hostfile_deepseekv3_671B_4nodes
examples/deepseek_v3/hostfile_deepseekv3_671B_4nodes
+0
-0
examples/deepseek_v3/run_deepseekv3_671B_1nodes.sh
examples/deepseek_v3/run_deepseekv3_671B_1nodes.sh
+14
-0
examples/deepseek_v3/run_deepseekv3_671B_4nodes.sh
examples/deepseek_v3/run_deepseekv3_671B_4nodes.sh
+17
-0
examples/deepseek_v3/run_deepseekv3_671B_multinodes.sh
examples/deepseek_v3/run_deepseekv3_671B_multinodes.sh
+17
-0
examples/deepseek_v3/topo-input.xml
examples/deepseek_v3/topo-input.xml
+162
-0
examples/deepseek_v3/train_deepseekv3_671B_1nodes.sh
examples/deepseek_v3/train_deepseekv3_671B_1nodes.sh
+138
-99
examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
+472
-0
examples/deepseek_v3/train_deepseekv3_671B_multinodes.sh
examples/deepseek_v3/train_deepseekv3_671B_multinodes.sh
+472
-0
examples/gpt3/README.md
examples/gpt3/README.md
+0
-57
examples/gpt3/run_gpt_567B_1nodes.sh
examples/gpt3/run_gpt_567B_1nodes.sh
+1
-1
examples/gpt3/run_gpt_567B_multinodes.sh
examples/gpt3/run_gpt_567B_multinodes.sh
+2
-2
examples/gpt3/train_gpt_567B_1nodes.sh
examples/gpt3/train_gpt_567B_1nodes.sh
+12
-19
examples/gpt3/train_gpt_567B_multinodes.sh
examples/gpt3/train_gpt_567B_multinodes.sh
+14
-23
examples/mixtral/run_mixtral_8x22B_1nodes.sh
examples/mixtral/run_mixtral_8x22B_1nodes.sh
+1
-1
examples/mixtral/run_mixtral_8x22B_multinodes.sh
examples/mixtral/run_mixtral_8x22B_multinodes.sh
+1
-1
examples/mixtral/run_mixtral_8x7B_1nodes.sh
examples/mixtral/run_mixtral_8x7B_1nodes.sh
+1
-1
examples/mixtral/run_mixtral_8x7B_multinodes.sh
examples/mixtral/run_mixtral_8x7B_multinodes.sh
+1
-1
examples/mixtral/train_mixtral_8x22B_1nodes.sh
examples/mixtral/train_mixtral_8x22B_1nodes.sh
+14
-21
examples/mixtral/train_mixtral_8x22B_multinodes.sh
examples/mixtral/train_mixtral_8x22B_multinodes.sh
+15
-22
No files found.
examples/deepseek_v3/hostfile_deepseekv3_671B
0 → 100644
View file @
70368616
examples/deepseek_v3/hostfile_deepseekv3_671B_4nodes
0 → 100644
View file @
70368616
examples/deepseek_v3/run_deepseek
_
v3_1node.sh
→
examples/deepseek_v3/run_deepseekv3_
671B_
1node
s
.sh
100644 → 100755
View file @
70368616
...
@@ -2,14 +2,13 @@ for para in $*
...
@@ -2,14 +2,13 @@ for para in $*
do
do
if
[[
$para
==
--profiling
*
]]
;
then
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
profiling
=
${
para
#*=
}
export
GPU_FLUSH_ON_EXECUTION
=
1
export
HIP_DIRECT_DISPATCH
=
0
fi
fi
done
done
mpirun
-np
8
--allow-run-as-root
\
mpirun
-np
8
--allow-run-as-root
\
train_deepseek
_
v3_1node.sh localhost
--profiling
=
$profiling
>
output
.log 2>&1
train_deepseekv3_
671B_
1node
s
.sh localhost
--profiling
=
$profiling
>
log-1nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
wait
rm
-rf
CKPT
rm
-rf
output
rm
-rf
deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
examples/deepseek_v3/run_deepseekv3_671B_4nodes.sh
0 → 100755
View file @
70368616
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
mpirun
-np
32
--hostfile
hostfile_deepseekv3_671B_4nodes
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
train_deepseekv3_671B_4nodes.sh node002
--profiling
=
$profiling
>
log-4nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
rm
-rf
output
rm
-rf
deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
\ No newline at end of file
examples/deepseek_v3/run_deepseekv3_671B_multinodes.sh
0 → 100755
View file @
70368616
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
mpirun
-np
1024
--hostfile
hostfile_deepseekv3_671B
\
--allow-run-as-root
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
train_deepseekv3_671B_multinodes.sh node001
--profiling
=
$profiling
>
log-1024nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
rm
-rf
output
rm
-rf
deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
\ No newline at end of file
examples/deepseek_v3/topo-input.xml
0 → 100644
View file @
70368616
<system
version=
"2"
>
<cpu
numaid=
"3"
affinity=
"00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000"
arch=
"x86_64"
vendor=
"HygonGenuine"
familyid=
"159"
modelid=
"4"
>
<pci
busid=
"0000:99:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:9d:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:9f:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"0"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"0"
gdr=
"1"
>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
<pci
busid=
"0000:51:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:54:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:56:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"1"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"1"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
</pci>
<pci
busid=
"0000:9b:00.0"
class=
"0x020000"
vendor=
"0x15b3"
device=
"0x1021"
subsystem_vendor=
"0x15b3"
subsystem_device=
"0x0022"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<nic>
<net
name=
"mlx5_2"
dev=
"2"
speed=
"200000"
port=
"1"
latency=
"0.000000"
guid=
"0x2227a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
<net
name=
"mlx5_3"
dev=
"3"
speed=
"200000"
port=
"2"
latency=
"0.000000"
guid=
"0x2227a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
</nic>
</pci>
</pci>
</cpu>
<cpu
numaid=
"0"
affinity=
"00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff"
arch=
"x86_64"
vendor=
"HygonGenuine"
familyid=
"159"
modelid=
"4"
>
<pci
busid=
"0000:01:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:03:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:05:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"3"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"3"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
<pci
busid=
"0000:59:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:5b:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:5d:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"2"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"2"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
</pci>
<pci
busid=
"0000:06:00.0"
class=
"0x020000"
vendor=
"0x15b3"
device=
"0x1021"
subsystem_vendor=
"0x15b3"
subsystem_device=
"0x0022"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<nic>
<net
name=
"mlx5_4"
dev=
"4"
speed=
"200000"
port=
"1"
latency=
"0.000000"
guid=
"0x8228a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
<net
name=
"mlx5_5"
dev=
"5"
speed=
"200000"
port=
"2"
latency=
"0.000000"
guid=
"0x8228a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
</nic>
</pci>
</pci>
</cpu>
<cpu
numaid=
"7"
affinity=
"7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000"
arch=
"x86_64"
vendor=
"HygonGenuine"
familyid=
"159"
modelid=
"4"
>
<pci
busid=
"0000:e1:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:e3:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:e5:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"4"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"4"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
<pci
busid=
"0000:bd:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:bf:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:c1:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"5"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"5"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
</pci>
<pci
busid=
"0000:e6:00.0"
class=
"0x020000"
vendor=
"0x15b3"
device=
"0x1021"
subsystem_vendor=
"0x15b3"
subsystem_device=
"0x0022"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<nic>
<net
name=
"mlx5_6"
dev=
"6"
speed=
"200000"
port=
"1"
latency=
"0.000000"
guid=
"0x6227a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
<net
name=
"mlx5_7"
dev=
"7"
speed=
"200000"
port=
"2"
latency=
"0.000000"
guid=
"0x6227a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
</nic>
</pci>
</pci>
</cpu>
<cpu
numaid=
"4"
affinity=
"00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000"
arch=
"x86_64"
vendor=
"HygonGenuine"
familyid=
"159"
modelid=
"4"
>
<pci
busid=
"0000:ab:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:af:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:b1:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"7"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"7"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
<pci
busid=
"0000:c5:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:c8:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:ca:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"6"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"6"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
</pci>
<pci
busid=
"0000:ad:00.0"
class=
"0x020000"
vendor=
"0x15b3"
device=
"0x1021"
subsystem_vendor=
"0x15b3"
subsystem_device=
"0x0022"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<nic>
<net
name=
"mlx5_8"
dev=
"8"
speed=
"200000"
port=
"1"
latency=
"0.000000"
guid=
"0xd226a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
<net
name=
"mlx5_9"
dev=
"9"
speed=
"200000"
port=
"2"
latency=
"0.000000"
guid=
"0xd226a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
</nic>
</pci>
</pci>
</cpu>
<cpu
numaid=
"2"
affinity=
"00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000"
arch=
"x86_64"
vendor=
"HygonGenuine"
familyid=
"159"
modelid=
"4"
>
<pci
busid=
"0000:71:00.0"
class=
"0x020000"
vendor=
"0x15b3"
device=
"0xa2dc"
subsystem_vendor=
"0x15b3"
subsystem_device=
"0x0009"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<nic>
<net
name=
"mlx5_0"
dev=
"0"
speed=
"200000"
port=
"1"
latency=
"0.000000"
guid=
"0xc0d00a000324e9b8"
maxconn=
"131072"
gdr=
"1"
/>
<net
name=
"mlx5_1"
dev=
"1"
speed=
"40000"
port=
"2"
latency=
"0.000000"
guid=
"0xc0d00a000324e9b8"
maxconn=
"131072"
gdr=
"1"
/>
</nic>
</pci>
</cpu>
</system>
examples/deepseek_v3/train_deepseek
_
v3_1node.sh
→
examples/deepseek_v3/train_deepseekv3_
671B_
1node
s
.sh
100644 → 100755
View file @
70368616
#!/bin/bash
#!/bin/bash
for
para
in
$*
for
para
in
$*
do
do
if
[[
$para
==
--profiling
*
]]
;
then
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
profiling
=
${
para
#*=
}
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
fi
fi
done
done
# Runs DeepseekV3 671B model
source
/opt/dtk/env.sh
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
25900
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
GLOG_minloglevel
=
3
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
export
GPU_MAX_HW_QUEUES
=
10
# nccl env
export
NCCL_ALGO
=
Ring
export
NCCL_ALGO
=
Ring
export
NCCL_MIN_NCHANNELS
=
32
export
NCCL_MIN_NCHANNELS
=
32
export
NCCL_MAX_NCHANNELS
=
32
export
NCCL_MAX_NCHANNELS
=
32
...
@@ -22,22 +32,20 @@ export NCCL_NET_GDR_LEVEL=7
...
@@ -22,22 +32,20 @@ export NCCL_NET_GDR_LEVEL=7
export
NCCL_NET_GDR_READ
=
1
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"/public/home/yuguo/check/rccl-tests-0204/topo-input.xml"
#"your topo file"
export
NCCL_TOPO_FILE
=
"./topo-input.xml"
export
GLOG_minloglevel
=
3
export
GROUPED_GEMM_BatchLinear
=
1
export
LD_LIBRARY_PATH
=
/public/home/yuguo/data/rocblas-install-0224/lib:
$LD_LIBRARY_PATH
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
# enable BatchLinear
RANK
=
$OMPI_COMM_WORLD_RANK
export
GROUPED_GEMM_BatchLinear
=
1
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
#export MP_PP0_LAYERS=2 # 是否使能视实际情况而定
### BASE CONFIG ###
### BASE CONFIG ###
MODEL_SIZE
=
A37B
MODEL_SIZE
=
A37B
BATCH_SIZE
=
1
BATCH_SIZE
=
1
GLOBAL_BATCH_SIZE
=
256
GLOBAL_BATCH_SIZE
=
256
LR
=
1e-
5
LR
=
1e-
4
MIN_LR
=
1e-6
MIN_LR
=
1e-6
SEQ_LEN
=
4096
SEQ_LEN
=
4096
PAD_LEN
=
4096
PR
=
bf16
PR
=
bf16
### BASE CONFIG ###
### BASE CONFIG ###
...
@@ -45,6 +53,7 @@ PR=bf16
...
@@ -45,6 +53,7 @@ PR=bf16
TP
=
1
TP
=
1
PP
=
2
PP
=
2
CP
=
1
CP
=
1
ETP
=
1
EP
=
4
EP
=
4
SP
=
true
SP
=
true
DO
=
true
DO
=
true
...
@@ -56,13 +65,14 @@ SFT=false
...
@@ -56,13 +65,14 @@ SFT=false
AC
=
none
AC
=
none
OPTIMIZER_OFFLOAD
=
false
OPTIMIZER_OFFLOAD
=
false
SAVE_INTERVAL
=
500
SAVE_INTERVAL
=
500
DATASET_PATH
=
${
MEGATRON_PATH
}
/deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
#"your data path"
DATASET_PATH
=
"path to mmap_deepseekv3_datasets_text_document"
VALID_DATASET_PATH
=
${
MEGATRON_PATH
}
/deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
#"your data path"
VALID_DATASET_PATH
=
"path to mmap_deepseekv3_datasets_text_document"
PRETRAIN_CHECKPOINT_PATH
=
${
MEGATRON_PATH
}
/deepseekv3_dataset
#"your model path"
PRETRAIN_CHECKPOINT_PATH
=
"./output"
TOKENIZER_MODEL_PATH
=
"path to deepseekv3_dataset"
# the following two values will not be used when SFT is true
# the following two values will not be used when SFT is true
TRAIN_TOKENS
=
10000
0000
TRAIN_TOKENS
=
$((
10000
*
${
GLOBAL_BATCH_SIZE
}
*
${
SEQ_LEN
}))
WARMUP_TOKENS
=
10000
WARMUP_TOKENS
=
$((
2000
*
${
GLOBAL_BATCH_SIZE
}
*
${
SEQ_LEN
}))
###############################
###############################
OUTPUT_BASEPATH
=
./output
OUTPUT_BASEPATH
=
./output
...
@@ -72,20 +82,19 @@ if [ $FL = true ]; then
...
@@ -72,20 +82,19 @@ if [ $FL = true ]; then
:
:
#exit -1
#exit -1
elif
[
$FL
=
false
]
;
then
elif
[
$FL
=
false
]
;
then
export
NVTE_FLASH_ATTN
=
0
NVTE_FUSED_ATTN
=
1
attn_backend_option
=
"
\
attn_backend_option
=
"
\
--attention-backend
fused
--attention-backend
auto
"
"
fi
fi
if
[
$MODEL_SIZE
=
A37B
]
;
then
if
[
$MODEL_SIZE
=
A37B
]
;
then
TRAIN_ITERS
=
2
TRAIN_ITERS
=
10
HIDDEN_SIZE
=
7168
HIDDEN_SIZE
=
7168
NUM_ATTENTION_HEADS
=
128
NUM_ATTENTION_HEADS
=
128
NUM_LAYERS
=
2
NUM_LAYERS
=
2
INTERMEDIATE_SIZE
=
18432
INTERMEDIATE_SIZE
=
18432
MOE_INTERMEDIATE_SIZE
=
2048
MOE_INTERMEDIATE_SIZE
=
2048
MAX_POSITION_EMBEDDINGS
=
${
SEQ_LEN
}
MAX_POSITION_EMBEDDINGS
=
163840
EXTRA_VOCAB_SIZE
=
467
EXTRA_VOCAB_SIZE
=
467
Q_LORA_RANK
=
1536
Q_LORA_RANK
=
1536
KV_LORA_RANK
=
512
KV_LORA_RANK
=
512
...
@@ -94,32 +103,43 @@ if [ $MODEL_SIZE = A37B ]; then
...
@@ -94,32 +103,43 @@ if [ $MODEL_SIZE = A37B ]; then
V_HEAD_DIM
=
128
V_HEAD_DIM
=
128
ROPE_THETA
=
10000
ROPE_THETA
=
10000
SCALE_FACTOR
=
40
SCALE_FACTOR
=
40
NUM_EXPERTS
=
8
#256
NUM_EXPERTS
=
8
ROUTER_TOPK
=
8
ROUTER_TOPK
=
8
NUM_SHARED_EXPERTS
=
1
NUM_SHARED_EXPERTS
=
1
RMS_NORM_EPS
=
1e-6
RMS_NORM_EPS
=
1e-6
moe_options
=
"
\
moe_options
=
"
\
--moe-grouped-gemm
\
--moe-grouped-gemm
\
--moe-expert-capacity-factor 1
\
--moe-expert-capacity-factor 0.5
\
--moe-pad-expert-input-to-capacity
\
--moe-pad-expert-input-to-capacity
\
--moe-token-dispatcher-type alltoall
\
--moe-token-dispatcher-type alltoall
\
--moe-router-topk
${
ROUTER_TOPK
}
\
--moe-router-topk
${
ROUTER_TOPK
}
\
--num-experts
${
NUM_EXPERTS
}
\
--moe-router-group-topk 2
\
--expert-model-parallel-size
${
EP
}
\
--moe-router-num-groups 4
\
--expert-tensor-parallel-size 1
\
--num-experts
${
NUM_EXPERTS
}
\
--moe-ffn-hidden-size
${
MOE_INTERMEDIATE_SIZE
}
\
--expert-model-parallel-size
${
EP
}
\
--moe-router-load-balancing-type aux_loss
\
--expert-tensor-parallel-size
${
ETP
}
\
--moe-aux-loss-coeff 0.001
\
--moe-ffn-hidden-size
${
MOE_INTERMEDIATE_SIZE
}
\
--moe-layer-freq ([0]*0+[1]*2)
\
--moe-router-load-balancing-type seq_aux_loss
\
--q-lora-rank
${
Q_LORA_RANK
}
\
--moe-router-topk-scaling-factor 2.5
\
--kv-lora-rank
${
KV_LORA_RANK
}
\
--moe-shared-expert-overlap
\
--qk-head-dim
${
QK_NOPE_HEAD_DIM
}
\
--moe-router-enable-expert-bias
\
--qk-pos-emb-head-dim
${
QK_ROPE_HEAD_DIM
}
\
--mscale 1.0
\
--v-head-dim
${
V_HEAD_DIM
}
\
--mscale-all-dim 1.0
\
--moe-shared-expert-intermediate-size
$((${
MOE_INTERMEDIATE_SIZE
}
*
${
NUM_SHARED_EXPERTS
}
))
\
--moe-router-score-function sigmoid
\
"
--moe-router-bias-update-rate 0.001
\
--moe-aux-loss-coeff 0.001
\
--moe-layer-freq ([0]*1+[1]*1)
\
--moe-shared-expert-intermediate-size
$((${
MOE_INTERMEDIATE_SIZE
}
*
${
NUM_SHARED_EXPERTS
}
))
\
--q-lora-rank
${
Q_LORA_RANK
}
\
--kv-lora-rank
${
KV_LORA_RANK
}
\
--qk-head-dim
${
QK_NOPE_HEAD_DIM
}
\
--qk-pos-emb-head-dim
${
QK_ROPE_HEAD_DIM
}
\
--v-head-dim
${
V_HEAD_DIM
}
\
--mtp-num-layers 1
\
"
mtp_options
=
""
fi
fi
# Here are some configs controled by env
# Here are some configs controled by env
...
@@ -147,6 +167,14 @@ comm_overlap_option="\
...
@@ -147,6 +167,14 @@ comm_overlap_option="\
--overlap-grad-reduce
\
--overlap-grad-reduce
\
--overlap-param-gather"
--overlap-param-gather"
# if [ $TP_COMM_OVERLAP -eq 1 ]; then
# comm_overlap_option="\
# --tp-comm-overlap \
# --overlap-grad-reduce \
# --overlap-param-gather"
# fi
if
[
$AC
=
full
]
;
then
if
[
$AC
=
full
]
;
then
_check
=
$((
(
$NUM_LAYERS
/
$PP
)
%
${
MP_AC_LAYERS
}
))
_check
=
$((
(
$NUM_LAYERS
/
$PP
)
%
${
MP_AC_LAYERS
}
))
if
[
$_check
!=
0
]
;
then
if
[
$_check
!=
0
]
;
then
...
@@ -154,9 +182,9 @@ if [ $AC = full ]; then
...
@@ -154,9 +182,9 @@ if [ $AC = full ]; then
exit
-1
exit
-1
fi
fi
activation_checkpoint_options
=
"
\
activation_checkpoint_options
=
"
\
--recompute-method uniform
\
--recompute-method uniform
\
--recompute-num-layers
${
MP_AC_LAYERS
}
\
--recompute-num-layers
${
MP_AC_LAYERS
}
\
--recompute-granularity full"
--recompute-granularity full"
elif
[
$AC
=
sel
]
;
then
elif
[
$AC
=
sel
]
;
then
activation_checkpoint_options
=
"
\
activation_checkpoint_options
=
"
\
--recompute-activations"
--recompute-activations"
...
@@ -165,8 +193,8 @@ elif [ $AC = none ]; then
...
@@ -165,8 +193,8 @@ elif [ $AC = none ]; then
"
"
elif
[
$AC
=
offload
]
;
then
elif
[
$AC
=
offload
]
;
then
activation_checkpoint_options
=
"
\
activation_checkpoint_options
=
"
\
--cpu-offloading
\
--cpu-offloading
\
--cpu-offloading-num-layers
${
MP_AC_LAYERS
}
"
--cpu-offloading-num-layers
${
MP_AC_LAYERS
}
"
if
[
$TP_COMM_OVERLAP
-eq
1
]
;
then
if
[
$TP_COMM_OVERLAP
-eq
1
]
;
then
echo
"Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
echo
"Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option
=
"
\
comm_overlap_option
=
"
\
...
@@ -179,8 +207,8 @@ fi
...
@@ -179,8 +207,8 @@ fi
if
[
$PR
=
fp16
]
;
then
if
[
$PR
=
fp16
]
;
then
pr_options
=
"
\
pr_options
=
"
\
--fp16
\
--fp16
\
--apply-query-key-layer-scaling"
--apply-query-key-layer-scaling"
export
NVTE_APPLY_QK_LAYER_SCALING
=
1
export
NVTE_APPLY_QK_LAYER_SCALING
=
1
elif
[
$PR
=
bf16
]
;
then
elif
[
$PR
=
bf16
]
;
then
pr_options
=
"
\
pr_options
=
"
\
...
@@ -200,7 +228,7 @@ fi
...
@@ -200,7 +228,7 @@ fi
if
[
$DO
=
true
]
;
then
if
[
$DO
=
true
]
;
then
do_option
=
"
\
do_option
=
"
\
--use-distributed-optimizer"
--use-distributed-optimizer"
elif
[
$DO
=
false
]
;
then
elif
[
$DO
=
false
]
;
then
do_option
=
"
\
do_option
=
"
\
...
@@ -210,7 +238,7 @@ fi
...
@@ -210,7 +238,7 @@ fi
if
[
$SP
=
true
]
&&
[
$TP
-gt
1
]
;
then
if
[
$SP
=
true
]
&&
[
$TP
-gt
1
]
;
then
sp_option
=
"
\
sp_option
=
"
\
--sequence-parallel"
--sequence-parallel"
elif
[
$SP
=
false
]
;
then
elif
[
$SP
=
false
]
;
then
sp_option
=
"
\
sp_option
=
"
\
...
@@ -236,7 +264,7 @@ fi
...
@@ -236,7 +264,7 @@ fi
if
[
$PRETRAIN_CHECKPOINT_PATH
!=
none
]
;
then
if
[
$PRETRAIN_CHECKPOINT_PATH
!=
none
]
;
then
load_option
=
"
\
load_option
=
"
\
--
tokenizer-model
$PRETRAIN_CHECKPOINT_PATH
"
--
load
$PRETRAIN_CHECKPOINT_PATH
"
fi
fi
if
[
$OPTIMIZER_OFFLOAD
!=
false
]
;
then
if
[
$OPTIMIZER_OFFLOAD
!=
false
]
;
then
...
@@ -247,15 +275,21 @@ if [ $OPTIMIZER_OFFLOAD != false ]; then
...
@@ -247,15 +275,21 @@ if [ $OPTIMIZER_OFFLOAD != false ]; then
fi
fi
if
[
$SFT
=
true
]
;
then
if
[
$SFT
=
true
]
;
then
TRAIN_ITERS
=
${
2
4
}
TRAIN_ITERS
=
${
2
5
}
LR_WARMUP_ITERS
=
${
2
5
}
LR_WARMUP_ITERS
=
${
2
6
}
LR_DECAY_ITERS
=
$((
${
TRAIN_ITERS
}
-
${
LR_WARMUP_ITERS
}))
LR_DECAY_ITERS
=
$((
${
TRAIN_ITERS
}
-
${
LR_WARMUP_ITERS
}))
PREFIX
=
"finetune-mcore-deepseek-v3"
PREFIX
=
"finetune-mcore-deepseek-v3-
${
MODEL_SIZE
}
-lr-
${
LR
}
-minlr-
${
MIN_LR
}
-bs-
${
BATCH_SIZE
}
-gbs-
${
GLOBAL_BATCH_SIZE
}
-seqlen-
${
SEQ_LEN
}
"
sft_options
=
"
\
--eod-mask-loss
\
--calculate-per-token-loss
\
--train-mode finetune"
else
else
# TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
# TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_WARMUP_ITERS
=
$((
${
WARMUP_TOKENS
}
/
${
GLOBAL_BATCH_SIZE
}
/
${
SEQ_LEN
}
))
LR_WARMUP_ITERS
=
$((
${
WARMUP_TOKENS
}
/
${
GLOBAL_BATCH_SIZE
}
/
${
SEQ_LEN
}
))
LR_DECAY_ITERS
=
$((
${
TRAIN_TOKENS
}
/
${
GLOBAL_BATCH_SIZE
}
/
${
SEQ_LEN
}
))
LR_DECAY_ITERS
=
$((
${
TRAIN_TOKENS
}
/
${
GLOBAL_BATCH_SIZE
}
/
${
SEQ_LEN
}
))
PREFIX
=
"pretrain-mcore-deepseek-v3"
PREFIX
=
"pretrain-mcore-deepseek-v3-
${
MODEL_SIZE
}
-lr-
${
LR
}
-minlr-
${
MIN_LR
}
-bs-
${
BATCH_SIZE
}
-gbs-
${
GLOBAL_BATCH_SIZE
}
-seqlen-
${
SEQ_LEN
}
"
sft_options
=
"
\
--train-mode pretrain"
fi
fi
if
[
${
MP_DATASET_TYPE
}
=
"raw"
]
;
then
if
[
${
MP_DATASET_TYPE
}
=
"raw"
]
;
then
...
@@ -278,16 +312,18 @@ else
...
@@ -278,16 +312,18 @@ else
fi
fi
##### Prepare logdirs #######
##### Prepare logdirs #######
NAME
=
"
${
PREFIX
}
"
NAME
=
"
${
PREFIX
}
-pr-
${
PR
}
-tp-
${
TP
}
-pp-
${
PP
}
-cp-
${
CP
}
-ac-
${
AC
}
-do-
${
DO
}
-sp-
${
SP
}
-ti-
${
TRAIN_ITERS
}
-wi-
${
LR_WARMUP_ITERS
}
"
mkdir
-p
"
${
OUTPUT_BASEPATH
}
/tensorboard/"
mkdir
-p
"
${
OUTPUT_BASEPATH
}
/tensorboard/"
mkdir
-p
"
${
OUTPUT_BASEPATH
}
/checkpoint/"
mkdir
-p
"
${
OUTPUT_BASEPATH
}
/checkpoint/"
mkdir
-p
"
${
OUTPUT_BASEPATH
}
/log/"
mkdir
-p
"
${
OUTPUT_BASEPATH
}
/log/"
TENSORBOARD_DIR
=
"
${
OUTPUT_BASEPATH
}
/tensorboard/
${
NAME
}
"
current_time
=
$(
date
"+%Y.%m.%d-%H.%M.%S"
)
TENSORBOARD_DIR
=
"
${
OUTPUT_BASEPATH
}
/tensorboard/
${
NAME
}
_
${
current_time
}
"
mkdir
-p
${
TENSORBOARD_DIR
}
mkdir
-p
${
TENSORBOARD_DIR
}
SAVED_PRETRAIN_CHECKPOINT_PATH
=
"
${
OUTPUT_BASEPATH
}
/checkpoint/
${
NAME
}
"
SAVED_PRETRAIN_CHECKPOINT_PATH
=
"
${
OUTPUT_BASEPATH
}
/checkpoint/
${
NAME
}
"
mkdir
-p
${
SAVED_PRETRAIN_CHECKPOINT_PATH
}
mkdir
-p
${
SAVED_PRETRAIN_CHECKPOINT_PATH
}
find
-L
${
PRETRAIN_CHECKPOINT_PATH
}
-maxdepth
1
-type
f
-name
"*.json"
-print0
| xargs
-0
cp
-t
${
SAVED_PRETRAIN_CHECKPOINT_PATH
}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
megatron_options
=
"
\
megatron_options
=
"
\
--lr
${
LR
}
\
--lr
${
LR
}
\
...
@@ -314,7 +350,7 @@ megatron_options=" \
...
@@ -314,7 +350,7 @@ megatron_options=" \
--log-interval 1
\
--log-interval 1
\
--log-throughput
\
--log-throughput
\
--eval-interval 10000
\
--eval-interval 10000
\
--eval-iters
5
\
--eval-iters
3
\
--save-interval
${
SAVE_INTERVAL
}
\
--save-interval
${
SAVE_INTERVAL
}
\
--tensorboard-queue-size 1
\
--tensorboard-queue-size 1
\
--tensorboard-dir
${
TENSORBOARD_DIR
}
\
--tensorboard-dir
${
TENSORBOARD_DIR
}
\
...
@@ -328,13 +364,12 @@ megatron_options=" \
...
@@ -328,13 +364,12 @@ megatron_options=" \
--num-workers 8
\
--num-workers 8
\
--extra-vocab-size
${
EXTRA_VOCAB_SIZE
}
\
--extra-vocab-size
${
EXTRA_VOCAB_SIZE
}
\
--tokenizer-type DeepSeekV2Tokenizer
\
--tokenizer-type DeepSeekV2Tokenizer
\
--tokenizer-model
${
TOKENIZER_MODEL_PATH
}
\
--swiglu
\
--swiglu
\
--normalization RMSNorm
\
--normalization RMSNorm
\
--norm-epsilon
${
RMS_NORM_EPS
}
\
--norm-epsilon
${
RMS_NORM_EPS
}
\
--use-rotary-position-embeddings
\
--use-rotary-position-embeddings
\
--no-bias-swiglu-fusion
\
--no-rope-fusion
\
--no-rope-fusion
\
--position-embedding-type rope
\
--untie-embeddings-and-output-weights
\
--untie-embeddings-and-output-weights
\
--disable-bias-linear
\
--disable-bias-linear
\
--rotary-base
${
ROPE_THETA
}
\
--rotary-base
${
ROPE_THETA
}
\
...
@@ -342,12 +377,11 @@ megatron_options=" \
...
@@ -342,12 +377,11 @@ megatron_options=" \
--no-save-optim
\
--no-save-optim
\
--kv-channels
${
V_HEAD_DIM
}
\
--kv-channels
${
V_HEAD_DIM
}
\
--qk-layernorm
\
--qk-layernorm
\
--multi-latent-attention
\
--ckpt-format torch
\
--ckpt-format torch
\
--transformer-impl transformer_engine
\
--transformer-impl transformer_engine
\
--no-masked-softmax-fusion
\
--use-rope-scaling
\
--use-rope-scaling
\
--multi-latent-attention
\
--mtp-num-layers 1
\
--use-mcore-models
\
"
"
TORCH_PROFIE_ARGS
=
"
\
TORCH_PROFIE_ARGS
=
"
\
...
@@ -355,7 +389,7 @@ TORCH_PROFIE_ARGS=" \
...
@@ -355,7 +389,7 @@ TORCH_PROFIE_ARGS=" \
--profile-ranks 0 1 2 3 4 5 6 7
\
--profile-ranks 0 1 2 3 4 5 6 7
\
--profile-step-start 3
\
--profile-step-start 3
\
--profile-step-end 4
\
--profile-step-end 4
\
--profile-dir torch_prof_d
ata_16nodes_dcu
\
--profile-dir torch_prof_d
eepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1
\
--use-pytorch-profiler
\
--use-pytorch-profiler
\
"
"
...
@@ -367,26 +401,30 @@ HIP_PROFIE_ARGS=" \
...
@@ -367,26 +401,30 @@ HIP_PROFIE_ARGS=" \
--use-hip-profiler
\
--use-hip-profiler
\
"
"
APP
=
"python3 -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
DISTRIBUTED_ARGS
=
"
\
${
megatron_options
}
\
${
dataset_options
}
\
${
pr_options
}
\
${
load_option
}
\
${
activation_checkpoint_options
}
\
${
do_option
}
\
${
sp_option
}
\
${
moe_options
}
\
${
offload_option
}
\
${
sft_options
}
\
${
vp_option
}
\
${
packing_options
}
\
${
uneven_split_option
}
\
${
attn_backend_option
}
\
${
comm_overlap_option
}
\
--rank
${
RANK
}
\
--rank
${
RANK
}
\
--world-size
${
WORLD_SIZE
}
\
--world-size
${
WORLD_SIZE
}
\
--local-rank
${
LOCAL_RANK
}
\
--local-rank
${
LOCAL_RANK
}
\
--dist-url tcp://
${
1
}
:25900
\
--dist-url tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
\
"
APP
=
"python3 -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
${
megatron_options
}
\
${
dataset_options
}
\
${
pr_options
}
\
${
load_option
}
\
${
activation_checkpoint_options
}
\
${
do_option
}
\
${
sp_option
}
\
${
moe_options
}
\
${
offload_option
}
\
${
vp_option
}
\
${
packing_options
}
\
${
uneven_split_option
}
\
${
attn_backend_option
}
\
${
mtp_options
}
\
${
comm_overlap_option
}
\
${
DISTRIBUTED_ARGS
}
\
"
"
if
[[
$profiling
==
"torch"
]]
;
then
if
[[
$profiling
==
"torch"
]]
;
then
...
@@ -397,37 +435,38 @@ elif [[ $profiling == "hip" ]]; then
...
@@ -397,37 +435,38 @@ elif [[ $profiling == "hip" ]]; then
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
fi
fi
#for hygon cpu
case
${
LOCAL_RANK
}
in
case
${
LOCAL_RANK
}
in
[
0]
)
[
0]
)
export
HIP
_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA
_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
;;
[
1]
)
[
1]
)
export
HIP
_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA
_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
;;
[
2]
)
[
2]
)
export
HIP
_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA
_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
;;
[
3]
)
[
3]
)
export
HIP
_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA
_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
;;
[
4]
)
[
4]
)
export
HIP
_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA
_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
;;
;;
[
5]
)
[
5]
)
export
HIP
_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA
_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
;;
;;
[
6]
)
[
6]
)
export
HIP
_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA
_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
;;
;;
[
7]
)
[
7]
)
export
HIP
_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA
_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
;;
;;
esac
esac
examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
0 → 100755
View file @
70368616
#!/bin/bash
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Runs DeepseekV3 671B model
source
/opt/dtk/env.sh
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
25900
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
GLOG_minloglevel
=
3
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
# nccl env
export
NCCL_ALGO
=
Ring
export
NCCL_MIN_NCHANNELS
=
32
export
NCCL_MAX_NCHANNELS
=
32
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"./topo-input.xml"
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
export
MP_PP0_LAYERS
=
2
# 是否使能视实际情况而定
### BASE CONFIG ###
MODEL_SIZE
=
A37B
BATCH_SIZE
=
1
GLOBAL_BATCH_SIZE
=
512
LR
=
1e-4
MIN_LR
=
1e-6
SEQ_LEN
=
4096
PAD_LEN
=
4096
PR
=
bf16
### BASE CONFIG ###
### PARALLEL / BOOL OPTION ###
TP
=
2
PP
=
2
CP
=
1
ETP
=
1
EP
=
16
SP
=
true
DO
=
true
FL
=
true
SFT
=
false
### PARALLEL / BOOL OPTION ###
### OTHERS ###
AC
=
none
OPTIMIZER_OFFLOAD
=
false
SAVE_INTERVAL
=
500
DATASET_PATH
=
"path to mmap_deepseekv3_datasets_text_document"
VALID_DATASET_PATH
=
"path to mmap_deepseekv3_datasets_text_document"
PRETRAIN_CHECKPOINT_PATH
=
"./output"
TOKENIZER_MODEL_PATH
=
"path to deepseekv3_dataset"
# the following two values will not be used when SFT is true
TRAIN_TOKENS
=
$((
10000
*
${
GLOBAL_BATCH_SIZE
}
*
${
SEQ_LEN
}))
WARMUP_TOKENS
=
$((
2000
*
${
GLOBAL_BATCH_SIZE
}
*
${
SEQ_LEN
}))
###############################
OUTPUT_BASEPATH
=
./output
### OTHERS ###
if
[
$FL
=
true
]
;
then
:
#exit -1
elif
[
$FL
=
false
]
;
then
attn_backend_option
=
"
\
--attention-backend auto
"
fi
if
[
$MODEL_SIZE
=
A37B
]
;
then
TRAIN_ITERS
=
10
HIDDEN_SIZE
=
7168
NUM_ATTENTION_HEADS
=
128
NUM_LAYERS
=
3
INTERMEDIATE_SIZE
=
18432
MOE_INTERMEDIATE_SIZE
=
2048
MAX_POSITION_EMBEDDINGS
=
163840
EXTRA_VOCAB_SIZE
=
467
Q_LORA_RANK
=
1536
KV_LORA_RANK
=
512
QK_NOPE_HEAD_DIM
=
128
QK_ROPE_HEAD_DIM
=
64
V_HEAD_DIM
=
128
ROPE_THETA
=
10000
SCALE_FACTOR
=
40
NUM_EXPERTS
=
256
ROUTER_TOPK
=
8
NUM_SHARED_EXPERTS
=
1
RMS_NORM_EPS
=
1e-6
moe_options
=
"
\
--moe-grouped-gemm
\
--moe-expert-capacity-factor 0.5
\
--moe-pad-expert-input-to-capacity
\
--moe-token-dispatcher-type alltoall
\
--moe-router-topk
${
ROUTER_TOPK
}
\
--moe-router-group-topk 4
\
--moe-router-num-groups 8
\
--num-experts
${
NUM_EXPERTS
}
\
--expert-model-parallel-size
${
EP
}
\
--expert-tensor-parallel-size
${
ETP
}
\
--moe-ffn-hidden-size
${
MOE_INTERMEDIATE_SIZE
}
\
--moe-router-load-balancing-type seq_aux_loss
\
--moe-router-topk-scaling-factor 2.5
\
--moe-shared-expert-overlap
\
--moe-router-enable-expert-bias
\
--mscale 1.0
\
--mscale-all-dim 1.0
\
--moe-router-score-function sigmoid
\
--moe-router-bias-update-rate 0.001
\
--moe-aux-loss-coeff 0.001
\
--moe-layer-freq ([0]*1+[1]*2)
\
--moe-shared-expert-intermediate-size
$((${
MOE_INTERMEDIATE_SIZE
}
*
${
NUM_SHARED_EXPERTS
}
))
\
--q-lora-rank
${
Q_LORA_RANK
}
\
--kv-lora-rank
${
KV_LORA_RANK
}
\
--qk-head-dim
${
QK_NOPE_HEAD_DIM
}
\
--qk-pos-emb-head-dim
${
QK_ROPE_HEAD_DIM
}
\
--v-head-dim
${
V_HEAD_DIM
}
\
--mtp-num-layers 1
\
"
mtp_options
=
""
fi
# Here are some configs controled by env
if
[
-z
${
MP_DATASET_TYPE
}
]
;
then
MP_DATASET_TYPE
=
"idxmap"
fi
if
[
-z
${
MP_AC_LAYERS
}
]
;
then
MP_AC_LAYERS
=
1
fi
if
[
-z
${
MP_VP
}
]
;
then
vp_option
=
""
else
vp_option
=
"
\
--num-layers-per-virtual-pipeline-stage
${
MP_VP
}
"
fi
if
[
-z
${
MP_SFT_PACKING
}
]
;
then
MP_SFT_PACKING
=
false
fi
TP_COMM_OVERLAP
=
$((
(
$TP
>
1
)
?
1
:
0
))
comm_overlap_option
=
"
\
--overlap-grad-reduce
\
--overlap-param-gather"
# if [ $TP_COMM_OVERLAP -eq 1 ]; then
# comm_overlap_option="\
# --tp-comm-overlap \
# --overlap-grad-reduce \
# --overlap-param-gather"
# fi
if
[
$AC
=
full
]
;
then
_check
=
$((
(
$NUM_LAYERS
/
$PP
)
%
${
MP_AC_LAYERS
}
))
if
[
$_check
!=
0
]
;
then
echo
"the num layers per pp rank must be a multiple of the recompute layers."
exit
-1
fi
activation_checkpoint_options
=
"
\
--recompute-method uniform
\
--recompute-num-layers
${
MP_AC_LAYERS
}
\
--recompute-granularity full"
elif
[
$AC
=
sel
]
;
then
activation_checkpoint_options
=
"
\
--recompute-activations"
elif
[
$AC
=
none
]
;
then
activation_checkpoint_options
=
"
\
"
elif
[
$AC
=
offload
]
;
then
activation_checkpoint_options
=
"
\
--cpu-offloading
\
--cpu-offloading-num-layers
${
MP_AC_LAYERS
}
"
if
[
$TP_COMM_OVERLAP
-eq
1
]
;
then
echo
"Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option
=
"
\
--tp-comm-overlap"
else
echo
"Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option
=
""
fi
fi
if
[
$PR
=
fp16
]
;
then
pr_options
=
"
\
--fp16
\
--apply-query-key-layer-scaling"
export
NVTE_APPLY_QK_LAYER_SCALING
=
1
elif
[
$PR
=
bf16
]
;
then
pr_options
=
"
\
--bf16"
elif
[
$PR
=
fp8
]
;
then
pr_options
=
"
\
--bf16
\
--fp8-format hybrid
\
--fp8-amax-compute-algo max
\
--fp8-amax-history-len 1024"
fi
if
[
$OPTIMIZER_OFFLOAD
!=
false
]
&&
[
$DO
=
false
]
;
then
echo
"Offload optimizer is valid only if
\$
DO=true"
DO
=
true
fi
if
[
$DO
=
true
]
;
then
do_option
=
"
\
--use-distributed-optimizer"
elif
[
$DO
=
false
]
;
then
do_option
=
"
\
"
fi
if
[
$SP
=
true
]
&&
[
$TP
-gt
1
]
;
then
sp_option
=
"
\
--sequence-parallel"
elif
[
$SP
=
false
]
;
then
sp_option
=
"
\
"
fi
if
[
-z
${
MP_PP0_LAYERS
}
]
;
then
uneven_split_option
=
""
elif
[
${
PP
}
-gt
1
]
;
then
_check
=
$((
(
$NUM_LAYERS
-
${
MP_PP0_LAYERS
}
)
%
(
${
PP
}
-
1
)
))
if
[
$_check
!=
0
]
;
then
echo
"With uneven pipelineing the left over layers must be divisible by left over stages."
exit
-1
fi
uneven_split_option
=
"
\
--decoder-first-pipeline-num-layers
${
MP_PP0_LAYERS
}
"
else
echo
"uneven pipeline split must be used when PP > 1"
exit
-1
fi
if
[
$PRETRAIN_CHECKPOINT_PATH
!=
none
]
;
then
load_option
=
"
\
--load
$PRETRAIN_CHECKPOINT_PATH
"
fi
if
[
$OPTIMIZER_OFFLOAD
!=
false
]
;
then
offload_option
=
"
\
--optimizer-cpu-offload
\
--use-precision-aware-optimizer
\
--optimizer-offload-fraction
${
OPTIMIZER_OFFLOAD
}
"
fi
if
[
$SFT
=
true
]
;
then
TRAIN_ITERS
=
${
25
}
LR_WARMUP_ITERS
=
${
26
}
LR_DECAY_ITERS
=
$((
${
TRAIN_ITERS
}
-
${
LR_WARMUP_ITERS
}))
PREFIX
=
"finetune-mcore-deepseek-v3-
${
MODEL_SIZE
}
-lr-
${
LR
}
-minlr-
${
MIN_LR
}
-bs-
${
BATCH_SIZE
}
-gbs-
${
GLOBAL_BATCH_SIZE
}
-seqlen-
${
SEQ_LEN
}
"
sft_options
=
"
\
--eod-mask-loss
\
--calculate-per-token-loss
\
--train-mode finetune"
else
# TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_WARMUP_ITERS
=
$((
${
WARMUP_TOKENS
}
/
${
GLOBAL_BATCH_SIZE
}
/
${
SEQ_LEN
}
))
LR_DECAY_ITERS
=
$((
${
TRAIN_TOKENS
}
/
${
GLOBAL_BATCH_SIZE
}
/
${
SEQ_LEN
}
))
PREFIX
=
"pretrain-mcore-deepseek-v3-
${
MODEL_SIZE
}
-lr-
${
LR
}
-minlr-
${
MIN_LR
}
-bs-
${
BATCH_SIZE
}
-gbs-
${
GLOBAL_BATCH_SIZE
}
-seqlen-
${
SEQ_LEN
}
"
sft_options
=
"
\
--train-mode pretrain"
fi
if
[
${
MP_DATASET_TYPE
}
=
"raw"
]
;
then
dataset_options
=
"
\
--train-data-path
${
DATASET_PATH
}
\
--valid-data-path
${
VALID_DATASET_PATH
}
\
--dataloader-type cyclic
\
--dataset JSON-SFT"
else
dataset_options
=
"
\
--data-path
${
DATASET_PATH
}
\
--split 99,1,0"
fi
if
[
${
MP_SFT_PACKING
}
=
true
]
;
then
echo
"Currently MLA do not support THD format attention, thus sequence packing can not be used..."
packing_options
=
""
else
packing_options
=
""
fi
##### Prepare logdirs #######
NAME
=
"
${
PREFIX
}
-pr-
${
PR
}
-tp-
${
TP
}
-pp-
${
PP
}
-cp-
${
CP
}
-ac-
${
AC
}
-do-
${
DO
}
-sp-
${
SP
}
-ti-
${
TRAIN_ITERS
}
-wi-
${
LR_WARMUP_ITERS
}
"
mkdir
-p
"
${
OUTPUT_BASEPATH
}
/tensorboard/"
mkdir
-p
"
${
OUTPUT_BASEPATH
}
/checkpoint/"
mkdir
-p
"
${
OUTPUT_BASEPATH
}
/log/"
current_time
=
$(
date
"+%Y.%m.%d-%H.%M.%S"
)
TENSORBOARD_DIR
=
"
${
OUTPUT_BASEPATH
}
/tensorboard/
${
NAME
}
_
${
current_time
}
"
mkdir
-p
${
TENSORBOARD_DIR
}
SAVED_PRETRAIN_CHECKPOINT_PATH
=
"
${
OUTPUT_BASEPATH
}
/checkpoint/
${
NAME
}
"
mkdir
-p
${
SAVED_PRETRAIN_CHECKPOINT_PATH
}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
megatron_options
=
"
\
--lr
${
LR
}
\
--min-lr
${
MIN_LR
}
\
--lr-decay-style cosine
\
--weight-decay 0.1
\
--adam-beta1 0.9
\
--adam-beta2 0.95
\
--clip-grad 1.0
\
--init-method-std 0.008
\
--attention-dropout 0.0
\
--hidden-dropout 0.0
\
--lr-decay-iters
${
LR_DECAY_ITERS
}
\
--lr-warmup-iters
${
LR_WARMUP_ITERS
}
\
--train-iters
${
TRAIN_ITERS
}
\
--micro-batch-size
${
BATCH_SIZE
}
\
--global-batch-size
${
GLOBAL_BATCH_SIZE
}
\
--num-layers
${
NUM_LAYERS
}
\
--hidden-size
${
HIDDEN_SIZE
}
\
--num-attention-heads
${
NUM_ATTENTION_HEADS
}
\
--ffn-hidden-size
${
INTERMEDIATE_SIZE
}
\
--seq-length
${
SEQ_LEN
}
\
--max-position-embeddings
${
MAX_POSITION_EMBEDDINGS
}
\
--log-interval 1
\
--log-throughput
\
--eval-interval 10000
\
--eval-iters 3
\
--save-interval
${
SAVE_INTERVAL
}
\
--tensorboard-queue-size 1
\
--tensorboard-dir
${
TENSORBOARD_DIR
}
\
--log-timers-to-tensorboard
\
--log-validation-ppl-to-tensorboard
\
--tensor-model-parallel-size
${
TP
}
\
--pipeline-model-parallel-size
${
PP
}
\
--context-parallel-size
${
CP
}
\
--no-load-optim
\
--no-load-rng
\
--num-workers 8
\
--extra-vocab-size
${
EXTRA_VOCAB_SIZE
}
\
--tokenizer-type DeepSeekV2Tokenizer
\
--tokenizer-model
${
TOKENIZER_MODEL_PATH
}
\
--swiglu
\
--normalization RMSNorm
\
--norm-epsilon
${
RMS_NORM_EPS
}
\
--use-rotary-position-embeddings
\
--no-rope-fusion
\
--untie-embeddings-and-output-weights
\
--disable-bias-linear
\
--rotary-base
${
ROPE_THETA
}
\
--rotary-scaling-factor
${
SCALE_FACTOR
}
\
--no-save-optim
\
--kv-channels
${
V_HEAD_DIM
}
\
--qk-layernorm
\
--multi-latent-attention
\
--ckpt-format torch
\
--transformer-impl transformer_engine
\
--no-masked-softmax-fusion
\
--use-rope-scaling
\
"
TORCH_PROFIE_ARGS
=
"
\
--profile
\
--profile-ranks 0 1 2 3 4 5 6 7
\
--profile-step-start 3
\
--profile-step-end 4
\
--profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1
\
--use-pytorch-profiler
\
"
HIP_PROFIE_ARGS
=
"
\
--profile
\
--profile-ranks 0 1 2 3 4 5 6 7
\
--profile-step-start 4
\
--profile-step-end 5
\
--use-hip-profiler
\
"
DISTRIBUTED_ARGS
=
"
\
--rank
${
RANK
}
\
--world-size
${
WORLD_SIZE
}
\
--local-rank
${
LOCAL_RANK
}
\
--dist-url tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
\
"
APP
=
"python3 -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
${
megatron_options
}
\
${
dataset_options
}
\
${
pr_options
}
\
${
load_option
}
\
${
activation_checkpoint_options
}
\
${
do_option
}
\
${
sp_option
}
\
${
moe_options
}
\
${
offload_option
}
\
${
vp_option
}
\
${
packing_options
}
\
${
uneven_split_option
}
\
${
attn_backend_option
}
\
${
mtp_options
}
\
${
comm_overlap_option
}
\
${
DISTRIBUTED_ARGS
}
\
"
if
[[
$profiling
==
"torch"
]]
;
then
APP+
=
"
${
TORCH_PROFIE_ARGS
}
"
elif
[[
$profiling
==
"hip"
]]
;
then
mkdir
-p
hip_prof_data
APP+
=
"
${
HIP_PROFIE_ARGS
}
"
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
fi
#for hygon cpu
case
${
LOCAL_RANK
}
in
[
0]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
[
4]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
;;
[
5]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
;;
[
6]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
;;
[
7]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
;;
esac
examples/deepseek_v3/train_deepseekv3_671B_multinodes.sh
0 → 100755
View file @
70368616
#!/bin/bash
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Runs DeepseekV3 671B model
source
/opt/dtk/env.sh
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
25900
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
GLOG_minloglevel
=
3
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
GPU_MAX_HW_QUEUES
=
10
# nccl env
export
NCCL_ALGO
=
Ring
export
NCCL_MIN_NCHANNELS
=
32
export
NCCL_MAX_NCHANNELS
=
32
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"./topo-input.xml"
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
export
MP_PP0_LAYERS
=
2
# 是否使能视实际情况而定
### BASE CONFIG ###
MODEL_SIZE
=
A37B
BATCH_SIZE
=
1
GLOBAL_BATCH_SIZE
=
4096
LR
=
1e-4
MIN_LR
=
1e-6
SEQ_LEN
=
4096
PAD_LEN
=
4096
PR
=
bf16
### BASE CONFIG ###
### PARALLEL / BOOL OPTION ###
TP
=
4
PP
=
8
CP
=
1
ETP
=
2
EP
=
64
SP
=
true
DO
=
true
FL
=
true
SFT
=
false
### PARALLEL / BOOL OPTION ###
### OTHERS ###
AC
=
none
OPTIMIZER_OFFLOAD
=
false
SAVE_INTERVAL
=
500
DATASET_PATH
=
"path to mmap_deepseekv3_datasets_text_document"
VALID_DATASET_PATH
=
"path to mmap_deepseekv3_datasets_text_document"
PRETRAIN_CHECKPOINT_PATH
=
"./output"
TOKENIZER_MODEL_PATH
=
"path to deepseekv3_dataset"
# the following two values will not be used when SFT is true
TRAIN_TOKENS
=
$((
10000
*
${
GLOBAL_BATCH_SIZE
}
*
${
SEQ_LEN
}))
WARMUP_TOKENS
=
$((
2000
*
${
GLOBAL_BATCH_SIZE
}
*
${
SEQ_LEN
}))
###############################
OUTPUT_BASEPATH
=
./output
### OTHERS ###
if
[
$FL
=
true
]
;
then
:
#exit -1
elif
[
$FL
=
false
]
;
then
attn_backend_option
=
"
\
--attention-backend auto
"
fi
if
[
$MODEL_SIZE
=
A37B
]
;
then
TRAIN_ITERS
=
10
HIDDEN_SIZE
=
7168
NUM_ATTENTION_HEADS
=
128
NUM_LAYERS
=
61
INTERMEDIATE_SIZE
=
18432
MOE_INTERMEDIATE_SIZE
=
2048
MAX_POSITION_EMBEDDINGS
=
163840
EXTRA_VOCAB_SIZE
=
467
Q_LORA_RANK
=
1536
KV_LORA_RANK
=
512
QK_NOPE_HEAD_DIM
=
128
QK_ROPE_HEAD_DIM
=
64
V_HEAD_DIM
=
128
ROPE_THETA
=
10000
SCALE_FACTOR
=
40
NUM_EXPERTS
=
256
ROUTER_TOPK
=
8
NUM_SHARED_EXPERTS
=
1
RMS_NORM_EPS
=
1e-6
moe_options
=
"
\
--moe-grouped-gemm
\
--moe-expert-capacity-factor 0.5
\
--moe-pad-expert-input-to-capacity
\
--moe-token-dispatcher-type alltoall
\
--moe-router-topk
${
ROUTER_TOPK
}
\
--moe-router-group-topk 4
\
--moe-router-num-groups 8
\
--num-experts
${
NUM_EXPERTS
}
\
--expert-model-parallel-size
${
EP
}
\
--expert-tensor-parallel-size
${
ETP
}
\
--moe-ffn-hidden-size
${
MOE_INTERMEDIATE_SIZE
}
\
--moe-router-load-balancing-type seq_aux_loss
\
--moe-router-topk-scaling-factor 2.5
\
--moe-shared-expert-overlap
\
--moe-router-enable-expert-bias
\
--mscale 1.0
\
--mscale-all-dim 1.0
\
--moe-router-score-function sigmoid
\
--moe-router-bias-update-rate 0.001
\
--moe-aux-loss-coeff 0.001
\
--moe-layer-freq ([0]*3+[1]*58)
\
--moe-shared-expert-intermediate-size
$((${
MOE_INTERMEDIATE_SIZE
}
*
${
NUM_SHARED_EXPERTS
}
))
\
--q-lora-rank
${
Q_LORA_RANK
}
\
--kv-lora-rank
${
KV_LORA_RANK
}
\
--qk-head-dim
${
QK_NOPE_HEAD_DIM
}
\
--qk-pos-emb-head-dim
${
QK_ROPE_HEAD_DIM
}
\
--v-head-dim
${
V_HEAD_DIM
}
\
--mtp-num-layers 1
\
"
mtp_options
=
""
fi
# Here are some configs controled by env
if
[
-z
${
MP_DATASET_TYPE
}
]
;
then
MP_DATASET_TYPE
=
"idxmap"
fi
if
[
-z
${
MP_AC_LAYERS
}
]
;
then
MP_AC_LAYERS
=
1
fi
if
[
-z
${
MP_VP
}
]
;
then
vp_option
=
""
else
vp_option
=
"
\
--num-layers-per-virtual-pipeline-stage
${
MP_VP
}
"
fi
if
[
-z
${
MP_SFT_PACKING
}
]
;
then
MP_SFT_PACKING
=
false
fi
TP_COMM_OVERLAP
=
$((
(
$TP
>
1
)
?
1
:
0
))
comm_overlap_option
=
"
\
--overlap-grad-reduce
\
--overlap-param-gather"
# if [ $TP_COMM_OVERLAP -eq 1 ]; then
# comm_overlap_option="\
# --tp-comm-overlap \
# --overlap-grad-reduce \
# --overlap-param-gather"
# fi
if
[
$AC
=
full
]
;
then
_check
=
$((
(
$NUM_LAYERS
/
$PP
)
%
${
MP_AC_LAYERS
}
))
if
[
$_check
!=
0
]
;
then
echo
"the num layers per pp rank must be a multiple of the recompute layers."
exit
-1
fi
activation_checkpoint_options
=
"
\
--recompute-method uniform
\
--recompute-num-layers
${
MP_AC_LAYERS
}
\
--recompute-granularity full"
elif
[
$AC
=
sel
]
;
then
activation_checkpoint_options
=
"
\
--recompute-activations"
elif
[
$AC
=
none
]
;
then
activation_checkpoint_options
=
"
\
"
elif
[
$AC
=
offload
]
;
then
activation_checkpoint_options
=
"
\
--cpu-offloading
\
--cpu-offloading-num-layers
${
MP_AC_LAYERS
}
"
if
[
$TP_COMM_OVERLAP
-eq
1
]
;
then
echo
"Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option
=
"
\
--tp-comm-overlap"
else
echo
"Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option
=
""
fi
fi
if
[
$PR
=
fp16
]
;
then
pr_options
=
"
\
--fp16
\
--apply-query-key-layer-scaling"
export
NVTE_APPLY_QK_LAYER_SCALING
=
1
elif
[
$PR
=
bf16
]
;
then
pr_options
=
"
\
--bf16"
elif
[
$PR
=
fp8
]
;
then
pr_options
=
"
\
--bf16
\
--fp8-format hybrid
\
--fp8-amax-compute-algo max
\
--fp8-amax-history-len 1024"
fi
if
[
$OPTIMIZER_OFFLOAD
!=
false
]
&&
[
$DO
=
false
]
;
then
echo
"Offload optimizer is valid only if
\$
DO=true"
DO
=
true
fi
if
[
$DO
=
true
]
;
then
do_option
=
"
\
--use-distributed-optimizer"
elif
[
$DO
=
false
]
;
then
do_option
=
"
\
"
fi
if
[
$SP
=
true
]
&&
[
$TP
-gt
1
]
;
then
sp_option
=
"
\
--sequence-parallel"
elif
[
$SP
=
false
]
;
then
sp_option
=
"
\
"
fi
if
[
-z
${
MP_PP0_LAYERS
}
]
;
then
uneven_split_option
=
""
elif
[
${
PP
}
-gt
1
]
;
then
_check
=
$((
(
$NUM_LAYERS
-
${
MP_PP0_LAYERS
}
)
%
(
${
PP
}
-
1
)
))
if
[
$_check
!=
0
]
;
then
echo
"With uneven pipelineing the left over layers must be divisible by left over stages."
exit
-1
fi
uneven_split_option
=
"
\
--decoder-first-pipeline-num-layers
${
MP_PP0_LAYERS
}
"
else
echo
"uneven pipeline split must be used when PP > 1"
exit
-1
fi
if
[
$PRETRAIN_CHECKPOINT_PATH
!=
none
]
;
then
load_option
=
"
\
--load
$PRETRAIN_CHECKPOINT_PATH
"
fi
if
[
$OPTIMIZER_OFFLOAD
!=
false
]
;
then
offload_option
=
"
\
--optimizer-cpu-offload
\
--use-precision-aware-optimizer
\
--optimizer-offload-fraction
${
OPTIMIZER_OFFLOAD
}
"
fi
if
[
$SFT
=
true
]
;
then
TRAIN_ITERS
=
${
25
}
LR_WARMUP_ITERS
=
${
26
}
LR_DECAY_ITERS
=
$((
${
TRAIN_ITERS
}
-
${
LR_WARMUP_ITERS
}))
PREFIX
=
"finetune-mcore-deepseek-v3-
${
MODEL_SIZE
}
-lr-
${
LR
}
-minlr-
${
MIN_LR
}
-bs-
${
BATCH_SIZE
}
-gbs-
${
GLOBAL_BATCH_SIZE
}
-seqlen-
${
SEQ_LEN
}
"
sft_options
=
"
\
--eod-mask-loss
\
--calculate-per-token-loss
\
--train-mode finetune"
else
# TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_WARMUP_ITERS
=
$((
${
WARMUP_TOKENS
}
/
${
GLOBAL_BATCH_SIZE
}
/
${
SEQ_LEN
}
))
LR_DECAY_ITERS
=
$((
${
TRAIN_TOKENS
}
/
${
GLOBAL_BATCH_SIZE
}
/
${
SEQ_LEN
}
))
PREFIX
=
"pretrain-mcore-deepseek-v3-
${
MODEL_SIZE
}
-lr-
${
LR
}
-minlr-
${
MIN_LR
}
-bs-
${
BATCH_SIZE
}
-gbs-
${
GLOBAL_BATCH_SIZE
}
-seqlen-
${
SEQ_LEN
}
"
sft_options
=
"
\
--train-mode pretrain"
fi
if
[
${
MP_DATASET_TYPE
}
=
"raw"
]
;
then
dataset_options
=
"
\
--train-data-path
${
DATASET_PATH
}
\
--valid-data-path
${
VALID_DATASET_PATH
}
\
--dataloader-type cyclic
\
--dataset JSON-SFT"
else
dataset_options
=
"
\
--data-path
${
DATASET_PATH
}
\
--split 99,1,0"
fi
if
[
${
MP_SFT_PACKING
}
=
true
]
;
then
echo
"Currently MLA do not support THD format attention, thus sequence packing can not be used..."
packing_options
=
""
else
packing_options
=
""
fi
##### Prepare logdirs #######
NAME
=
"
${
PREFIX
}
-pr-
${
PR
}
-tp-
${
TP
}
-pp-
${
PP
}
-cp-
${
CP
}
-ac-
${
AC
}
-do-
${
DO
}
-sp-
${
SP
}
-ti-
${
TRAIN_ITERS
}
-wi-
${
LR_WARMUP_ITERS
}
"
mkdir
-p
"
${
OUTPUT_BASEPATH
}
/tensorboard/"
mkdir
-p
"
${
OUTPUT_BASEPATH
}
/checkpoint/"
mkdir
-p
"
${
OUTPUT_BASEPATH
}
/log/"
current_time
=
$(
date
"+%Y.%m.%d-%H.%M.%S"
)
TENSORBOARD_DIR
=
"
${
OUTPUT_BASEPATH
}
/tensorboard/
${
NAME
}
_
${
current_time
}
"
mkdir
-p
${
TENSORBOARD_DIR
}
SAVED_PRETRAIN_CHECKPOINT_PATH
=
"
${
OUTPUT_BASEPATH
}
/checkpoint/
${
NAME
}
"
mkdir
-p
${
SAVED_PRETRAIN_CHECKPOINT_PATH
}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
#find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "merges.txt" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
megatron_options
=
"
\
--lr
${
LR
}
\
--min-lr
${
MIN_LR
}
\
--lr-decay-style cosine
\
--weight-decay 0.1
\
--adam-beta1 0.9
\
--adam-beta2 0.95
\
--clip-grad 1.0
\
--init-method-std 0.008
\
--attention-dropout 0.0
\
--hidden-dropout 0.0
\
--lr-decay-iters
${
LR_DECAY_ITERS
}
\
--lr-warmup-iters
${
LR_WARMUP_ITERS
}
\
--train-iters
${
TRAIN_ITERS
}
\
--micro-batch-size
${
BATCH_SIZE
}
\
--global-batch-size
${
GLOBAL_BATCH_SIZE
}
\
--num-layers
${
NUM_LAYERS
}
\
--hidden-size
${
HIDDEN_SIZE
}
\
--num-attention-heads
${
NUM_ATTENTION_HEADS
}
\
--ffn-hidden-size
${
INTERMEDIATE_SIZE
}
\
--seq-length
${
SEQ_LEN
}
\
--max-position-embeddings
${
MAX_POSITION_EMBEDDINGS
}
\
--log-interval 1
\
--log-throughput
\
--eval-interval 10000
\
--eval-iters 3
\
--save-interval
${
SAVE_INTERVAL
}
\
--tensorboard-queue-size 1
\
--tensorboard-dir
${
TENSORBOARD_DIR
}
\
--log-timers-to-tensorboard
\
--log-validation-ppl-to-tensorboard
\
--tensor-model-parallel-size
${
TP
}
\
--pipeline-model-parallel-size
${
PP
}
\
--context-parallel-size
${
CP
}
\
--no-load-optim
\
--no-load-rng
\
--num-workers 8
\
--extra-vocab-size
${
EXTRA_VOCAB_SIZE
}
\
--tokenizer-type DeepSeekV2Tokenizer
\
--tokenizer-model
${
TOKENIZER_MODEL_PATH
}
\
--swiglu
\
--normalization RMSNorm
\
--norm-epsilon
${
RMS_NORM_EPS
}
\
--use-rotary-position-embeddings
\
--no-rope-fusion
\
--untie-embeddings-and-output-weights
\
--disable-bias-linear
\
--rotary-base
${
ROPE_THETA
}
\
--rotary-scaling-factor
${
SCALE_FACTOR
}
\
--no-save-optim
\
--kv-channels
${
V_HEAD_DIM
}
\
--qk-layernorm
\
--multi-latent-attention
\
--ckpt-format torch
\
--transformer-impl transformer_engine
\
--no-masked-softmax-fusion
\
--use-rope-scaling
\
"
TORCH_PROFIE_ARGS
=
"
\
--profile
\
--profile-ranks 0 1 2 3 4 5 6 7
\
--profile-step-start 3
\
--profile-step-end 4
\
--profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1
\
--use-pytorch-profiler
\
"
HIP_PROFIE_ARGS
=
"
\
--profile
\
--profile-ranks 0 1 2 3 4 5 6 7
\
--profile-step-start 4
\
--profile-step-end 5
\
--use-hip-profiler
\
"
DISTRIBUTED_ARGS
=
"
\
--rank
${
RANK
}
\
--world-size
${
WORLD_SIZE
}
\
--local-rank
${
LOCAL_RANK
}
\
--dist-url tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
\
"
APP
=
"python3 -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
${
megatron_options
}
\
${
dataset_options
}
\
${
pr_options
}
\
${
load_option
}
\
${
activation_checkpoint_options
}
\
${
do_option
}
\
${
sp_option
}
\
${
moe_options
}
\
${
offload_option
}
\
${
vp_option
}
\
${
packing_options
}
\
${
uneven_split_option
}
\
${
attn_backend_option
}
\
${
mtp_options
}
\
${
comm_overlap_option
}
\
${
DISTRIBUTED_ARGS
}
\
"
if
[[
$profiling
==
"torch"
]]
;
then
APP+
=
"
${
TORCH_PROFIE_ARGS
}
"
elif
[[
$profiling
==
"hip"
]]
;
then
mkdir
-p
hip_prof_data
APP+
=
"
${
HIP_PROFIE_ARGS
}
"
APP
=
"hipprof -d hip_prof_data --hip-trace --trace-off
${
APP
}
"
fi
#for hygon cpu
case
${
LOCAL_RANK
}
in
[
0]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
[
4]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
;;
[
5]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
;;
[
6]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
;;
[
7]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
;;
esac
examples/gpt3/README.md
deleted
100644 → 0
View file @
8551c38e
# GPT3 MODEL
## Table of contents
-
[
1. Training Setup
](
#1-training-setup
)
-
[
2. Configurations
](
#2-configurations
)
-
[
3. Training Results
](
#3-training-results
)
## 1. Training setup
<a
id=
"markdown-training-setup"
name=
"training-setup"
></a>
To run the model using a docker container run it as follows
```
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
CHECKPOINT_PATH="" #<Specify path>
TENSORBOARD_LOGS_PATH=""#<Specify path>
VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
DATA_PATH="" #<Specify path and file prefix>_text_document
docker run \
--gpus=all \
--ipc=host \
--workdir /workspace/megatron-lm \
-v /path/to/data:/path/to/data \
-v /path/to/megatron-lm:/workspace/megatron-lm \
megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
```
NOTE: Depending on the environment you are running it the above command might like slightly different.
## 2. Configurations
<a
id=
"markdown-configurations"
name=
"configurations"
></a>
The example in this folder shows you how to run 175B model. There are other configs you could run as well
### 345M
```
--num-layers 12 \
--hidden-size 512 \
--num-attention-heads 8 \
--seq-length 1024 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
### 857M
```
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
examples/gpt3/run_gpt_567B_1nodes.sh
View file @
70368616
...
@@ -6,7 +6,7 @@ do
...
@@ -6,7 +6,7 @@ do
done
done
mpirun
-np
8
--allow-run-as-root
\
mpirun
-np
8
--allow-run-as-root
\
train_gpt_567B_1nodes.sh localhost
--profiling
=
$profiling
>
output
.log 2>&1
train_gpt_567B_1nodes.sh localhost
--profiling
=
$profiling
>
log-1nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
wait
...
...
examples/gpt3/run_gpt_567B_multinodes.sh
View file @
70368616
...
@@ -5,11 +5,11 @@ do
...
@@ -5,11 +5,11 @@ do
fi
fi
done
done
mpirun
-np
512
--hostfile
hostfile_gpt_567B
\
mpirun
-np
1024
--hostfile
hostfile_gpt_567B
\
--allow-run-as-root
\
--allow-run-as-root
\
--bind-to
none
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
--mca
plm_rsh_no_tree_spawn 1
\
train_gpt_567B_multinodes.sh node059
--profiling
=
$profiling
>
output
.log 2>&1
train_gpt_567B_multinodes.sh node059
--profiling
=
$profiling
>
log-1024nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
wait
...
...
examples/gpt3/train_gpt_567B_1nodes.sh
View file @
70368616
...
@@ -93,11 +93,11 @@ TRAINING_ARGS=(
...
@@ -93,11 +93,11 @@ TRAINING_ARGS=(
--global-batch-size
256
--global-batch-size
256
--lr
1e-4
--lr
1e-4
--train-iters
10
--train-iters
10
--lr-decay-iters
32
0000
--lr-decay-iters
1
0000
--lr-decay-style
cosine
--lr-decay-style
cosine
--min-lr
1.0e-
5
--min-lr
1.0e-
6
--weight-decay
0.1
--weight-decay
0.1
--lr-warmup-iters
5
00
--lr-warmup-iters
20
00
--clip-grad
1.0
--clip-grad
1.0
--bf16
--bf16
--overlap-param-gather
--overlap-param-gather
...
@@ -126,6 +126,7 @@ MODEL_PARALLEL_ARGS=(
...
@@ -126,6 +126,7 @@ MODEL_PARALLEL_ARGS=(
--pipeline-model-parallel-size
1
--pipeline-model-parallel-size
1
--expert-model-parallel-size
4
--expert-model-parallel-size
4
--expert-tensor-parallel-size
2
--expert-tensor-parallel-size
2
--context-parallel-size
1
--use-distributed-optimizer
--use-distributed-optimizer
--sequence-parallel
--sequence-parallel
)
)
...
@@ -173,42 +174,34 @@ fi
...
@@ -173,42 +174,34 @@ fi
case
${
LOCAL_RANK
}
in
case
${
LOCAL_RANK
}
in
[
0]
)
[
0]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
#numactl --cpunodebind=0 --membind=0 ${APP}
;;
;;
[
1]
)
[
1]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
#numactl --cpunodebind=1 --membind=1 ${APP}
;;
;;
[
2]
)
[
2]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
#numactl --cpunodebind=2 --membind=2 ${APP}
;;
;;
[
3]
)
[
3]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
#numactl --cpunodebind=3 --membind=3 ${APP}
;;
;;
[
4]
)
[
4]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
#numactl --cpunodebind=4 --membind=4 ${APP}
;;
;;
[
5]
)
[
5]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
#numactl --cpunodebind=5 --membind=5 ${APP}
;;
;;
[
6]
)
[
6]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
#numactl --cpunodebind=6 --membind=6 ${APP}
;;
;;
[
7]
)
[
7]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
#numactl --cpunodebind=7 --membind=7 ${APP}
;;
;;
esac
esac
examples/gpt3/train_gpt_567B_multinodes.sh
View file @
70368616
...
@@ -90,14 +90,14 @@ DATA_ARGS=(
...
@@ -90,14 +90,14 @@ DATA_ARGS=(
TRAINING_ARGS
=(
TRAINING_ARGS
=(
--micro-batch-size
1
--micro-batch-size
1
--global-batch-size
1024
--global-batch-size
2048
--lr
1e-4
--lr
1e-4
--train-iters
10
--train-iters
10
--lr-decay-iters
32
0000
--lr-decay-iters
1
0000
--lr-decay-style
cosine
--lr-decay-style
cosine
--min-lr
1.0e-
5
--min-lr
1.0e-
6
--weight-decay
0.1
--weight-decay
0.1
--lr-warmup-iters
5
00
--lr-warmup-iters
20
00
--clip-grad
1.0
--clip-grad
1.0
--bf16
--bf16
--overlap-param-gather
--overlap-param-gather
...
@@ -109,7 +109,7 @@ TORCH_PROFIE_ARGS=(
...
@@ -109,7 +109,7 @@ TORCH_PROFIE_ARGS=(
--profile-ranks
0 1 2 3 4 5 6 7
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-start
3
--profile-step-end
4
--profile-step-end
4
--profile-dir
torch_prof_gpt_64nodes_tp4-pp
8
-ep16-ep_tp4-cp2
--profile-dir
torch_prof_gpt_64nodes_tp4-pp
16
-ep16-ep_tp4-cp2
--use-pytorch-profiler
--use-pytorch-profiler
)
)
...
@@ -123,11 +123,10 @@ HIP_PROFIE_ARGS=(
...
@@ -123,11 +123,10 @@ HIP_PROFIE_ARGS=(
MODEL_PARALLEL_ARGS
=(
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
4
--tensor-model-parallel-size
4
--pipeline-model-parallel-size
8
--pipeline-model-parallel-size
16
--expert-model-parallel-size
16
--expert-model-parallel-size
16
--expert-tensor-parallel-size
4
--expert-tensor-parallel-size
4
--context-parallel-size
2
--context-parallel-size
2
#--num-layers-per-virtual-pipeline-stage 2
--use-distributed-optimizer
--use-distributed-optimizer
--sequence-parallel
--sequence-parallel
)
)
...
@@ -175,42 +174,34 @@ fi
...
@@ -175,42 +174,34 @@ fi
case
${
LOCAL_RANK
}
in
case
${
LOCAL_RANK
}
in
[
0]
)
[
0]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
#numactl --cpunodebind=0 --membind=0 ${APP}
;;
;;
[
1]
)
[
1]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
#numactl --cpunodebind=1 --membind=1 ${APP}
;;
;;
[
2]
)
[
2]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
#numactl --cpunodebind=2 --membind=2 ${APP}
;;
;;
[
3]
)
[
3]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
#numactl --cpunodebind=3 --membind=3 ${APP}
;;
;;
[
4]
)
[
4]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
#numactl --cpunodebind=4 --membind=4 ${APP}
;;
;;
[
5]
)
[
5]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
#numactl --cpunodebind=5 --membind=5 ${APP}
;;
;;
[
6]
)
[
6]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
#numactl --cpunodebind=6 --membind=6 ${APP}
;;
;;
[
7]
)
[
7]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
#numactl --cpunodebind=7 --membind=7 ${APP}
;;
;;
esac
esac
examples/mixtral/run_mixtral_8x22B_1nodes.sh
View file @
70368616
...
@@ -6,7 +6,7 @@ do
...
@@ -6,7 +6,7 @@ do
done
done
mpirun
-np
8
--allow-run-as-root
\
mpirun
-np
8
--allow-run-as-root
\
train_mixtral_8x22B_1nodes.sh localhost
--profiling
=
$profiling
>
output
.log 2>&1
train_mixtral_8x22B_1nodes.sh localhost
--profiling
=
$profiling
>
log-1nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
wait
...
...
examples/mixtral/run_mixtral_8x22B_multinodes.sh
View file @
70368616
...
@@ -9,7 +9,7 @@ mpirun -np 64 --hostfile hostfile_mixtral_8x22B \
...
@@ -9,7 +9,7 @@ mpirun -np 64 --hostfile hostfile_mixtral_8x22B \
--allow-run-as-root
\
--allow-run-as-root
\
--bind-to
none
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
--mca
plm_rsh_no_tree_spawn 1
\
train_mixtral_8x22B_multinodes.sh node067
--profiling
=
$profiling
>
output
.log 2>&1
train_mixtral_8x22B_multinodes.sh node067
--profiling
=
$profiling
>
log-8nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
wait
...
...
examples/mixtral/run_mixtral_8x7B_1nodes.sh
View file @
70368616
...
@@ -6,7 +6,7 @@ do
...
@@ -6,7 +6,7 @@ do
done
done
mpirun
-np
8
--allow-run-as-root
\
mpirun
-np
8
--allow-run-as-root
\
train_mixtral_8x7B_1nodes.sh localhost
--profiling
=
$profiling
>
output
.log 2>&1
train_mixtral_8x7B_1nodes.sh localhost
--profiling
=
$profiling
>
log-1nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
wait
...
...
examples/mixtral/run_mixtral_8x7B_multinodes.sh
View file @
70368616
...
@@ -9,7 +9,7 @@ mpirun -np 32 --hostfile hostfile_mixtral_8x7B \
...
@@ -9,7 +9,7 @@ mpirun -np 32 --hostfile hostfile_mixtral_8x7B \
--allow-run-as-root
\
--allow-run-as-root
\
--bind-to
none
\
--bind-to
none
\
--mca
plm_rsh_no_tree_spawn 1
\
--mca
plm_rsh_no_tree_spawn 1
\
train_mixtral_8x7B_multinodes.sh node067
--profiling
=
$profiling
>
output
.log 2>&1
train_mixtral_8x7B_multinodes.sh node067
--profiling
=
$profiling
>
log-4nodes-
`
date
+%F-%H%M
`
.log 2>&1
wait
wait
...
...
examples/mixtral/train_mixtral_8x22B_1nodes.sh
View file @
70368616
...
@@ -96,11 +96,11 @@ TRAINING_ARGS=(
...
@@ -96,11 +96,11 @@ TRAINING_ARGS=(
--global-batch-size
256
--global-batch-size
256
--lr
1e-4
--lr
1e-4
--train-iters
10
--train-iters
10
--lr-decay-iters
32
0000
--lr-decay-iters
1
0000
--lr-decay-style
cosine
--lr-decay-style
cosine
--min-lr
1.0e-
5
--min-lr
1.0e-
6
--weight-decay
0.1
--weight-decay
0.1
--lr-warmup-iters
5
00
--lr-warmup-iters
20
00
--clip-grad
1.0
--clip-grad
1.0
--bf16
--bf16
--overlap-param-gather
--overlap-param-gather
...
@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=(
...
@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=(
--pipeline-model-parallel-size
1
--pipeline-model-parallel-size
1
--expert-model-parallel-size
8
--expert-model-parallel-size
8
--expert-tensor-parallel-size
1
--expert-tensor-parallel-size
1
--context-parallel-size
1
--use-distributed-optimizer
--use-distributed-optimizer
--sequence-parallel
--sequence-parallel
)
)
...
@@ -143,7 +144,8 @@ LOGGING_ARGS=(
...
@@ -143,7 +144,8 @@ LOGGING_ARGS=(
#--load $CHECKPOINT_PATH \
#--load $CHECKPOINT_PATH \
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
\
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
\
--no-load-optim
\
--no-load-optim
\
--no-load-rng
--no-load-rng
\
--no-save-optim
)
)
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
...
@@ -175,43 +177,34 @@ fi
...
@@ -175,43 +177,34 @@ fi
case
${
LOCAL_RANK
}
in
case
${
LOCAL_RANK
}
in
[
0]
)
[
0]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
#numactl --cpunodebind=0 --membind=0 ${APP}
;;
;;
[
1]
)
[
1]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
#numactl --cpunodebind=1 --membind=1 ${APP}
;;
;;
[
2]
)
[
2]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
#numactl --cpunodebind=2 --membind=2 ${APP}
;;
;;
[
3]
)
[
3]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
#numactl --cpunodebind=3 --membind=3 ${APP}
;;
;;
[
4]
)
[
4]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
#numactl --cpunodebind=4 --membind=4 ${APP}
;;
;;
[
5]
)
[
5]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
#numactl --cpunodebind=5 --membind=5 ${APP}
;;
;;
[
6]
)
[
6]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
#numactl --cpunodebind=6 --membind=6 ${APP}
;;
;;
[
7]
)
[
7]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
#numactl --cpunodebind=7 --membind=7 ${APP}
;;
;;
esac
esac
examples/mixtral/train_mixtral_8x22B_multinodes.sh
View file @
70368616
...
@@ -96,11 +96,11 @@ TRAINING_ARGS=(
...
@@ -96,11 +96,11 @@ TRAINING_ARGS=(
--global-batch-size
256
--global-batch-size
256
--lr
1e-4
--lr
1e-4
--train-iters
10
--train-iters
10
--lr-decay-iters
32
0000
--lr-decay-iters
1
0000
--lr-decay-style
cosine
--lr-decay-style
cosine
--min-lr
1.0e-
5
--min-lr
1.0e-
6
--weight-decay
0.1
--weight-decay
0.1
--lr-warmup-iters
5
00
--lr-warmup-iters
20
00
--clip-grad
1.0
--clip-grad
1.0
--bf16
--bf16
--overlap-param-gather
--overlap-param-gather
...
@@ -112,7 +112,7 @@ TORCH_PROFIE_ARGS=(
...
@@ -112,7 +112,7 @@ TORCH_PROFIE_ARGS=(
--profile-ranks
0 1 2 3 4 5 6 7
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-start
3
--profile-step-end
4
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x22B_
8
nodes_tp4-pp8-ep8-ep_tp1-cp1
--profile-dir
torch_prof_mixtral8x22B_
1
nodes_tp4-pp8-ep8-ep_tp1-cp1
--use-pytorch-profiler
--use-pytorch-profiler
)
)
...
@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=(
...
@@ -129,6 +129,7 @@ MODEL_PARALLEL_ARGS=(
--pipeline-model-parallel-size
8
--pipeline-model-parallel-size
8
--expert-model-parallel-size
8
--expert-model-parallel-size
8
--expert-tensor-parallel-size
1
--expert-tensor-parallel-size
1
--context-parallel-size
1
--use-distributed-optimizer
--use-distributed-optimizer
--sequence-parallel
--sequence-parallel
)
)
...
@@ -143,7 +144,8 @@ LOGGING_ARGS=(
...
@@ -143,7 +144,8 @@ LOGGING_ARGS=(
#--load $CHECKPOINT_PATH \
#--load $CHECKPOINT_PATH \
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
\
--tensorboard-dir
"
${
CHECKPOINT_PATH
}
/tensorboard"
\
--no-load-optim
\
--no-load-optim
\
--no-load-rng
--no-load-rng
\
--no-save-optim
)
)
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
...
@@ -175,43 +177,34 @@ fi
...
@@ -175,43 +177,34 @@ fi
case
${
LOCAL_RANK
}
in
case
${
LOCAL_RANK
}
in
[
0]
)
[
0]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
#numactl --cpunodebind=0 --membind=0 ${APP}
;;
;;
[
1]
)
[
1]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
#numactl --cpunodebind=1 --membind=1 ${APP}
;;
;;
[
2]
)
[
2]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
#numactl --cpunodebind=2 --membind=2 ${APP}
;;
;;
[
3]
)
[
3]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
#numactl --cpunodebind=3 --membind=3 ${APP}
;;
;;
[
4]
)
[
4]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
#numactl --cpunodebind=4 --membind=4 ${APP}
;;
;;
[
5]
)
[
5]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
#numactl --cpunodebind=5 --membind=5 ${APP}
;;
;;
[
6]
)
[
6]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
#numactl --cpunodebind=6 --membind=6 ${APP}
;;
;;
[
7]
)
[
7]
)
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
${
APP
}
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
#numactl --cpunodebind=7 --membind=7 ${APP}
;;
;;
esac
esac
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment