Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
fe0b03b5
Commit
fe0b03b5
authored
May 12, 2025
by
silencealiang
Browse files
fix llama2 bug and update file format
parent
ee3ff5df
Changes
26
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
130 additions
and
280 deletions
+130
-280
examples/mixtral/topo-input.xml
examples/mixtral/topo-input.xml
+0
-162
examples/mixtral/train_mixtral_8x22B_1nodes.sh
examples/mixtral/train_mixtral_8x22B_1nodes.sh
+33
-30
examples/mixtral/train_mixtral_8x22B_multinodes.sh
examples/mixtral/train_mixtral_8x22B_multinodes.sh
+33
-30
examples/mixtral/train_mixtral_8x7B_1nodes.sh
examples/mixtral/train_mixtral_8x7B_1nodes.sh
+32
-29
examples/mixtral/train_mixtral_8x7B_multinodes.sh
examples/mixtral/train_mixtral_8x7B_multinodes.sh
+32
-29
topo-input.xml
topo-input.xml
+0
-0
No files found.
examples/mixtral/topo-input.xml
deleted
100644 → 0
View file @
ee3ff5df
<system
version=
"2"
>
<cpu
numaid=
"3"
affinity=
"00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000"
arch=
"x86_64"
vendor=
"HygonGenuine"
familyid=
"159"
modelid=
"4"
>
<pci
busid=
"0000:99:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:9d:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:9f:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"0"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"0"
gdr=
"1"
>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
<pci
busid=
"0000:51:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:54:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:56:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"1"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"1"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
</pci>
<pci
busid=
"0000:9b:00.0"
class=
"0x020000"
vendor=
"0x15b3"
device=
"0x1021"
subsystem_vendor=
"0x15b3"
subsystem_device=
"0x0022"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<nic>
<net
name=
"mlx5_2"
dev=
"2"
speed=
"200000"
port=
"1"
latency=
"0.000000"
guid=
"0x2227a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
<net
name=
"mlx5_3"
dev=
"3"
speed=
"200000"
port=
"2"
latency=
"0.000000"
guid=
"0x2227a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
</nic>
</pci>
</pci>
</cpu>
<cpu
numaid=
"0"
affinity=
"00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff"
arch=
"x86_64"
vendor=
"HygonGenuine"
familyid=
"159"
modelid=
"4"
>
<pci
busid=
"0000:01:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:03:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:05:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"3"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"3"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
<pci
busid=
"0000:59:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:5b:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:5d:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"2"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"2"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
</pci>
<pci
busid=
"0000:06:00.0"
class=
"0x020000"
vendor=
"0x15b3"
device=
"0x1021"
subsystem_vendor=
"0x15b3"
subsystem_device=
"0x0022"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<nic>
<net
name=
"mlx5_4"
dev=
"4"
speed=
"200000"
port=
"1"
latency=
"0.000000"
guid=
"0x8228a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
<net
name=
"mlx5_5"
dev=
"5"
speed=
"200000"
port=
"2"
latency=
"0.000000"
guid=
"0x8228a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
</nic>
</pci>
</pci>
</cpu>
<cpu
numaid=
"7"
affinity=
"7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000"
arch=
"x86_64"
vendor=
"HygonGenuine"
familyid=
"159"
modelid=
"4"
>
<pci
busid=
"0000:e1:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:e3:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:e5:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"4"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"4"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
<pci
busid=
"0000:bd:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:bf:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:c1:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"5"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"5"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
</pci>
<pci
busid=
"0000:e6:00.0"
class=
"0x020000"
vendor=
"0x15b3"
device=
"0x1021"
subsystem_vendor=
"0x15b3"
subsystem_device=
"0x0022"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<nic>
<net
name=
"mlx5_6"
dev=
"6"
speed=
"200000"
port=
"1"
latency=
"0.000000"
guid=
"0x6227a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
<net
name=
"mlx5_7"
dev=
"7"
speed=
"200000"
port=
"2"
latency=
"0.000000"
guid=
"0x6227a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
</nic>
</pci>
</pci>
</cpu>
<cpu
numaid=
"4"
affinity=
"00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000"
arch=
"x86_64"
vendor=
"HygonGenuine"
familyid=
"159"
modelid=
"4"
>
<pci
busid=
"0000:ab:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:af:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:b1:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"7"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"7"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
<pci
busid=
"0000:c5:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:c8:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:ca:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"6"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"6"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
</pci>
<pci
busid=
"0000:ad:00.0"
class=
"0x020000"
vendor=
"0x15b3"
device=
"0x1021"
subsystem_vendor=
"0x15b3"
subsystem_device=
"0x0022"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<nic>
<net
name=
"mlx5_8"
dev=
"8"
speed=
"200000"
port=
"1"
latency=
"0.000000"
guid=
"0xd226a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
<net
name=
"mlx5_9"
dev=
"9"
speed=
"200000"
port=
"2"
latency=
"0.000000"
guid=
"0xd226a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
</nic>
</pci>
</pci>
</cpu>
<cpu
numaid=
"2"
affinity=
"00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000"
arch=
"x86_64"
vendor=
"HygonGenuine"
familyid=
"159"
modelid=
"4"
>
<pci
busid=
"0000:71:00.0"
class=
"0x020000"
vendor=
"0x15b3"
device=
"0xa2dc"
subsystem_vendor=
"0x15b3"
subsystem_device=
"0x0009"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<nic>
<net
name=
"mlx5_0"
dev=
"0"
speed=
"200000"
port=
"1"
latency=
"0.000000"
guid=
"0xc0d00a000324e9b8"
maxconn=
"131072"
gdr=
"1"
/>
<net
name=
"mlx5_1"
dev=
"1"
speed=
"40000"
port=
"2"
latency=
"0.000000"
guid=
"0xc0d00a000324e9b8"
maxconn=
"131072"
gdr=
"1"
/>
</nic>
</pci>
</cpu>
</system>
examples/mixtral/train_mixtral_8x22B_1nodes.sh
View file @
fe0b03b5
...
...
@@ -2,17 +2,25 @@
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Runs Mixtral 8x22B model
source
/opt/dtk/env.sh
# data path
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
25900
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
...
...
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"
.
/topo-input.xml"
export
NCCL_TOPO_FILE
=
"
${
MEGATRON_PATH
}
/topo-input.xml"
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
# data path
CHECKPOINT_PATH
=
"path to CKPT"
TOKENIZER_MODEL
=
"path to tokenizer.model"
DATA_PATH
=
"path to my-mixtral_text_document"
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
...
...
@@ -86,8 +89,8 @@ MOE_ARGS=(
DATA_ARGS
=(
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
${
TOKENIZER_MODEL
}
--data-path
$DATA_PATH
--tokenizer-model
${
TOKENIZER_MODEL
_PATH
}
--data-path
$
{
DATA_PATH
}
--split
99990,8,2
)
...
...
@@ -107,23 +110,6 @@ TRAINING_ARGS=(
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
2
--pipeline-model-parallel-size
1
...
...
@@ -148,10 +134,27 @@ LOGGING_ARGS=(
--no-save-optim
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-etp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
LOGGING_ARGS+
=(
--wandb-project
${
WANDB_PROJECT
:-
"Mixtral"
}
--wandb-exp-name
${
WANDB_NAME
:-
"Mixtral_8x
7
B"
}
--wandb-exp-name
${
WANDB_NAME
:-
"Mixtral_8x
22
B"
}
)
fi
...
...
examples/mixtral/train_mixtral_8x22B_multinodes.sh
View file @
fe0b03b5
...
...
@@ -2,17 +2,25 @@
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Runs Mixtral 8x22B model
source
/opt/dtk/env.sh
# data path
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
25900
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
...
...
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"
.
/topo-input.xml"
export
NCCL_TOPO_FILE
=
"
${
MEGATRON_PATH
}
/topo-input.xml"
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
# data path
CHECKPOINT_PATH
=
"path to CKPT"
TOKENIZER_MODEL
=
"path to tokenizer.model"
DATA_PATH
=
"path to my-mixtral_text_document"
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
...
...
@@ -86,8 +89,8 @@ MOE_ARGS=(
DATA_ARGS
=(
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
${
TOKENIZER_MODEL
}
--data-path
$DATA_PATH
--tokenizer-model
${
TOKENIZER_MODEL
_PATH
}
--data-path
$
{
DATA_PATH
}
--split
99990,8,2
)
...
...
@@ -107,23 +110,6 @@ TRAINING_ARGS=(
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x22B_8nodes_tp4-pp8-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
4
--pipeline-model-parallel-size
8
...
...
@@ -148,10 +134,27 @@ LOGGING_ARGS=(
--no-save-optim
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x22B_8nodes_tp4-pp8-ep8-etp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
LOGGING_ARGS+
=(
--wandb-project
${
WANDB_PROJECT
:-
"Mixtral"
}
--wandb-exp-name
${
WANDB_NAME
:-
"Mixtral_8x
7
B"
}
--wandb-exp-name
${
WANDB_NAME
:-
"Mixtral_8x
22
B"
}
)
fi
...
...
examples/mixtral/train_mixtral_8x7B_1nodes.sh
View file @
fe0b03b5
...
...
@@ -2,17 +2,25 @@
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Runs Mixtral 8x7B model
source
/opt/dtk/env.sh
# data path
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
25900
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
...
...
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"
.
/topo-input.xml"
export
NCCL_TOPO_FILE
=
"
${
MEGATRON_PATH
}
/topo-input.xml"
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
# data path
CHECKPOINT_PATH
=
"path to CKPT"
TOKENIZER_MODEL
=
"path to tokenizer.model"
DATA_PATH
=
"path to my-mixtral_text_document"
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
...
...
@@ -86,8 +89,8 @@ MOE_ARGS=(
DATA_ARGS
=(
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
${
TOKENIZER_MODEL
}
--data-path
$DATA_PATH
--tokenizer-model
${
TOKENIZER_MODEL
_PATH
}
--data-path
$
{
DATA_PATH
}
--split
99990,8,2
)
...
...
@@ -107,23 +110,6 @@ TRAINING_ARGS=(
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
2
--pipeline-model-parallel-size
1
...
...
@@ -148,6 +134,23 @@ LOGGING_ARGS=(
--no-save-optim
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x7B_1nodes_tp2-pp1-ep8-etp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
LOGGING_ARGS+
=(
--wandb-project
${
WANDB_PROJECT
:-
"Mixtral"
}
...
...
examples/mixtral/train_mixtral_8x7B_multinodes.sh
View file @
fe0b03b5
...
...
@@ -2,17 +2,25 @@
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
# Runs Mixtral 8x7B model
source
/opt/dtk/env.sh
# data path
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# default env
DIST_URL
=
${
1
}
DIST_PORT
=
25900
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
...
...
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"
.
/topo-input.xml"
export
NCCL_TOPO_FILE
=
"
${
MEGATRON_PATH
}
/topo-input.xml"
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
# data path
CHECKPOINT_PATH
=
"path to CKPT"
TOKENIZER_MODEL
=
"path to tokenizer.model"
DATA_PATH
=
"path to my-mixtral_text_document"
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
...
...
@@ -86,8 +89,8 @@ MOE_ARGS=(
DATA_ARGS
=(
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
${
TOKENIZER_MODEL
}
--data-path
$DATA_PATH
--tokenizer-model
${
TOKENIZER_MODEL
_PATH
}
--data-path
$
{
DATA_PATH
}
--split
99990,8,2
)
...
...
@@ -107,23 +110,6 @@ TRAINING_ARGS=(
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 8 9 10 11
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x7B_4nodes_tp2-pp4-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
2
--pipeline-model-parallel-size
4
...
...
@@ -148,6 +134,23 @@ LOGGING_ARGS=(
--no-save-optim
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 8 9 10 11
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x7B_4nodes_tp2-pp4-ep8-etp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
LOGGING_ARGS+
=(
--wandb-project
${
WANDB_PROJECT
:-
"Mixtral"
}
...
...
examples/deepseek_v3/
topo-input.xml
→
topo-input.xml
View file @
fe0b03b5
File moved
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment