Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
fe0b03b5
"vscode:/vscode.git/clone" did not exist on "c67cf3c41ec77cca96ec81e8b0acaa5da6a96895"
Commit
fe0b03b5
authored
May 12, 2025
by
silencealiang
Browse files
fix llama2 bug and update file format
parent
ee3ff5df
Changes
26
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
130 additions
and
280 deletions
+130
-280
examples/mixtral/topo-input.xml
examples/mixtral/topo-input.xml
+0
-162
examples/mixtral/train_mixtral_8x22B_1nodes.sh
examples/mixtral/train_mixtral_8x22B_1nodes.sh
+33
-30
examples/mixtral/train_mixtral_8x22B_multinodes.sh
examples/mixtral/train_mixtral_8x22B_multinodes.sh
+33
-30
examples/mixtral/train_mixtral_8x7B_1nodes.sh
examples/mixtral/train_mixtral_8x7B_1nodes.sh
+32
-29
examples/mixtral/train_mixtral_8x7B_multinodes.sh
examples/mixtral/train_mixtral_8x7B_multinodes.sh
+32
-29
topo-input.xml
topo-input.xml
+0
-0
No files found.
examples/mixtral/topo-input.xml
deleted
100644 → 0
View file @
ee3ff5df
<system
version=
"2"
>
<cpu
numaid=
"3"
affinity=
"00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000"
arch=
"x86_64"
vendor=
"HygonGenuine"
familyid=
"159"
modelid=
"4"
>
<pci
busid=
"0000:99:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:9d:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:9f:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"0"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"0"
gdr=
"1"
>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
<pci
busid=
"0000:51:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:54:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:56:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"1"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"1"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
</pci>
<pci
busid=
"0000:9b:00.0"
class=
"0x020000"
vendor=
"0x15b3"
device=
"0x1021"
subsystem_vendor=
"0x15b3"
subsystem_device=
"0x0022"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<nic>
<net
name=
"mlx5_2"
dev=
"2"
speed=
"200000"
port=
"1"
latency=
"0.000000"
guid=
"0x2227a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
<net
name=
"mlx5_3"
dev=
"3"
speed=
"200000"
port=
"2"
latency=
"0.000000"
guid=
"0x2227a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
</nic>
</pci>
</pci>
</cpu>
<cpu
numaid=
"0"
affinity=
"00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff"
arch=
"x86_64"
vendor=
"HygonGenuine"
familyid=
"159"
modelid=
"4"
>
<pci
busid=
"0000:01:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:03:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:05:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"3"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"3"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
<pci
busid=
"0000:59:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:5b:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:5d:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"2"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"2"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
</pci>
<pci
busid=
"0000:06:00.0"
class=
"0x020000"
vendor=
"0x15b3"
device=
"0x1021"
subsystem_vendor=
"0x15b3"
subsystem_device=
"0x0022"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<nic>
<net
name=
"mlx5_4"
dev=
"4"
speed=
"200000"
port=
"1"
latency=
"0.000000"
guid=
"0x8228a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
<net
name=
"mlx5_5"
dev=
"5"
speed=
"200000"
port=
"2"
latency=
"0.000000"
guid=
"0x8228a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
</nic>
</pci>
</pci>
</cpu>
<cpu
numaid=
"7"
affinity=
"7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000"
arch=
"x86_64"
vendor=
"HygonGenuine"
familyid=
"159"
modelid=
"4"
>
<pci
busid=
"0000:e1:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:e3:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:e5:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"4"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"4"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
<pci
busid=
"0000:bd:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:bf:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:c1:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"5"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"5"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
</pci>
<pci
busid=
"0000:e6:00.0"
class=
"0x020000"
vendor=
"0x15b3"
device=
"0x1021"
subsystem_vendor=
"0x15b3"
subsystem_device=
"0x0022"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<nic>
<net
name=
"mlx5_6"
dev=
"6"
speed=
"200000"
port=
"1"
latency=
"0.000000"
guid=
"0x6227a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
<net
name=
"mlx5_7"
dev=
"7"
speed=
"200000"
port=
"2"
latency=
"0.000000"
guid=
"0x6227a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
</nic>
</pci>
</pci>
</cpu>
<cpu
numaid=
"4"
affinity=
"00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000"
arch=
"x86_64"
vendor=
"HygonGenuine"
familyid=
"159"
modelid=
"4"
>
<pci
busid=
"0000:ab:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:af:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:b1:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"7"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"7"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:ca:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
<pci
busid=
"0000:c5:00.0"
class=
"0x060400"
vendor=
"0x1000"
device=
"0xc030"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:c8:00.0"
class=
"0x060400"
vendor=
"0x1d94"
device=
"0x23b7"
subsystem_vendor=
"0x1000"
subsystem_device=
"0x100b"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<pci
busid=
"0000:ca:00.0"
class=
"0x0b4000"
vendor=
"0x1d94"
device=
"0x6320"
subsystem_vendor=
"0x1d94"
subsystem_device=
"0x6310"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<gpu
dev=
"6"
sm=
"93"
gcn=
"gfx936"
arch=
"169983"
rank=
"6"
gdr=
"1"
>
<xgmi
target=
"0000:9f:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:56:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:5d:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:05:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:e5:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:b1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
<xgmi
target=
"0000:c1:00.0"
count=
"7"
tclass=
"0x0b4000"
/>
</gpu>
</pci>
</pci>
</pci>
<pci
busid=
"0000:ad:00.0"
class=
"0x020000"
vendor=
"0x15b3"
device=
"0x1021"
subsystem_vendor=
"0x15b3"
subsystem_device=
"0x0022"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<nic>
<net
name=
"mlx5_8"
dev=
"8"
speed=
"200000"
port=
"1"
latency=
"0.000000"
guid=
"0xd226a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
<net
name=
"mlx5_9"
dev=
"9"
speed=
"200000"
port=
"2"
latency=
"0.000000"
guid=
"0xd226a1000373255c"
maxconn=
"131072"
gdr=
"1"
/>
</nic>
</pci>
</pci>
</cpu>
<cpu
numaid=
"2"
affinity=
"00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000"
arch=
"x86_64"
vendor=
"HygonGenuine"
familyid=
"159"
modelid=
"4"
>
<pci
busid=
"0000:71:00.0"
class=
"0x020000"
vendor=
"0x15b3"
device=
"0xa2dc"
subsystem_vendor=
"0x15b3"
subsystem_device=
"0x0009"
link_speed=
"32.0 GT/s PCIe"
link_width=
"16"
>
<nic>
<net
name=
"mlx5_0"
dev=
"0"
speed=
"200000"
port=
"1"
latency=
"0.000000"
guid=
"0xc0d00a000324e9b8"
maxconn=
"131072"
gdr=
"1"
/>
<net
name=
"mlx5_1"
dev=
"1"
speed=
"40000"
port=
"2"
latency=
"0.000000"
guid=
"0xc0d00a000324e9b8"
maxconn=
"131072"
gdr=
"1"
/>
</nic>
</pci>
</cpu>
</system>
examples/mixtral/train_mixtral_8x22B_1nodes.sh
View file @
fe0b03b5
...
@@ -2,17 +2,25 @@
...
@@ -2,17 +2,25 @@
for
para
in
$*
for
para
in
$*
do
do
if
[[
$para
==
--profiling
*
]]
;
then
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
profiling
=
${
para
#*=
}
fi
fi
done
done
# Runs Mixtral 8x22B model
# data path
source
/opt/dtk/env.sh
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# default env
# default env
DIST_URL
=
${
1
}
DIST_URL
=
${
1
}
DIST_PORT
=
25900
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
...
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
...
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
export
NCCL_NET_GDR_READ
=
1
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"
.
/topo-input.xml"
export
NCCL_TOPO_FILE
=
"
${
MEGATRON_PATH
}
/topo-input.xml"
# enable BatchLinear
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
export
GROUPED_GEMM_BatchLinear
=
1
# data path
CHECKPOINT_PATH
=
"path to CKPT"
TOKENIZER_MODEL
=
"path to tokenizer.model"
DATA_PATH
=
"path to my-mixtral_text_document"
DISTRIBUTED_ARGS
=(
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--world-size
${
WORLD_SIZE
}
...
@@ -86,8 +89,8 @@ MOE_ARGS=(
...
@@ -86,8 +89,8 @@ MOE_ARGS=(
DATA_ARGS
=(
DATA_ARGS
=(
--tokenizer-type
Llama2Tokenizer
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
${
TOKENIZER_MODEL
}
--tokenizer-model
${
TOKENIZER_MODEL
_PATH
}
--data-path
$DATA_PATH
--data-path
$
{
DATA_PATH
}
--split
99990,8,2
--split
99990,8,2
)
)
...
@@ -107,23 +110,6 @@ TRAINING_ARGS=(
...
@@ -107,23 +110,6 @@ TRAINING_ARGS=(
--overlap-grad-reduce
--overlap-grad-reduce
)
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS
=(
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
2
--tensor-model-parallel-size
2
--pipeline-model-parallel-size
1
--pipeline-model-parallel-size
1
...
@@ -148,10 +134,27 @@ LOGGING_ARGS=(
...
@@ -148,10 +134,27 @@ LOGGING_ARGS=(
--no-save-optim
--no-save-optim
)
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-etp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
LOGGING_ARGS+
=(
LOGGING_ARGS+
=(
--wandb-project
${
WANDB_PROJECT
:-
"Mixtral"
}
--wandb-project
${
WANDB_PROJECT
:-
"Mixtral"
}
--wandb-exp-name
${
WANDB_NAME
:-
"Mixtral_8x
7
B"
}
--wandb-exp-name
${
WANDB_NAME
:-
"Mixtral_8x
22
B"
}
)
)
fi
fi
...
...
examples/mixtral/train_mixtral_8x22B_multinodes.sh
View file @
fe0b03b5
...
@@ -2,17 +2,25 @@
...
@@ -2,17 +2,25 @@
for
para
in
$*
for
para
in
$*
do
do
if
[[
$para
==
--profiling
*
]]
;
then
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
profiling
=
${
para
#*=
}
fi
fi
done
done
# Runs Mixtral 8x22B model
# data path
source
/opt/dtk/env.sh
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# default env
# default env
DIST_URL
=
${
1
}
DIST_URL
=
${
1
}
DIST_PORT
=
25900
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
...
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
...
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
export
NCCL_NET_GDR_READ
=
1
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"
.
/topo-input.xml"
export
NCCL_TOPO_FILE
=
"
${
MEGATRON_PATH
}
/topo-input.xml"
# enable BatchLinear
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
export
GROUPED_GEMM_BatchLinear
=
1
# data path
CHECKPOINT_PATH
=
"path to CKPT"
TOKENIZER_MODEL
=
"path to tokenizer.model"
DATA_PATH
=
"path to my-mixtral_text_document"
DISTRIBUTED_ARGS
=(
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--world-size
${
WORLD_SIZE
}
...
@@ -86,8 +89,8 @@ MOE_ARGS=(
...
@@ -86,8 +89,8 @@ MOE_ARGS=(
DATA_ARGS
=(
DATA_ARGS
=(
--tokenizer-type
Llama2Tokenizer
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
${
TOKENIZER_MODEL
}
--tokenizer-model
${
TOKENIZER_MODEL
_PATH
}
--data-path
$DATA_PATH
--data-path
$
{
DATA_PATH
}
--split
99990,8,2
--split
99990,8,2
)
)
...
@@ -107,23 +110,6 @@ TRAINING_ARGS=(
...
@@ -107,23 +110,6 @@ TRAINING_ARGS=(
--overlap-grad-reduce
--overlap-grad-reduce
)
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x22B_8nodes_tp4-pp8-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS
=(
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
4
--tensor-model-parallel-size
4
--pipeline-model-parallel-size
8
--pipeline-model-parallel-size
8
...
@@ -148,10 +134,27 @@ LOGGING_ARGS=(
...
@@ -148,10 +134,27 @@ LOGGING_ARGS=(
--no-save-optim
--no-save-optim
)
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x22B_8nodes_tp4-pp8-ep8-etp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
LOGGING_ARGS+
=(
LOGGING_ARGS+
=(
--wandb-project
${
WANDB_PROJECT
:-
"Mixtral"
}
--wandb-project
${
WANDB_PROJECT
:-
"Mixtral"
}
--wandb-exp-name
${
WANDB_NAME
:-
"Mixtral_8x
7
B"
}
--wandb-exp-name
${
WANDB_NAME
:-
"Mixtral_8x
22
B"
}
)
)
fi
fi
...
...
examples/mixtral/train_mixtral_8x7B_1nodes.sh
View file @
fe0b03b5
...
@@ -2,17 +2,25 @@
...
@@ -2,17 +2,25 @@
for
para
in
$*
for
para
in
$*
do
do
if
[[
$para
==
--profiling
*
]]
;
then
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
profiling
=
${
para
#*=
}
fi
fi
done
done
# Runs Mixtral 8x7B model
# data path
source
/opt/dtk/env.sh
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# default env
# default env
DIST_URL
=
${
1
}
DIST_URL
=
${
1
}
DIST_PORT
=
25900
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
...
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
...
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
export
NCCL_NET_GDR_READ
=
1
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"
.
/topo-input.xml"
export
NCCL_TOPO_FILE
=
"
${
MEGATRON_PATH
}
/topo-input.xml"
# enable BatchLinear
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
export
GROUPED_GEMM_BatchLinear
=
1
# data path
CHECKPOINT_PATH
=
"path to CKPT"
TOKENIZER_MODEL
=
"path to tokenizer.model"
DATA_PATH
=
"path to my-mixtral_text_document"
DISTRIBUTED_ARGS
=(
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--world-size
${
WORLD_SIZE
}
...
@@ -86,8 +89,8 @@ MOE_ARGS=(
...
@@ -86,8 +89,8 @@ MOE_ARGS=(
DATA_ARGS
=(
DATA_ARGS
=(
--tokenizer-type
Llama2Tokenizer
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
${
TOKENIZER_MODEL
}
--tokenizer-model
${
TOKENIZER_MODEL
_PATH
}
--data-path
$DATA_PATH
--data-path
$
{
DATA_PATH
}
--split
99990,8,2
--split
99990,8,2
)
)
...
@@ -107,23 +110,6 @@ TRAINING_ARGS=(
...
@@ -107,23 +110,6 @@ TRAINING_ARGS=(
--overlap-grad-reduce
--overlap-grad-reduce
)
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS
=(
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
2
--tensor-model-parallel-size
2
--pipeline-model-parallel-size
1
--pipeline-model-parallel-size
1
...
@@ -148,6 +134,23 @@ LOGGING_ARGS=(
...
@@ -148,6 +134,23 @@ LOGGING_ARGS=(
--no-save-optim
--no-save-optim
)
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x7B_1nodes_tp2-pp1-ep8-etp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
LOGGING_ARGS+
=(
LOGGING_ARGS+
=(
--wandb-project
${
WANDB_PROJECT
:-
"Mixtral"
}
--wandb-project
${
WANDB_PROJECT
:-
"Mixtral"
}
...
...
examples/mixtral/train_mixtral_8x7B_multinodes.sh
View file @
fe0b03b5
...
@@ -2,17 +2,25 @@
...
@@ -2,17 +2,25 @@
for
para
in
$*
for
para
in
$*
do
do
if
[[
$para
==
--profiling
*
]]
;
then
if
[[
$para
==
--data_path
*
]]
;
then
data_path
=
${
para
#*=
}
elif
[[
$para
==
--tokenizer_path
*
]]
;
then
tokenizer_path
=
${
para
#*=
}
elif
[[
$para
==
--checkpoint_path
*
]]
;
then
checkpoint_path
=
${
para
#*=
}
elif
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
profiling
=
${
para
#*=
}
fi
fi
done
done
# Runs Mixtral 8x7B model
# data path
source
/opt/dtk/env.sh
DATA_PATH
=
${
data_path
}
TOKENIZER_MODEL_PATH
=
${
tokenizer_path
}
CHECKPOINT_PATH
=
${
checkpoint_path
}
# default env
# default env
DIST_URL
=
${
1
}
DIST_URL
=
${
1
}
DIST_PORT
=
25900
DIST_PORT
=
${
2
}
RANK
=
$OMPI_COMM_WORLD_RANK
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
...
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
...
@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
export
NCCL_NET_GDR_READ
=
1
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"
.
/topo-input.xml"
export
NCCL_TOPO_FILE
=
"
${
MEGATRON_PATH
}
/topo-input.xml"
# enable BatchLinear
# enable BatchLinear
export
GROUPED_GEMM_BatchLinear
=
1
export
GROUPED_GEMM_BatchLinear
=
1
# data path
CHECKPOINT_PATH
=
"path to CKPT"
TOKENIZER_MODEL
=
"path to tokenizer.model"
DATA_PATH
=
"path to my-mixtral_text_document"
DISTRIBUTED_ARGS
=(
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--world-size
${
WORLD_SIZE
}
...
@@ -86,8 +89,8 @@ MOE_ARGS=(
...
@@ -86,8 +89,8 @@ MOE_ARGS=(
DATA_ARGS
=(
DATA_ARGS
=(
--tokenizer-type
Llama2Tokenizer
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
${
TOKENIZER_MODEL
}
--tokenizer-model
${
TOKENIZER_MODEL
_PATH
}
--data-path
$DATA_PATH
--data-path
$
{
DATA_PATH
}
--split
99990,8,2
--split
99990,8,2
)
)
...
@@ -107,23 +110,6 @@ TRAINING_ARGS=(
...
@@ -107,23 +110,6 @@ TRAINING_ARGS=(
--overlap-grad-reduce
--overlap-grad-reduce
)
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 8 9 10 11
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x7B_4nodes_tp2-pp4-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS
=(
MODEL_PARALLEL_ARGS
=(
--tensor-model-parallel-size
2
--tensor-model-parallel-size
2
--pipeline-model-parallel-size
4
--pipeline-model-parallel-size
4
...
@@ -148,6 +134,23 @@ LOGGING_ARGS=(
...
@@ -148,6 +134,23 @@ LOGGING_ARGS=(
--no-save-optim
--no-save-optim
)
)
TORCH_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 8 9 10 11
--profile-step-start
3
--profile-step-end
4
--profile-dir
torch_prof_mixtral8x7B_4nodes_tp2-pp4-ep8-etp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS
=(
--profile
--profile-ranks
0 1 2 3 4 5 6 7
--profile-step-start
4
--profile-step-end
5
--use-hip-profiler
)
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
if
[
-n
"
${
WANDB_API_KEY
}
"
]
;
then
LOGGING_ARGS+
=(
LOGGING_ARGS+
=(
--wandb-project
${
WANDB_PROJECT
:-
"Mixtral"
}
--wandb-project
${
WANDB_PROJECT
:-
"Mixtral"
}
...
...
examples/deepseek_v3/
topo-input.xml
→
topo-input.xml
View file @
fe0b03b5
File moved
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment