Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
2a0c4358
Commit
2a0c4358
authored
Jun 10, 2025
by
silencealiang
Browse files
fix parameters bug
parent
3b081313
Changes
16
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
25 additions
and
56 deletions
+25
-56
examples/deepseek_v3/train_deepseekv3_671B_128nodes.sh
examples/deepseek_v3/train_deepseekv3_671B_128nodes.sh
+1
-1
examples/deepseek_v3/train_deepseekv3_671B_1nodes.sh
examples/deepseek_v3/train_deepseekv3_671B_1nodes.sh
+1
-1
examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
+1
-1
examples/gpt3/train_gpt_567B_128nodes.sh
examples/gpt3/train_gpt_567B_128nodes.sh
+1
-1
examples/gpt3/train_gpt_567B_1nodes.sh
examples/gpt3/train_gpt_567B_1nodes.sh
+1
-1
examples/mixtral/train_mixtral_8x22B_1nodes.sh
examples/mixtral/train_mixtral_8x22B_1nodes.sh
+1
-1
examples/mixtral/train_mixtral_8x22B_8nodes.sh
examples/mixtral/train_mixtral_8x22B_8nodes.sh
+1
-1
examples/mixtral/train_mixtral_8x7B_1nodes.sh
examples/mixtral/train_mixtral_8x7B_1nodes.sh
+1
-1
examples/mixtral/train_mixtral_8x7B_4nodes.sh
examples/mixtral/train_mixtral_8x7B_4nodes.sh
+1
-1
requirements/launch_with_binding.sh
requirements/launch_with_binding.sh
+6
-1
requirements/nccl_zz/env.sh
requirements/nccl_zz/env.sh
+10
-5
requirements/nccl_zz/lib-v8/librccl-net.a
requirements/nccl_zz/lib-v8/librccl-net.a
+0
-0
requirements/nccl_zz/lib-v8/librccl-net.la
requirements/nccl_zz/lib-v8/librccl-net.la
+0
-41
requirements/nccl_zz/lib-v8/librccl-net.so
requirements/nccl_zz/lib-v8/librccl-net.so
+0
-0
requirements/nccl_zz/lib-v8/librccl-net.so.0
requirements/nccl_zz/lib-v8/librccl-net.so.0
+0
-0
requirements/nccl_zz/lib-v8/librccl-net.so.0.0.0
requirements/nccl_zz/lib-v8/librccl-net.so.0.0.0
+0
-0
No files found.
examples/deepseek_v3/train_deepseekv3_671B_128nodes.sh
View file @
2a0c4358
...
...
@@ -104,7 +104,7 @@ if [ $MODEL_SIZE = A37B ]; then
moe_options
=
"
\
--moe-grouped-gemm
\
--moe-expert-capacity-factor
0.5
\
--moe-expert-capacity-factor
1
\
--moe-pad-expert-input-to-capacity
\
--moe-token-dispatcher-type alltoall
\
--moe-router-topk
${
ROUTER_TOPK
}
\
...
...
examples/deepseek_v3/train_deepseekv3_671B_1nodes.sh
View file @
2a0c4358
...
...
@@ -104,7 +104,7 @@ if [ $MODEL_SIZE = A37B ]; then
moe_options
=
"
\
--moe-grouped-gemm
\
--moe-expert-capacity-factor
0.5
\
--moe-expert-capacity-factor
1
\
--moe-pad-expert-input-to-capacity
\
--moe-token-dispatcher-type alltoall
\
--moe-router-topk
${
ROUTER_TOPK
}
\
...
...
examples/deepseek_v3/train_deepseekv3_671B_4nodes.sh
View file @
2a0c4358
...
...
@@ -104,7 +104,7 @@ if [ $MODEL_SIZE = A37B ]; then
moe_options
=
"
\
--moe-grouped-gemm
\
--moe-expert-capacity-factor
0.5
\
--moe-expert-capacity-factor
1
\
--moe-pad-expert-input-to-capacity
\
--moe-token-dispatcher-type alltoall
\
--moe-router-topk
${
ROUTER_TOPK
}
\
...
...
examples/gpt3/train_gpt_567B_128nodes.sh
View file @
2a0c4358
...
...
@@ -70,7 +70,7 @@ MOE_ARGS=(
--moe-router-load-balancing-type
aux_loss
--moe-aux-loss-coeff
1e-2
--moe-token-dispatcher-type
alltoall
--moe-expert-capacity-factor
0.5
--moe-expert-capacity-factor
1
--moe-pad-expert-input-to-capacity
#--moe-grouped-gemm
)
...
...
examples/gpt3/train_gpt_567B_1nodes.sh
View file @
2a0c4358
...
...
@@ -70,7 +70,7 @@ MOE_ARGS=(
--moe-router-load-balancing-type
aux_loss
--moe-aux-loss-coeff
1e-2
--moe-token-dispatcher-type
alltoall
--moe-expert-capacity-factor
0.5
--moe-expert-capacity-factor
1
--moe-pad-expert-input-to-capacity
--moe-grouped-gemm
)
...
...
examples/mixtral/train_mixtral_8x22B_1nodes.sh
View file @
2a0c4358
...
...
@@ -73,7 +73,7 @@ MOE_ARGS=(
--moe-router-load-balancing-type
aux_loss
--moe-aux-loss-coeff
1e-3
--moe-token-dispatcher-type
alltoall
--moe-expert-capacity-factor
0.5
--moe-expert-capacity-factor
1
--moe-pad-expert-input-to-capacity
#--moe-grouped-gemm
)
...
...
examples/mixtral/train_mixtral_8x22B_8nodes.sh
View file @
2a0c4358
...
...
@@ -73,7 +73,7 @@ MOE_ARGS=(
--moe-router-load-balancing-type
aux_loss
--moe-aux-loss-coeff
1e-3
--moe-token-dispatcher-type
alltoall
--moe-expert-capacity-factor
0.5
--moe-expert-capacity-factor
1
--moe-pad-expert-input-to-capacity
#--moe-grouped-gemm
)
...
...
examples/mixtral/train_mixtral_8x7B_1nodes.sh
View file @
2a0c4358
...
...
@@ -73,7 +73,7 @@ MOE_ARGS=(
--moe-router-load-balancing-type
aux_loss
--moe-aux-loss-coeff
1e-2
--moe-token-dispatcher-type
alltoall
--moe-expert-capacity-factor
0.5
--moe-expert-capacity-factor
1
--moe-pad-expert-input-to-capacity
#--moe-grouped-gemm
)
...
...
examples/mixtral/train_mixtral_8x7B_4nodes.sh
View file @
2a0c4358
...
...
@@ -73,7 +73,7 @@ MOE_ARGS=(
--moe-router-load-balancing-type
aux_loss
--moe-aux-loss-coeff
1e-2
--moe-token-dispatcher-type
alltoall
--moe-expert-capacity-factor
0.5
--moe-expert-capacity-factor
1
--moe-pad-expert-input-to-capacity
#--moe-grouped-gemm
)
...
...
requirements/launch_with_binding.sh
View file @
2a0c4358
#!/bin/bash
# wz
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numa_map
=(
0 1 2 3 4 5 6 7
)
# 508
# export HIP_VISIBLE_DEVICES=0,1,2,3,5,4,7,6
# numa_map=(0 3 2 1 7 4 5 6)
LOCAL_RANK
=
$1
shift
...
...
requirements/nccl_zz/env.sh
View file @
2a0c4358
# nccl env
module load compiler/dtk/25.04.1
module load app/rccl/shca_rdma_plugins/v8
module load app/rccl/tests
module load app/rccl/topos/shca
module load mpi/openmpi/5.0.3/gcc-8.5.0/shca_ucx-1.15.0
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
export
NCCL_ALGO
=
Ring
...
...
@@ -7,18 +13,17 @@ export NCCL_MIN_NCHANNELS=16
export
NCCL_NCHANNELS_PER_PEER
=
16
export
NCCL_MIN_P2P_NCHANNELS
=
16
export
NCCL_MAX_P2P_NCHANNELS
=
16
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_LEVEL
=
4
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
shca_0:1,shca_1:1,shca_2:1,shca_3:1
export
NCCL_TOPO_FILE
=
${
MEGATRON_PATH
}
/requirements/nccl_zz/topo-input.xml
export
NCCL_IB_PCI_RELAXED_ORDERING
=
0
export
NCCL_PLUGIN_P2P
=
ucx
export
NCCL_SOCKET_IFNAME
=
eno1
export
NCCL_SOCKET_IFNAME
=
ib0
#
eno1
export
SHCA_DEBUG_MASK
=
0
export
SHCA_CMR_LOG_LEVEL
=
1
export
SHCA_SHUT_UP_FWB
=
1
export
SHCA_UCT_CQ_SIZE_INC
=
5
export
UCX_RNDV_PUT_FORCE_FLUSH
=
y
export
NCCL_PXN_DISABLE
=
0
export
LD_LIBRARY_PATH
=
${
MEGATRON_PATH
}
/requirements/nccl_zz/lib-v8:
$LD_LIBRARY_PATH
\ No newline at end of file
export
NCCL_PXN_DISABLE
=
1
export
NCCL_NET_PLUGIN
=
shca
\ No newline at end of file
requirements/nccl_zz/lib-v8/librccl-net.a
deleted
100644 → 0
View file @
3b081313
File deleted
requirements/nccl_zz/lib-v8/librccl-net.la
deleted
100755 → 0
View file @
3b081313
# librccl-net.la - a libtool library file
# Generated by libtool (GNU libtool) 2.4.6
#
# Please DO NOT delete this file!
# It is necessary for linking the library.
# The name that we can dlopen(3).
dlname='librccl-net.so.0'
# Names of this library.
library_names='librccl-net.so.0.0.0 librccl-net.so.0 librccl-net.so'
# The name of the static archive.
old_library='librccl-net.a'
# Linker flags that cannot go in dependency_libs.
inherited_linker_flags=''
# Libraries that this one depends upon.
dependency_libs=' -L/usr/lib64 -L/usr/lib -L/opt/dtk-25.04/hip/lib -lucp -lucs -lucm -luct -libverbs -lamdhip64'
# Names of additional weak libraries provided by this library
weak_library_names=''
# Version information for librccl-net.
current=0
age=0
revision=0
# Is this an already installed library?
installed=yes
# Should we warn about portability when linking against -modules?
shouldnotlink=no
# Files to dlopen/dlpreopen
dlopen=''
dlpreopen=''
# Directory that this library needs to be installed in:
libdir='/home/shanxs/rccl/508/install-v8/lib'
requirements/nccl_zz/lib-v8/librccl-net.so
deleted
100755 → 0
View file @
3b081313
File deleted
requirements/nccl_zz/lib-v8/librccl-net.so.0
deleted
100755 → 0
View file @
3b081313
File deleted
requirements/nccl_zz/lib-v8/librccl-net.so.0.0.0
deleted
100755 → 0
View file @
3b081313
File deleted
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment