Commit 2a0c4358 authored by silencealiang's avatar silencealiang
Browse files

fix parameters bug

parent 3b081313
...@@ -104,7 +104,7 @@ if [ $MODEL_SIZE = A37B ]; then ...@@ -104,7 +104,7 @@ if [ $MODEL_SIZE = A37B ]; then
moe_options=" \ moe_options=" \
--moe-grouped-gemm \ --moe-grouped-gemm \
--moe-expert-capacity-factor 0.5 \ --moe-expert-capacity-factor 1 \
--moe-pad-expert-input-to-capacity \ --moe-pad-expert-input-to-capacity \
--moe-token-dispatcher-type alltoall \ --moe-token-dispatcher-type alltoall \
--moe-router-topk ${ROUTER_TOPK} \ --moe-router-topk ${ROUTER_TOPK} \
......
...@@ -104,7 +104,7 @@ if [ $MODEL_SIZE = A37B ]; then ...@@ -104,7 +104,7 @@ if [ $MODEL_SIZE = A37B ]; then
moe_options=" \ moe_options=" \
--moe-grouped-gemm \ --moe-grouped-gemm \
--moe-expert-capacity-factor 0.5 \ --moe-expert-capacity-factor 1 \
--moe-pad-expert-input-to-capacity \ --moe-pad-expert-input-to-capacity \
--moe-token-dispatcher-type alltoall \ --moe-token-dispatcher-type alltoall \
--moe-router-topk ${ROUTER_TOPK} \ --moe-router-topk ${ROUTER_TOPK} \
......
...@@ -104,7 +104,7 @@ if [ $MODEL_SIZE = A37B ]; then ...@@ -104,7 +104,7 @@ if [ $MODEL_SIZE = A37B ]; then
moe_options=" \ moe_options=" \
--moe-grouped-gemm \ --moe-grouped-gemm \
--moe-expert-capacity-factor 0.5 \ --moe-expert-capacity-factor 1 \
--moe-pad-expert-input-to-capacity \ --moe-pad-expert-input-to-capacity \
--moe-token-dispatcher-type alltoall \ --moe-token-dispatcher-type alltoall \
--moe-router-topk ${ROUTER_TOPK} \ --moe-router-topk ${ROUTER_TOPK} \
......
...@@ -70,7 +70,7 @@ MOE_ARGS=( ...@@ -70,7 +70,7 @@ MOE_ARGS=(
--moe-router-load-balancing-type aux_loss --moe-router-load-balancing-type aux_loss
--moe-aux-loss-coeff 1e-2 --moe-aux-loss-coeff 1e-2
--moe-token-dispatcher-type alltoall --moe-token-dispatcher-type alltoall
--moe-expert-capacity-factor 0.5 --moe-expert-capacity-factor 1
--moe-pad-expert-input-to-capacity --moe-pad-expert-input-to-capacity
#--moe-grouped-gemm #--moe-grouped-gemm
) )
......
...@@ -70,7 +70,7 @@ MOE_ARGS=( ...@@ -70,7 +70,7 @@ MOE_ARGS=(
--moe-router-load-balancing-type aux_loss --moe-router-load-balancing-type aux_loss
--moe-aux-loss-coeff 1e-2 --moe-aux-loss-coeff 1e-2
--moe-token-dispatcher-type alltoall --moe-token-dispatcher-type alltoall
--moe-expert-capacity-factor 0.5 --moe-expert-capacity-factor 1
--moe-pad-expert-input-to-capacity --moe-pad-expert-input-to-capacity
--moe-grouped-gemm --moe-grouped-gemm
) )
......
...@@ -73,7 +73,7 @@ MOE_ARGS=( ...@@ -73,7 +73,7 @@ MOE_ARGS=(
--moe-router-load-balancing-type aux_loss --moe-router-load-balancing-type aux_loss
--moe-aux-loss-coeff 1e-3 --moe-aux-loss-coeff 1e-3
--moe-token-dispatcher-type alltoall --moe-token-dispatcher-type alltoall
--moe-expert-capacity-factor 0.5 --moe-expert-capacity-factor 1
--moe-pad-expert-input-to-capacity --moe-pad-expert-input-to-capacity
#--moe-grouped-gemm #--moe-grouped-gemm
) )
......
...@@ -73,7 +73,7 @@ MOE_ARGS=( ...@@ -73,7 +73,7 @@ MOE_ARGS=(
--moe-router-load-balancing-type aux_loss --moe-router-load-balancing-type aux_loss
--moe-aux-loss-coeff 1e-3 --moe-aux-loss-coeff 1e-3
--moe-token-dispatcher-type alltoall --moe-token-dispatcher-type alltoall
--moe-expert-capacity-factor 0.5 --moe-expert-capacity-factor 1
--moe-pad-expert-input-to-capacity --moe-pad-expert-input-to-capacity
#--moe-grouped-gemm #--moe-grouped-gemm
) )
......
...@@ -73,7 +73,7 @@ MOE_ARGS=( ...@@ -73,7 +73,7 @@ MOE_ARGS=(
--moe-router-load-balancing-type aux_loss --moe-router-load-balancing-type aux_loss
--moe-aux-loss-coeff 1e-2 --moe-aux-loss-coeff 1e-2
--moe-token-dispatcher-type alltoall --moe-token-dispatcher-type alltoall
--moe-expert-capacity-factor 0.5 --moe-expert-capacity-factor 1
--moe-pad-expert-input-to-capacity --moe-pad-expert-input-to-capacity
#--moe-grouped-gemm #--moe-grouped-gemm
) )
......
...@@ -73,7 +73,7 @@ MOE_ARGS=( ...@@ -73,7 +73,7 @@ MOE_ARGS=(
--moe-router-load-balancing-type aux_loss --moe-router-load-balancing-type aux_loss
--moe-aux-loss-coeff 1e-2 --moe-aux-loss-coeff 1e-2
--moe-token-dispatcher-type alltoall --moe-token-dispatcher-type alltoall
--moe-expert-capacity-factor 0.5 --moe-expert-capacity-factor 1
--moe-pad-expert-input-to-capacity --moe-pad-expert-input-to-capacity
#--moe-grouped-gemm #--moe-grouped-gemm
) )
......
#!/bin/bash #!/bin/bash
# wz
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numa_map=(0 1 2 3 4 5 6 7) numa_map=(0 1 2 3 4 5 6 7)
# 508
# export HIP_VISIBLE_DEVICES=0,1,2,3,5,4,7,6
# numa_map=(0 3 2 1 7 4 5 6)
LOCAL_RANK=$1 LOCAL_RANK=$1
shift shift
......
# nccl env # nccl env
module load compiler/dtk/25.04.1
module load app/rccl/shca_rdma_plugins/v8
module load app/rccl/tests
module load app/rccl/topos/shca
module load mpi/openmpi/5.0.3/gcc-8.5.0/shca_ucx-1.15.0
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )" CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR})) MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export NCCL_ALGO=Ring export NCCL_ALGO=Ring
...@@ -7,18 +13,17 @@ export NCCL_MIN_NCHANNELS=16 ...@@ -7,18 +13,17 @@ export NCCL_MIN_NCHANNELS=16
export NCCL_NCHANNELS_PER_PEER=16 export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_P2P_NCHANNELS=16 export NCCL_MIN_P2P_NCHANNELS=16
export NCCL_MAX_P2P_NCHANNELS=16 export NCCL_MAX_P2P_NCHANNELS=16
export NCCL_NET_GDR_LEVEL=7 export NCCL_NET_GDR_LEVEL=4
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0 export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=shca_0:1,shca_1:1,shca_2:1,shca_3:1 export NCCL_IB_HCA=shca_0:1,shca_1:1,shca_2:1,shca_3:1
export NCCL_TOPO_FILE=${MEGATRON_PATH}/requirements/nccl_zz/topo-input.xml
export NCCL_IB_PCI_RELAXED_ORDERING=0 export NCCL_IB_PCI_RELAXED_ORDERING=0
export NCCL_PLUGIN_P2P=ucx export NCCL_PLUGIN_P2P=ucx
export NCCL_SOCKET_IFNAME=eno1 export NCCL_SOCKET_IFNAME=ib0 #eno1
export SHCA_DEBUG_MASK=0 export SHCA_DEBUG_MASK=0
export SHCA_CMR_LOG_LEVEL=1 export SHCA_CMR_LOG_LEVEL=1
export SHCA_SHUT_UP_FWB=1 export SHCA_SHUT_UP_FWB=1
export SHCA_UCT_CQ_SIZE_INC=5 export SHCA_UCT_CQ_SIZE_INC=5
export UCX_RNDV_PUT_FORCE_FLUSH=y export UCX_RNDV_PUT_FORCE_FLUSH=y
export NCCL_PXN_DISABLE=0 export NCCL_PXN_DISABLE=1
export LD_LIBRARY_PATH=${MEGATRON_PATH}/requirements/nccl_zz/lib-v8:$LD_LIBRARY_PATH export NCCL_NET_PLUGIN=shca
\ No newline at end of file \ No newline at end of file
# librccl-net.la - a libtool library file
# Generated by libtool (GNU libtool) 2.4.6
#
# Please DO NOT delete this file!
# It is necessary for linking the library.
# The name that we can dlopen(3).
dlname='librccl-net.so.0'
# Names of this library.
library_names='librccl-net.so.0.0.0 librccl-net.so.0 librccl-net.so'
# The name of the static archive.
old_library='librccl-net.a'
# Linker flags that cannot go in dependency_libs.
inherited_linker_flags=''
# Libraries that this one depends upon.
dependency_libs=' -L/usr/lib64 -L/usr/lib -L/opt/dtk-25.04/hip/lib -lucp -lucs -lucm -luct -libverbs -lamdhip64'
# Names of additional weak libraries provided by this library
weak_library_names=''
# Version information for librccl-net.
current=0
age=0
revision=0
# Is this an already installed library?
installed=yes
# Should we warn about portability when linking against -modules?
shouldnotlink=no
# Files to dlopen/dlpreopen
dlopen=''
dlpreopen=''
# Directory that this library needs to be installed in:
libdir='/home/shanxs/rccl/508/install-v8/lib'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment