Commit 6c3cfb1d authored by silencealiang's avatar silencealiang
Browse files

update model parameters format

parent 935bfd74
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Those variables need to modify
GPUS="" # how many gpus to use
DTK_ENV="" # where env.sh of dtk
NCCL_ENV="" # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
HOST="" # hostname
PORT="" # port id
DATA_PATH="" # path to my-mixtral_text_document
TOKENIZER_MODEL_PATH="" # path to tokenizer.model
CHECKPOINT_PATH="" # path to ckpt
# Runs Mixtral 8x22B model
mpirun -np ${GPUS} --hostfile hostfile_mixtral_8x22B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
bash -c "
source ${DTK_ENV} && \
source ${NCCL_ENV} && \
./train_mixtral_8x22B_$((${GPUS} / 8))nodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling" > log-$((${GPUS} / 8))nodes-`date +%F-%H%M`.log 2>&1
wait
\ No newline at end of file
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
mpirun -np 8 --allow-run-as-root \
train_mixtral_8x22B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
wait
rm -rf CKPT
rm -rf mixtral_dataset/my-mixtral_text_document
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
mpirun -np 64 --hostfile hostfile_mixtral_8x22B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_mixtral_8x22B_multinodes.sh node067 --profiling=$profiling > output.log 2>&1
wait
rm -rf CKPT
rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Those variables need to modify
GPUS="" # how many gpus to use
DTK_ENV="" # where env.sh of dtk
NCCL_ENV="" # where env.sh of nccl (requirements/nccl_wz/env.sh or requirements/nccl_zz/env.sh)
HOST="" # hostname
PORT="" # port id
DATA_PATH="" # path to my-mixtral_text_document
TOKENIZER_MODEL_PATH="" # path to tokenizer.model
CHECKPOINT_PATH="" # path to ckpt
# Runs Mixtral 8x7B model
mpirun -np ${GPUS} --hostfile hostfile_mixtral_8x7B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
bash -c "
source ${DTK_ENV} && \
source ${NCCL_ENV} && \
./train_mixtral_8x7B_$((${GPUS} / 8))nodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling" > log-$((${GPUS} / 8))nodes-`date +%F-%H%M`.log 2>&1
wait
\ No newline at end of file
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
mpirun -np 8 --allow-run-as-root \
train_mixtral_8x7B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
wait
rm -rf CKPT
rm -rf mixtral_dataset/my-mixtral_text_document
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
mpirun -np 32 --hostfile hostfile_mixtral_8x7B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_mixtral_8x7B_multinodes.sh node067 --profiling=$profiling > output.log 2>&1
wait
rm -rf CKPT
rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
<system version="2">
<cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:99:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:9d:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:9f:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:51:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:54:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:56:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:9b:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_3" dev="3" speed="200000" port="2" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:03:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:05:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:59:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:5b:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:5d:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:06:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_4" dev="4" speed="200000" port="1" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_5" dev="5" speed="200000" port="2" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="7" affinity="7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:bd:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:bf:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:c1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:e6:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_6" dev="6" speed="200000" port="1" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_7" dev="7" speed="200000" port="2" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:ab:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:af:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:b1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c5:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:c8:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:ca:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:ad:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_8" dev="8" speed="200000" port="1" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_9" dev="9" speed="200000" port="2" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:71:00.0" class="0x020000" vendor="0x15b3" device="0xa2dc" subsystem_vendor="0x15b3" subsystem_device="0x0009" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
<net name="mlx5_1" dev="1" speed="40000" port="2" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
</nic>
</pci>
</cpu>
</system>
......@@ -2,17 +2,25 @@
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs Mixtral 8x22B model
source /opt/dtk/env.sh
# data path
DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env
DIST_URL=${1}
DIST_PORT=25900
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -23,25 +31,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
# data path
CHECKPOINT_PATH="path to CKPT"
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to my-mixtral_text_document"
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
......@@ -86,8 +80,8 @@ MOE_ARGS=(
DATA_ARGS=(
--tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL}
--data-path $DATA_PATH
--tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path ${DATA_PATH}
--split 99990,8,2
)
......@@ -96,39 +90,23 @@ TRAINING_ARGS=(
--global-batch-size 256
--lr 1e-4
--train-iters 10
--lr-decay-iters 320000
--lr-decay-iters 10000
--lr-decay-style cosine
--min-lr 1.0e-5
--min-lr 1.0e-6
--weight-decay 0.1
--lr-warmup-iters 500
--lr-warmup-iters 2000
--clip-grad 1.0
--bf16
--overlap-param-gather
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 1
--expert-model-parallel-size 8
--expert-tensor-parallel-size 1
--context-parallel-size 1
--use-distributed-optimizer
--sequence-parallel
)
......@@ -143,13 +121,31 @@ LOGGING_ARGS=(
#--load $CHECKPOINT_PATH \
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
--no-load-optim \
--no-load-rng
--no-load-rng \
--no-save-optim
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-etp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT:-"Mixtral"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x22B"}
)
fi
......@@ -173,45 +169,28 @@ fi
#for hygon cpu
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
0)
export HIP_VISIBLE_DEVICES=0
numactl --cpunodebind=0 --membind=0 ${APP} ;;
1)
export HIP_VISIBLE_DEVICES=1
numactl --cpunodebind=1 --membind=1 ${APP} ;;
2)
export HIP_VISIBLE_DEVICES=2
numactl --cpunodebind=2 --membind=2 ${APP} ;;
3)
export HIP_VISIBLE_DEVICES=3
numactl --cpunodebind=3 --membind=3 ${APP} ;;
4)
export HIP_VISIBLE_DEVICES=4
numactl --cpunodebind=4 --membind=4 ${APP} ;;
5)
export HIP_VISIBLE_DEVICES=5
numactl --cpunodebind=5 --membind=5 ${APP} ;;
6)
export HIP_VISIBLE_DEVICES=6
numactl --cpunodebind=6 --membind=6 ${APP} ;;
7)
export HIP_VISIBLE_DEVICES=7
numactl --cpunodebind=7 --membind=7 ${APP} ;;
esac
\ No newline at end of file
......@@ -2,17 +2,25 @@
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs Mixtral 8x22B model
source /opt/dtk/env.sh
# data path
DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env
DIST_URL=${1}
DIST_PORT=25900
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -23,25 +31,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
# data path
CHECKPOINT_PATH="path to CKPT"
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to my-mixtral_text_document"
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
......@@ -86,8 +80,8 @@ MOE_ARGS=(
DATA_ARGS=(
--tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL}
--data-path $DATA_PATH
--tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path ${DATA_PATH}
--split 99990,8,2
)
......@@ -96,39 +90,23 @@ TRAINING_ARGS=(
--global-batch-size 256
--lr 1e-4
--train-iters 10
--lr-decay-iters 320000
--lr-decay-iters 10000
--lr-decay-style cosine
--min-lr 1.0e-5
--min-lr 1.0e-6
--weight-decay 0.1
--lr-warmup-iters 500
--lr-warmup-iters 2000
--clip-grad 1.0
--bf16
--overlap-param-gather
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x22B_8nodes_tp4-pp8-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 4
--pipeline-model-parallel-size 8
--expert-model-parallel-size 8
--expert-tensor-parallel-size 1
--context-parallel-size 1
--use-distributed-optimizer
--sequence-parallel
)
......@@ -143,13 +121,31 @@ LOGGING_ARGS=(
#--load $CHECKPOINT_PATH \
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
--no-load-optim \
--no-load-rng
--no-load-rng \
--no-save-optim
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 8 9 16 17 24 25
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x22B_8nodes_tp4-pp8-ep8-etp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT:-"Mixtral"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x22B"}
)
fi
......@@ -173,45 +169,28 @@ fi
#for hygon cpu
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
0)
export HIP_VISIBLE_DEVICES=0
numactl --cpunodebind=0 --membind=0 ${APP} ;;
1)
export HIP_VISIBLE_DEVICES=1
numactl --cpunodebind=1 --membind=1 ${APP} ;;
2)
export HIP_VISIBLE_DEVICES=2
numactl --cpunodebind=2 --membind=2 ${APP} ;;
3)
export HIP_VISIBLE_DEVICES=3
numactl --cpunodebind=3 --membind=3 ${APP} ;;
4)
export HIP_VISIBLE_DEVICES=4
numactl --cpunodebind=4 --membind=4 ${APP} ;;
5)
export HIP_VISIBLE_DEVICES=5
numactl --cpunodebind=5 --membind=5 ${APP} ;;
6)
export HIP_VISIBLE_DEVICES=6
numactl --cpunodebind=6 --membind=6 ${APP} ;;
7)
export HIP_VISIBLE_DEVICES=7
numactl --cpunodebind=7 --membind=7 ${APP} ;;
esac
\ No newline at end of file
......@@ -2,17 +2,25 @@
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs Mixtral 8x7B model
source /opt/dtk/env.sh
# data path
DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env
DIST_URL=${1}
DIST_PORT=25900
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -23,25 +31,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
# data path
CHECKPOINT_PATH="path to CKPT"
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to my-mixtral_text_document"
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
......@@ -86,8 +80,8 @@ MOE_ARGS=(
DATA_ARGS=(
--tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL}
--data-path $DATA_PATH
--tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path ${DATA_PATH}
--split 99990,8,2
)
......@@ -96,39 +90,23 @@ TRAINING_ARGS=(
--global-batch-size 256
--lr 1e-4
--train-iters 10
--lr-decay-iters 320000
--lr-decay-iters 10000
--lr-decay-style cosine
--min-lr 1.0e-5
--min-lr 1.0e-6
--weight-decay 0.1
--lr-warmup-iters 500
--lr-warmup-iters 2000
--clip-grad 1.0
--bf16
--overlap-param-gather
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x7B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 1
--expert-model-parallel-size 8
--expert-tensor-parallel-size 1
--context-parallel-size 1
--use-distributed-optimizer
--sequence-parallel
)
......@@ -143,7 +121,25 @@ LOGGING_ARGS=(
#--load $CHECKPOINT_PATH \
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
--no-load-optim \
--no-load-rng
--no-load-rng \
--no-save-optim
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x7B_1nodes_tp2-pp1-ep8-etp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
if [ -n "${WANDB_API_KEY}" ]; then
......@@ -173,45 +169,28 @@ fi
#for hygon cpu
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
0)
export HIP_VISIBLE_DEVICES=0
numactl --cpunodebind=0 --membind=0 ${APP} ;;
1)
export HIP_VISIBLE_DEVICES=1
numactl --cpunodebind=1 --membind=1 ${APP} ;;
2)
export HIP_VISIBLE_DEVICES=2
numactl --cpunodebind=2 --membind=2 ${APP} ;;
3)
export HIP_VISIBLE_DEVICES=3
numactl --cpunodebind=3 --membind=3 ${APP} ;;
4)
export HIP_VISIBLE_DEVICES=4
numactl --cpunodebind=4 --membind=4 ${APP} ;;
5)
export HIP_VISIBLE_DEVICES=5
numactl --cpunodebind=5 --membind=5 ${APP} ;;
6)
export HIP_VISIBLE_DEVICES=6
numactl --cpunodebind=6 --membind=6 ${APP} ;;
7)
export HIP_VISIBLE_DEVICES=7
numactl --cpunodebind=7 --membind=7 ${APP} ;;
esac
\ No newline at end of file
......@@ -2,17 +2,25 @@
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs Mixtral 8x7B model
source /opt/dtk/env.sh
# data path
DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env
DIST_URL=${1}
DIST_PORT=25900
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -23,25 +31,11 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
export PYTHONPATH=${MEGATRON_PATH}/Megatron-LM:$PYTHONPATH
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
# data path
CHECKPOINT_PATH="path to CKPT"
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to my-mixtral_text_document"
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
......@@ -86,8 +80,8 @@ MOE_ARGS=(
DATA_ARGS=(
--tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL}
--data-path $DATA_PATH
--tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path ${DATA_PATH}
--split 99990,8,2
)
......@@ -96,39 +90,23 @@ TRAINING_ARGS=(
--global-batch-size 256
--lr 1e-4
--train-iters 10
--lr-decay-iters 320000
--lr-decay-iters 10000
--lr-decay-style cosine
--min-lr 1.0e-5
--min-lr 1.0e-6
--weight-decay 0.1
--lr-warmup-iters 500
--lr-warmup-iters 2000
--clip-grad 1.0
--bf16
--overlap-param-gather
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 8 9 10 11
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x7B_4nodes_tp2-pp4-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 4
--expert-model-parallel-size 8
--expert-tensor-parallel-size 1
--context-parallel-size 1
--use-distributed-optimizer
--sequence-parallel
)
......@@ -143,7 +121,25 @@ LOGGING_ARGS=(
#--load $CHECKPOINT_PATH \
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
--no-load-optim \
--no-load-rng
--no-load-rng \
--no-save-optim
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 8 9 10 11
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x7B_4nodes_tp2-pp4-ep8-etp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
if [ -n "${WANDB_API_KEY}" ]; then
......@@ -173,45 +169,28 @@ fi
#for hygon cpu
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
0)
export HIP_VISIBLE_DEVICES=0
numactl --cpunodebind=0 --membind=0 ${APP} ;;
1)
export HIP_VISIBLE_DEVICES=1
numactl --cpunodebind=1 --membind=1 ${APP} ;;
2)
export HIP_VISIBLE_DEVICES=2
numactl --cpunodebind=2 --membind=2 ${APP} ;;
3)
export HIP_VISIBLE_DEVICES=3
numactl --cpunodebind=3 --membind=3 ${APP} ;;
4)
export HIP_VISIBLE_DEVICES=4
numactl --cpunodebind=4 --membind=4 ${APP} ;;
5)
export HIP_VISIBLE_DEVICES=5
numactl --cpunodebind=5 --membind=5 ${APP} ;;
6)
export HIP_VISIBLE_DEVICES=6
numactl --cpunodebind=6 --membind=6 ${APP} ;;
7)
export HIP_VISIBLE_DEVICES=7
numactl --cpunodebind=7 --membind=7 ${APP} ;;
esac
\ No newline at end of file
# nccl env
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export NCCL_ALGO=Ring
export NCCL_MAX_NCHANNELS=32
export NCCL_MIN_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE=${MEGATRON_PATH}/requirements/nccl_wz/topo-input.xml
\ No newline at end of file
# nccl env
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export NCCL_ALGO=Ring
export NCCL_MAX_NCHANNELS=32
export NCCL_MIN_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=4
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=shca_0:1,shca_1:1,shca_2:1,shca_3:1
export NCCL_TOPO_FILE=${MEGATRON_PATH}/requirements/nccl_zz/topo-input.xml
export NCCL_IB_PCI_RELAXED_ORDERING=0
export NCCL_PLUGIN_P2P=ucx
export NCCL_PXN_DISABLE=0
export NCCL_SOCKET_IFNAME=eno1
export LD_LIBRARY_PATH=${MEGATRON_PATH}/requirements/nccl_zz/lib-v8:$LD_LIBRARY_PATH
\ No newline at end of file
# librccl-net.la - a libtool library file
# Generated by libtool (GNU libtool) 2.4.6
#
# Please DO NOT delete this file!
# It is necessary for linking the library.
# The name that we can dlopen(3).
dlname='librccl-net.so.0'
# Names of this library.
library_names='librccl-net.so.0.0.0 librccl-net.so.0 librccl-net.so'
# The name of the static archive.
old_library='librccl-net.a'
# Linker flags that cannot go in dependency_libs.
inherited_linker_flags=''
# Libraries that this one depends upon.
dependency_libs=' -L/usr/lib64 -L/usr/lib -L/opt/dtk-25.04/hip/lib -lucp -lucs -lucm -luct -libverbs -lamdhip64'
# Names of additional weak libraries provided by this library
weak_library_names=''
# Version information for librccl-net.
current=0
age=0
revision=0
# Is this an already installed library?
installed=yes
# Should we warn about portability when linking against -modules?
shouldnotlink=no
# Files to dlopen/dlpreopen
dlopen=''
dlpreopen=''
# Directory that this library needs to be installed in:
libdir='/home/shanxs/rccl/508/install-v8/lib'
<system version="2">
<cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="7">
<pci busid="0000:01:00.0" class="0x060400" vendor="0x1eb6" device="0x6011" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:07:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:09:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6330" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
<xgmi target="0000:55:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:36:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:77:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:85:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:d5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:f5:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:51:00.0" class="0x060400" vendor="0x1eb6" device="0x6011" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:53:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:55:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6330" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
<xgmi target="0000:09:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:36:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:77:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:85:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:d5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:f5:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:03:00.0" class="0x020700" vendor="0x1eb6" device="0x10c1" subsystem_vendor="0x1eb6" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="shca_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xb669ff21ffff" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="1" affinity="00000000,00000000,00000000,ffff0000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="7">
<pci busid="0000:71:00.0" class="0x060400" vendor="0x1eb6" device="0x6011" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:31:00.0" class="0x060400" vendor="0x1eb6" device="0x6011" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:34:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:36:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6330" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
<xgmi target="0000:09:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:55:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:77:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:85:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:d5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:f5:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:75:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:77:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6330" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
<xgmi target="0000:09:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:55:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:36:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:85:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:d5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:f5:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:73:00.0" class="0x020700" vendor="0x1eb6" device="0x10c1" subsystem_vendor="0x1eb6" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="shca_1" dev="1" speed="200000" port="1" latency="0.000000" guid="0xf646ebc0ffff" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="7">
<pci busid="0000:81:00.0" class="0x060400" vendor="0x1eb6" device="0x6011" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:83:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:85:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6330" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
<xgmi target="0000:09:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:55:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:36:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:77:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:d5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:f5:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:d1:00.0" class="0x060400" vendor="0x1eb6" device="0x6011" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:d3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:d5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6330" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
<xgmi target="0000:09:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:55:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:36:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:77:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:85:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:f5:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:86:00.0" class="0x020700" vendor="0x1eb6" device="0x10c1" subsystem_vendor="0x1eb6" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="shca_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x765d9008ffff" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="5" affinity="00000000,ffff0000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="7">
<pci busid="0000:f1:00.0" class="0x060400" vendor="0x1eb6" device="0x6011" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:b1:00.0" class="0x060400" vendor="0x1eb6" device="0x6011" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:b3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:b5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6330" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
<xgmi target="0000:09:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:55:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:36:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:77:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:85:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:d5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:f5:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:f3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:f5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6330" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
<xgmi target="0000:09:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:55:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:36:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:77:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:85:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:d5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b5:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:f6:00.0" class="0x020700" vendor="0x1eb6" device="0x10c1" subsystem_vendor="0x1eb6" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="shca_3" dev="3" speed="200000" port="1" latency="0.000000" guid="0xb3550287ffff" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
</system>
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment