Commit 57944e55 authored by silencealiang's avatar silencealiang
Browse files

update model parameters format

parent 90ae7f5c
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs Mixtral 8x7B model
source /opt/dtk/env.sh
HOST=localhost
PORT=25900
DATA_PATH="path to my-mixtral_text_document"
TOKENIZER_MODEL_PATH="path to tokenizer.model"
CHECKPOINT_PATH="path to ckpt"
mpirun -np 8 --allow-run-as-root \
train_mixtral_8x7B_1nodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
wait
\ No newline at end of file
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs Mixtral 8x7B model
source /opt/dtk/env.sh
HOST="" # modify this variable
PORT=25900
DATA_PATH="path to my-mixtral_text_document"
TOKENIZER_MODEL_PATH="path to tokenizer.model"
CHECKPOINT_PATH="path to ckpt"
mpirun -np 32 --hostfile hostfile_mixtral_8x7B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_mixtral_8x7B_multinodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
wait
\ No newline at end of file
......@@ -32,16 +32,6 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
......
......@@ -32,16 +32,6 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
......@@ -136,7 +126,7 @@ LOGGING_ARGS=(
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-ranks 0 1 8 9 16 17 24 25
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x22B_8nodes_tp4-pp8-ep8-etp1-cp1
......
......@@ -32,16 +32,6 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
......
......@@ -32,16 +32,6 @@ export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
......
# nccl env
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export NCCL_ALGO=Ring
export NCCL_MAX_NCHANNELS=32
export NCCL_MIN_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE=${MEGATRON_PATH}/requirements/nccl_wz/topo-input.xml
\ No newline at end of file
# nccl env
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export NCCL_ALGO=Ring
export NCCL_MAX_NCHANNELS=32
export NCCL_MIN_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=4
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=shca_0:1,shca_1:1,shca_2:1,shca_3:1
export NCCL_TOPO_FILE=${MEGATRON_PATH}/requirements/nccl_zz/topo-input.xml
export NCCL_IB_PCI_RELAXED_ORDERING=0
export NCCL_PLUGIN_P2P=ucx
export NCCL_PXN_DISABLE=0
export NCCL_SOCKET_IFNAME=eno1
export LD_LIBRARY_PATH=${MEGATRON_PATH}/requirements/nccl_zz/lib-v8:$LD_LIBRARY_PATH
\ No newline at end of file
# librccl-net.la - a libtool library file
# Generated by libtool (GNU libtool) 2.4.6
#
# Please DO NOT delete this file!
# It is necessary for linking the library.
# The name that we can dlopen(3).
dlname='librccl-net.so.0'
# Names of this library.
library_names='librccl-net.so.0.0.0 librccl-net.so.0 librccl-net.so'
# The name of the static archive.
old_library='librccl-net.a'
# Linker flags that cannot go in dependency_libs.
inherited_linker_flags=''
# Libraries that this one depends upon.
dependency_libs=' -L/usr/lib64 -L/usr/lib -L/opt/dtk-25.04/hip/lib -lucp -lucs -lucm -luct -libverbs -lamdhip64'
# Names of additional weak libraries provided by this library
weak_library_names=''
# Version information for librccl-net.
current=0
age=0
revision=0
# Is this an already installed library?
installed=yes
# Should we warn about portability when linking against -modules?
shouldnotlink=no
# Files to dlopen/dlpreopen
dlopen=''
dlpreopen=''
# Directory that this library needs to be installed in:
libdir='/home/shanxs/rccl/508/install-v8/lib'
<system version="2">
<cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="7">
<pci busid="0000:01:00.0" class="0x060400" vendor="0x1eb6" device="0x6011" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:07:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:09:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6330" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
<xgmi target="0000:55:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:36:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:77:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:85:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:d5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:f5:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:51:00.0" class="0x060400" vendor="0x1eb6" device="0x6011" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:53:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:55:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6330" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
<xgmi target="0000:09:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:36:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:77:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:85:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:d5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:f5:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:03:00.0" class="0x020700" vendor="0x1eb6" device="0x10c1" subsystem_vendor="0x1eb6" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="shca_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xb669ff21ffff" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="1" affinity="00000000,00000000,00000000,ffff0000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="7">
<pci busid="0000:71:00.0" class="0x060400" vendor="0x1eb6" device="0x6011" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:31:00.0" class="0x060400" vendor="0x1eb6" device="0x6011" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:34:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:36:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6330" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
<xgmi target="0000:09:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:55:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:77:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:85:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:d5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:f5:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:75:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:77:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6330" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
<xgmi target="0000:09:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:55:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:36:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:85:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:d5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:f5:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:73:00.0" class="0x020700" vendor="0x1eb6" device="0x10c1" subsystem_vendor="0x1eb6" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="shca_1" dev="1" speed="200000" port="1" latency="0.000000" guid="0xf646ebc0ffff" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="7">
<pci busid="0000:81:00.0" class="0x060400" vendor="0x1eb6" device="0x6011" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:83:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:85:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6330" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
<xgmi target="0000:09:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:55:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:36:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:77:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:d5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:f5:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:d1:00.0" class="0x060400" vendor="0x1eb6" device="0x6011" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:d3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:d5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6330" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
<xgmi target="0000:09:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:55:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:36:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:77:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:85:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:f5:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:86:00.0" class="0x020700" vendor="0x1eb6" device="0x10c1" subsystem_vendor="0x1eb6" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="shca_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x765d9008ffff" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="5" affinity="00000000,ffff0000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="7">
<pci busid="0000:f1:00.0" class="0x060400" vendor="0x1eb6" device="0x6011" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:b1:00.0" class="0x060400" vendor="0x1eb6" device="0x6011" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:b3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:b5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6330" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
<xgmi target="0000:09:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:55:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:36:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:77:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:85:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:d5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:f5:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:f3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x0000" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:f5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6330" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
<xgmi target="0000:09:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:55:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:36:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:77:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:85:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:d5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b5:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:f6:00.0" class="0x020700" vendor="0x1eb6" device="0x10c1" subsystem_vendor="0x1eb6" subsystem_device="0x0000" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="shca_3" dev="3" speed="200000" port="1" latency="0.000000" guid="0xb3550287ffff" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
</system>
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment