Commit fe0b03b5 authored by silencealiang's avatar silencealiang
Browse files

fix llama2 bug and update file format

parent ee3ff5df
<system version="2">
<cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:99:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:9d:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:9f:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:51:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:54:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:56:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:9b:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_3" dev="3" speed="200000" port="2" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:03:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:05:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:59:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:5b:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:5d:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:06:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_4" dev="4" speed="200000" port="1" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_5" dev="5" speed="200000" port="2" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="7" affinity="7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:bd:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:bf:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:c1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:e6:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_6" dev="6" speed="200000" port="1" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_7" dev="7" speed="200000" port="2" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:ab:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:af:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:b1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c5:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:c8:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:ca:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:ad:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_8" dev="8" speed="200000" port="1" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_9" dev="9" speed="200000" port="2" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:71:00.0" class="0x020000" vendor="0x15b3" device="0xa2dc" subsystem_vendor="0x15b3" subsystem_device="0x0009" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
<net name="mlx5_1" dev="1" speed="40000" port="2" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
</nic>
</pci>
</cpu>
</system>
......@@ -2,17 +2,25 @@
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs Mixtral 8x22B model
source /opt/dtk/env.sh
# data path
DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env
DIST_URL=${1}
DIST_PORT=25900
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
# data path
CHECKPOINT_PATH="path to CKPT"
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to my-mixtral_text_document"
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
......@@ -86,8 +89,8 @@ MOE_ARGS=(
DATA_ARGS=(
--tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL}
--data-path $DATA_PATH
--tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path ${DATA_PATH}
--split 99990,8,2
)
......@@ -107,23 +110,6 @@ TRAINING_ARGS=(
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 1
......@@ -148,10 +134,27 @@ LOGGING_ARGS=(
--no-save-optim
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-etp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT:-"Mixtral"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x22B"}
)
fi
......
......@@ -2,17 +2,25 @@
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs Mixtral 8x22B model
source /opt/dtk/env.sh
# data path
DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env
DIST_URL=${1}
DIST_PORT=25900
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
# data path
CHECKPOINT_PATH="path to CKPT"
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to my-mixtral_text_document"
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
......@@ -86,8 +89,8 @@ MOE_ARGS=(
DATA_ARGS=(
--tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL}
--data-path $DATA_PATH
--tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path ${DATA_PATH}
--split 99990,8,2
)
......@@ -107,23 +110,6 @@ TRAINING_ARGS=(
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x22B_8nodes_tp4-pp8-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 4
--pipeline-model-parallel-size 8
......@@ -148,10 +134,27 @@ LOGGING_ARGS=(
--no-save-optim
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x22B_8nodes_tp4-pp8-ep8-etp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT:-"Mixtral"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x22B"}
)
fi
......
......@@ -2,17 +2,25 @@
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs Mixtral 8x7B model
source /opt/dtk/env.sh
# data path
DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env
DIST_URL=${1}
DIST_PORT=25900
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
# data path
CHECKPOINT_PATH="path to CKPT"
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to my-mixtral_text_document"
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
......@@ -86,8 +89,8 @@ MOE_ARGS=(
DATA_ARGS=(
--tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL}
--data-path $DATA_PATH
--tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path ${DATA_PATH}
--split 99990,8,2
)
......@@ -107,23 +110,6 @@ TRAINING_ARGS=(
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x22B_1nodes_tp2-pp1-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 1
......@@ -148,6 +134,23 @@ LOGGING_ARGS=(
--no-save-optim
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x7B_1nodes_tp2-pp1-ep8-etp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT:-"Mixtral"}
......
......@@ -2,17 +2,25 @@
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs Mixtral 8x7B model
source /opt/dtk/env.sh
# data path
DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env
DIST_URL=${1}
DIST_PORT=25900
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
# data path
CHECKPOINT_PATH="path to CKPT"
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to my-mixtral_text_document"
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
......@@ -86,8 +89,8 @@ MOE_ARGS=(
DATA_ARGS=(
--tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL}
--data-path $DATA_PATH
--tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path ${DATA_PATH}
--split 99990,8,2
)
......@@ -107,23 +110,6 @@ TRAINING_ARGS=(
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 8 9 10 11
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x7B_4nodes_tp2-pp4-ep8-ep_tp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 4
......@@ -148,6 +134,23 @@ LOGGING_ARGS=(
--no-save-optim
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 8 9 10 11
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_mixtral8x7B_4nodes_tp2-pp4-ep8-etp1-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT:-"Mixtral"}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment