Commit 9dabea91 authored by silencealiang's avatar silencealiang
Browse files

update

parent 66d982b8
Pipeline #2563 passed with stage
...@@ -11,6 +11,11 @@ done ...@@ -11,6 +11,11 @@ done
source /opt/dtk/env.sh source /opt/dtk/env.sh
# default env # default env
DIST_URL=${1}
DIST_PORT=25900
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )" CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR})) MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
...@@ -33,13 +38,8 @@ export NCCL_TOPO_FILE="./topo-input.xml" ...@@ -33,13 +38,8 @@ export NCCL_TOPO_FILE="./topo-input.xml"
# enable BatchLinear # enable BatchLinear
export GROUPED_GEMM_BatchLinear=1 export GROUPED_GEMM_BatchLinear=1
RANK=$OMPI_COMM_WORLD_RANK # data path
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK CHECKPOINT_PATH=./CKPT
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
DIST_URL=${1}
DIST_PORT=25900
CHECKPOINT_PATH=./CKPT
TOKENIZER_MODEL="path to tokenizer.model" TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to my-mixtral_text_document" DATA_PATH="path to my-mixtral_text_document"
......
...@@ -11,6 +11,11 @@ done ...@@ -11,6 +11,11 @@ done
source /opt/dtk/env.sh source /opt/dtk/env.sh
# default env # default env
DIST_URL=${1}
DIST_PORT=25900
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )" CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR})) MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
...@@ -33,13 +38,8 @@ export NCCL_TOPO_FILE="./topo-input.xml" ...@@ -33,13 +38,8 @@ export NCCL_TOPO_FILE="./topo-input.xml"
# enable BatchLinear # enable BatchLinear
export GROUPED_GEMM_BatchLinear=1 export GROUPED_GEMM_BatchLinear=1
RANK=$OMPI_COMM_WORLD_RANK # data path
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK CHECKPOINT_PATH=./CKPT
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
DIST_URL=${1}
DIST_PORT=25900
CHECKPOINT_PATH=./CKPT
TOKENIZER_MODEL="path to tokenizer.model" TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to my-mixtral_text_document" DATA_PATH="path to my-mixtral_text_document"
......
...@@ -11,6 +11,11 @@ done ...@@ -11,6 +11,11 @@ done
source /opt/dtk/env.sh source /opt/dtk/env.sh
# default env # default env
DIST_URL=${1}
DIST_PORT=25900
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )" CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR})) MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
...@@ -33,12 +38,7 @@ export NCCL_TOPO_FILE="./topo-input.xml" ...@@ -33,12 +38,7 @@ export NCCL_TOPO_FILE="./topo-input.xml"
# enable BatchLinear # enable BatchLinear
export GROUPED_GEMM_BatchLinear=1 export GROUPED_GEMM_BatchLinear=1
RANK=$OMPI_COMM_WORLD_RANK # data path
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
DIST_URL=${1}
DIST_PORT=25900
CHECKPOINT_PATH=./CKPT CHECKPOINT_PATH=./CKPT
TOKENIZER_MODEL="path to tokenizer.model" TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to my-mixtral_text_document" DATA_PATH="path to my-mixtral_text_document"
...@@ -112,7 +112,7 @@ TORCH_PROFIE_ARGS=( ...@@ -112,7 +112,7 @@ TORCH_PROFIE_ARGS=(
--profile-ranks 0 1 2 3 4 5 6 7 --profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3 --profile-step-start 3
--profile-step-end 4 --profile-step-end 4
--profile-dir torch_prof_mixtral_1nodes_tp2-pp1-ep8-ep_tp1 --profile-dir torch_prof_mixtral_1nodes_tp2-pp1-ep8-ep_tp1-cp1
--use-pytorch-profiler --use-pytorch-profiler
) )
......
...@@ -11,6 +11,11 @@ done ...@@ -11,6 +11,11 @@ done
source /opt/dtk/env.sh source /opt/dtk/env.sh
# default env # default env
DIST_URL=${1}
DIST_PORT=25900
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )" CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR})) MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
...@@ -33,12 +38,7 @@ export NCCL_TOPO_FILE="./topo-input.xml" ...@@ -33,12 +38,7 @@ export NCCL_TOPO_FILE="./topo-input.xml"
# enable BatchLinear # enable BatchLinear
export GROUPED_GEMM_BatchLinear=1 export GROUPED_GEMM_BatchLinear=1
RANK=$OMPI_COMM_WORLD_RANK # data path
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
DIST_URL=${1}
DIST_PORT=25900
CHECKPOINT_PATH=./CKPT CHECKPOINT_PATH=./CKPT
TOKENIZER_MODEL="path to tokenizer.model" TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to my-mixtral_text_document" DATA_PATH="path to my-mixtral_text_document"
...@@ -81,7 +81,7 @@ MOE_ARGS=( ...@@ -81,7 +81,7 @@ MOE_ARGS=(
--moe-token-dispatcher-type alltoall --moe-token-dispatcher-type alltoall
--moe-expert-capacity-factor 0.5 --moe-expert-capacity-factor 0.5
--moe-pad-expert-input-to-capacity --moe-pad-expert-input-to-capacity
--moe-grouped-gemm #--moe-grouped-gemm
) )
DATA_ARGS=( DATA_ARGS=(
...@@ -112,14 +112,14 @@ TORCH_PROFIE_ARGS=( ...@@ -112,14 +112,14 @@ TORCH_PROFIE_ARGS=(
--profile-ranks 0 1 2 3 8 9 10 11 --profile-ranks 0 1 2 3 8 9 10 11
--profile-step-start 3 --profile-step-start 3
--profile-step-end 4 --profile-step-end 4
--profile-dir torch_prof_mixtral_4nodes_tp2-pp8-ep2-ep_tp1 --profile-dir torch_prof_mixtral_4nodes_tp2-pp4-ep8-ep_tp1-cp1
--use-pytorch-profiler --use-pytorch-profiler
) )
MODEL_PARALLEL_ARGS=( MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2 --tensor-model-parallel-size 2
--pipeline-model-parallel-size 8 --pipeline-model-parallel-size 4
--expert-model-parallel-size 2 --expert-model-parallel-size 8
--expert-tensor-parallel-size 1 --expert-tensor-parallel-size 1
--use-distributed-optimizer --use-distributed-optimizer
--sequence-parallel --sequence-parallel
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment