Commit fe0b03b5 authored by silencealiang's avatar silencealiang
Browse files

fix llama2 bug and update file format

parent ee3ff5df
...@@ -163,6 +163,11 @@ class CoreAdaptation(MegatronAdaptationABC): ...@@ -163,6 +163,11 @@ class CoreAdaptation(MegatronAdaptationABC):
staticmethod, staticmethod,
apply_wrapper=True) apply_wrapper=True)
# reduce_scatter_to_sequence_parallel_region
MegatronAdaptation.register('megatron.core.tensor_parallel.mappings.reduce_scatter_to_sequence_parallel_region',
torch._dynamo.disable,
apply_wrapper=True)
# flux # flux
if int(os.getenv("USE_FLUX_OVERLAP", "0")): if int(os.getenv("USE_FLUX_OVERLAP", "0")):
from ..core.tensor_parallel.layers import ( from ..core.tensor_parallel.layers import (
......
...@@ -6,6 +6,7 @@ from functools import wraps ...@@ -6,6 +6,7 @@ from functools import wraps
from megatron.training import get_args from megatron.training import get_args
from megatron.core import tensor_parallel from megatron.core import tensor_parallel
from megatron.legacy.model.enums import AttnType from megatron.legacy.model.enums import AttnType
from megatron.core.utils import deprecate_inference_params
from megatron.core.models.common.embeddings import apply_rotary_pos_emb from megatron.core.models.common.embeddings import apply_rotary_pos_emb
from megatron.legacy.model.module import MegatronModule from megatron.legacy.model.module import MegatronModule
...@@ -94,18 +95,20 @@ class ParallelAttentionPatch(MegatronModule): ...@@ -94,18 +95,20 @@ class ParallelAttentionPatch(MegatronModule):
""" """
def forward(self, hidden_states, attention_mask, def forward(self, hidden_states, attention_mask,
encoder_output=None, inference_params=None, encoder_output=None, inference_context=None,
rotary_pos_emb=None): rotary_pos_emb=None, *, inference_params=None):
# hidden_states: [sq, b, h] # hidden_states: [sq, b, h]
inference_context = deprecate_inference_params(inference_context, inference_params)
# ================================================= # =================================================
# Pre-allocate memory for key-values for inference. # Pre-allocate memory for key-values for inference.
# ================================================= # =================================================
is_first_step = False is_first_step = False
if inference_params: if inference_context:
if self.layer_number not in inference_params.key_value_memory_dict: if self.layer_number not in inference_context.key_value_memory_dict:
inf_max_seq_len = inference_params.max_sequence_length inf_max_seq_len = inference_context.max_sequence_length
inf_max_batch_size = inference_params.max_batch_size inf_max_batch_size = inference_context.max_batch_size
inference_key_memory = self._allocate_memory( inference_key_memory = self._allocate_memory(
inf_max_seq_len, inf_max_batch_size, inf_max_seq_len, inf_max_batch_size,
self.num_query_groups_per_partition) self.num_query_groups_per_partition)
...@@ -113,12 +116,12 @@ class ParallelAttentionPatch(MegatronModule): ...@@ -113,12 +116,12 @@ class ParallelAttentionPatch(MegatronModule):
inf_max_seq_len, inf_max_batch_size, inf_max_seq_len, inf_max_batch_size,
self.num_query_groups_per_partition) self.num_query_groups_per_partition)
inference_params.key_value_memory_dict[self.layer_number] = ( inference_context.key_value_memory_dict[self.layer_number] = (
inference_key_memory, inference_value_memory) inference_key_memory, inference_value_memory)
is_first_step = True is_first_step = True
else: else:
inference_key_memory, inference_value_memory = \ inference_key_memory, inference_value_memory = \
inference_params.key_value_memory_dict[self.layer_number] inference_context.key_value_memory_dict[self.layer_number]
# ===================== # =====================
# Query, Key, and Value # Query, Key, and Value
...@@ -188,13 +191,14 @@ class ParallelAttentionPatch(MegatronModule): ...@@ -188,13 +191,14 @@ class ParallelAttentionPatch(MegatronModule):
else: else:
rotary_pos_emb = ((rotary_pos_emb,) * 2) rotary_pos_emb = ((rotary_pos_emb,) * 2)
if inference_params: if inference_context:
batch_start = inference_params.batch_size_offset batch_start = inference_context.batch_size_offset
batch_end = batch_start + key_layer.size(1) batch_end = batch_start + key_layer.size(1)
assert batch_end <= inference_key_memory.size(1) assert batch_end <= inference_key_memory.size(1)
sequence_start = inference_params.sequence_len_offset sequence_start = inference_context.sequence_len_offset
sequence_end = sequence_start + key_layer.size(0) sequence_end = sequence_start + key_layer.size(0)
assert sequence_end <= inference_key_memory.size(0) assert sequence_end <= inference_key_memory.size(0), ("Current sequence length is "
"longer than expected maximum sequence length! Increase inference_max_seq_length.")
# Copy key and values. # Copy key and values.
inference_key_memory[sequence_start:sequence_end, inference_key_memory[sequence_start:sequence_end,
batch_start:batch_end, ...] = key_layer batch_start:batch_end, ...] = key_layer
......
...@@ -5,10 +5,22 @@ do ...@@ -5,10 +5,22 @@ do
fi fi
done done
mpirun -np 8 --allow-run-as-root \ # Runs DeepseekV3 671B model
train_deepseekv3_671B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1 source /opt/dtk/env.sh
HOST=localhost
PORT=25900
DATA_PATH="path to mmap_deepseekv3_datasets_text_document"
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
CHECKPOINT_PATH="path to output"
wait mpirun -np 8 --allow-run-as-root \
train_deepseekv3_671B_1nodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
rm -rf output wait
rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document \ No newline at end of file
...@@ -5,13 +5,25 @@ do ...@@ -5,13 +5,25 @@ do
fi fi
done done
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
HOST="" # modify this variable
PORT=25900
DATA_PATH="path to mmap_deepseekv3_datasets_text_document"
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
CHECKPOINT_PATH="path to output"
mpirun -np 32 --hostfile hostfile_deepseekv3_671B_4nodes \ mpirun -np 32 --hostfile hostfile_deepseekv3_671B_4nodes \
--allow-run-as-root \ --allow-run-as-root \
--bind-to none \ --bind-to none \
--mca plm_rsh_no_tree_spawn 1 \ --mca plm_rsh_no_tree_spawn 1 \
train_deepseekv3_671B_4nodes.sh node002 --profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1 train_deepseekv3_671B_4nodes.sh \
${HOST} \
wait ${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
rm -rf output wait
rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document \ No newline at end of file
\ No newline at end of file
...@@ -5,13 +5,25 @@ do ...@@ -5,13 +5,25 @@ do
fi fi
done done
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
HOST="" # modify this variable
PORT=25900
DATA_PATH="path to mmap_deepseekv3_datasets_text_document"
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
CHECKPOINT_PATH="path to output"
mpirun -np 1024 --hostfile hostfile_deepseekv3_671B \ mpirun -np 1024 --hostfile hostfile_deepseekv3_671B \
--allow-run-as-root \ --allow-run-as-root \
--bind-to none \ --bind-to none \
--mca plm_rsh_no_tree_spawn 1 \ --mca plm_rsh_no_tree_spawn 1 \
train_deepseekv3_671B_multinodes.sh node001 --profiling=$profiling > log-1024nodes-`date +%F-%H%M`.log 2>&1 train_deepseekv3_671B_multinodes.sh \
${HOST} \
wait ${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
rm -rf output wait
rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document \ No newline at end of file
\ No newline at end of file
...@@ -2,17 +2,20 @@ ...@@ -2,17 +2,20 @@
for para in $* for para in $*
do do
if [[ $para == --profiling* ]];then if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=} profiling=${para#*=}
fi fi
done done
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
# default env # default env
DIST_URL=${1} DIST_URL=${1}
DIST_PORT=25900 DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
...@@ -32,7 +35,7 @@ export NCCL_NET_GDR_LEVEL=7 ...@@ -32,7 +35,7 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0 export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml" export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear # enable BatchLinear
export GROUPED_GEMM_BatchLinear=1 export GROUPED_GEMM_BatchLinear=1
...@@ -51,7 +54,7 @@ PR=bf16 ...@@ -51,7 +54,7 @@ PR=bf16
### PARALLEL / BOOL OPTION ### ### PARALLEL / BOOL OPTION ###
TP=1 TP=1
PP=2 PP=1
CP=1 CP=1
ETP=1 ETP=1
EP=4 EP=4
...@@ -65,17 +68,17 @@ SFT=false ...@@ -65,17 +68,17 @@ SFT=false
AC=none AC=none
OPTIMIZER_OFFLOAD=false OPTIMIZER_OFFLOAD=false
SAVE_INTERVAL=500 SAVE_INTERVAL=500
DATASET_PATH="path to mmap_deepseekv3_datasets_text_document" DATASET_PATH=${data_path}
VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document" VALID_DATASET_PATH=${data_path}
PRETRAIN_CHECKPOINT_PATH="./output" PRETRAIN_CHECKPOINT_PATH=${checkpoint_path}
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset" TOKENIZER_MODEL_PATH=${tokenizer_path}
# the following two values will not be used when SFT is true # the following two values will not be used when SFT is true
TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN})) TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN})) WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
############################### ###############################
OUTPUT_BASEPATH=./output OUTPUT_BASEPATH=${checkpoint_path}
### OTHERS ### ### OTHERS ###
if [ $FL = true ]; then if [ $FL = true ]; then
...@@ -389,7 +392,7 @@ TORCH_PROFIE_ARGS=" \ ...@@ -389,7 +392,7 @@ TORCH_PROFIE_ARGS=" \
--profile-ranks 0 1 2 3 4 5 6 7 \ --profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 3 \ --profile-step-start 3 \
--profile-step-end 4 \ --profile-step-end 4 \
--profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \ --profile-dir torch_prof_deepseekv3_1nodes_tp1-pp1-ep4-etp1-cp1 \
--use-pytorch-profiler \ --use-pytorch-profiler \
" "
......
...@@ -2,17 +2,20 @@ ...@@ -2,17 +2,20 @@
for para in $* for para in $*
do do
if [[ $para == --profiling* ]];then if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=} profiling=${para#*=}
fi fi
done done
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
# default env # default env
DIST_URL=${1} DIST_URL=${1}
DIST_PORT=25900 DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
...@@ -32,7 +35,7 @@ export NCCL_NET_GDR_LEVEL=7 ...@@ -32,7 +35,7 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0 export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml" export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear # enable BatchLinear
export GROUPED_GEMM_BatchLinear=1 export GROUPED_GEMM_BatchLinear=1
...@@ -65,17 +68,17 @@ SFT=false ...@@ -65,17 +68,17 @@ SFT=false
AC=none AC=none
OPTIMIZER_OFFLOAD=false OPTIMIZER_OFFLOAD=false
SAVE_INTERVAL=500 SAVE_INTERVAL=500
DATASET_PATH="path to mmap_deepseekv3_datasets_text_document" DATASET_PATH=${data_path}
VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document" VALID_DATASET_PATH=${data_path}
PRETRAIN_CHECKPOINT_PATH="./output" PRETRAIN_CHECKPOINT_PATH=${checkpoint_path}
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset" TOKENIZER_MODEL_PATH=${tokenizer_path}
# the following two values will not be used when SFT is true # the following two values will not be used when SFT is true
TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN})) TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN})) WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
############################### ###############################
OUTPUT_BASEPATH=./output OUTPUT_BASEPATH=${checkpoint_path}
### OTHERS ### ### OTHERS ###
if [ $FL = true ]; then if [ $FL = true ]; then
......
...@@ -2,17 +2,20 @@ ...@@ -2,17 +2,20 @@
for para in $* for para in $*
do do
if [[ $para == --profiling* ]];then if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=} profiling=${para#*=}
fi fi
done done
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
# default env # default env
DIST_URL=${1} DIST_URL=${1}
DIST_PORT=25900 DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
...@@ -32,7 +35,7 @@ export NCCL_NET_GDR_LEVEL=7 ...@@ -32,7 +35,7 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0 export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml" export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear # enable BatchLinear
export GROUPED_GEMM_BatchLinear=1 export GROUPED_GEMM_BatchLinear=1
...@@ -65,17 +68,17 @@ SFT=false ...@@ -65,17 +68,17 @@ SFT=false
AC=none AC=none
OPTIMIZER_OFFLOAD=false OPTIMIZER_OFFLOAD=false
SAVE_INTERVAL=500 SAVE_INTERVAL=500
DATASET_PATH="path to mmap_deepseekv3_datasets_text_document" DATASET_PATH=${data_path}
VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document" VALID_DATASET_PATH=${data_path}
PRETRAIN_CHECKPOINT_PATH="./output" PRETRAIN_CHECKPOINT_PATH=${checkpoint_path}
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset" TOKENIZER_MODEL_PATH=${tokenizer_path}
# the following two values will not be used when SFT is true # the following two values will not be used when SFT is true
TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN})) TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN})) WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
############################### ###############################
OUTPUT_BASEPATH=./output OUTPUT_BASEPATH=${checkpoint_path}
### OTHERS ### ### OTHERS ###
if [ $FL = true ]; then if [ $FL = true ]; then
...@@ -389,7 +392,7 @@ TORCH_PROFIE_ARGS=" \ ...@@ -389,7 +392,7 @@ TORCH_PROFIE_ARGS=" \
--profile-ranks 0 1 2 3 4 5 6 7 \ --profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 3 \ --profile-step-start 3 \
--profile-step-end 4 \ --profile-step-end 4 \
--profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \ --profile-dir torch_prof_deepseekv3_128nodes_tp4-pp8-ep64-etp2-cp1 \
--use-pytorch-profiler \ --use-pytorch-profiler \
" "
......
...@@ -5,10 +5,22 @@ do ...@@ -5,10 +5,22 @@ do
fi fi
done done
mpirun -np 8 --allow-run-as-root \ # Runs GPT 567B model
train_gpt_567B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1 source /opt/dtk/env.sh
HOST=localhost
PORT=25900
DATA_PATH="path to redpajama_text_document"
TOKENIZER_MODEL_PATH="path to tokenizer.model"
CHECKPOINT_PATH="path to ckpt"
wait mpirun -np 8 --allow-run-as-root \
train_gpt_567B_1nodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
rm -rf CKPT wait
rm -rf gpt_dataset/redpajama_text_document \ No newline at end of file
...@@ -5,13 +5,25 @@ do ...@@ -5,13 +5,25 @@ do
fi fi
done done
mpirun -np 1024 --hostfile hostfile_gpt_567B \ # Runs GPT 567B model
--allow-run-as-root \ source /opt/dtk/env.sh
--bind-to none \ HOST="" # modify this variable
--mca plm_rsh_no_tree_spawn 1 \ PORT=25900
train_gpt_567B_multinodes.sh node059 --profiling=$profiling > log-1024nodes-`date +%F-%H%M`.log 2>&1 DATA_PATH="path to redpajama_text_document"
TOKENIZER_MODEL_PATH="path to tokenizer.model"
CHECKPOINT_PATH="path to ckpt"
wait mpirun -np 1024 --hostfile hostfile_gpt_567B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_gpt_567B_multinodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-1024nodes-`date +%F-%H%M`.log 2>&1
rm -rf CKPT wait
rm -rf gpt_dataset/redpajama_text_document \ No newline at end of file
\ No newline at end of file
<system version="2">
<cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:99:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:9d:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:9f:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:51:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:54:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:56:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:9b:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_3" dev="3" speed="200000" port="2" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:03:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:05:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:59:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:5b:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:5d:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:06:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_4" dev="4" speed="200000" port="1" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_5" dev="5" speed="200000" port="2" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="7" affinity="7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:bd:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:bf:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:c1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:e6:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_6" dev="6" speed="200000" port="1" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_7" dev="7" speed="200000" port="2" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:ab:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:af:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:b1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c5:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:c8:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:ca:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:ad:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_8" dev="8" speed="200000" port="1" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_9" dev="9" speed="200000" port="2" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:71:00.0" class="0x020000" vendor="0x15b3" device="0xa2dc" subsystem_vendor="0x15b3" subsystem_device="0x0009" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
<net name="mlx5_1" dev="1" speed="40000" port="2" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
</nic>
</pci>
</cpu>
</system>
...@@ -2,17 +2,25 @@ ...@@ -2,17 +2,25 @@
for para in $* for para in $*
do do
if [[ $para == --profiling* ]];then if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=} profiling=${para#*=}
fi fi
done done
# Runs GPT 567B model # data path
source /opt/dtk/env.sh DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env # default env
DIST_URL=${1} DIST_URL=${1}
DIST_PORT=25900 DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
...@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7 ...@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0 export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml" export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear # enable BatchLinear
export GROUPED_GEMM_BatchLinear=1 export GROUPED_GEMM_BatchLinear=1
# data path
CHECKPOINT_PATH="path to CKPT"
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to redpajama_text_document"
DISTRIBUTED_ARGS=( DISTRIBUTED_ARGS=(
--rank ${RANK} --rank ${RANK}
--world-size ${WORLD_SIZE} --world-size ${WORLD_SIZE}
...@@ -83,8 +86,8 @@ MOE_ARGS=( ...@@ -83,8 +86,8 @@ MOE_ARGS=(
DATA_ARGS=( DATA_ARGS=(
--tokenizer-type Llama2Tokenizer --tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL} --tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path $DATA_PATH --data-path ${DATA_PATH}
--split 98,2,0 --split 98,2,0
) )
...@@ -104,23 +107,6 @@ TRAINING_ARGS=( ...@@ -104,23 +107,6 @@ TRAINING_ARGS=(
--overlap-grad-reduce --overlap-grad-reduce
) )
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep4-ep_tp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=( MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2 --tensor-model-parallel-size 2
--pipeline-model-parallel-size 1 --pipeline-model-parallel-size 1
...@@ -145,10 +131,27 @@ LOGGING_ARGS=( ...@@ -145,10 +131,27 @@ LOGGING_ARGS=(
--no-save-optim --no-save-optim
) )
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep4-etp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
if [ -n "${WANDB_API_KEY}" ]; then if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=( LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT:-"Mixtral"} --wandb-project ${WANDB_PROJECT:-"GPT"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"} --wandb-exp-name ${WANDB_NAME:-"GPT_567B"}
) )
fi fi
...@@ -204,4 +207,4 @@ case ${LOCAL_RANK} in ...@@ -204,4 +207,4 @@ case ${LOCAL_RANK} in
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP} numactl --cpunodebind=7 --membind=7 ${APP}
;; ;;
esac esac
\ No newline at end of file
...@@ -2,17 +2,25 @@ ...@@ -2,17 +2,25 @@
for para in $* for para in $*
do do
if [[ $para == --profiling* ]];then if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=} profiling=${para#*=}
fi fi
done done
# Runs GPT 567B model # data path
source /opt/dtk/env.sh DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env # default env
DIST_URL=${1} DIST_URL=${1}
DIST_PORT=25900 DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
...@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7 ...@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0 export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml" export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear # enable BatchLinear
export GROUPED_GEMM_BatchLinear=1 export GROUPED_GEMM_BatchLinear=1
# data path
CHECKPOINT_PATH="path to CKPT"
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to redpajama_text_document"
DISTRIBUTED_ARGS=( DISTRIBUTED_ARGS=(
--rank ${RANK} --rank ${RANK}
--world-size ${WORLD_SIZE} --world-size ${WORLD_SIZE}
...@@ -83,8 +86,8 @@ MOE_ARGS=( ...@@ -83,8 +86,8 @@ MOE_ARGS=(
DATA_ARGS=( DATA_ARGS=(
--tokenizer-type Llama2Tokenizer --tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL} --tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path $DATA_PATH --data-path ${DATA_PATH}
--split 98,2,0 --split 98,2,0
) )
...@@ -104,23 +107,6 @@ TRAINING_ARGS=( ...@@ -104,23 +107,6 @@ TRAINING_ARGS=(
--overlap-grad-reduce --overlap-grad-reduce
) )
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_64nodes_tp4-pp16-ep16-ep_tp4-cp2
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=( MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 4 --tensor-model-parallel-size 4
--pipeline-model-parallel-size 16 --pipeline-model-parallel-size 16
...@@ -145,10 +131,27 @@ LOGGING_ARGS=( ...@@ -145,10 +131,27 @@ LOGGING_ARGS=(
--no-save-optim --no-save-optim
) )
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_128nodes_tp4-pp16-ep16-etp4-cp2
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
if [ -n "${WANDB_API_KEY}" ]; then if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=( LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT:-"Mixtral"} --wandb-project ${WANDB_PROJECT:-"GPT"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"} --wandb-exp-name ${WANDB_NAME:-"GPT_567B"}
) )
fi fi
......
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs Llama2 7B model
source /opt/dtk/env.sh
DATA_PATH="path to oscar-1GB_head-llama2_text_document"
TOKENIZER_MODEL_PATH="path to tokenizer.model"
CHECKPOINT_PATH="path to ckpt"
mpirun -np 8 --allow-run-as-root \
train_llama2_7b_1nodes.sh \
localhost \
34577 \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
wait
\ No newline at end of file
#!/bin/bash #!/bin/bash
# set -eux
for para in $* for para in $*
do do
if [[ $para == --profiling* ]];then if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=} profiling=${para#*=}
fi fi
done done
# data path
DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env
DIST_URL=${1}
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )" CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR})) MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
#default env
#export FLASH_ATTENTION_PRINT_PARAM=1
export HSA_FORCE_FINE_GRAIN_PCIE=1 export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1 export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=PXB # SYS export GPU_MAX_HW_QUEUES=10
# export GPU_MAX_HW_QUEUES=10
#export HIP_ALLOC_INITIALIZE=0
export CUDA_DEVICE_MAX_CONNECTIONS=1
# nccl env # nccl env
export NCCL_ALGO=Ring export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16 export NCCL_MIN_NCHANNELS=32
export NCCL_MIN_NCHANNELS=32 # 20 export NCCL_MAX_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32 # 20
export NCCL_IB_TIMEOUT=22
export NCCL_NET_GDR_LEVEL=7 export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1 export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0 export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml" export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
export GLOG_minloglevel=3 # 打印error级别的nccl日志
source /opt/dtk/env.sh
# hipblaslt库
export LD_LIBRARY_PATH=/data/blas/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
# rocblas
export LD_LIBRARY_PATH=/data/blas/rocblas-install-0331-release/lib:$LD_LIBRARY_PATH
# torch控制多流转单流 # torch控制多流转单流
export ALLREDUCE_STREAM_WITH_COMPUTE=1 export ALLREDUCE_STREAM_WITH_COMPUTE=1
...@@ -48,187 +49,143 @@ export SENDRECV_STREAM_WITH_COMPUTE=1 ...@@ -48,187 +49,143 @@ export SENDRECV_STREAM_WITH_COMPUTE=1
#增加编译缓存 #增加编译缓存
export cache_size_limit=64 export cache_size_limit=64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b # DISTRIBUTED_ARGS=(
SAVE_PATH=./tmp_7b --rank ${RANK}
TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path> --world-size ${WORLD_SIZE}
DATA_PATH="/data/datasets/oscar-1GB/oscar-1GB-llama2_text_document" #<Specify path and file prefix>_text_document --local-rank ${LOCAL_RANK}
--dist-url tcp://${DIST_URL}:${DIST_PORT}
)
GPT_MODEL_ARGS=( GPT_MODEL_ARGS=(
--seq-length 4096
--num-layers 32 --num-layers 32
--hidden-size 4096 --hidden-size 4096
--ffn-hidden-size 11008 --ffn-hidden-size 11008
--num-attention-heads 32 --num-attention-heads 32
--max-position-embeddings 4096 --max-position-embeddings 4096
--normalization LightopRMSNorm
--normalization RMSNorm # LightopRMSNorm --position-embedding-type rope
--position-embedding-type rope # none # --untie-embeddings-and-output-weights
--untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
) )
export NVTE_FLASH_ATTN=1 # 走cutlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine # 走core用这两组参数
# --use-mcore-models
# --transformer-impl local # 走legacy用这两组参数
# --use-legacy-models
TRAINING_ARGS=( TRAINING_ARGS=(
--transformer-impl local # 走legacy用这两组参数 --transformer-impl local
--use-legacy-models --use-legacy-models
--micro-batch-size 1 --micro-batch-size 1
--global-batch-size 256 #256 #240 #60 #512 #64 --global-batch-size 256
--train-iters 50 --train-iters 10
--weight-decay 0.1 --weight-decay 0.1
--adam-beta1 0.9 --adam-beta1 0.9
--adam-beta2 0.95 --adam-beta2 0.95
--init-method-std 0.006 --init-method-std 0.006
--clip-grad 1.0 --clip-grad 1.0
--bf16 --bf16
# --fp16 # 开启fp16需要指定loss-scale
# --loss-scale 1024
--use-distributed-optimizer
--disable-bias-linear --disable-bias-linear
--attention-dropout 0 --attention-dropout 0
--hidden-dropout 0 --hidden-dropout 0
# --no-gradient-accumulation-fusion
--swiglu --swiglu
--lr 3.0e-5 --lr 3.0e-5
--lr-decay-style cosine --lr-decay-style cosine
--min-lr 3.0e-6 --min-lr 3.0e-6
--lr-warmup-iters 1 --lr-warmup-iters 1
--ckpt-format torch --ckpt-format torch
--ddp-average-in-collective # 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均 --ddp-average-in-collective
# --recompute-granularity full # 开启重计算降低显存增加耗时 --overlap-grad-reduce
# --recompute-num-layers 5 #0 #
# --recompute-method block
--overlap-grad-reduce # 重叠ddp grad reduce
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
--use-flash-attn --use-flash-attn
) )
# 使用torch fa的环境变量
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_BENCHMARK_FUSION=1
# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_MAX_AUTOTUNE=1
# export TORCHINDUCTOR_CACHE_DIR=./cache
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-triton # triton fa
# --use-flash-attn-torch # torch fa
MODEL_PARALLEL_ARGS=( MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 1
--pipeline-model-parallel-size 2
--context-parallel-size 1
--use-distributed-optimizer
--sequence-parallel --sequence-parallel
--tensor-model-parallel-size 1
--pipeline-model-parallel-size 2
# --context-parallel-size 2
# --num-layers-per-virtual-pipeline-stage 4
# --microbatch-group-size-per-virtual-pipeline-stage 1
# --no-overlap-p2p-communication # 开启后
) )
DATA_ARGS=( DATA_ARGS=(
--data-path $DATA_PATH
--seq-length 4096 #4096
--split 949,50,1
--tokenizer-type Llama2Tokenizer --tokenizer-type Llama2Tokenizer
--tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model --tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path ${DATA_PATH}
--split 949,50,1
) )
EVAL_AND_LOGGING_ARGS=( EVAL_AND_LOGGING_ARGS=(
--log-throughput --log-throughput
--eval-iters 50 --eval-iters 5
--log-interval 1 --log-interval 1
--save-interval 1000 --save-interval 1000
--eval-interval 1000 --eval-interval 1000
--save $SAVE_PATH --save $CHECKPOINT_PATH
--load $SAVE_PATH --load $CHECKPOINT_PATH
--tensorboard-dir $TENSORBOARD_LOGS_PATH --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard"
) )
# FINETUNE_ARGS=( TORCH_PROFIE_ARGS=(
# # --finetune
# # --pretrained-checkpoint $CHECKPOINT_PATH
# --load $CHECKPOINT_PATH
# --no-load-optim
# --no-load-rng
# )
PROFILE_ARGS=(
--profile --profile
--profile-step-start 4
--profile-step-end 5
--use-pytorch-profiler
--profile-ranks 0 1 2 3 4 5 6 7 --profile-ranks 0 1 2 3 4 5 6 7
--profile-dir prof_data --profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_llama_1nodes_tp1-pp2-cp1
--use-pytorch-profiler
) )
RANK=$OMPI_COMM_WORLD_RANK HIP_PROFIE_ARGS=(
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK --profile
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE --profile-ranks 0 1 2 3 4 5 6 7
DIST_URL=${1} --profile-step-start 4
DIST_PORT=34577 --profile-step-end 5
--use-hip-profiler
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
--local-rank ${LOCAL_RANK}
--dist-url tcp://${DIST_URL}:${DIST_PORT}
) )
APP="python -u ${MEGATRON_PATH}/pretrain_gpt.py \ APP="python -u ${MEGATRON_PATH}/pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \ ${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \ ${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \ ${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \ ${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \ ${EVAL_AND_LOGGING_ARGS[@]} \
${DISTRIBUTED_ARGS[@]} \ ${DISTRIBUTED_ARGS[@]} \
"
"
# 开启profile if [[ $profiling == "torch" ]]; then
# ${PROFILE_ARGS[@]} \ APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
# export HIP_VISIBLE_DEVICES=0,7 # # 4,5,6,7 #, mkdir -p hip_prof_data
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # # 4,5,6,7 #, APP+=" ${HIP_PROFIE_ARGS[@]}"
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3, APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
# ${APP} fi
#for hygon cpu
case ${LOCAL_RANK} in case ${LOCAL_RANK} in
[0]) [0])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP} numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[1]) [1])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP} numactl --cpunodebind=1 --membind=1 ${APP}
;; ;;
[2]) [2])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP} numactl --cpunodebind=2 --membind=2 ${APP}
;; ;;
[3]) [3])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=3 --membind=3 ${APP} numactl --cpunodebind=3 --membind=3 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[4]) [4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=4 --membind=4 ${APP} numactl --cpunodebind=4 --membind=4 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[5]) [5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=5 --membind=5 ${APP} numactl --cpunodebind=5 --membind=5 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[6]) [6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=6 --membind=6 ${APP} numactl --cpunodebind=6 --membind=6 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
[7]) [7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP} numactl --cpunodebind=7 --membind=7 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;; ;;
esac esac
\ No newline at end of file
# Mixtral 8x7B Model Inference and Finetuning
## Download Mixtral 8x7B Checkpoints
Download Mixtral 8x7B HF format checkpoint from [HF-hub](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/)
Or you can simply run this following script to download Mixtral 8x7B into a specific folder.
```python
from huggingface_hub import snapshot_download
SAVED_DIR = "" # Specify the saved directory
# Download HF checkpoints
snapshot_download(repo_id="mistralai/Mixtral-8x7B-v0.1", ignore_patterns=["*.pt"], local_dir=SAVED_DIR, local_dir_use_symlinks=False)
```
## Convert Mixtral 8x7B checkpoints from HF to MCore
The HF checkpoints can be converted to Megatron format by using the provided checkpoint converter for HF format.
The target model parallel size(e.g. TP,PP,EP) should be specified.
Currently the converter doesn't support distributed checkpointing yet, so each different parallel config requires a specific checkpoint.
- For training, the recommended model parallel config is TP1EP8PP4
- For inference, the recommended model parallel config is TP1EP1PP2
```
TOKENIZER_MODEL=/workspace/checkpoints/mixtral-hf/tokenizer.model
MEGATRON_PATH="/workspace/megatron-lm"
export PYTHONPATH=$MEGATRON_PATH:$PYTHONPATH
export CUDA_DEVICE_MAX_CONNECTIONS=1
TARGET_TP_SIZE=""
TARGET_EP_SIZE=""
TARGET_PP_SIZE=""
HF_FORMAT_DIR=/workspace/checkpoints/mixtral-hf
MEGATRON_FORMAT_DIR=/workspace/checkpoints/mixtral-mcore-TP${TARGET_TP_SIZE}PP${TARGET_PP_SIZE}EP${TARGET_EP_SIZE}
python tools/checkpoint/convert.py \
--model-type GPT \
--loader loader_mixtral_hf \
--saver mcore \
--target-tensor-parallel-size ${TARGET_TP_SIZE} \
--target-pipeline-parallel-size ${TARGET_PP_SIZE} \
--target-expert-parallel-size ${TARGET_EP_SIZE} \
--load-dir ${HF_FORMAT_DIR} \
--save-dir ${MEGATRON_FORMAT_DIR} \
--tokenizer-model ${TOKENIZER_MODEL}
```
## Text generation with Mixtral 8x7B
Inference with Mixtral 8x7B requires at least 2 GPUS, such that a distributed checkpoint with EP>=2 or PP>=2 converted with above script is needed.
The Megatron-LM have included a simple REST server to use for text generation in `tools/run_text_generation_server.py`, launch it with the following script:
```
#!/bin/bash
# This example will start serving the Mixtral 8x7B model.
DISTRIBUTED_ARGS="--nproc_per_node 2 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint>
TOKENIZER_MODEL=<Path to tokenizer (e.g. /tokenizer.model)>
export CUDA_DEVICE_MAX_CONNECTIONS=1
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 2 \
--expert-model-parallel-size 1 \
--load ${CHECKPOINT} \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model $TOKENIZER_MODEL \
--use-mcore-models \
--max-position-embeddings 32768 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--normalization RMSNorm \
--disable-bias-linear \
--position-embedding-type rope \
--no-position-embedding \
--swiglu \
--untie-embeddings-and-output-weights \
--group-query-attention \
--num-query-groups 8 \
--bf16 \
--micro-batch-size 1 \
--seq-length 1024 \
--seed 42 \
--num-experts 8 \
--moe-router-topk 2 \
--moe-token-dispatcher-type alltoall \
--moe-grouped-gemm \
--mock-data \
--rotary-base 1000000
```
Once the server is running you can use `tools/text_generation_cli.py` to query it, it takes one argument which is the host the server is running on.
```
python tools/text_generation_cli.py localhost:5000
```
## Finetuning from pretrained Mixtral 8x7B
To finetuning pretrained Mixtral 8x7B, use the following scripts:
```bash
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.04-py3
CHECKPOINT_PATH="" # Speicfy path to checkpoint dir
TOKENIZER_MODEL="" # Specify path to tokenizer.model
DATA_PATH="" # Specify path to data
docker run \
--gpus=all \
--ipc=host \
--workdir /workspace/megatron-lm \
-v /path/to/data:/path/to/data \
-v /path/to/megatron-lm:/workspace/megatron-lm \
$PYTORCH_IMAGE \
bash examples/mixtral/train_mixtral_8x7b_distributed.sh $CHECKPOINT_PATH $TOKENIZER_MODEL $DATA_PATH
```
The above functionality also applys to Mixtral 8x22B actually, you should set the model config (including hidden_size/head_num/num_layers/ffn_hidden_size) properly according to the original [config](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1/blob/main/config.json).
## Acknowledgements
Contributors outside NVIDIA for the huggingface converter and example of Mixtral models in Megatron-Core:
- Peng Li <jerry.lp@alibaba-inc.com>
- Jun Huang <huangjun.hj@alibaba-inc.com>
...@@ -5,10 +5,22 @@ do ...@@ -5,10 +5,22 @@ do
fi fi
done done
mpirun -np 8 --allow-run-as-root \ # Runs Mixtral 8x22B model
train_mixtral_8x22B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1 source /opt/dtk/env.sh
HOST=localhost
PORT=25900
DATA_PATH="path to my-mixtral_text_document"
TOKENIZER_MODEL_PATH="path to tokenizer.model"
CHECKPOINT_PATH="path to ckpt"
wait mpirun -np 8 --allow-run-as-root \
train_mixtral_8x22B_1nodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
rm -rf CKPT wait
rm -rf mixtral_dataset/my-mixtral_text_document \ No newline at end of file
...@@ -5,13 +5,25 @@ do ...@@ -5,13 +5,25 @@ do
fi fi
done done
mpirun -np 64 --hostfile hostfile_mixtral_8x22B \ # Runs Mixtral 8x22B model
source /opt/dtk/env.sh
HOST="" # modify this variable
PORT=25900
DATA_PATH="path to my-mixtral_text_document"
TOKENIZER_MODEL_PATH="path to tokenizer.model"
CHECKPOINT_PATH="path to ckpt"
mpirun -np 32 --hostfile hostfile_mixtral_8x22B \
--allow-run-as-root \ --allow-run-as-root \
--bind-to none \ --bind-to none \
--mca plm_rsh_no_tree_spawn 1 \ --mca plm_rsh_no_tree_spawn 1 \
train_mixtral_8x22B_multinodes.sh node067 --profiling=$profiling > log-8nodes-`date +%F-%H%M`.log 2>&1 train_mixtral_8x22B_multinodes.sh \
${HOST} \
wait ${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
rm -rf CKPT wait
rm -rf mixtral_dataset/my-mixtral_text_document \ No newline at end of file
\ No newline at end of file
...@@ -5,10 +5,22 @@ do ...@@ -5,10 +5,22 @@ do
fi fi
done done
mpirun -np 8 --allow-run-as-root \ # Runs Mixtral 8x7B model
train_mixtral_8x7B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1 source /opt/dtk/env.sh
HOST=localhost
PORT=25900
DATA_PATH="path to my-mixtral_text_document"
TOKENIZER_MODEL_PATH="path to tokenizer.model"
CHECKPOINT_PATH="path to ckpt"
wait mpirun -np 8 --allow-run-as-root \
train_mixtral_8x7B_1nodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
rm -rf CKPT wait
rm -rf mixtral_dataset/my-mixtral_text_document \ No newline at end of file
...@@ -5,13 +5,25 @@ do ...@@ -5,13 +5,25 @@ do
fi fi
done done
# Runs Mixtral 8x7B model
source /opt/dtk/env.sh
HOST="" # modify this variable
PORT=25900
DATA_PATH="path to my-mixtral_text_document"
TOKENIZER_MODEL_PATH="path to tokenizer.model"
CHECKPOINT_PATH="path to ckpt"
mpirun -np 32 --hostfile hostfile_mixtral_8x7B \ mpirun -np 32 --hostfile hostfile_mixtral_8x7B \
--allow-run-as-root \ --allow-run-as-root \
--bind-to none \ --bind-to none \
--mca plm_rsh_no_tree_spawn 1 \ --mca plm_rsh_no_tree_spawn 1 \
train_mixtral_8x7B_multinodes.sh node067 --profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1 train_mixtral_8x7B_multinodes.sh \
${HOST} \
wait ${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
rm -rf CKPT wait
rm -rf mixtral_dataset/my-mixtral_text_document \ No newline at end of file
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment