Commit fe0b03b5 authored by silencealiang's avatar silencealiang
Browse files

fix llama2 bug and update file format

parent ee3ff5df
......@@ -163,6 +163,11 @@ class CoreAdaptation(MegatronAdaptationABC):
staticmethod,
apply_wrapper=True)
# reduce_scatter_to_sequence_parallel_region
MegatronAdaptation.register('megatron.core.tensor_parallel.mappings.reduce_scatter_to_sequence_parallel_region',
torch._dynamo.disable,
apply_wrapper=True)
# flux
if int(os.getenv("USE_FLUX_OVERLAP", "0")):
from ..core.tensor_parallel.layers import (
......
......@@ -6,6 +6,7 @@ from functools import wraps
from megatron.training import get_args
from megatron.core import tensor_parallel
from megatron.legacy.model.enums import AttnType
from megatron.core.utils import deprecate_inference_params
from megatron.core.models.common.embeddings import apply_rotary_pos_emb
from megatron.legacy.model.module import MegatronModule
......@@ -94,18 +95,20 @@ class ParallelAttentionPatch(MegatronModule):
"""
def forward(self, hidden_states, attention_mask,
encoder_output=None, inference_params=None,
rotary_pos_emb=None):
encoder_output=None, inference_context=None,
rotary_pos_emb=None, *, inference_params=None):
# hidden_states: [sq, b, h]
inference_context = deprecate_inference_params(inference_context, inference_params)
# =================================================
# Pre-allocate memory for key-values for inference.
# =================================================
is_first_step = False
if inference_params:
if self.layer_number not in inference_params.key_value_memory_dict:
inf_max_seq_len = inference_params.max_sequence_length
inf_max_batch_size = inference_params.max_batch_size
if inference_context:
if self.layer_number not in inference_context.key_value_memory_dict:
inf_max_seq_len = inference_context.max_sequence_length
inf_max_batch_size = inference_context.max_batch_size
inference_key_memory = self._allocate_memory(
inf_max_seq_len, inf_max_batch_size,
self.num_query_groups_per_partition)
......@@ -113,12 +116,12 @@ class ParallelAttentionPatch(MegatronModule):
inf_max_seq_len, inf_max_batch_size,
self.num_query_groups_per_partition)
inference_params.key_value_memory_dict[self.layer_number] = (
inference_context.key_value_memory_dict[self.layer_number] = (
inference_key_memory, inference_value_memory)
is_first_step = True
else:
inference_key_memory, inference_value_memory = \
inference_params.key_value_memory_dict[self.layer_number]
inference_context.key_value_memory_dict[self.layer_number]
# =====================
# Query, Key, and Value
......@@ -188,13 +191,14 @@ class ParallelAttentionPatch(MegatronModule):
else:
rotary_pos_emb = ((rotary_pos_emb,) * 2)
if inference_params:
batch_start = inference_params.batch_size_offset
if inference_context:
batch_start = inference_context.batch_size_offset
batch_end = batch_start + key_layer.size(1)
assert batch_end <= inference_key_memory.size(1)
sequence_start = inference_params.sequence_len_offset
sequence_start = inference_context.sequence_len_offset
sequence_end = sequence_start + key_layer.size(0)
assert sequence_end <= inference_key_memory.size(0)
assert sequence_end <= inference_key_memory.size(0), ("Current sequence length is "
"longer than expected maximum sequence length! Increase inference_max_seq_length.")
# Copy key and values.
inference_key_memory[sequence_start:sequence_end,
batch_start:batch_end, ...] = key_layer
......
......@@ -5,10 +5,22 @@ do
fi
done
mpirun -np 8 --allow-run-as-root \
train_deepseekv3_671B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
HOST=localhost
PORT=25900
DATA_PATH="path to mmap_deepseekv3_datasets_text_document"
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
CHECKPOINT_PATH="path to output"
wait
mpirun -np 8 --allow-run-as-root \
train_deepseekv3_671B_1nodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
rm -rf output
rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
wait
\ No newline at end of file
......@@ -5,13 +5,25 @@ do
fi
done
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
HOST="" # modify this variable
PORT=25900
DATA_PATH="path to mmap_deepseekv3_datasets_text_document"
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
CHECKPOINT_PATH="path to output"
mpirun -np 32 --hostfile hostfile_deepseekv3_671B_4nodes \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_deepseekv3_671B_4nodes.sh node002 --profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
wait
train_deepseekv3_671B_4nodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
rm -rf output
rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
\ No newline at end of file
wait
\ No newline at end of file
......@@ -5,13 +5,25 @@ do
fi
done
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
HOST="" # modify this variable
PORT=25900
DATA_PATH="path to mmap_deepseekv3_datasets_text_document"
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
CHECKPOINT_PATH="path to output"
mpirun -np 1024 --hostfile hostfile_deepseekv3_671B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_deepseekv3_671B_multinodes.sh node001 --profiling=$profiling > log-1024nodes-`date +%F-%H%M`.log 2>&1
wait
train_deepseekv3_671B_multinodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
rm -rf output
rm -rf deepseekv3_dataset/mmap_deepseekv3_datasets_text_document
\ No newline at end of file
wait
\ No newline at end of file
......@@ -2,17 +2,20 @@
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
# default env
DIST_URL=${1}
DIST_PORT=25900
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -32,7 +35,7 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
......@@ -51,7 +54,7 @@ PR=bf16
### PARALLEL / BOOL OPTION ###
TP=1
PP=2
PP=1
CP=1
ETP=1
EP=4
......@@ -65,17 +68,17 @@ SFT=false
AC=none
OPTIMIZER_OFFLOAD=false
SAVE_INTERVAL=500
DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
PRETRAIN_CHECKPOINT_PATH="./output"
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
DATASET_PATH=${data_path}
VALID_DATASET_PATH=${data_path}
PRETRAIN_CHECKPOINT_PATH=${checkpoint_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
# the following two values will not be used when SFT is true
TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
###############################
OUTPUT_BASEPATH=./output
OUTPUT_BASEPATH=${checkpoint_path}
### OTHERS ###
if [ $FL = true ]; then
......@@ -389,7 +392,7 @@ TORCH_PROFIE_ARGS=" \
--profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 3 \
--profile-step-end 4 \
--profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \
--profile-dir torch_prof_deepseekv3_1nodes_tp1-pp1-ep4-etp1-cp1 \
--use-pytorch-profiler \
"
......
......@@ -2,17 +2,20 @@
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
# default env
DIST_URL=${1}
DIST_PORT=25900
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -32,7 +35,7 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
......@@ -65,17 +68,17 @@ SFT=false
AC=none
OPTIMIZER_OFFLOAD=false
SAVE_INTERVAL=500
DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
PRETRAIN_CHECKPOINT_PATH="./output"
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
DATASET_PATH=${data_path}
VALID_DATASET_PATH=${data_path}
PRETRAIN_CHECKPOINT_PATH=${checkpoint_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
# the following two values will not be used when SFT is true
TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
###############################
OUTPUT_BASEPATH=./output
OUTPUT_BASEPATH=${checkpoint_path}
### OTHERS ###
if [ $FL = true ]; then
......
......@@ -2,17 +2,20 @@
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs DeepseekV3 671B model
source /opt/dtk/env.sh
# default env
DIST_URL=${1}
DIST_PORT=25900
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -32,7 +35,7 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
......@@ -65,17 +68,17 @@ SFT=false
AC=none
OPTIMIZER_OFFLOAD=false
SAVE_INTERVAL=500
DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
VALID_DATASET_PATH="path to mmap_deepseekv3_datasets_text_document"
PRETRAIN_CHECKPOINT_PATH="./output"
TOKENIZER_MODEL_PATH="path to deepseekv3_dataset"
DATASET_PATH=${data_path}
VALID_DATASET_PATH=${data_path}
PRETRAIN_CHECKPOINT_PATH=${checkpoint_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
# the following two values will not be used when SFT is true
TRAIN_TOKENS=$((10000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
WARMUP_TOKENS=$((2000 * ${GLOBAL_BATCH_SIZE} * ${SEQ_LEN}))
###############################
OUTPUT_BASEPATH=./output
OUTPUT_BASEPATH=${checkpoint_path}
### OTHERS ###
if [ $FL = true ]; then
......@@ -389,7 +392,7 @@ TORCH_PROFIE_ARGS=" \
--profile-ranks 0 1 2 3 4 5 6 7 \
--profile-step-start 3 \
--profile-step-end 4 \
--profile-dir torch_prof_deepseekv3_4nodes_tp2-pp2-ep16-etp1-cp1 \
--profile-dir torch_prof_deepseekv3_128nodes_tp4-pp8-ep64-etp2-cp1 \
--use-pytorch-profiler \
"
......
......@@ -5,10 +5,22 @@ do
fi
done
mpirun -np 8 --allow-run-as-root \
train_gpt_567B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
# Runs GPT 567B model
source /opt/dtk/env.sh
HOST=localhost
PORT=25900
DATA_PATH="path to redpajama_text_document"
TOKENIZER_MODEL_PATH="path to tokenizer.model"
CHECKPOINT_PATH="path to ckpt"
wait
mpirun -np 8 --allow-run-as-root \
train_gpt_567B_1nodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
rm -rf CKPT
rm -rf gpt_dataset/redpajama_text_document
wait
\ No newline at end of file
......@@ -5,13 +5,25 @@ do
fi
done
mpirun -np 1024 --hostfile hostfile_gpt_567B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_gpt_567B_multinodes.sh node059 --profiling=$profiling > log-1024nodes-`date +%F-%H%M`.log 2>&1
# Runs GPT 567B model
source /opt/dtk/env.sh
HOST="" # modify this variable
PORT=25900
DATA_PATH="path to redpajama_text_document"
TOKENIZER_MODEL_PATH="path to tokenizer.model"
CHECKPOINT_PATH="path to ckpt"
wait
mpirun -np 1024 --hostfile hostfile_gpt_567B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_gpt_567B_multinodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-1024nodes-`date +%F-%H%M`.log 2>&1
rm -rf CKPT
rm -rf gpt_dataset/redpajama_text_document
\ No newline at end of file
wait
\ No newline at end of file
<system version="2">
<cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:99:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:9d:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:9f:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:51:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:54:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:56:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:9b:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_3" dev="3" speed="200000" port="2" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:03:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:05:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:59:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:5b:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:5d:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:06:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_4" dev="4" speed="200000" port="1" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_5" dev="5" speed="200000" port="2" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="7" affinity="7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:bd:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:bf:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:c1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:e6:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_6" dev="6" speed="200000" port="1" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_7" dev="7" speed="200000" port="2" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:ab:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:af:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:b1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c5:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:c8:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:ca:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:ad:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_8" dev="8" speed="200000" port="1" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_9" dev="9" speed="200000" port="2" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:71:00.0" class="0x020000" vendor="0x15b3" device="0xa2dc" subsystem_vendor="0x15b3" subsystem_device="0x0009" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
<net name="mlx5_1" dev="1" speed="40000" port="2" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
</nic>
</pci>
</cpu>
</system>
......@@ -2,17 +2,25 @@
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs GPT 567B model
source /opt/dtk/env.sh
# data path
DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env
DIST_URL=${1}
DIST_PORT=25900
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
# data path
CHECKPOINT_PATH="path to CKPT"
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to redpajama_text_document"
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
......@@ -83,8 +86,8 @@ MOE_ARGS=(
DATA_ARGS=(
--tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL}
--data-path $DATA_PATH
--tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path ${DATA_PATH}
--split 98,2,0
)
......@@ -104,23 +107,6 @@ TRAINING_ARGS=(
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep4-ep_tp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 1
......@@ -145,10 +131,27 @@ LOGGING_ARGS=(
--no-save-optim
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep4-etp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT:-"Mixtral"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
--wandb-project ${WANDB_PROJECT:-"GPT"}
--wandb-exp-name ${WANDB_NAME:-"GPT_567B"}
)
fi
......@@ -204,4 +207,4 @@ case ${LOCAL_RANK} in
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
esac
\ No newline at end of file
......@@ -2,17 +2,25 @@
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs GPT 567B model
source /opt/dtk/env.sh
# data path
DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env
DIST_URL=${1}
DIST_PORT=25900
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
......@@ -32,16 +40,11 @@ export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
# data path
CHECKPOINT_PATH="path to CKPT"
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to redpajama_text_document"
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
......@@ -83,8 +86,8 @@ MOE_ARGS=(
DATA_ARGS=(
--tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL}
--data-path $DATA_PATH
--tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path ${DATA_PATH}
--split 98,2,0
)
......@@ -104,23 +107,6 @@ TRAINING_ARGS=(
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_64nodes_tp4-pp16-ep16-ep_tp4-cp2
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 4
--pipeline-model-parallel-size 16
......@@ -145,10 +131,27 @@ LOGGING_ARGS=(
--no-save-optim
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_128nodes_tp4-pp16-ep16-etp4-cp2
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT:-"Mixtral"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
--wandb-project ${WANDB_PROJECT:-"GPT"}
--wandb-exp-name ${WANDB_NAME:-"GPT_567B"}
)
fi
......
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs Llama2 7B model
source /opt/dtk/env.sh
DATA_PATH="path to oscar-1GB_head-llama2_text_document"
TOKENIZER_MODEL_PATH="path to tokenizer.model"
CHECKPOINT_PATH="path to ckpt"
mpirun -np 8 --allow-run-as-root \
train_llama2_7b_1nodes.sh \
localhost \
34577 \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
wait
\ No newline at end of file
#!/bin/bash
# set -eux
for para in $*
do
if [[ $para == --profiling* ]];then
if [[ $para == --data_path* ]];then
data_path=${para#*=}
elif [[ $para == --tokenizer_path* ]];then
tokenizer_path=${para#*=}
elif [[ $para == --checkpoint_path* ]];then
checkpoint_path=${para#*=}
elif [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# data path
DATA_PATH=${data_path}
TOKENIZER_MODEL_PATH=${tokenizer_path}
CHECKPOINT_PATH=${checkpoint_path}
# default env
DIST_URL=${1}
DIST_PORT=${2}
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
#default env
#export FLASH_ATTENTION_PRINT_PARAM=1
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=PXB # SYS
# export GPU_MAX_HW_QUEUES=10
#export HIP_ALLOC_INITIALIZE=0
export CUDA_DEVICE_MAX_CONNECTIONS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=32 # 20
export NCCL_MAX_NCHANNELS=32 # 20
export NCCL_IB_TIMEOUT=22
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
export GLOG_minloglevel=3 # 打印error级别的nccl日志
source /opt/dtk/env.sh
# hipblaslt库
export LD_LIBRARY_PATH=/data/blas/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
# rocblas
export LD_LIBRARY_PATH=/data/blas/rocblas-install-0331-release/lib:$LD_LIBRARY_PATH
export NCCL_TOPO_FILE="${MEGATRON_PATH}/topo-input.xml"
# torch控制多流转单流
export ALLREDUCE_STREAM_WITH_COMPUTE=1
......@@ -48,187 +49,143 @@ export SENDRECV_STREAM_WITH_COMPUTE=1
#增加编译缓存
export cache_size_limit=64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH=./tmp_7b
TENSORBOARD_LOGS_PATH=./tmp_7b #$2 #<Specify path>
DATA_PATH="/data/datasets/oscar-1GB/oscar-1GB-llama2_text_document" #<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
--local-rank ${LOCAL_RANK}
--dist-url tcp://${DIST_URL}:${DIST_PORT}
)
GPT_MODEL_ARGS=(
--seq-length 4096
--num-layers 32
--hidden-size 4096
--ffn-hidden-size 11008
--num-attention-heads 32
--max-position-embeddings 4096
--normalization RMSNorm # LightopRMSNorm
--position-embedding-type rope # none #
--untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
--normalization LightopRMSNorm
--position-embedding-type rope
--untie-embeddings-and-output-weights
)
export NVTE_FLASH_ATTN=1 # 走cutlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine # 走core用这两组参数
# --use-mcore-models
# --transformer-impl local # 走legacy用这两组参数
# --use-legacy-models
TRAINING_ARGS=(
--transformer-impl local # 走legacy用这两组参数
--transformer-impl local
--use-legacy-models
--micro-batch-size 1
--global-batch-size 256 #256 #240 #60 #512 #64
--train-iters 50
--global-batch-size 256
--train-iters 10
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
# --fp16 # 开启fp16需要指定loss-scale
# --loss-scale 1024
--use-distributed-optimizer
--disable-bias-linear
--attention-dropout 0
--hidden-dropout 0
# --no-gradient-accumulation-fusion
--swiglu
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
--ckpt-format torch
--ddp-average-in-collective # 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
# --recompute-granularity full # 开启重计算降低显存增加耗时
# --recompute-num-layers 5 #0 #
# --recompute-method block
--overlap-grad-reduce # 重叠ddp grad reduce
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
--ddp-average-in-collective
--overlap-grad-reduce
--use-flash-attn
)
# 使用torch fa的环境变量
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_BENCHMARK_FUSION=1
# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_MAX_AUTOTUNE=1
# export TORCHINDUCTOR_CACHE_DIR=./cache
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-triton # triton fa
# --use-flash-attn-torch # torch fa
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 1
--pipeline-model-parallel-size 2
--context-parallel-size 1
--use-distributed-optimizer
--sequence-parallel
--tensor-model-parallel-size 1
--pipeline-model-parallel-size 2
# --context-parallel-size 2
# --num-layers-per-virtual-pipeline-stage 4
# --microbatch-group-size-per-virtual-pipeline-stage 1
# --no-overlap-p2p-communication # 开启后
)
DATA_ARGS=(
--data-path $DATA_PATH
--seq-length 4096 #4096
--split 949,50,1
--tokenizer-type Llama2Tokenizer
--tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
--tokenizer-model ${TOKENIZER_MODEL_PATH}
--data-path ${DATA_PATH}
--split 949,50,1
)
EVAL_AND_LOGGING_ARGS=(
--log-throughput
--eval-iters 50
--eval-iters 5
--log-interval 1
--save-interval 1000
--eval-interval 1000
--save $SAVE_PATH
--load $SAVE_PATH
--tensorboard-dir $TENSORBOARD_LOGS_PATH
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard"
)
# FINETUNE_ARGS=(
# # --finetune
# # --pretrained-checkpoint $CHECKPOINT_PATH
# --load $CHECKPOINT_PATH
# --no-load-optim
# --no-load-rng
# )
PROFILE_ARGS=(
TORCH_PROFIE_ARGS=(
--profile
--profile-step-start 4
--profile-step-end 5
--use-pytorch-profiler
--profile-ranks 0 1 2 3 4 5 6 7
--profile-dir prof_data
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_llama_1nodes_tp1-pp2-cp1
--use-pytorch-profiler
)
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
DIST_URL=${1}
DIST_PORT=34577
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
--local-rank ${LOCAL_RANK}
--dist-url tcp://${DIST_URL}:${DIST_PORT}
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
APP="python -u ${MEGATRON_PATH}/pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
${DISTRIBUTED_ARGS[@]} \
"
# 开启profile
# ${PROFILE_ARGS[@]} \
# export HIP_VISIBLE_DEVICES=0,7 # # 4,5,6,7 #,
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
# ${APP}
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]} \
${DISTRIBUTED_ARGS[@]} \
"
if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS[@]}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
case ${LOCAL_RANK} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=3 --membind=3 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=4 --membind=4 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=5 --membind=5 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=6 --membind=6 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
numactl --cpunodebind=7 --membind=7 ${APP}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
esac
\ No newline at end of file
# Mixtral 8x7B Model Inference and Finetuning
## Download Mixtral 8x7B Checkpoints
Download Mixtral 8x7B HF format checkpoint from [HF-hub](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/)
Or you can simply run this following script to download Mixtral 8x7B into a specific folder.
```python
from huggingface_hub import snapshot_download
SAVED_DIR = "" # Specify the saved directory
# Download HF checkpoints
snapshot_download(repo_id="mistralai/Mixtral-8x7B-v0.1", ignore_patterns=["*.pt"], local_dir=SAVED_DIR, local_dir_use_symlinks=False)
```
## Convert Mixtral 8x7B checkpoints from HF to MCore
The HF checkpoints can be converted to Megatron format by using the provided checkpoint converter for HF format.
The target model parallel size(e.g. TP,PP,EP) should be specified.
Currently the converter doesn't support distributed checkpointing yet, so each different parallel config requires a specific checkpoint.
- For training, the recommended model parallel config is TP1EP8PP4
- For inference, the recommended model parallel config is TP1EP1PP2
```
TOKENIZER_MODEL=/workspace/checkpoints/mixtral-hf/tokenizer.model
MEGATRON_PATH="/workspace/megatron-lm"
export PYTHONPATH=$MEGATRON_PATH:$PYTHONPATH
export CUDA_DEVICE_MAX_CONNECTIONS=1
TARGET_TP_SIZE=""
TARGET_EP_SIZE=""
TARGET_PP_SIZE=""
HF_FORMAT_DIR=/workspace/checkpoints/mixtral-hf
MEGATRON_FORMAT_DIR=/workspace/checkpoints/mixtral-mcore-TP${TARGET_TP_SIZE}PP${TARGET_PP_SIZE}EP${TARGET_EP_SIZE}
python tools/checkpoint/convert.py \
--model-type GPT \
--loader loader_mixtral_hf \
--saver mcore \
--target-tensor-parallel-size ${TARGET_TP_SIZE} \
--target-pipeline-parallel-size ${TARGET_PP_SIZE} \
--target-expert-parallel-size ${TARGET_EP_SIZE} \
--load-dir ${HF_FORMAT_DIR} \
--save-dir ${MEGATRON_FORMAT_DIR} \
--tokenizer-model ${TOKENIZER_MODEL}
```
## Text generation with Mixtral 8x7B
Inference with Mixtral 8x7B requires at least 2 GPUS, such that a distributed checkpoint with EP>=2 or PP>=2 converted with above script is needed.
The Megatron-LM have included a simple REST server to use for text generation in `tools/run_text_generation_server.py`, launch it with the following script:
```
#!/bin/bash
# This example will start serving the Mixtral 8x7B model.
DISTRIBUTED_ARGS="--nproc_per_node 2 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint>
TOKENIZER_MODEL=<Path to tokenizer (e.g. /tokenizer.model)>
export CUDA_DEVICE_MAX_CONNECTIONS=1
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 2 \
--expert-model-parallel-size 1 \
--load ${CHECKPOINT} \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model $TOKENIZER_MODEL \
--use-mcore-models \
--max-position-embeddings 32768 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--normalization RMSNorm \
--disable-bias-linear \
--position-embedding-type rope \
--no-position-embedding \
--swiglu \
--untie-embeddings-and-output-weights \
--group-query-attention \
--num-query-groups 8 \
--bf16 \
--micro-batch-size 1 \
--seq-length 1024 \
--seed 42 \
--num-experts 8 \
--moe-router-topk 2 \
--moe-token-dispatcher-type alltoall \
--moe-grouped-gemm \
--mock-data \
--rotary-base 1000000
```
Once the server is running you can use `tools/text_generation_cli.py` to query it, it takes one argument which is the host the server is running on.
```
python tools/text_generation_cli.py localhost:5000
```
## Finetuning from pretrained Mixtral 8x7B
To finetuning pretrained Mixtral 8x7B, use the following scripts:
```bash
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.04-py3
CHECKPOINT_PATH="" # Speicfy path to checkpoint dir
TOKENIZER_MODEL="" # Specify path to tokenizer.model
DATA_PATH="" # Specify path to data
docker run \
--gpus=all \
--ipc=host \
--workdir /workspace/megatron-lm \
-v /path/to/data:/path/to/data \
-v /path/to/megatron-lm:/workspace/megatron-lm \
$PYTORCH_IMAGE \
bash examples/mixtral/train_mixtral_8x7b_distributed.sh $CHECKPOINT_PATH $TOKENIZER_MODEL $DATA_PATH
```
The above functionality also applys to Mixtral 8x22B actually, you should set the model config (including hidden_size/head_num/num_layers/ffn_hidden_size) properly according to the original [config](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1/blob/main/config.json).
## Acknowledgements
Contributors outside NVIDIA for the huggingface converter and example of Mixtral models in Megatron-Core:
- Peng Li <jerry.lp@alibaba-inc.com>
- Jun Huang <huangjun.hj@alibaba-inc.com>
......@@ -5,10 +5,22 @@ do
fi
done
mpirun -np 8 --allow-run-as-root \
train_mixtral_8x22B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
# Runs Mixtral 8x22B model
source /opt/dtk/env.sh
HOST=localhost
PORT=25900
DATA_PATH="path to my-mixtral_text_document"
TOKENIZER_MODEL_PATH="path to tokenizer.model"
CHECKPOINT_PATH="path to ckpt"
wait
mpirun -np 8 --allow-run-as-root \
train_mixtral_8x22B_1nodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
rm -rf CKPT
rm -rf mixtral_dataset/my-mixtral_text_document
wait
\ No newline at end of file
......@@ -5,13 +5,25 @@ do
fi
done
mpirun -np 64 --hostfile hostfile_mixtral_8x22B \
# Runs Mixtral 8x22B model
source /opt/dtk/env.sh
HOST="" # modify this variable
PORT=25900
DATA_PATH="path to my-mixtral_text_document"
TOKENIZER_MODEL_PATH="path to tokenizer.model"
CHECKPOINT_PATH="path to ckpt"
mpirun -np 32 --hostfile hostfile_mixtral_8x22B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_mixtral_8x22B_multinodes.sh node067 --profiling=$profiling > log-8nodes-`date +%F-%H%M`.log 2>&1
wait
train_mixtral_8x22B_multinodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
rm -rf CKPT
rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
wait
\ No newline at end of file
......@@ -5,10 +5,22 @@ do
fi
done
mpirun -np 8 --allow-run-as-root \
train_mixtral_8x7B_1nodes.sh localhost --profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
# Runs Mixtral 8x7B model
source /opt/dtk/env.sh
HOST=localhost
PORT=25900
DATA_PATH="path to my-mixtral_text_document"
TOKENIZER_MODEL_PATH="path to tokenizer.model"
CHECKPOINT_PATH="path to ckpt"
wait
mpirun -np 8 --allow-run-as-root \
train_mixtral_8x7B_1nodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-1nodes-`date +%F-%H%M`.log 2>&1
rm -rf CKPT
rm -rf mixtral_dataset/my-mixtral_text_document
wait
\ No newline at end of file
......@@ -5,13 +5,25 @@ do
fi
done
# Runs Mixtral 8x7B model
source /opt/dtk/env.sh
HOST="" # modify this variable
PORT=25900
DATA_PATH="path to my-mixtral_text_document"
TOKENIZER_MODEL_PATH="path to tokenizer.model"
CHECKPOINT_PATH="path to ckpt"
mpirun -np 32 --hostfile hostfile_mixtral_8x7B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_mixtral_8x7B_multinodes.sh node067 --profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
wait
train_mixtral_8x7B_multinodes.sh \
${HOST} \
${PORT} \
--data_path=$DATA_PATH \
--tokenizer_path=$TOKENIZER_MODEL_PATH \
--checkpoint_path=$CHECKPOINT_PATH \
--profiling=$profiling > log-4nodes-`date +%F-%H%M`.log 2>&1
rm -rf CKPT
rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
wait
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment