Commit 688448db authored by silencealiang's avatar silencealiang
Browse files

更新代码

parent a02a5490
Pipeline #2503 passed with stage
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
#!/bin/bash #!/bin/bash
# SLURM options. # SLURM options.
export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm> export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>
export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm> export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm>
# Source code. # Source code.
export MEGATRON_CODE_DIR=<megatron source code directory> export MEGATRON_CODE_DIR=<megatron source code directory>
# This variable is used to mount the relevant part of the filesystem # This variable is used to mount the relevant part of the filesystem
# inside the docker container. Note that the `MEGATRON_CODE_DIR` and the # inside the docker container. Note that the `MEGATRON_CODE_DIR` and the
# launch directory already get mounted; this variable should be used to # launch directory already get mounted; this variable should be used to
# mount the directories that contain the data and tokenizer files. # mount the directories that contain the data and tokenizer files.
export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path> export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path>
# Data and tokenizer files. # Data and tokenizer files.
MEGATRON_DATA=<path to megatron processed data> MEGATRON_DATA=<path to megatron processed data>
BPE_VOCAB_FILE=<path to bpe vocab file> BPE_VOCAB_FILE=<path to bpe vocab file>
BPE_MERGE_FILE=<path to bpe merges file> BPE_MERGE_FILE=<path to bpe merges file>
# Megatron input parameters. # Megatron input parameters.
# `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters # `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters
# that are not listed here. # that are not listed here.
export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \ export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \
--tensor-model-parallel-size ${TP} \ --tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \ --pipeline-model-parallel-size ${PP} \
--micro-batch-size ${MBS} \ --micro-batch-size ${MBS} \
--global-batch-size ${GBS} \ --global-batch-size ${GBS} \
--num-layers ${NLS} \ --num-layers ${NLS} \
--hidden-size ${HS} \ --hidden-size ${HS} \
--num-attention-heads ${NAH} \ --num-attention-heads ${NAH} \
--DDP-impl ${DDP} \ --DDP-impl ${DDP} \
--data-path ${MEGATRON_DATA} \ --data-path ${MEGATRON_DATA} \
--vocab-file ${BPE_VOCAB_FILE} \ --vocab-file ${BPE_VOCAB_FILE} \
--merge-file ${BPE_MERGE_FILE} \ --merge-file ${BPE_MERGE_FILE} \
--log-interval 5 \ --log-interval 5 \
--seq-length 2048 \ --seq-length 2048 \
--max-position-embeddings 2048 \ --max-position-embeddings 2048 \
--train-iters 500 \ --train-iters 500 \
--lr-decay-iters 320 \ --lr-decay-iters 320 \
--lr 0.0001 \ --lr 0.0001 \
--min-lr 0.00001 \ --min-lr 0.00001 \
--lr-decay-style cosine \ --lr-decay-style cosine \
--lr-warmup-fraction 0.01 \ --lr-warmup-fraction 0.01 \
--split 969,30,1 \ --split 969,30,1 \
--eval-iters 100 \ --eval-iters 100 \
--eval-interval 1000 \ --eval-interval 1000 \
--clip-grad 1.0 \ --clip-grad 1.0 \
--fp16 \ --fp16 \
--loss-scale 8192 " --loss-scale 8192 "
#!/bin/bash #!/bin/bash
sbatch -p ${SLURM_PARTITION} \ sbatch -p ${SLURM_PARTITION} \
-A ${SLURM_ACCOUNT} \ -A ${SLURM_ACCOUNT} \
--job-name=${JOB_NAME} \ --job-name=${JOB_NAME} \
--nodes=${NNODES} \ --nodes=${NNODES} \
--export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh
exit 0 exit 0
#!/bin/bash #!/bin/bash
#SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8 #SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8
THIS_DIR=`pwd` THIS_DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p ${THIS_DIR}/logs mkdir -p ${THIS_DIR}/logs
CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}" CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"
srun -l \ srun -l \
--container-image "nvcr.io#nvidia/pytorch:20.12-py3" \ --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
--container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \ --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
--output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}" --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"
#!/bin/bash #!/bin/bash
# ================================ # ================================
# Choose the case to run. # Choose the case to run.
# ================================ # ================================
# Pipeline-parallel size options = [1, 2, 4, 8]. # Pipeline-parallel size options = [1, 2, 4, 8].
PP=1 PP=1
# Batch size (global batch size) options = [8, 128]. # Batch size (global batch size) options = [8, 128].
GBS=8 GBS=8
# Set pipeline-parallel size options. # Set pipeline-parallel size options.
NLS=$((3*PP)) NLS=$((3*PP))
NNODES=${PP} NNODES=${PP}
# Other params. # Other params.
TP=8 TP=8
MBS=1 MBS=1
HS=20480 HS=20480
NAH=128 NAH=128
DDP=local DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
# Name of the job. # Name of the job.
export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS} export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}
# Import the configs. # Import the configs.
. `pwd`/CONFIG.sh . `pwd`/CONFIG.sh
# Submit the job. # Submit the job.
. `pwd`/SBATCH.sh . `pwd`/SBATCH.sh
exit 0 exit 0
#!/bin/bash #!/bin/bash
# ================================ # ================================
# Choose the case to run. # Choose the case to run.
# ================================ # ================================
# Interleaved schedule options = [YES, NO]. # Interleaved schedule options = [YES, NO].
INTERLEAVED=YES INTERLEAVED=YES
# Batch size (global batch size) options = [12, 24, 36, ..., 60]. # Batch size (global batch size) options = [12, 24, 36, ..., 60].
GBS=12 GBS=12
# Set interleaved schedule options. # Set interleaved schedule options.
if [ ${INTERLEAVED} == "YES" ]; then if [ ${INTERLEAVED} == "YES" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
elif [ ${INTERLEAVED} == "NO" ]; then elif [ ${INTERLEAVED} == "NO" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
else else
echo "Invalid configuration" echo "Invalid configuration"
exit 1 exit 1
fi fi
# Other params. # Other params.
TP=8 TP=8
PP=12 PP=12
MBS=1 MBS=1
NLS=96 NLS=96
HS=12288 HS=12288
NAH=96 NAH=96
DDP=local DDP=local
NNODES=12 NNODES=12
# Name of the job. # Name of the job.
export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS} export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
# Import the configs. # Import the configs.
. `pwd`/CONFIG.sh . `pwd`/CONFIG.sh
# Submit the job. # Submit the job.
. `pwd`/SBATCH.sh . `pwd`/SBATCH.sh
exit 0 exit 0
#!/bin/bash #!/bin/bash
# ================================ # ================================
# Choose the case to run. # Choose the case to run.
# ================================ # ================================
# Pipeline-parallel size options = [2, 4, 8, 16, 32]. # Pipeline-parallel size options = [2, 4, 8, 16, 32].
PP=2 PP=2
# Batch size (global batch size) options = [32, 128]. # Batch size (global batch size) options = [32, 128].
GBS=32 GBS=32
# Set pipeline-parallel and tensor-parallel size options. # Set pipeline-parallel and tensor-parallel size options.
TP=$((64/PP)) TP=$((64/PP))
# Other params. # Other params.
MBS=1 MBS=1
NLS=32 NLS=32
HS=20480 HS=20480
NAH=128 NAH=128
DDP=local DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
NNODES=8 NNODES=8
# Name of the job. # Name of the job.
export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS} export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
# Import the configs. # Import the configs.
. `pwd`/CONFIG.sh . `pwd`/CONFIG.sh
# Submit the job. # Submit the job.
. `pwd`/SBATCH.sh . `pwd`/SBATCH.sh
exit 0 exit 0
#!/bin/bash #!/bin/bash
# ================================ # ================================
# Choose the case to run. # Choose the case to run.
# ================================ # ================================
# Pipeline-parallel size options = [2, 4, 8, 16, 32]. # Pipeline-parallel size options = [2, 4, 8, 16, 32].
PP=2 PP=2
# Batch size (global batch size) options = [32, 512]. # Batch size (global batch size) options = [32, 512].
GBS=32 GBS=32
# Set pipeline-parallel and data-parallel size options. # Set pipeline-parallel and data-parallel size options.
DP=$((64/PP)) DP=$((64/PP))
# Other params. # Other params.
TP=1 TP=1
MBS=1 MBS=1
NLS=32 NLS=32
HS=3840 HS=3840
NAH=32 NAH=32
DDP=local DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
NNODES=8 NNODES=8
# Name of the job. # Name of the job.
export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS} export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
# Import the configs. # Import the configs.
. `pwd`/CONFIG.sh . `pwd`/CONFIG.sh
# Submit the job. # Submit the job.
. `pwd`/SBATCH.sh . `pwd`/SBATCH.sh
exit 0 exit 0
#!/bin/bash #!/bin/bash
# ================================ # ================================
# Choose the case to run. # Choose the case to run.
# ================================ # ================================
# Tensor-parallel size options = [2, 4, 8, 16, 32]. # Tensor-parallel size options = [2, 4, 8, 16, 32].
TP=2 TP=2
# Batch size (global batch size) options = [32, 128, 512]. # Batch size (global batch size) options = [32, 128, 512].
GBS=32 GBS=32
# Set tensor-parallel and data-parallel size options. # Set tensor-parallel and data-parallel size options.
DP=$((64/TP)) DP=$((64/TP))
# Other params. # Other params.
PP=1 PP=1
MBS=1 MBS=1
NLS=32 NLS=32
HS=3840 HS=3840
NAH=32 NAH=32
DDP=local DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
NNODES=8 NNODES=8
# Name of the job. # Name of the job.
export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS} export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
# Import the configs. # Import the configs.
. `pwd`/CONFIG.sh . `pwd`/CONFIG.sh
# Submit the job. # Submit the job.
. `pwd`/SBATCH.sh . `pwd`/SBATCH.sh
exit 0 exit 0
#!/bin/bash #!/bin/bash
# ================================ # ================================
# Choose the case to run. # Choose the case to run.
# ================================ # ================================
# Microbatch size options = [1, 2, 4, 8]. # Microbatch size options = [1, 2, 4, 8].
MBS=1 MBS=1
# Batch size (global batch size) options = [128, 512]. # Batch size (global batch size) options = [128, 512].
GBS=128 GBS=128
# Other params. # Other params.
TP=8 TP=8
PP=8 PP=8
NLS=32 NLS=32
HS=15360 HS=15360
NAH=128 NAH=128
DDP=local DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
NNODES=8 NNODES=8
# Name of the job. # Name of the job.
export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS} export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
# Import the configs. # Import the configs.
. `pwd`/CONFIG.sh . `pwd`/CONFIG.sh
# Submit the job. # Submit the job.
. `pwd`/SBATCH.sh . `pwd`/SBATCH.sh
exit 0 exit 0
#!/bin/bash #!/bin/bash
# ================================ # ================================
# Choose the case to run. # Choose the case to run.
# ================================ # ================================
# Activation recomputation options = [YES, NO]. # Activation recomputation options = [YES, NO].
ACTIVATION_RECOMPUTATION=YES ACTIVATION_RECOMPUTATION=YES
# Batch size (global batch size) options = [1, 2, 4, ..., 256]. # Batch size (global batch size) options = [1, 2, 4, ..., 256].
GBS=1 GBS=1
# Set activation recomputation. # Set activation recomputation.
if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
MEGATRON_EXTRA_PARAMS="" MEGATRON_EXTRA_PARAMS=""
else else
echo "Invalid configuration" echo "Invalid configuration"
exit 1 exit 1
fi fi
# Other params. # Other params.
TP=8 TP=8
PP=16 PP=16
MBS=1 MBS=1
NLS=80 NLS=80
HS=12288 HS=12288
NAH=96 NAH=96
DDP=local DDP=local
NNODES=16 NNODES=16
# Name of the job. # Name of the job.
export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS} export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
# Import the configs. # Import the configs.
. `pwd`/CONFIG.sh . `pwd`/CONFIG.sh
# Submit the job. # Submit the job.
. `pwd`/SBATCH.sh . `pwd`/SBATCH.sh
exit 0 exit 0
#!/bin/bash #!/bin/bash
# ================================ # ================================
# Choose the case to run. # Choose the case to run.
# ================================ # ================================
# Scatter-gather communication optimization options = [YES, NO]. # Scatter-gather communication optimization options = [YES, NO].
SCATTER_GATHER=YES SCATTER_GATHER=YES
# Batch size (global batch size) options = [12, 24, 36, ..., 60]. # Batch size (global batch size) options = [12, 24, 36, ..., 60].
GBS=12 GBS=12
# Set scatter-gather communication optimization options. # Set scatter-gather communication optimization options.
if [ ${SCATTER_GATHER} == "YES" ]; then if [ ${SCATTER_GATHER} == "YES" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
elif [ ${SCATTER_GATHER} == "NO" ]; then elif [ ${SCATTER_GATHER} == "NO" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
else else
echo "Invalid configuration" echo "Invalid configuration"
exit 1 exit 1
fi fi
# Other params. # Other params.
TP=8 TP=8
PP=12 PP=12
MBS=1 MBS=1
NLS=96 NLS=96
HS=12288 HS=12288
NAH=96 NAH=96
DDP=local DDP=local
NNODES=12 NNODES=12
# Name of the job. # Name of the job.
export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS} export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
# Import the configs. # Import the configs.
. `pwd`/CONFIG.sh . `pwd`/CONFIG.sh
# Submit the job. # Submit the job.
. `pwd`/SBATCH.sh . `pwd`/SBATCH.sh
exit 0 exit 0
#!/bin/bash #!/bin/bash
# ================================ # ================================
# Choose the case to run. # Choose the case to run.
# ================================ # ================================
# model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T] # model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T]
MODEL_SIZE=1.7B MODEL_SIZE=1.7B
if [ ${MODEL_SIZE} == "1.7B" ]; then if [ ${MODEL_SIZE} == "1.7B" ]; then
TP=1 TP=1
PP=1 PP=1
MBS=16 MBS=16
GBS=512 GBS=512
NLS=24 NLS=24
HS=2304 HS=2304
NAH=24 NAH=24
DDP=torch DDP=torch
NNODES=4 NNODES=4
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "3.6B" ]; then elif [ ${MODEL_SIZE} == "3.6B" ]; then
TP=2 TP=2
PP=1 PP=1
MBS=16 MBS=16
GBS=512 GBS=512
NLS=30 NLS=30
HS=3072 HS=3072
NAH=32 NAH=32
DDP=torch DDP=torch
NNODES=8 NNODES=8
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "7.5B" ]; then elif [ ${MODEL_SIZE} == "7.5B" ]; then
TP=4 TP=4
PP=1 PP=1
MBS=16 MBS=16
GBS=512 GBS=512
NLS=36 NLS=36
HS=4096 HS=4096
NAH=32 NAH=32
DDP=torch DDP=torch
NNODES=16 NNODES=16
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "18B" ]; then elif [ ${MODEL_SIZE} == "18B" ]; then
TP=8 TP=8
PP=1 PP=1
MBS=8 MBS=8
GBS=1024 GBS=1024
NLS=40 NLS=40
HS=6144 HS=6144
NAH=48 NAH=48
DDP=torch DDP=torch
NNODES=32 NNODES=32
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "39B" ]; then elif [ ${MODEL_SIZE} == "39B" ]; then
TP=8 TP=8
PP=2 PP=2
MBS=4 MBS=4
GBS=1536 GBS=1536
NLS=48 NLS=48
HS=8192 HS=8192
NAH=64 NAH=64
DDP=local DDP=local
NNODES=64 NNODES=64
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "76B" ]; then elif [ ${MODEL_SIZE} == "76B" ]; then
TP=8 TP=8
PP=4 PP=4
MBS=2 MBS=2
GBS=1792 GBS=1792
NLS=60 NLS=60
HS=10240 HS=10240
NAH=80 NAH=80
DDP=local DDP=local
NNODES=128 NNODES=128
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5" MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5"
elif [ ${MODEL_SIZE} == "145B" ]; then elif [ ${MODEL_SIZE} == "145B" ]; then
TP=8 TP=8
PP=8 PP=8
MBS=2 MBS=2
GBS=2304 GBS=2304
NLS=80 NLS=80
HS=12288 HS=12288
NAH=96 NAH=96
DDP=local DDP=local
NNODES=192 NNODES=192
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 "
elif [ ${MODEL_SIZE} == "310B" ]; then elif [ ${MODEL_SIZE} == "310B" ]; then
TP=8 TP=8
PP=16 PP=16
MBS=1 MBS=1
GBS=2160 GBS=2160
NLS=96 NLS=96
HS=16384 HS=16384
NAH=128 NAH=128
DDP=local DDP=local
NNODES=240 NNODES=240
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 "
elif [ ${MODEL_SIZE} == "530B" ]; then elif [ ${MODEL_SIZE} == "530B" ]; then
TP=8 TP=8
PP=35 PP=35
MBS=1 MBS=1
GBS=2520 GBS=2520
NLS=105 NLS=105
HS=20480 HS=20480
NAH=128 NAH=128
DDP=local DDP=local
NNODES=315 NNODES=315
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 "
elif [ ${MODEL_SIZE} == "1T" ]; then elif [ ${MODEL_SIZE} == "1T" ]; then
TP=8 TP=8
PP=64 PP=64
MBS=1 MBS=1
GBS=3072 GBS=3072
NLS=128 NLS=128
HS=25600 HS=25600
NAH=160 NAH=160
DDP=local DDP=local
NNODES=384 NNODES=384
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
else else
echo "Invalid configuration" echo "Invalid configuration"
exit 1 exit 1
fi fi
# Name of the job # Name of the job
export JOB_NAME=results_table_1_model_size_${MODEL_SIZE} export JOB_NAME=results_table_1_model_size_${MODEL_SIZE}
# Import the configs. # Import the configs.
. `pwd`/CONFIG.sh . `pwd`/CONFIG.sh
# Submit the job. # Submit the job.
. `pwd`/SBATCH.sh . `pwd`/SBATCH.sh
exit 0 exit 0
File mode changed from 100644 to 100755
...@@ -18,7 +18,7 @@ make -C docker release_build ...@@ -18,7 +18,7 @@ make -C docker release_build
Once the container is built, install `nvidia-modelopt` and additional dependencies for sharded checkpoint support: Once the container is built, install `nvidia-modelopt` and additional dependencies for sharded checkpoint support:
```sh ```sh
pip install "nvidia-modelopt[all]~=0.13.0" --extra-index-url https://pypi.nvidia.com pip install "nvidia-modelopt[all]~=0.13.0" --extra-index-url https://pypi.nvidia.com
pip install zarr tensorstore==0.1.45 pip install zarr tensorstore!=0.1.46
``` ```
TensorRT-LLM quantization functionalities are currently packaged in `nvidia-modelopt`. TensorRT-LLM quantization functionalities are currently packaged in `nvidia-modelopt`.
You can find more documentation about `nvidia-modelopt` [here](https://nvidia.github.io/TensorRT-Model-Optimizer/). You can find more documentation about `nvidia-modelopt` [here](https://nvidia.github.io/TensorRT-Model-Optimizer/).
...@@ -292,4 +292,4 @@ export trtllm_options=" \ ...@@ -292,4 +292,4 @@ export trtllm_options=" \
trtllm-build ${trtllm_options} trtllm-build ${trtllm_options}
python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer mistralai/Mixtral-8x7B-v0.1 python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer mistralai/Mixtral-8x7B-v0.1
``` ```
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment