更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/examples/academic_paper_scripts/msdp/data_processing.sh
+++ b/examples/academic_paper_scripts/msdp/data_processing.sh
--- a/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh
+++ b/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh
--- a/examples/academic_paper_scripts/msdp/eval_resp_generation.sh
+++ b/examples/academic_paper_scripts/msdp/eval_resp_generation.sh
--- a/examples/academic_paper_scripts/msdp/prep_resp_gen.sh
+++ b/examples/academic_paper_scripts/msdp/prep_resp_gen.sh
--- a/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh
+++ b/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh
--- a/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh
+++ b/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh
--- a/examples/academic_paper_scripts/sc21/CONFIG.sh
+++ b/examples/academic_paper_scripts/sc21/CONFIG.sh
 #!/bin/bash
 # SLURM options.
 export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>
 export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm>
 # Source code.
 export MEGATRON_CODE_DIR=<megatron source code directory>
 # This variable is used to mount the relevant part of the filesystem
 # inside the docker container. Note that the `MEGATRON_CODE_DIR` and the
 # launch directory already get mounted; this variable should be used to
 # mount the directories that contain the data and tokenizer files.
 export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path>
 # Data and tokenizer files.
 MEGATRON_DATA=<path to megatron processed data>
 BPE_VOCAB_FILE=<path to bpe vocab file>
 BPE_MERGE_FILE=<path to bpe merges file>
 # Megatron input parameters.
 # `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters
 # that are not listed here. 
 export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \
 	--tensor-model-parallel-size ${TP} \
 	--pipeline-model-parallel-size ${PP} \
 	--micro-batch-size ${MBS} \
 	--global-batch-size ${GBS} \
        --num-layers ${NLS} \
        --hidden-size ${HS} \
        --num-attention-heads ${NAH} \
 	--DDP-impl ${DDP} \
 	--data-path ${MEGATRON_DATA} \
 	--vocab-file ${BPE_VOCAB_FILE} \
 	--merge-file ${BPE_MERGE_FILE} \
        --log-interval 5 \
        --seq-length 2048 \
        --max-position-embeddings 2048 \
        --train-iters 500 \
        --lr-decay-iters 320 \
        --lr 0.0001 \
 	--min-lr 0.00001 \
        --lr-decay-style cosine \
        --lr-warmup-fraction 0.01 \
        --split 969,30,1 \
        --eval-iters 100 \
        --eval-interval 1000 \
        --clip-grad 1.0 \
        --fp16 \
 	--loss-scale 8192 "
--- a/examples/academic_paper_scripts/sc21/SBATCH.sh
+++ b/examples/academic_paper_scripts/sc21/SBATCH.sh
 #!/bin/bash
 sbatch -p ${SLURM_PARTITION} \
       -A ${SLURM_ACCOUNT} \
       --job-name=${JOB_NAME} \
       --nodes=${NNODES} \
       --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh
 exit 0
--- a/examples/academic_paper_scripts/sc21/SRUN.sh
+++ b/examples/academic_paper_scripts/sc21/SRUN.sh
 #!/bin/bash
 #SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8
 THIS_DIR=`pwd`
 DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
 mkdir -p ${THIS_DIR}/logs
 CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"
 srun -l \
     --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
     --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
     --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"
--- a/examples/academic_paper_scripts/sc21/run_figure_11.sh
+++ b/examples/academic_paper_scripts/sc21/run_figure_11.sh
 #!/bin/bash
 # ================================
 # Choose the case to run.
 # ================================
 # Pipeline-parallel size options = [1, 2, 4, 8].
 PP=1
 # Batch size (global batch size) options = [8, 128].
 GBS=8
 # Set pipeline-parallel size options.
 NLS=$((3*PP))
 NNODES=${PP}
 # Other params.
 TP=8
 MBS=1
 HS=20480
 NAH=128
 DDP=local
 MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 # Name of the job.
 export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}
 # Import the configs.
 . `pwd`/CONFIG.sh
 # Submit the job.
 . `pwd`/SBATCH.sh
 exit 0
--- a/examples/academic_paper_scripts/sc21/run_figure_12.sh
+++ b/examples/academic_paper_scripts/sc21/run_figure_12.sh
 #!/bin/bash
 # ================================
 # Choose the case to run.
 # ================================
 # Interleaved schedule options = [YES, NO].
 INTERLEAVED=YES
 # Batch size (global batch size) options = [12, 24, 36, ..., 60].
 GBS=12
 # Set interleaved schedule options.
 if [ ${INTERLEAVED} == "YES" ]; then
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
 elif [ ${INTERLEAVED} == "NO" ]; then
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 else
    echo "Invalid configuration"
    exit 1
 fi
 # Other params.
 TP=8
 PP=12
 MBS=1
 NLS=96
 HS=12288
 NAH=96
 DDP=local
 NNODES=12
 # Name of the job.
 export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
 # Import the configs.
 . `pwd`/CONFIG.sh
 # Submit the job.
 . `pwd`/SBATCH.sh
 exit 0
--- a/examples/academic_paper_scripts/sc21/run_figure_13.sh
+++ b/examples/academic_paper_scripts/sc21/run_figure_13.sh
 #!/bin/bash
 # ================================
 # Choose the case to run.
 # ================================
 # Pipeline-parallel size options = [2, 4, 8, 16, 32].
 PP=2
 # Batch size (global batch size) options = [32, 128].
 GBS=32
 # Set pipeline-parallel and tensor-parallel size options.
 TP=$((64/PP))
 # Other params.
 MBS=1
 NLS=32
 HS=20480
 NAH=128
 DDP=local
 MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 NNODES=8
 # Name of the job.
 export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
 # Import the configs.
 . `pwd`/CONFIG.sh
 # Submit the job.
 . `pwd`/SBATCH.sh
 exit 0
--- a/examples/academic_paper_scripts/sc21/run_figure_14.sh
+++ b/examples/academic_paper_scripts/sc21/run_figure_14.sh
 #!/bin/bash
 # ================================
 # Choose the case to run.
 # ================================
 # Pipeline-parallel size options = [2, 4, 8, 16, 32].
 PP=2
 # Batch size (global batch size) options = [32, 512].
 GBS=32
 # Set pipeline-parallel and data-parallel size options.
 DP=$((64/PP))
 # Other params.
 TP=1
 MBS=1
 NLS=32
 HS=3840
 NAH=32
 DDP=local
 MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 NNODES=8
 # Name of the job.
 export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
 # Import the configs.
 . `pwd`/CONFIG.sh
 # Submit the job.
 . `pwd`/SBATCH.sh
 exit 0
--- a/examples/academic_paper_scripts/sc21/run_figure_15.sh
+++ b/examples/academic_paper_scripts/sc21/run_figure_15.sh
 #!/bin/bash
 # ================================
 # Choose the case to run.
 # ================================
 # Tensor-parallel size options = [2, 4, 8, 16, 32].
 TP=2
 # Batch size (global batch size) options = [32, 128, 512].
 GBS=32
 # Set tensor-parallel and data-parallel size options.
 DP=$((64/TP))
 # Other params.
 PP=1
 MBS=1
 NLS=32
 HS=3840
 NAH=32
 DDP=local
 MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 NNODES=8
 # Name of the job.
 export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
 # Import the configs.
 . `pwd`/CONFIG.sh
 # Submit the job.
 . `pwd`/SBATCH.sh
 exit 0
--- a/examples/academic_paper_scripts/sc21/run_figure_16.sh
+++ b/examples/academic_paper_scripts/sc21/run_figure_16.sh
 #!/bin/bash
 # ================================
 # Choose the case to run.
 # ================================
 # Microbatch size options = [1, 2, 4, 8].
 MBS=1
 # Batch size (global batch size) options = [128, 512].
 GBS=128
 # Other params.
 TP=8
 PP=8
 NLS=32
 HS=15360
 NAH=128
 DDP=local
 MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 NNODES=8
 # Name of the job.
 export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
 # Import the configs.
 . `pwd`/CONFIG.sh
 # Submit the job.
 . `pwd`/SBATCH.sh
 exit 0
--- a/examples/academic_paper_scripts/sc21/run_figure_17.sh
+++ b/examples/academic_paper_scripts/sc21/run_figure_17.sh
 #!/bin/bash
 # ================================
 # Choose the case to run.
 # ================================
 # Activation recomputation options = [YES, NO].
 ACTIVATION_RECOMPUTATION=YES
 # Batch size (global batch size) options = [1, 2, 4, ..., 256].
 GBS=1
 # Set activation recomputation.
 if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
    MEGATRON_EXTRA_PARAMS=""
 else
    echo "Invalid configuration"
    exit 1
 fi
 # Other params.
 TP=8
 PP=16
 MBS=1
 NLS=80
 HS=12288
 NAH=96
 DDP=local
 NNODES=16
 # Name of the job.
 export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
 # Import the configs.
 . `pwd`/CONFIG.sh
 # Submit the job.
 . `pwd`/SBATCH.sh
 exit 0
--- a/examples/academic_paper_scripts/sc21/run_figure_18.sh
+++ b/examples/academic_paper_scripts/sc21/run_figure_18.sh
 #!/bin/bash
 # ================================
 # Choose the case to run.
 # ================================
 # Scatter-gather communication optimization options = [YES, NO].
 SCATTER_GATHER=YES
 # Batch size (global batch size) options = [12, 24, 36, ..., 60].
 GBS=12
 # Set scatter-gather communication optimization options.
 if [ ${SCATTER_GATHER} == "YES" ]; then
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
 elif [ ${SCATTER_GATHER} == "NO" ]; then
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
 else
    echo "Invalid configuration"
    exit 1
 fi
 # Other params.
 TP=8
 PP=12
 MBS=1
 NLS=96
 HS=12288
 NAH=96
 DDP=local
 NNODES=12
 # Name of the job.
 export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
 # Import the configs.
 . `pwd`/CONFIG.sh
 # Submit the job.
 . `pwd`/SBATCH.sh
 exit 0
--- a/examples/academic_paper_scripts/sc21/run_table_1.sh
+++ b/examples/academic_paper_scripts/sc21/run_table_1.sh
 #!/bin/bash
 # ================================
 # Choose the case to run.
 # ================================
 # model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T]
 MODEL_SIZE=1.7B
 if [ ${MODEL_SIZE} == "1.7B" ]; then
    TP=1
    PP=1
    MBS=16
    GBS=512
    NLS=24
    HS=2304
    NAH=24
    DDP=torch
    NNODES=4
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 elif [ ${MODEL_SIZE} == "3.6B" ]; then
    TP=2
    PP=1
    MBS=16
    GBS=512
    NLS=30
    HS=3072
    NAH=32
    DDP=torch
    NNODES=8
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 elif [ ${MODEL_SIZE} == "7.5B" ]; then
    TP=4
    PP=1
    MBS=16
    GBS=512
    NLS=36
    HS=4096
    NAH=32
    DDP=torch
    NNODES=16
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 elif [ ${MODEL_SIZE} == "18B" ]; then
    TP=8
    PP=1
    MBS=8
    GBS=1024
    NLS=40
    HS=6144
    NAH=48
    DDP=torch
    NNODES=32
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 elif [ ${MODEL_SIZE} == "39B" ]; then
    TP=8
    PP=2
    MBS=4
    GBS=1536
    NLS=48
    HS=8192
    NAH=64
    DDP=local
    NNODES=64
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 elif [ ${MODEL_SIZE} == "76B" ]; then
    TP=8
    PP=4
    MBS=2
    GBS=1792
    NLS=60
    HS=10240
    NAH=80
    DDP=local
    NNODES=128
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5"
 elif [ ${MODEL_SIZE} == "145B" ]; then
    TP=8
    PP=8
    MBS=2
    GBS=2304
    NLS=80
    HS=12288
    NAH=96
    DDP=local
    NNODES=192
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 "
 elif [ ${MODEL_SIZE} == "310B" ]; then
    TP=8
    PP=16
    MBS=1
    GBS=2160
    NLS=96
    HS=16384
    NAH=128
    DDP=local
    NNODES=240
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 "
 elif [ ${MODEL_SIZE} == "530B" ]; then
    TP=8
    PP=35
    MBS=1
    GBS=2520
    NLS=105
    HS=20480
    NAH=128
    DDP=local
    NNODES=315
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 "
 elif [ ${MODEL_SIZE} == "1T" ]; then
    TP=8
    PP=64
    MBS=1
    GBS=3072
    NLS=128
    HS=25600
    NAH=160
    DDP=local
    NNODES=384
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
 else
    echo "Invalid configuration"
    exit 1
 fi
 # Name of the job
 export JOB_NAME=results_table_1_model_size_${MODEL_SIZE}
 # Import the configs.
 . `pwd`/CONFIG.sh
 # Submit the job.
 . `pwd`/SBATCH.sh
 exit 0
--- a/examples/bert/train_bert_340m_distributed.sh
+++ b/examples/bert/train_bert_340m_distributed.sh
--- a/examples/export/ptq_and_trtllm_export/README.md
+++ b/examples/export/ptq_and_trtllm_export/README.md
@@ -18,7 +18,7 @@ make -C docker release_build
 Once the container is built, install `nvidia-modelopt` and additional dependencies for sharded checkpoint support:
 ```sh
 pip install "nvidia-modelopt[all]~=0.13.0" --extra-index-url https://pypi.nvidia.com
-pip install zarr tensorstore==0.1.45
+pip install zarr tensorstore!=0.1.46
 ```
 TensorRT-LLM quantization functionalities are currently packaged in `nvidia-modelopt`.
 You can find more documentation about `nvidia-modelopt` [here](https://nvidia.github.io/TensorRT-Model-Optimizer/).
@@ -292,4 +292,4 @@ export trtllm_options=" \
 trtllm-build ${trtllm_options}
 python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer mistralai/Mixtral-8x7B-v0.1
 ```
\ No newline at end of file