First add

d3dd8642 · Rayyyyy · d3dd8642 · d3dd8642 · d3dd8642 · d3dd8642
Commit d3dd8642 authored Jun 26, 2024 by Rayyyyy
20 changed files
--- a/examples/pretrain_bert.sh
+++ b/examples/pretrain_bert.sh
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/bert-vocab.txt
+DATA_PATH=<Specify path and file prefix>_text_sentence
+
+BERT_ARGS="
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --seq-length 512 \
+    --max-position-embeddings 512 \
+    --micro-batch-size 4 \
+    --global-batch-size 8 \
+    --lr 0.0001 \
+    --train-iters 2000000 \
+    --lr-decay-iters 990000 \
+    --lr-decay-style linear \
+    --min-lr 0.00001 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --data-impl mmap \
+    --split 949,50,1
+"
+
+OUTPUT_ARGS="
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10
+"
+
+torchrun pretrain_bert.py \
+    $BERT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
--- a/examples/pretrain_bert_distributed.sh
+++ b/examples/pretrain_bert_distributed.sh
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/bert-vocab.txt
+DATA_PATH=<Specify path and file prefix>_text_sentence
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+BERT_ARGS="
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --seq-length 512 \
+    --max-position-embeddings 512 \
+    --micro-batch-size 4 \
+    --global-batch-size 32 \
+    --lr 0.0001 \
+    --train-iters 1000000 \
+    --lr-decay-iters 990000 \
+    --lr-decay-style linear \
+    --min-lr 1.0e-5 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --data-impl mmap \
+    --split 949,50,1
+"
+
+OUTPUT_ARGS="
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
+    $BERT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
--- a/examples/pretrain_bert_distributed_with_mp.sh
+++ b/examples/pretrain_bert_distributed_with_mp.sh
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/bert-vocab.txt
+DATA_PATH=<Specify path and file prefix>_text_sentence
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+BERT_ARGS="
+    --tensor-model-parallel-size 2 \
+    --pipeline-model-parallel-size 2 \
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --seq-length 512 \
+    --max-position-embeddings 512 \
+    --micro-batch-size 2 \
+    --global-batch-size 16 \
+    --lr 0.0001 \
+    --train-iters 1000000 \
+    --lr-decay-iters 990000 \
+    --lr-decay-style linear \
+    --min-lr 1.0e-5 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --data-impl mmap \
+    --split 949,50,1
+"
+
+OUTPUT_ARGS="
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
+    $BERT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
--- a/examples/pretrain_gpt.sh
+++ b/examples/pretrain_gpt.sh
+#!/bin/bash
+
+# Runs the "345M" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
+MERGE_FILE=<Specify path to file>/gpt2-merges.txt
+DATA_PATH=<Specify path and file prefix>_text_document
+
+GPT_ARGS="
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --micro-batch-size 4 \
+    --global-batch-size 8 \
+    --lr 0.00015 \
+    --train-iters 500000 \
+    --lr-decay-iters 320000 \
+    --lr-decay-style cosine \
+    --min-lr 1.0e-5 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --data-impl mmap \
+    --split 949,50,1
+"
+
+OUTPUT_ARGS="
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10
+"
+
+torchrun pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
--- a/examples/pretrain_gpt3_175B.sh
+++ b/examples/pretrain_gpt3_175B.sh
+#!/bin/bash
+
+
+#SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
+
+
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+mkdir -p $DIR/logs
+
+
+DATASET_1="<PATH TO THE FIRST DATASET>"
+DATASET_2="<PATH TO THE SECOND DATASET>"
+DATASET_3="<PATH TO THE THIRD DATASET>"
+DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
+
+
+options=" \
+	--tensor-model-parallel-size 8 \
+	--pipeline-model-parallel-size 16 \
+        --num-layers 96 \
+        --hidden-size 12288 \
+        --num-attention-heads 96 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+	--micro-batch-size 1 \
+	--global-batch-size 1536 \
+	--rampup-batch-size 16 16 5859375 \
+	--train-samples 146484375 \
+       	--lr-decay-samples 126953125 \
+        --lr-warmup-samples 183105 \
+        --lr 6.0e-5 \
+	--min-lr 6.0e-6 \
+        --lr-decay-style cosine \
+        --log-interval 10 \
+        --eval-iters 40 \
+        --eval-interval 1000 \
+	--data-path ${DATASET} \
+	--vocab-file <PATH TO gpt-vocab.json> \
+	--merge-file <PATH TO gpt-merges.txt> \
+	--save-interval 1000 \
+	--save <PATH TO CHECKPOINTS DIRECTORY> \
+	--load <PATH TO CHECKPOINTS DIRECTORY> \
+	--split 98,2,0 \
+	--clip-grad 1.0 \
+	--weight-decay 0.1 \
+	--adam-beta1 0.9 \
+	--adam-beta2 0.95 \
+	--init-method-std 0.006 \
+	--tensorboard-dir <TENSORBOARD DIRECTORY> \
+	--fp16 "
+
+
+run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
+
+
+srun -l \
+     --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
+     --container-mounts "<DIRECTORIES TO MOUNT>" \
+     --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
+
+
+set +x
+
--- a/examples/pretrain_gpt_distributed.sh
+++ b/examples/pretrain_gpt_distributed.sh
+#!/bin/bash
+
+# Runs the "345M" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
+MERGE_FILE=<Specify path to file>/gpt2-merges.txt
+DATA_PATH=<Specify path and file prefix>_text_document
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --micro-batch-size 8 \
+    --global-batch-size 64 \
+    --lr 0.00015 \
+    --train-iters 500000 \
+    --lr-decay-iters 320000 \
+    --lr-decay-style cosine \
+    --min-lr 1.0e-5 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --data-impl mmap \
+    --split 949,50,1
+"
+
+OUTPUT_ARGS="
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
--- a/examples/pretrain_gpt_distributed_with_mp.sh
+++ b/examples/pretrain_gpt_distributed_with_mp.sh
+#!/bin/bash
+
+# Runs the "345M" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
+MERGE_FILE=<Specify path to file>/gpt2-merges.txt
+DATA_PATH=<Specify path and file prefix>_text_document
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size 2 \
+    --pipeline-model-parallel-size 2 \
+    --sequence-parallel \
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --micro-batch-size 4 \
+    --global-batch-size 16 \
+    --lr 0.00015 \
+    --train-iters 500000 \
+    --lr-decay-iters 320000 \
+    --lr-decay-style cosine \
+    --min-lr 1.0e-5 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --data-impl mmap \
+    --split 949,50,1
+"
+
+OUTPUT_ARGS="
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
+
--- a/examples/pretrain_ict.sh
+++ b/examples/pretrain_ict.sh
+#! /bin/bash
+
+# Runs the "217M" parameter biencoder model for ICT retriever
+
+RANK=0
+WORLD_SIZE=1
+
+PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
+TEXT_DATA_PATH=<Specify path and file prefix of the text data>
+TITLE_DATA_PATH=<Specify path and file prefix od the titles>
+CHECKPOINT_PATH=<Specify path>
+
+
+python pretrain_ict.py \
+        --num-layers 12 \
+        --hidden-size 768 \
+        --num-attention-heads 12 \
+        --tensor-model-parallel-size 1 \
+        --micro-batch-size 32 \
+        --seq-length 256 \
+        --max-position-embeddings 512 \
+        --train-iters 100000 \
+        --vocab-file bert-vocab.txt \
+        --tokenizer-type BertWordPieceLowerCase \
+        --DDP-impl torch \
+        --bert-load ${PRETRAINED_BERT_PATH} \
+        --log-interval 100 \
+        --eval-interval 1000 \
+        --eval-iters 10 \
+        --retriever-report-topk-accuracies 1 5 10 20 100 \
+        --retriever-score-scaling \
+        --load $CHECKPOINT_PATH \
+        --save $CHECKPOINT_PATH \
+        --data-path ${TEXT_DATA_PATH} \
+        --titles-data-path ${TITLE_DATA_PATH} \
+        --lr 0.0001 \
+        --lr-decay-style linear \
+        --weight-decay 1e-2 \
+        --clip-grad 1.0 \
+        --lr-warmup-fraction 0.01 \
+        --save-interval 4000 \
+        --exit-interval 8000 \
+        --query-in-block-prob 0.1 \
+        --fp16
--- a/examples/pretrain_t5.sh
+++ b/examples/pretrain_t5.sh
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/t5-vocab.txt
+DATA_PATH=<Specify path and file prefix>_text_sentence
+
+T5_ARGS="
+    --num-layers 12 \
+    --hidden-size 768 \
+    --num-attention-heads 12 \
+    --kv-channels 64 \
+    --ffn-hidden-size 3072 \
+    --encoder-seq-length 512 \
+    --decoder-seq-length 128 \
+    --max-position-embeddings 512 \
+    --micro-batch-size 16 \
+    --global-batch-size 16 \
+    --lr 0.0001 \
+    --train-iters 1000000 \
+    --lr-decay-iters 1000000 \
+    --lr-decay-style linear \
+    --min-lr 0.00001 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16 \
+    --vocab-extra-ids 100
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --data-impl mmap \
+    --split 949,50,1
+"
+
+OUTPUT_ARGS="
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10
+"
+
+torchrun pretrain_t5.py \
+    $T5_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
--- a/examples/pretrain_t5_distributed.sh
+++ b/examples/pretrain_t5_distributed.sh
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/t5-vocab.txt
+DATA_PATH=<Specify path and file prefix>_text_sentence
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+T5_ARGS="
+    --num-layers 12 \
+    --hidden-size 768 \
+    --num-attention-heads 12 \
+    --kv-channels 64 \
+    --ffn-hidden-size 3072 \
+    --encoder-seq-length 512 \
+    --decoder-seq-length 128 \
+    --max-position-embeddings 512 \
+    --micro-batch-size 16 \
+    --global-batch-size 128 \
+    --lr 0.0001 \
+    --train-iters 1000000 \
+    --lr-decay-iters 1000000 \
+    --lr-decay-style linear \
+    --min-lr 0.00001 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16 \
+    --vocab-extra-ids 100
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --data-impl mmap \
+    --split 949,50,1
+"
+
+OUTPUT_ARGS="
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_t5.py \
+    $T5_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
--- a/examples/pretrain_t5_distributed_with_mp.sh
+++ b/examples/pretrain_t5_distributed_with_mp.sh
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+VOCAB_FILE=<Specify path to file>/t5-vocab.txt
+DATA_PATH=<Specify path and file prefix>_text_sentence
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+T5_ARGS="
+    --tensor-model-parallel-size 2 \
+    --num-layers 12 \
+    --hidden-size 768 \
+    --num-attention-heads 12 \
+    --kv-channels 64 \
+    --ffn-hidden-size 3072 \
+    --encoder-seq-length 512 \
+    --decoder-seq-length 128 \
+    --max-position-embeddings 512 \
+    --micro-batch-size 16 \
+    --global-batch-size 128 \
+    --lr 0.0001 \
+    --train-iters 1000000 \
+    --lr-decay-iters 1000000 \
+    --lr-decay-style linear \
+    --min-lr 0.00001 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction .01 \
+    --clip-grad 1.0 \
+    --fp16  \
+    --vocab-extra-ids 100
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --vocab-file $VOCAB_FILE \
+    --data-impl mmap \
+    --split 949,50,1
+"
+
+OUTPUT_ARGS="
+    --log-interval 100 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_t5.py \
+    $T5_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
--- a/examples/pretrain_yuan2.0_102B.sh
+++ b/examples/pretrain_yuan2.0_102B.sh
+#!/bin/bash
+
+# Runs the "Yuan-102B" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+DATA_PATH=<Specify path and file prefix>_text_document
+TOKENIZER_MODEL_PATH=<Specify path to file>
+TENSORBOARD_PATH=<Specify path to file>
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 32 \
+    --pipeline-model-parallel-method block \
+    --pipeline-model-parallel-blocks 2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2 \
+    --timing-log-level 2 \
+    --num-workers 2 \
+    --num-layers 84 \
+    --hidden-size 8192 \
+    --num-attention-heads 64 \
+    --use-lf-gate \
+    --lf-conv2d-group 1 \
+    --lf-conv2d-num-pad 1 \
+    --position-embedding-type rope \
+    --no-embedding-dropout \
+    --flash-attn-drop 0.1 \
+    --fim-rate 0.5 \
+    --fim-spm-rate 0.5 \
+    --norm-dtype RMSNorm \
+    --attention-dropout 0 \
+    --hidden-dropout 0 \
+    --disable-bias-linear \
+    --reset-position-ids \
+    --use-flash-attn \
+    --swiglu \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --seq-length 4096 \
+    --max-position-embeddings 4096 \
+    --micro-batch-size 1 \
+    --global-batch-size 1152 \
+    --lr 0.00003 \
+    --train-iters 63578 \
+    --lr-decay-iters 63578 \
+    --lr-decay-style cosine \
+    --min-lr 0.3e-5 \
+    --weight-decay 1e-1 \
+    --use-distributed-optimizer \
+    --lr-warmup-iters 1300 \
+    --clip-grad 1.0 \
+    --recompute-method block \
+    --recompute-granularity full \
+    --recompute-num-layers 2 \
+    --bf16
+"
+
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --tokenizer-type YuanTokenizer \
+    --tokenizer-model-path $TOKENIZER_MODEL_PATH \
+    --data-impl mmap \
+    --split 10,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000000 \
+    --eval-iters 10
+"
+
+LOG_ARGS="
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-log-interval 1 \
+    --tensorboard-queue-size 1000 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-memory-to-tensorboard \
+    --log-world-size-to-tensorboard
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_yuan.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    $LOG_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
+
--- a/examples/pretrain_yuan2.0_102B_sft.sh
+++ b/examples/pretrain_yuan2.0_102B_sft.sh
+#!/bin/bash
+
+# Runs the "Yuan-102B" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+DATA_PATH=<Specify path and file prefix>_text_document
+TOKENIZER_MODEL_PATH=<Specify path to file>
+TENSORBOARD_PATH=<Specify path to file>
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 32 \
+    --pipeline-model-parallel-method block \
+    --pipeline-model-parallel-blocks 2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2 \
+    --timing-log-level 2 \
+    --num-workers 2 \
+    --num-layers 84 \
+    --hidden-size 8192 \
+    --num-attention-heads 64 \
+    --use-lf-gate \
+    --lf-conv2d-group 1 \
+    --lf-conv2d-num-pad 1 \
+    --position-embedding-type rope \
+    --no-embedding-dropout \
+    --flash-attn-drop 0.1 \
+    --fim-rate 0.5 \
+    --fim-spm-rate 0.5 \
+    --norm-dtype RMSNorm \
+    --attention-dropout 0 \
+    --hidden-dropout 0 \
+    --disable-bias-linear \
+    --reset-position-ids \
+    --use-flash-attn \
+    --swiglu \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --seq-length 4096 \
+    --max-position-embeddings 4096 \
+    --micro-batch-size 1 \
+    --global-batch-size 1152 \
+    --lr 0.00003 \
+    --train-iters 63578 \
+    --lr-decay-iters 63578 \
+    --lr-decay-style cosine \
+    --min-lr 0.3e-5 \
+    --weight-decay 1e-1 \
+    --use-distributed-optimizer \
+    --lr-warmup-iters 1300 \
+    --clip-grad 1.0 \
+    --recompute-method block \
+    --recompute-granularity full \
+    --recompute-num-layers 2 \
+    --bf16 \
+    --sft-stage \
+    --override-opt-param-scheduler \
+    --train-reset \
+    --finetune
+"
+
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --tokenizer-type YuanTokenizer \
+    --tokenizer-model-path $TOKENIZER_MODEL_PATH \
+    --data-impl mmap \
+    --split 10,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000000 \
+    --eval-iters 10
+"
+
+LOG_ARGS="
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-log-interval 1 \
+    --tensorboard-queue-size 1000 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-memory-to-tensorboard \
+    --log-world-size-to-tensorboard
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_yuan.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    $LOG_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
+
--- a/examples/pretrain_yuan2.0_2.1B.sh
+++ b/examples/pretrain_yuan2.0_2.1B.sh
+#!/bin/bash
+
+# Runs the "Yuan-2.1B" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+DATA_PATH=<Specify path and file prefix>_text_document
+TOKENIZER_MODEL_PATH=<Specify path to file>
+TENSORBOARD_PATH=<Specify path to file>
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 1 \
+    --timing-log-level 2 \
+    --num-workers 2 \
+    --num-layers 24 \
+    --hidden-size 2048 \
+    --num-attention-heads 32 \
+    --use-lf-gate \
+    --lf-conv2d-group 1 \
+    --lf-conv2d-num-pad 1 \
+    --position-embedding-type rope \
+    --no-embedding-dropout \
+    --flash-attn-drop 0.1 \
+    --fim-rate 0.5 \
+    --fim-spm-rate 0.5 \
+    --norm-dtype RMSNorm \
+    --attention-dropout 0 \
+    --hidden-dropout 0 \
+    --disable-bias-linear \
+    --reset-position-ids \
+    --use-flash-attn \
+    --swiglu \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --seq-length 8192 \
+    --max-position-embeddings 8192 \
+    --micro-batch-size 2 \
+    --global-batch-size 384 \
+    --lr 0.0002 \
+    --train-iters 95367 \
+    --lr-decay-iters 95367 \
+    --lr-decay-style cosine \
+    --min-lr 2.0e-5 \
+    --weight-decay 1e-1 \
+    --lr-warmup-iters 1900 \
+    --clip-grad 1.0 \
+    --recompute-method uniform \
+    --recompute-granularity full \
+    --recompute-num-layers 1 \
+    --bf16
+"
+
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --tokenizer-type YuanTokenizer \
+    --tokenizer-model-path $TOKENIZER_MODEL_PATH \
+    --data-impl mmap \
+    --split 10,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000000 \
+    --eval-iters 10
+"
+
+LOG_ARGS="
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-log-interval 1 \
+    --tensorboard-queue-size 1000 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-memory-to-tensorboard \
+    --log-world-size-to-tensorboard
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_yuan.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    $LOG_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
+
--- a/examples/pretrain_yuan2.0_2.1B_sft.sh
+++ b/examples/pretrain_yuan2.0_2.1B_sft.sh
+#!/bin/bash
+
+# Runs the "Yuan-2.1B" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+DATA_PATH=<Specify path and file prefix>_text_document
+TOKENIZER_MODEL_PATH=<Specify path to file>
+TENSORBOARD_PATH=<Specify path to file>
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 1 \
+    --timing-log-level 2 \
+    --num-workers 2 \
+    --num-layers 24 \
+    --hidden-size 2048 \
+    --num-attention-heads 32 \
+    --use-lf-gate \
+    --lf-conv2d-group 1 \
+    --lf-conv2d-num-pad 1 \
+    --position-embedding-type rope \
+    --no-embedding-dropout \
+    --flash-attn-drop 0.1 \
+    --fim-rate 0.5 \
+    --fim-spm-rate 0.5 \
+    --norm-dtype RMSNorm \
+    --attention-dropout 0 \
+    --hidden-dropout 0 \
+    --disable-bias-linear \
+    --reset-position-ids \
+    --use-flash-attn \
+    --swiglu \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --seq-length 8192 \
+    --max-position-embeddings 8192 \
+    --micro-batch-size 2 \
+    --global-batch-size 384 \
+    --lr 0.0002 \
+    --train-iters 95367 \
+    --lr-decay-iters 95367 \
+    --lr-decay-style cosine \
+    --min-lr 2.0e-5 \
+    --weight-decay 1e-1 \
+    --lr-warmup-iters 1900 \
+    --clip-grad 1.0 \
+    --recompute-method uniform \
+    --recompute-granularity full \
+    --recompute-num-layers 1 \
+    --bf16 \
+    --sft-stage \
+    --override-opt-param-scheduler \
+    --train-reset \
+    --finetune
+"
+
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --tokenizer-type YuanTokenizer \
+    --tokenizer-model-path $TOKENIZER_MODEL_PATH \
+    --data-impl mmap \
+    --split 10,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000000 \
+    --eval-iters 10
+"
+
+LOG_ARGS="
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-log-interval 1 \
+    --tensorboard-queue-size 1000 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-memory-to-tensorboard \
+    --log-world-size-to-tensorboard
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_yuan.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    $LOG_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
+
--- a/examples/pretrain_yuan2.0_51B.sh
+++ b/examples/pretrain_yuan2.0_51B.sh
+#!/bin/bash
+
+# Runs the "Yuan-51B" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+DATA_PATH=<Specify path and file prefix>_text_document
+TOKENIZER_MODEL_PATH=<Specify path to file>
+TENSORBOARD_PATH=<Specify path to file>
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 16 \
+    --pipeline-model-parallel-method block \
+    --pipeline-model-parallel-blocks 2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,2 \
+    --timing-log-level 2 \
+    --num-workers 2 \
+    --num-layers 42 \
+    --hidden-size 8192 \
+    --num-attention-heads 64 \
+    --use-lf-gate \
+    --lf-conv2d-group 1 \
+    --lf-conv2d-num-pad 1 \
+    --position-embedding-type rope \
+    --no-embedding-dropout \
+    --flash-attn-drop 0.1 \
+    --fim-rate 0.5 \
+    --fim-spm-rate 0.5 \
+    --norm-dtype RMSNorm \
+    --attention-dropout 0 \
+    --hidden-dropout 0 \
+    --disable-bias-linear \
+    --reset-position-ids \
+    --use-flash-attn \
+    --swiglu \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --seq-length 4096 \
+    --max-position-embeddings 4096 \
+    --micro-batch-size 1 \
+    --global-batch-size 1152 \
+    --lr 0.00009 \
+    --train-iters 63578 \
+    --lr-decay-iters 63578 \
+    --lr-decay-style cosine \
+    --min-lr 0.9e-5 \
+    --weight-decay 1e-1 \
+    --use-distributed-optimizer \
+    --lr-warmup-iters 1300 \
+    --clip-grad 1.0 \
+    --recompute-method block \
+    --recompute-granularity full \
+    --recompute-num-layers 1 \
+    --bf16
+"
+
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --tokenizer-type YuanTokenizer \
+    --tokenizer-model-path $TOKENIZER_MODEL_PATH \
+    --data-impl mmap \
+    --split 10,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000000 \
+    --eval-iters 10
+"
+
+LOG_ARGS="
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-log-interval 1 \
+    --tensorboard-queue-size 1000 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-memory-to-tensorboard \
+    --log-world-size-to-tensorboard
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_yuan.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    $LOG_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
+
--- a/examples/pretrain_yuan2.0_51B_sft.sh
+++ b/examples/pretrain_yuan2.0_51B_sft.sh
+#!/bin/bash
+
+# Runs the "Yuan-51B" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+DATA_PATH=<Specify path and file prefix>_text_document
+TOKENIZER_MODEL_PATH=<Specify path to file>
+TENSORBOARD_PATH=<Specify path to file>
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 16 \
+    --pipeline-model-parallel-method block \
+    --pipeline-model-parallel-blocks 2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,2 \
+    --timing-log-level 2 \
+    --num-workers 2 \
+    --num-layers 42 \
+    --hidden-size 8192 \
+    --num-attention-heads 64 \
+    --use-lf-gate \
+    --lf-conv2d-group 1 \
+    --lf-conv2d-num-pad 1 \
+    --position-embedding-type rope \
+    --no-embedding-dropout \
+    --flash-attn-drop 0.1 \
+    --fim-rate 0.5 \
+    --fim-spm-rate 0.5 \
+    --norm-dtype RMSNorm \
+    --attention-dropout 0 \
+    --hidden-dropout 0 \
+    --disable-bias-linear \
+    --reset-position-ids \
+    --use-flash-attn \
+    --swiglu \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --seq-length 4096 \
+    --max-position-embeddings 4096 \
+    --micro-batch-size 1 \
+    --global-batch-size 1152 \
+    --lr 0.00009 \
+    --train-iters 63578 \
+    --lr-decay-iters 63578 \
+    --lr-decay-style cosine \
+    --min-lr 0.9e-5 \
+    --weight-decay 1e-1 \
+    --use-distributed-optimizer \
+    --lr-warmup-iters 1300 \
+    --clip-grad 1.0 \
+    --recompute-method block \
+    --recompute-granularity full \
+    --recompute-num-layers 1 \
+    --bf16 \
+    --sft-stage \
+    --override-opt-param-scheduler \
+    --train-reset \
+    --finetune
+"
+
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --tokenizer-type YuanTokenizer \
+    --tokenizer-model-path $TOKENIZER_MODEL_PATH \
+    --data-impl mmap \
+    --split 10,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000000 \
+    --eval-iters 10
+"
+
+LOG_ARGS="
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-log-interval 1 \
+    --tensorboard-queue-size 1000 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-memory-to-tensorboard \
+    --log-world-size-to-tensorboard
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_yuan.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    $LOG_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
+
--- a/examples/pretrain_yuan2.0_moe_2x32B.sh
+++ b/examples/pretrain_yuan2.0_moe_2x32B.sh
+#!/bin/bash
+
+# Runs the "Yuan-2.1B" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+DATA_PATH=<Specify path and file prefix>_text_document
+TOKENIZER_MODEL_PATH=<Specify path to file>
+TENSORBOARD_PATH=<Specify path to file>
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 8 \
+    --timing-log-level 2 \
+    --num-workers 2 \
+    --num-layers 24 \
+    --hidden-size 2048 \
+    --num-attention-heads 16 \
+    --kv-channels 256 \
+    --use-lf-gate \
+    --lf-conv2d-group 1 \
+    --lf-conv2d-num-pad 1 \
+    --position-embedding-type rope \
+    --no-embedding-dropout \
+    --flash-attn-drop 0.1 \
+    --fim-rate 0.5 \
+    --fim-spm-rate 0.5 \
+    --norm-dtype RMSNorm \
+    --attention-dropout 0 \
+    --hidden-dropout 0 \
+    --disable-bias-linear \
+    --reset-position-ids \
+    --use-flash-attn \
+    --swiglu \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --seq-length 4096 \
+    --max-position-embeddings 4096 \
+    --micro-batch-size 2 \
+    --global-batch-size 1536 \
+    --lr 0.0001 \
+    --train-iters 318000 \
+    --lr-decay-iters 318000 \
+    --lr-decay-style cosine \
+    --min-lr 1.0e-5 \
+    --weight-decay 1e-1 \
+    --lr-warmup-iters 6400 \
+    --clip-grad 1.0 \
+    --recompute-method block \
+    --recompute-granularity full \
+    --recompute-num-layers 1 \
+    --bf16 \
+    --rotary-percent 0.5 \
+    --use-attention-router \
+    --no-masked-softmax-fusion \
+    --use-fp32-router \
+    --num-experts 32 \
+    --moe-router-load-balancing-type none \
+    --moe-router-topk 2 \
+    --moe-grouped-gemm \
+
+    
+"
+
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --tokenizer-type YuanTokenizer \
+    --tokenizer-model-path $TOKENIZER_MODEL_PATH \
+    --data-impl mmap \
+    --split 10,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000000 \
+    --eval-iters 10
+"
+
+LOG_ARGS="
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-log-interval 1 \
+    --tensorboard-queue-size 1000 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-memory-to-tensorboard \
+    --log-world-size-to-tensorboard
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_yuan.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    $LOG_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
+
--- a/examples/pretrain_yuan2.0_moe_2x32B_sft.sh
+++ b/examples/pretrain_yuan2.0_moe_2x32B_sft.sh
+#!/bin/bash
+
+# Runs the "Yuan-2.1B" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=<Specify path>
+DATA_PATH=<Specify path and file prefix>_text_document
+TOKENIZER_MODEL_PATH=<Specify path to file>
+TENSORBOARD_PATH=<Specify path to file>
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size 1 \
+    --pipeline-model-parallel-size 8 \
+    --timing-log-level 2 \
+    --num-workers 2 \
+    --num-layers 24 \
+    --hidden-size 2048 \
+    --num-attention-heads 16 \
+    --kv-channels 256 \
+    --use-lf-gate \
+    --lf-conv2d-group 1 \
+    --lf-conv2d-num-pad 1 \
+    --rotary-base 40890 \
+    --position-embedding-type rope \
+    --no-embedding-dropout \
+    --flash-attn-drop 0.1 \
+    --fim-rate 0.0 \
+    --fim-spm-rate 0.0 \
+    --norm-dtype RMSNorm \
+    --attention-dropout 0 \
+    --hidden-dropout 0 \
+    --disable-bias-linear \
+    --reset-position-ids \
+    --use-flash-attn \
+    --swiglu \
+    --use-distributed-optimizer \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --seq-length 16384 \
+    --max-position-embeddings 16384 \
+    --micro-batch-size 1 \
+    --global-batch-size 1152 \
+    --lr 8.0e-5 \
+    --train-iters 4220 \
+    --lr-decay-style constant \
+    --min-lr 8.0e-5 \
+    --weight-decay 1e-1 \
+    --clip-grad 1.0 \
+    --recompute-method uniform \
+    --recompute-granularity full \
+    --recompute-num-layers 1 \
+    --bf16 \
+    --rotary-percent 0.5 \
+    --use-attention-router \
+    --num-attention-router-heads 16384 \
+    --num-experts 32 \
+    --no-masked-softmax-fusion \
+    --use-fp32-router \
+    --moe-router-load-balancing-type none \
+    --moe-router-topk 2 \
+    --moe-grouped-gemm \ 
+    --sft-stage \
+    --override-opt-param-scheduler \
+    --train-reset \
+"
+
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --tokenizer-type YuanTokenizer \
+    --tokenizer-model-path $TOKENIZER_MODEL_PATH \
+    --data-impl mmap \
+    --split 10,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000000 \
+    --eval-iters 10
+"
+
+LOG_ARGS="
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-log-interval 1 \
+    --tensorboard-queue-size 1000 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-memory-to-tensorboard \
+    --log-world-size-to-tensorboard
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_yuan.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    $LOG_ARGS \
+    --distributed-backend nccl \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH
+
--- a/examples/run_inference_server_102B.sh
+++ b/examples/run_inference_server_102B.sh
+#!/bin/bash
+
+# Runs the "Yuan-102B" parameter model
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+GPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6074
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+if [ "$TEMP" == "" ]; then
+    TEMP=1
+fi
+if [ "$TOP_P" == "" ]; then
+    TOP_P=0.0
+fi
+if [ "$TOP_K" == "" ]; then
+    TOP_K=1
+fi
+
+TOKENIZER_MODEL_PATH=./tokenizer
+CHECKPOINT_PATH=<Specify path>
+
+GPT_ARGS="
+    --micro-batch-size 1 \
+    --tensor-model-parallel-size 8 \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 84 \
+    --distributed-timeout-minutes 120 \
+    --hidden-size 8192 \
+    --use-lf-gate \
+    --lf-conv2d-group 1 \
+    --lf-conv2d-num-pad 0 \
+    --position-embedding-type rope \
+    --no-embedding-dropout \
+    --use-flash-attn \
+    --flash-attn-drop 0.0 \
+    --attention-dropout 0 \
+    --fim-rate 0.0 \
+    --hidden-dropout 0 \
+    --norm-dtype RMSNorm \
+    --disable-bias-linear \
+    --reset-position-ids \
+    --swiglu \
+    --num-attention-heads 64 \
+    --seq-length 4096 \
+    --max-position-embeddings 4096 \
+    --bf16 \
+    --temperature $TEMP \
+    --top_p $TOP_P \
+    --top_k $TOP_K \
+    --seed $RANDOM
+"
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 PORT=8000 NCCL_IB_TIMEOUT=22 NCCL_TIMEOUT=60000000000 torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
+       $GPT_ARGS \
+       --tokenizer-type "YuanTokenizer" \
+       --inference-server \
+       --tokenizer-model-path $TOKENIZER_MODEL_PATH \
+       --distributed-backend nccl \
+       --load $CHECKPOINT_PATH