initial llama

0211193c · zhuwenwen · 0211193c · 0211193c · 0211193c · 0211193c
Commit 0211193c authored Aug 17, 2023 by zhuwenwen
20 changed files
--- a/3rdparty/Megatron-LM/examples/msdp/eval_knwl_generation.sh
+++ b/3rdparty/Megatron-LM/examples/msdp/eval_knwl_generation.sh
+#!/bin/bash
+
+#########################
+# Evaluate the F1 scores.
+#########################
+
+WORLD_SIZE=1
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+                  
+MODEL_GEN_PATH=<PATH_OF_THE_KNOWLEDGE_GENERATION> \ 
+        (e.g., /testseen_knowledge_generations.txt)
+GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \ 
+        (e.g., /testseen_knowledge_reference.txt)
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
+        --num-layers 24 \
+        --hidden-size 1024 \
+        --num-attention-heads 16 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+        --micro-batch-size 4 \
+        --task MSDP-EVAL-F1 \
+        --guess-file ${MODEL_GEN_PATH} \
+        --answer-file ${GROUND_TRUTH_PATH}
+
+
+############################################
+# Evaluate BLEU, METEOR, and ROUGE-L scores.
+############################################
+
+# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 
+# evaluate the BLEU, METEOR, and ROUGE-L scores. 
+
+# To evaluate on these metrics, please setup the environments based on 
+# the nlg-eval github, and run the corresponding evaluation commands.
+
+nlg-eval \
+    --hypothesis=<PATH_OF_THE_KNOWLEDGE_GENERATION> \
+    --references=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE>
--- a/3rdparty/Megatron-LM/examples/msdp/eval_resp_generation.sh
+++ b/3rdparty/Megatron-LM/examples/msdp/eval_resp_generation.sh
+#!/bin/bash
+
+#########################
+# Evaluate the F1 scores.
+#########################
+
+WORLD_SIZE=1
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+                  
+MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \ 
+        (e.g., /testseen_response_generations.txt)
+GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_RESPONSE> \ 
+        (e.g., /testseen_response_reference.txt)
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
+        --num-layers 24 \
+        --hidden-size 1024 \
+        --num-attention-heads 16 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+        --micro-batch-size 4 \
+        --task MSDP-EVAL-F1 \
+        --guess-file ${MODEL_GEN_PATH} \
+        --answer-file ${GROUND_TRUTH_PATH}
+
+
+##########################
+# Evaluate the KF1 scores.
+##########################
+                  
+MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \ 
+        (e.g., /testseen_response_generations.txt)
+GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \ 
+        (e.g., /testseen_knowledge_reference.txt)
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
+        --num-layers 24 \
+        --hidden-size 1024 \
+        --num-attention-heads 16 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+        --micro-batch-size 4 \
+        --task MSDP-EVAL-F1 \
+        --guess-file ${MODEL_GEN_PATH} \
+        --answer-file ${GROUND_TRUTH_PATH}
+
+
+############################################
+# Evaluate BLEU, METEOR, and ROUGE-L scores.
+############################################
+
+# We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 
+# evaluate the BLEU, METEOR, and ROUGE-L scores. 
+
+# To evaluate on these metrics, please setup the environments based on 
+# the nlg-eval github, and run the corresponding evaluation commands.
+
+nlg-eval \
+    --hypothesis=<PATH_OF_THE_RESPONSE_GENERATION> \
+    --references=<PATH_OF_THE_GROUND_TRUTH_RESPONSE>
--- a/3rdparty/Megatron-LM/examples/msdp/prep_resp_gen.sh
+++ b/3rdparty/Megatron-LM/examples/msdp/prep_resp_gen.sh
+#!/bin/bash
+
+# Preparing the input file for the response generation (second-stage prompting)
+
+DIR=`pwd`
+
+TEST_FILE=<PATH_OF_PROCESSED_TEST_DATA> \
+        (e.g., /testseen_processed.txt)
+KNOWLEDGE_FILE=<PATH_OF_GENERATED_KNOWLEDGE_DATA> \
+        (e.g., /testseen_knowledge_generations.txt)
+PROCESSED_FILE=<PATH_OF_INPUT_FILE_FOR_RESPONSE_GENERATION> \
+        (e.g., /testseen_processed_with_generated_knowledge.txt)
+
+python ${DIR}/tasks/msdp/preprocessing.py \
+        --func prepare_input \
+        --test_file ${TEST_FILE} \
+        --knwl_gen_file ${KNOWLEDGE_FILE} \
+        --processed_file ${PROCESSED_FILE}
--- a/3rdparty/Megatron-LM/examples/msdp/prompt_knwl_gen.sh
+++ b/3rdparty/Megatron-LM/examples/msdp/prompt_knwl_gen.sh
+#!/bin/bash
+
+# Stage-1: Prompt a pretrained language model to generate the context-relevant knowledge
+# The input contains prompts and current dialogue context, the output is the relevant knowledge
+# The size of the pretrained language model is 357M
+
+WORLD_SIZE=8
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
+VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
+MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
+INPUT_PATH=<PATH_OF_PROCESSED_TEST_DATA_FILE> \ 
+        (e.g., /testseen_processed.txt)
+PROMPT_PATH=<PATH_OF_KNOWLEDGE_GENERATION_PROMPTS> \
+        (e.g., /testseen_knowledge_prompts.json)
+OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
+        (e.g., /testseen_knowledge_generations.txt)
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
+        --num-layers 24 \
+        --hidden-size 1024 \
+        --num-attention-heads 16 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+        --micro-batch-size 1 \
+        --vocab-file ${VOCAB_PATH} \
+        --merge-file ${MERGE_PATH} \
+        --load ${CHECKPOINT_PATH} \
+        --fp16 \
+        --DDP-impl torch \
+        --tokenizer-type GPT2BPETokenizer \
+        --sample-input-file ${INPUT_PATH} \
+        --sample-output-file ${OUTPUT_PATH} \
+        --prompt-file ${PROMPT_PATH} \
+        --prompt-type knowledge \
+        --num-prompt-examples 10 \
+        --task MSDP-PROMPT 
+
+# NOTE: If you use api for the model generation, please use 
+# the "--api-prompt" flag (setting this value as True). 
--- a/3rdparty/Megatron-LM/examples/msdp/prompt_resp_gen.sh
+++ b/3rdparty/Megatron-LM/examples/msdp/prompt_resp_gen.sh
+#!/bin/bash
+
+# Stage-2: Prompt a pretrained language model to generate the corresponding response
+# The input contains prompts, current dialogue context, and generated knowledge in Stage-1
+# The output is the corresponding response.
+# The size of the pretrained language model is 357M
+
+WORLD_SIZE=8
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
+VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
+MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
+INPUT_PATH=<PATH_OF_INPUT_TEST_DATA_FILE> (e.g., /testseen_processed.txt)
+PROMPT_PATH=<PATH_OF_RESPONSE_GENERATION_PROMPTS> \
+        (e.g., /response_prompts.txt)
+OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
+        (e.g., /output_testseen_response_generations.txt)
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
+        --num-layers 24 \
+        --hidden-size 1024 \
+        --num-attention-heads 16 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+        --micro-batch-size 1 \
+        --vocab-file ${VOCAB_PATH} \
+        --merge-file ${MERGE_PATH} \
+        --load ${CHECKPOINT_PATH} \
+        --fp16 \
+        --DDP-impl torch \
+        --tokenizer-type GPT2BPETokenizer \
+        --sample-input-file ${INPUT_PATH} \
+        --sample-output-file ${OUTPUT_PATH} \
+        --prompt-file ${PROMPT_PATH} \
+        --prompt-type response \
+        --num-prompt-examples 20 \
+        --task MSDP-PROMPT 
+
+# NOTE: If you use api for the model generation, please use 
+# the "--api-prompt" flag (setting this value as True). 
--- a/3rdparty/Megatron-LM/examples/pretrain_bert.sh
+++ b/3rdparty/Megatron-LM/examples/pretrain_bert.sh
+#!/bin/bash
+
+RANK=0
+WORLD_SIZE=1
+DATA_PATH=<Specify path and file prefix>_text_sentence
+CHECKPOINT_PATH=<Specify path>
+
+python pretrain_bert.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 8 \
+       --seq-length 512 \
+       --max-position-embeddings 512 \
+       --train-iters 2000000 \
+       --lr-decay-iters 990000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file bert-vocab.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-decay-style linear \
+       --lr-warmup-fraction .01 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/3rdparty/Megatron-LM/examples/pretrain_bert_distributed.sh
+++ b/3rdparty/Megatron-LM/examples/pretrain_bert_distributed.sh
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>_text_sentence
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_bert.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 32 \
+       --seq-length 512 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file bert-vocab.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.0001 \
+       --lr-decay-style linear \
+       --min-lr 1.0e-5 \
+       --lr-decay-iters 990000 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/3rdparty/Megatron-LM/examples/pretrain_bert_distributed_with_mp.sh
+++ b/3rdparty/Megatron-LM/examples/pretrain_bert_distributed_with_mp.sh
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>_text_sentence
+VOCAB_FILE=<Specify path to vocab.txt>
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_bert.py \
+       --tensor-model-parallel-size 2 \
+       --pipeline-model-parallel-size 2 \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 2 \
+       --global-batch-size 16 \
+       --seq-length 512 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file $VOCAB_FILE \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.0001 \
+       --lr-decay-style linear \
+       --min-lr 1.0e-5 \
+       --lr-decay-iters 990000 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/3rdparty/Megatron-LM/examples/pretrain_gpt.sh
+++ b/3rdparty/Megatron-LM/examples/pretrain_gpt.sh
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+RANK=0
+WORLD_SIZE=1
+
+DATA_PATH=<Specify path and file prefix>_text_document
+CHECKPOINT_PATH=<Specify path>
+
+
+python pretrain_gpt.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 8 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --min-lr 1.0e-5 \
+       --lr-decay-style cosine \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --activations-checkpoint-method uniform \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/3rdparty/Megatron-LM/examples/pretrain_gpt3_175B.sh
+++ b/3rdparty/Megatron-LM/examples/pretrain_gpt3_175B.sh
+#!/bin/bash
+
+
+#SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
+
+
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+mkdir -p $DIR/logs
+
+
+DATASET_1="<PATH TO THE FIRST DATASET>"
+DATASET_2="<PATH TO THE SECOND DATASET>"
+DATASET_3="<PATH TO THE THIRD DATASET>"
+DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
+
+
+options=" \
+	--tensor-model-parallel-size 8 \
+	--pipeline-model-parallel-size 16 \
+        --num-layers 96 \
+        --hidden-size 12288 \
+        --num-attention-heads 96 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+	--micro-batch-size 1 \
+	--global-batch-size 1536 \
+	--rampup-batch-size 16 16 5859375 \
+	--train-samples 146484375 \
+       	--lr-decay-samples 126953125 \
+        --lr-warmup-samples 183105 \
+        --lr 6.0e-5 \
+	--min-lr 6.0e-6 \
+        --lr-decay-style cosine \
+        --log-interval 10 \
+        --eval-iters 40 \
+        --eval-interval 1000 \
+	--data-path ${DATASET} \
+	--vocab-file <PATH TO gpt-vocab.json> \
+	--merge-file <PATH TO gpt-merges.txt> \
+	--save-interval 1000 \
+	--save <PATH TO CHECKPOINTS DIRECTORY> \
+	--load <PATH TO CHECKPOINTS DIRECTORY> \
+        --split 98,2,0 \
+        --clip-grad 1.0 \
+	--weight-decay 0.1 \
+	--adam-beta1 0.9 \
+	--adam-beta2 0.95 \
+	--init-method-std 0.006 \
+	--tensorboard-dir <TENSORBOARD DIRECTORY> \
+        --fp16 \
+	--activations-checkpoint-method uniform "
+
+
+run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
+
+
+srun -l \
+     --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
+     --container-mounts "<DIRECTORIES TO MOUNT>" \
+     --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
+
+
+set +x
+
--- a/3rdparty/Megatron-LM/examples/pretrain_gpt_distributed.sh
+++ b/3rdparty/Megatron-LM/examples/pretrain_gpt_distributed.sh
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>_text_document
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_gpt.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 8 \
+       --global-batch-size 64 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --lr-decay-style cosine \
+       --min-lr 1.0e-5 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --activations-checkpoint-method uniform \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/3rdparty/Megatron-LM/examples/pretrain_gpt_distributed_with_mp.sh
+++ b/3rdparty/Megatron-LM/examples/pretrain_gpt_distributed_with_mp.sh
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>_text_document
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_gpt.py \
+       --tensor-model-parallel-size 2 \
+       --pipeline-model-parallel-size 2 \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 16 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --lr-decay-style cosine \
+       --min-lr 1.0e-5 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --activations-checkpoint-method uniform \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/3rdparty/Megatron-LM/examples/pretrain_ict.sh
+++ b/3rdparty/Megatron-LM/examples/pretrain_ict.sh
+#! /bin/bash
+
+# Runs the "217M" parameter biencoder model for ICT retriever
+
+RANK=0
+WORLD_SIZE=1
+
+PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
+TEXT_DATA_PATH=<Specify path and file prefix of the text data>
+TITLE_DATA_PATH=<Specify path and file prefix od the titles>
+CHECKPOINT_PATH=<Specify path>
+
+
+python pretrain_ict.py \
+        --num-layers 12 \
+        --hidden-size 768 \
+        --num-attention-heads 12 \
+        --tensor-model-parallel-size 1 \
+        --micro-batch-size 32 \
+        --seq-length 256 \
+        --max-position-embeddings 512 \
+        --train-iters 100000 \
+        --vocab-file bert-vocab.txt \
+        --tokenizer-type BertWordPieceLowerCase \
+        --DDP-impl torch \
+        --bert-load ${PRETRAINED_BERT_PATH} \
+        --log-interval 100 \
+        --eval-interval 1000 \
+        --eval-iters 10 \
+        --retriever-report-topk-accuracies 1 5 10 20 100 \
+        --retriever-score-scaling \
+        --load $CHECKPOINT_PATH \
+        --save $CHECKPOINT_PATH \
+        --data-path ${TEXT_DATA_PATH} \
+        --titles-data-path ${TITLE_DATA_PATH} \
+        --lr 0.0001 \
+        --lr-decay-style linear \
+        --weight-decay 1e-2 \
+        --clip-grad 1.0 \
+        --lr-warmup-fraction 0.01 \
+        --save-interval 4000 \
+        --exit-interval 8000 \
+        --query-in-block-prob 0.1 \
+        --fp16
--- a/3rdparty/Megatron-LM/examples/pretrain_t5.sh
+++ b/3rdparty/Megatron-LM/examples/pretrain_t5.sh
+#!/bin/bash
+
+RANK=0
+WORLD_SIZE=1
+DATA_PATH=<Specify path and file prefix>
+VOCAB_FILE=<Specify path to vocab.txt>
+CHECKPOINT_PATH=<Specify path>
+
+python pretrain_t5.py \
+       --num-layers 12 \
+       --hidden-size 768 \
+       --num-attention-heads 12 \
+       --kv-channels 64 \
+       --ffn-hidden-size 3072 \
+       --encoder-seq-length 512 \
+       --decoder-seq-length 128 \
+       --micro-batch-size 16 \
+       --global-batch-size 16 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --lr-decay-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file $VOCAB_FILE \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-decay-style linear \
+       --lr-warmup-fraction .01 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16 \
+       --vocab-extra-ids 100
--- a/3rdparty/Megatron-LM/examples/pretrain_t5_distributed.sh
+++ b/3rdparty/Megatron-LM/examples/pretrain_t5_distributed.sh
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>
+VOCAB_FILE=<Specify path to vocab.txt>
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_t5.py \
+       --num-layers 12 \
+       --hidden-size 768 \
+       --num-attention-heads 12 \
+       --kv-channels 64 \
+       --ffn-hidden-size 3072 \
+       --encoder-seq-length 512 \
+       --decoder-seq-length 128 \
+       --micro-batch-size 16 \
+       --global-batch-size 128 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --lr-decay-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file $VOCAB_FILE \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-decay-style linear \
+       --lr-warmup-fraction .01 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16 \
+       --vocab-extra-ids 100
--- a/3rdparty/Megatron-LM/examples/pretrain_t5_distributed_with_mp.sh
+++ b/3rdparty/Megatron-LM/examples/pretrain_t5_distributed_with_mp.sh
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_t5.py \
+       --tensor-model-parallel-size 2 \
+       --num-layers 12 \
+       --hidden-size 768 \
+       --num-attention-heads 12 \
+       --kv-channels 64 \
+       --ffn-hidden-size 3072 \
+       --encoder-seq-length 512 \
+       --decoder-seq-length 128 \
+       --micro-batch-size 16 \
+       --global-batch-size 128 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --lr-decay-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file t5-vocab.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-decay-style linear \
+       --lr-warmup-fraction .01 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16  \
+       --vocab-extra-ids 100
--- a/3rdparty/Megatron-LM/examples/run_text_generation_server_345M.sh
+++ b/3rdparty/Megatron-LM/examples/run_text_generation_server_345M.sh
+#!/bin/bash
+# This example will start serving the 345M model.
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+
+pip install flask-restful
+
+python -m torch.distributed.run $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 24  \
+       --hidden-size 1024  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 16  \
+       --max-position-embeddings 1024  \
+       --tokenizer-type GPT2BPETokenizer  \
+       --fp16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --out-seq-length 1024  \
+       --temperature 1.0  \
+       --vocab-file $VOCAB_FILE  \
+       --merge-file $MERGE_FILE  \
+       --top_p 0.9  \
+       --seed 42
--- a/3rdparty/Megatron-LM/examples/run_text_generation_server_345M_8_tensor_parallel.sh
+++ b/3rdparty/Megatron-LM/examples/run_text_generation_server_345M_8_tensor_parallel.sh
+#!/bin/bash
+# This example will start serving the 345M model that is partitioned 8 way tensor parallel
+DISTRIBUTED_ARGS="--nproc_per_node 8 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+
+pip install flask-restful
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 8  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 24  \
+       --hidden-size 1024  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 16  \
+       --max-position-embeddings 1024  \
+       --tokenizer-type GPT2BPETokenizer  \
+       --fp16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --out-seq-length 1024  \
+       --temperature 1.0  \
+       --vocab-file $VOCAB_FILE  \
+       --merge-file $MERGE_FILE  \
+       --top_p 0.9  \
+       --seed 42
--- a/3rdparty/Megatron-LM/examples/sc21/CONFIG.sh
+++ b/3rdparty/Megatron-LM/examples/sc21/CONFIG.sh
+#!/bin/bash
+
+
+# SLURM options.
+export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>
+export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm>
+
+
+# Source code.
+export MEGATRON_CODE_DIR=<megatron source code directory>
+
+
+# This variable is used to mount the relevant part of the filesystem
+# inside the docker container. Note that the `MEGATRON_CODE_DIR` and the
+# launch directory already get mounted; this variable should be used to
+# mount the directories that contain the data and tokenizer files.
+export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path>
+
+
+# Data and tokenizer files.
+MEGATRON_DATA=<path to megatron processed data>
+BPE_VOCAB_FILE=<path to bpe vocab file>
+BPE_MERGE_FILE=<path to bpe merges file>
+
+
+# Megatron input parameters.
+# `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters
+# that are not listed here. 
+export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \
+	--tensor-model-parallel-size ${TP} \
+	--pipeline-model-parallel-size ${PP} \
+	--micro-batch-size ${MBS} \
+	--global-batch-size ${GBS} \
+        --num-layers ${NLS} \
+        --hidden-size ${HS} \
+        --num-attention-heads ${NAH} \
+	--DDP-impl ${DDP} \
+	--data-path ${MEGATRON_DATA} \
+	--vocab-file ${BPE_VOCAB_FILE} \
+	--merge-file ${BPE_MERGE_FILE} \
+        --log-interval 5 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+        --train-iters 500 \
+        --lr-decay-iters 320 \
+        --lr 0.0001 \
+	--min-lr 0.00001 \
+        --lr-decay-style cosine \
+        --lr-warmup-fraction 0.01 \
+        --split 969,30,1 \
+        --eval-iters 100 \
+        --eval-interval 1000 \
+        --clip-grad 1.0 \
+        --fp16 \
+	--loss-scale 8192 "
+
+
--- a/3rdparty/Megatron-LM/examples/sc21/README.md
+++ b/3rdparty/Megatron-LM/examples/sc21/README.md
+# Reproducing Figures in SC21 Paper
+
+
+This directory contains some of the scripts that were used to produce the
+results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is
+to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These
+scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the
+[pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other
+schedulers as well.
+
+
+## Setup
+
+All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please
+update the unspecified values (in angle brackets `<...>`) before launching any
+scripts.
+
+
+
+## Scripts
+
+Below is a list of scripts that can be used to reproduce various figures in our
+[paper](https://arxiv.org/pdf/2104.04473.pdf):
+
+* [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput
+for GPT models ranging from 1 billion to 1 trillion parameters.
+* [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling
+performance of pipeline parallelism.
+* [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of
+the interleaved schedule on a 175B GPT model.
+* [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of
+different degrees of pipeline and tensor model parallelism on a model with
+162.2 billion parameters.
+* [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of
+different degrees of data and pipeline model parallelism on a model with
+5.9 billion parameters.
+* [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of
+different degrees of data and tensor model parallelism on a model with
+5.9 billion parameters.
+* [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of
+microbatch size.
+* [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of
+activation recomputation.
+* [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of
+the scatter-gather communication optimization.