GPT2 base on megatron-deepspeed

8ec5d678 · hepj987 · 8ec5d678 · 8ec5d678 · 8ec5d678 · 8ec5d678
Commit 8ec5d678 authored Apr 03, 2023 by hepj987
20 changed files
--- a/megatron-deepspeed_dtk22.10/examples/merge_mp_bert.sh
+++ b/megatron-deepspeed_dtk22.10/examples/merge_mp_bert.sh
+#!/bin/bash
+
+TENSOR_MODEL_PARALLEL_SIZE=2
+
+VOCAB_FILE=bert-vocab.txt
+CHECKPOINT_PATH=checkpoints/bert_345m
+
+WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
+                                --model-type BERT \
+                                --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \
+                                --tokenizer-type BertWordPieceLowerCase \
+                                --vocab-file $VOCAB_FILE \
+                                --num-layers 24 \
+                                --hidden-size 1024 \
+                                --num-attention-heads 16 \
+                                --seq-length 512 \
+                                --max-position-embeddings 512 \
+                                --load $CHECKPOINT_PATH
--- a/megatron-deepspeed_dtk22.10/examples/pretrain_bert.sh
+++ b/megatron-deepspeed_dtk22.10/examples/pretrain_bert.sh
+#!/bin/bash
+
+RANK=0
+WORLD_SIZE=1
+DATA_PATH=<Specify path and file prefix>_text_sentence
+CHECKPOINT_PATH=<Specify path>
+
+python pretrain_bert.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 8 \
+       --seq-length 512 \
+       --max-position-embeddings 512 \
+       --train-iters 2000000 \
+       --lr-decay-iters 990000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file bert-vocab.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-decay-style linear \
+       --lr-warmup-fraction .01 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/megatron-deepspeed_dtk22.10/examples/pretrain_bert_distributed.sh
+++ b/megatron-deepspeed_dtk22.10/examples/pretrain_bert_distributed.sh
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>_text_sentence
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_bert.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 32 \
+       --seq-length 512 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file bert-vocab.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.0001 \
+       --lr-decay-style linear \
+       --min-lr 1.0e-5 \
+       --lr-decay-iters 990000 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/megatron-deepspeed_dtk22.10/examples/pretrain_bert_distributed_with_mp.sh
+++ b/megatron-deepspeed_dtk22.10/examples/pretrain_bert_distributed_with_mp.sh
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>_text_sentence
+VOCAB_FILE=<Specify path to vocab.txt>
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_bert.py \
+       --tensor-model-parallel-size 2 \
+       --pipeline-model-parallel-size 2 \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 2 \
+       --global-batch-size 16 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file $VOCAB_FILE \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.0001 \
+       --lr-decay-style linear \
+       --min-lr 1.0e-5 \
+       --lr-decay-iters 990000 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/megatron-deepspeed_dtk22.10/examples/pretrain_gpt.sh
+++ b/megatron-deepspeed_dtk22.10/examples/pretrain_gpt.sh
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+RANK=0
+WORLD_SIZE=1
+
+DATA_PATH=<Specify path and file prefix>_text_document
+CHECKPOINT_PATH=<Specify path>
+
+
+deepspeed --num_gpus 1 pretrain_gpt.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 8 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --min-lr 1.0e-5 \
+       --lr-decay-style cosine \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --checkpoint-activations \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/megatron-deepspeed_dtk22.10/examples/pretrain_gpt3_175B.sh
+++ b/megatron-deepspeed_dtk22.10/examples/pretrain_gpt3_175B.sh
+#!/bin/bash
+
+
+#SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
+
+
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+mkdir -p $DIR/logs
+
+
+DATASET_1="<PATH TO THE FIRST DATASET>"
+DATASET_2="<PATH TO THE SECOND DATASET>"
+DATASET_3="<PATH TO THE THIRD DATASET>"
+DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
+
+
+options=" \
+	--tensor-model-parallel-size 8 \
+	--pipeline-model-parallel-size 16 \
+        --num-layers 96 \
+        --hidden-size 12288 \
+        --num-attention-heads 96 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+  --micro-batch-size 1 \
+	--global-batch-size 1536 \
+	--rampup-batch-size 16 16 5859375 \
+	--train-samples 146484375 \
+       	--lr-decay-samples 126953125 \
+        --lr-warmup-samples 183105 \
+        --lr 6.0e-5 \
+	--min-lr 6.0e-6 \
+        --lr-decay-style cosine \
+        --log-interval 10 \
+        --eval-iters 40 \
+        --eval-interval 1000 \
+	--data-path ${DATASET} \
+	--vocab-file <PATH TO gpt-vocab.json> \
+	--merge-file <PATH TO gpt-merges.txt> \
+	--save-interval 1000 \
+	--save <PATH TO CHECKPOINTS DIRECTORY> \
+	--load <PATH TO CHECKPOINTS DIRECTORY> \
+        --split 98,2,0 \
+        --clip-grad 1.0 \
+	--weight-decay 0.1 \
+	--adam-beta1 0.9 \
+	--adam-beta2 0.95 \
+	--init-method-std 0.006 \
+	--tensorboard-dir <TENSORBOARD DIRECTORY> \
+        --fp16 \
+	--checkpoint-activations "
+
+
+run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
+
+
+srun -l \
+     --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
+     --container-mounts "<DIRECTORIES TO MOUNT>" \
+     --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
+
+
+set +x
+
--- a/megatron-deepspeed_dtk22.10/examples/pretrain_gpt_distributed.sh
+++ b/megatron-deepspeed_dtk22.10/examples/pretrain_gpt_distributed.sh
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>_text_document
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_gpt.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 8 \
+       --global-batch-size 64 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --lr-decay-style cosine \
+       --min-lr 1.0e-5 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --checkpoint-activations \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/megatron-deepspeed_dtk22.10/examples/pretrain_gpt_distributed_with_mp.sh
+++ b/megatron-deepspeed_dtk22.10/examples/pretrain_gpt_distributed_with_mp.sh
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>_text_document
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_gpt.py \
+       --tensor-model-parallel-size 2 \
+       --pipeline-model-parallel-size 2 \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 16 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --lr-decay-style cosine \
+       --min-lr 1.0e-5 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --checkpoint-activations \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/megatron-deepspeed_dtk22.10/examples/pretrain_gpt_multilingual.sh
+++ b/megatron-deepspeed_dtk22.10/examples/pretrain_gpt_multilingual.sh
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+RANK=0
+WORLD_SIZE=1
+
+# paths to multilingual preprocessed datasets
+DATA_PATH_EN=<Specify path and file prefix>_text_document
+DATA_PATH_AR=<Specify path and file prefix>_text_document
+DATA_PATH_KR=<Specify path and file prefix>_text_document
+DATA_PATH_JP=<Specify path and file prefix>_text_document
+
+CHECKPOINT_PATH=<Specify path>
+
+
+deepspeed --num_gpus 1 pretrain_gpt.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --micro-batch-size 4 \
+       --global-batch-size 8 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 500000 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --train-weighted-split-paths "TRAIN: 0.3 0:0.6 $DATA_EN 1 0:0.6 $DATA_AR 1 0:0.6 $DATA_KR 1 0:0.6 $DATA_JP" \
+       --valid-weighted-split-paths \
+       "VALID_EN: 1 0.6:0.8 $DATA_EN" \
+       "VALID_AR: 1 0.6:0.8 $DATA_AR" \
+       "VALID_JP: 1 0.6:0.8 $DATA_KR" \
+       "VALID_KR: 1 0.6:0.8 $DATA_JP" \
+       "VALID_EN-AR-JP-KR_BALANCED: 1 0.6:0.8 $DATA_EN, 1 0.6:0.8 $DATA_AR, 1 0.6:0.8 $DATA_JP, 1 0.6:0.8 $DATA_KR" \
+       --test-weighted-split-paths \
+       "TEST_EN: 1 0.8:1 $DATA_EN" \
+       "TEST_AR: 1 0.8:1 $DATA_AR" \
+       "TEST_JP: 1 0.8:1 $DATA_JP" \
+       "TEST_KR: 1 0.8:1 $DATA_KR" \
+       "TEST_EN-AR-JP-KR_BALANCED: 1 0.8:1 $DATA_EN, 1 0.8:1 $DATA_AR, 1 0.8:1 $DATA_JP, 1 0.8:1 $DATA_KR" \
+       --vocab-file gpt2-vocab.json \
+       --merge-file gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --min-lr 1.0e-5 \
+       --lr-decay-style cosine \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --checkpoint-activations \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/megatron-deepspeed_dtk22.10/examples/pretrain_gpt_single_node.sh
+++ b/megatron-deepspeed_dtk22.10/examples/pretrain_gpt_single_node.sh
+#!/bin/bash
+
+# Adapted to use deepspeed on a single node
+#
+# Multi-node will require either a `hostfile` or switching to `torch.distributed.launch`
+
+# adjust to the number of GPUs to use
+N_GPUS=1
+
+CHECKPOINT_PATH=checkpoints/gpt2
+VOCAB_FILE=data/gpt2-vocab.json
+MERGE_FILE=data/gpt2-merges.txt
+DATA_PATH=data/meg-gpt2_text_document
+
+GPT_ARGS=" \
+    --num-layers 24 \
+    --hidden-size 1024 \
+    --num-attention-heads 16 \
+    --seq-length 1024 \
+    --max-position-embeddings 1024 \
+    --micro-batch-size 4 \
+    --global-batch-size 8 \
+    --lr-decay-iters 320000 \
+    --lr 0.00015 \
+    --min-lr 1.0e-5 \
+    --lr-decay-style cosine \
+    --train-iters 5000 \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --data-impl mmap \
+    --split 949,50,1 \
+    --distributed-backend nccl \
+    --weight-decay 1e-2 \
+    --clip-grad 1.0 \
+    --lr-warmup-fraction .01 \
+    --fp16 \
+    "
+
+OUTPUT_ARGS=" \
+    --log-interval 10 \
+    --save-interval 500 \
+    --eval-interval 100 \
+    --eval-iters 10 \
+    --checkpoint-activations \
+    "
+
+DATA_ARGS=" \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --data-path $DATA_PATH \
+    "
+
+ALL_ARGS="$GPT_ARGS $OUTPUT_ARGS $DATA_ARGS"
+
+LAUNCHER="deepspeed --num_gpus $N_GPUS"
+
+CMD="$LAUNCHER pretrain_gpt.py $ALL_ARGS"
+
+echo $CMD
+
+$CMD
--- a/megatron-deepspeed_dtk22.10/examples/pretrain_gpt_tiny.sh
+++ b/megatron-deepspeed_dtk22.10/examples/pretrain_gpt_tiny.sh
+#! /bin/bash
+
+# Runs the "345M" parameter model
+
+RANK=0
+WORLD_SIZE=1
+
+DATA_PATH=GPT2/c4_en_partial_gpt2_text_document
+CHECKPOINT_PATH=GPT2
+
+
+deepspeed --num_gpus 1 pretrain_gpt.py \
+       --num-layers 2 \
+       --hidden-size 128 \
+       --num-attention-heads 4 \
+       --micro-batch-size 4 \
+       --global-batch-size 8 \
+       --seq-length 256 \
+       --max-position-embeddings 256 \
+       --train-iters 10000 \
+       --lr-decay-iters 5000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+        --tokenizer-type PretrainedFromHF \
+        --tokenizer-name-or-path t5-small \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --min-lr 1.0e-5 \
+       --lr-decay-style cosine \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --checkpoint-activations \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16 \
+       --tensorboard-dir GPT2
+
+#        --vocab-file GPT2/gpt2-vocab.json \
+#        --merge-file GPT2/gpt2-merges.txt \
--- a/megatron-deepspeed_dtk22.10/examples/pretrain_ict.sh
+++ b/megatron-deepspeed_dtk22.10/examples/pretrain_ict.sh
+#! /bin/bash
+
+# Runs the "217M" parameter biencoder model for ICT retriever
+
+RANK=0
+WORLD_SIZE=1
+
+PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
+TEXT_DATA_PATH=<Specify path and file prefix of the text data>
+TITLE_DATA_PATH=<Specify path and file prefix od the titles>
+CHECKPOINT_PATH=<Specify path>
+
+
+python pretrain_ict.py \
+        --num-layers 12 \
+        --hidden-size 768 \
+        --num-attention-heads 12 \
+        --tensor-model-parallel-size 1 \
+        --micro-batch-size 32 \
+        --seq-length 256 \
+        --max-position-embeddings 512 \
+        --train-iters 100000 \
+        --vocab-file bert-vocab.txt \
+        --tokenizer-type BertWordPieceLowerCase \
+        --DDP-impl torch \
+        --bert-load ${PRETRAINED_BERT_PATH} \
+        --log-interval 100 \
+        --eval-interval 1000 \
+        --eval-iters 10 \
+        --retriever-report-topk-accuracies 1 5 10 20 100 \
+        --retriever-score-scaling \
+        --load $CHECKPOINT_PATH \
+        --save $CHECKPOINT_PATH \
+        --data-path ${TEXT_DATA_PATH} \
+        --titles-data-path ${TITLE_DATA_PATH} \
+        --lr 0.0001 \
+        --lr-decay-style linear \
+        --weight-decay 1e-2 \
+        --clip-grad 1.0 \
+        --lr-warmup-fraction 0.01 \
+        --save-interval 4000 \
+        --exit-interval 8000 \
+        --query-in-block-prob 0.1 \
+        --fp16
--- a/megatron-deepspeed_dtk22.10/examples/pretrain_t5.sh
+++ b/megatron-deepspeed_dtk22.10/examples/pretrain_t5.sh
+#!/bin/bash
+
+RANK=0
+WORLD_SIZE=1
+DATA_PATH=<Specify path and file prefix>
+VOCAB_FILE=<Specify path to vocab.txt>
+CHECKPOINT_PATH=<Specify path>
+
+python pretrain_t5.py \
+       --num-layers 12 \
+       --hidden-size 768 \
+       --num-attention-heads 12 \
+       --kv-channels 64 \
+       --ffn-hidden-size 3072 \
+       --encoder-seq-length 512 \
+       --decoder-seq-length 128 \
+       --micro-batch-size 16 \
+       --global-batch-size 2048 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --lr-decay-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file $VOCAB_FILE \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-decay-style linear \
+       --lr-warmup-fraction .01 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/megatron-deepspeed_dtk22.10/examples/pretrain_t5_distributed.sh
+++ b/megatron-deepspeed_dtk22.10/examples/pretrain_t5_distributed.sh
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>
+VOCAB_FILE=<Specify path to vocab.txt>
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_t5.py \
+       --num-layers 12 \
+       --hidden-size 768 \
+       --num-attention-heads 12 \
+       --kv-channels 64 \
+       --ffn-hidden-size 3072 \
+       --encoder-seq-length 512 \
+       --decoder-seq-length 128 \
+       --micro-batch-size 16 \
+       --global-batch-size 2048 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --lr-decay-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file $VOCAB_FILE \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-decay-style linear \
+       --lr-warmup-fraction .01 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/megatron-deepspeed_dtk22.10/examples/pretrain_t5_distributed_with_mp.sh
+++ b/megatron-deepspeed_dtk22.10/examples/pretrain_t5_distributed_with_mp.sh
+#!/bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+DATA_PATH=<Specify path and file prefix>
+CHECKPOINT_PATH=<Specify path>
+
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_t5.py \
+       --tensor-model-parallel-size 2 \
+       --num-layers 12 \
+       --hidden-size 768 \
+       --num-attention-heads 12 \
+       --kv-channels 64 \
+       --ffn-hidden-size 3072 \
+       --encoder-seq-length 512 \
+       --decoder-seq-length 128 \
+       --micro-batch-size 16 \
+       --global-batch-size 2048 \
+       --seq-length 512 \
+       --max-position-embeddings 512 \
+       --train-iters 1000000 \
+       --lr-decay-iters 1000000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file t5-vocab.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-decay-style linear \
+       --lr-warmup-fraction .01 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --log-interval 100 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --fp16
--- a/megatron-deepspeed_dtk22.10/examples/run_evalharness.sh
+++ b/megatron-deepspeed_dtk22.10/examples/run_evalharness.sh
+CHECKPOINT_PATH=/gpfsscratch/rech/bbv/utw68ny/checkpoints/tr3m-1B3-pile/global_step296023/
+
+PP_SIZE=1
+TP_SIZE=1
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+
+export HF_DATASETS_OFFLINE=1
+
+#dummy arguments to make megatron happy.
+MEGATRON_REQUIRED_ARGS="\
+    --num-layers -1\
+    --hidden-size -1\
+    --num-attention-heads -1\
+    --seq-length -1 \
+    --max-position-embeddings -1
+"
+
+CMD="./tasks/eval_harness/evaluate.py \
+    --load $CHECKPOINT_PATH\
+    --tensor-model-parallel-size $TP_SIZE \
+    --pipeline-model-parallel-size $PP_SIZE\
+    --vocab-file $VOCAB_FILE\
+    --merge-file $MERGE_FILE\
+    --micro-batch-size 64\
+    --adaptive_seq_len\
+    --eval_fp32\
+    --task_list hellaswag,mrpc,piqa\
+    $MEGATRON_REQUIRED_ARGS\
+    "
+
+N_GPUS=1
+LAUNCHER="deepspeed --num_gpus $N_GPUS"
+$LAUNCHER $CMD
\ No newline at end of file
--- a/megatron-deepspeed_dtk22.10/examples/run_evalharness_deepspeed.md
+++ b/megatron-deepspeed_dtk22.10/examples/run_evalharness_deepspeed.md
+# How to run lm-eval on Megatron-DeepSpeed checkpoint using the original setup
+
+This particular setup uses the normal deepspeed checkpoint and requires no conversion to Megatron-LM.
+
+This doc assumes usage on JZ, so some peculiar requirements in places. Ignore these if you're not running this on JZ.
+
+## Prerequisites
+
+1. Install software
+
+On login console with external network
+
+Get lm-eval harness (https://github.com/EleutherAI/lm-evaluation-harness) and `best-download==0.0.7` needed to download some tasks.
+```
+start-prod
+pip install best-download==0.0.7
+pip install git+https://github.com/EleutherAI/lm-evaluation-harness
+```
+
+2. Pre-download needed datasets
+
+some symlinks due to lm-harness' issues with relative position of data
+```
+mkdir data
+ln -s `pwd`/data tasks/eval_harness/data
+```
+Also make sure `data` is not on one of the limited paritions like WORKSF.
+
+Then install datasets for the tasks:
+```
+python ./tasks/eval_harness/download.py --task_list
+arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc
+```
+and make sure that `export HF_DATASETS_OFFLINE=1`
+
+If there are things like custom tokenizers, pre-download those too, e.g.:
+
+```
+python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('bigscience/oscar_13_languages_alpha_weight')"
+```
+and make sure that `export TRANSFORMERS_OFFLINE=1` is in the script.
+You know there is a custom tokenizer if the training script had something like:
+
+```
+--tokenizer-type PretrainedFromHF \
+ --tokenizer-name-or-path bigscience/oscar_13_languages_alpha_weight \
+```
+
+3. Prepare the slurm script
+
+Prepare the run script, replace `variant` with a unique identifier for the current eval so that multiple evals could run in parallel and not all log into the same `results.json` file. so, e.g., `tr9c-1B3-swiglu`
+
+```
+cp examples/run_evalharness_deepspeed.slurm run_evalharness-variant.slurm
+```
+
+now edit `run_evalharness-variant.slurm`
+
+
+Note that the eval code knows to pull the original training args from the checkpoint, so we don't need to pass any of those. And we just need to setup the evaluation args.
+
+1. Edit:
+
+```
+PP_SIZE=1
+TP_SIZE=1
+```
+to match the eval topology. If the model fits into 1 gpu, then there is nothing to change.
+
+The eval script will automatically reshape the model if it was of a different topology.
+
+
+2. Adjust the following to fit the chosen GPU. As of last check for 1.3B model the settings are one of:
+```
+EVAL_MICRO_BATCH_SIZE=6  # 16GB GPU 1.3B model
+EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
+```
+
+If you get OOM lower it further.
+
+3. If not using the Deepspeed path, disable it by removing:
+
+```
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+```
+
+If you didn't disable it and the program crashed on checkpoint loading unable to find some key, disable deepspeed as explained above.
+
+4. Additional flags
+
+- To reduce the amount of iterations for stderr estimation, use e.g. `--bootstrap_iters 2`. This saves 1-2 minutes per dataset.
+- To print intermediate results when running multiple tasks use `--intermed_results`.
+- To reduce the bubble when setting PP use the flag `--micro_bs_multiplier`. Reducing `--micro-batch-size` may be needed when increasing the multiplier. 
+    - Running the 176B model with PP=8, `--micro_bs_multiplier 8` & `--micro-batch-size 4` produced the fastest results for PiQA on 1 node in 2min18s.
+
+## Eval
+
+Currently it takes 2-3 hours to run on 32GB for 1.3B model, 6-7h for 16GB GPU, so a 20h slurm job should be enough.
+
+When ready, launch:
+```
+sbatch ./run_evalharness-variant.slurm
+```
+
+To monitor progress:
+```
+tail -f tail -f $VARIANT-eval-harness.log
+```
+where the variant is what you set `$VARIANT` to in the slurm script.
+
+The template is set up for 16GB gpu since they are easier to get by. If you change to 32GB, adjust:
+```
+#SBATCH --constraint=v100-32g
+...
+EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
+```
+
+
+Note that the original ETA at the start of the run can be 10x too longer than the actual outcome. For example it may suggest 18 hours but will complete in 2 hours.
+
+
+## Short eval
+
+if you just want to quickly test that everything can run to the end, edit `tasks/eval_harness/evaluate.py`,  e.g. to run only 10 batches:
+```
+- results = evaluator.evaluate(adaptor, task_dict, False, 0, None)
+ results = evaluator.evaluate(adaptor, task_dict, False, 0, 10)
+```
+
+(XXX: could be a cmd line option so that code won't need to be modified)
+
+
+## Import into spreadsheet
+
+https://docs.google.com/spreadsheets/d/1CI8Q9RCblLRzUOPJ6ViqBmo284-8ojluQ-CmaEuhuv0/edit?usp=sharing
+
+Note that the spreadsheet format is quite different, so use this script:
+```
+./tasks/eval_harness/report-to-csv.py results.json
+```
+to reformat the json results into csv while changing its shape to match the spreadsheet format
+
+Since some records might be missing or extraneous here is the best way to do it:
+
+1. copy the data from first 2 columns to some place under the main spreadsheet
+
+2. put the pointer to the 3rd column next to where the 2 first columns were copied.
+
+3. import `results.csv` using file-> import -> file ->
+
+Import location: Replace data at selected cell
+
+4. Now it should be easy to align the new records with the old ones - delete irrelevant records and Insert->Cells where data is missing until the first 2 columns match
+
+5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space.
--- a/megatron-deepspeed_dtk22.10/examples/run_evalharness_deepspeed.slurm
+++ b/megatron-deepspeed_dtk22.10/examples/run_evalharness_deepspeed.slurm
+#!/bin/bash
+#SBATCH --job-name=eval-harness-deepspeed
+#SBATCH --constraint=v100-16g
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=40           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:1                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+#SBATCH --account=six@gpu
+
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-prod
+
+echo "START TIME: $(date)"
+
+# a unique identifier for the current eval so that multiple evals could run in parallel and not all log into the same "results.json" file.
+VARIANT="tr9c-1B3-swiglu"
+
+CHECKPOINT_PATH=/gpfsdsstore/projects/rech/six/commun/checkpoints/tr3m-1B3-emb-norm-pile/global_step296023
+MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
+
+# you want these 2 on JZ, and pre-download/cache any datasets/tokenizers/models
+# but comment these out if you're running on a node with Internet access
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+# eval topology
+PP_SIZE=1
+TP_SIZE=1
+
+VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json
+MERGE_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-merges.txt
+SEQ_LEN=2048
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+
+EVAL_MICRO_BATCH_SIZE=6  # 16GB GPU 1.3B model
+#EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
+
+
+#dummy arguments to make megatron happy.
+MEGATRON_REQUIRED_ARGS=" \
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1  \
+    --max-position-embeddings -1
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "zero_optimization": { "stage": $ZERO_STAGE },
+  "fp16": { "enabled": true },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+CMD="./tasks/eval_harness/evaluate.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $VARIANT-results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --inference \
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --seq-length $SEQ_LEN \
+    --adaptive_seq_len \
+    --eval_fp32 \
+    --task_list arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sst,webqs,wic,winogrande,wnli,wsc,triviaqa,sciq \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+N_GPUS=1
+LAUNCHER="deepspeed --num_gpus $N_GPUS"
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log
--- a/megatron-deepspeed_dtk22.10/examples/run_evalharness_tr11-176b-ml.slurm
+++ b/megatron-deepspeed_dtk22.10/examples/run_evalharness_tr11-176b-ml.slurm
+#!/bin/bash
+#SBATCH --job-name=run_evalharness-tr11-176b-ml
+#SBATCH --partition=gpu_p5
+#SBATCH --constraint=a100
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=64           # number of cores per tasks
+#SBATCH --hint=nomultithread         # we get physical cores not logical
+#SBATCH --gres=gpu:8                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+#SBATCH --account=six@a100
+
+
+set -x -e
+
+source $six_ALL_CCFRWORK/start-py38-pt111
+
+echo "START TIME: $(date)"
+
+# a unique identifier for the current eval ideally correspnding to the modelname
+VARIANT="tr11-176b-ml"
+
+
+CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step50000
+MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
+export HF_DATASETS_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
+export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
+export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
+export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
+
+cd $MEGATRON_DEEPSPEED_REPO
+
+TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
+
+PP_SIZE=8
+TP_SIZE=1
+SEQ_LEN=2048
+
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+
+#dummy arguments to make megatron happy.
+MEGATRON_REQUIRED_ARGS=" \
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1  \
+    --max-position-embeddings -1 \
+"
+
+
+ZERO_STAGE=0
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 1,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+
+CMD="./tasks/eval_harness/evaluate.py  \
+    --load $CHECKPOINT_PATH \
+    --results_path $VARIANT-results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --bf16 \
+    --inference \
+    --seq-length $SEQ_LEN \
+    --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+    --bootstrap_iters 2 \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier 4 \
+    $MEGATRON_REQUIRED_ARGS \
+    "
+
+GPUS_PER_NODE=8
+NNODES=$SLURM_NNODES
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+export CUDA_LAUNCH_BLOCKING=1
+
+echo $LAUNCHER $CMD
+
+export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
+
+$LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log
--- a/megatron-deepspeed_dtk22.10/finetune_t0_non_causal_decoder.py
+++ b/megatron-deepspeed_dtk22.10/finetune_t0_non_causal_decoder.py
+"""Multitask Finetuning T0"""
+
+import torch
+
+from megatron import get_args, get_tokenizer, print_rank_0, mpu
+from megatron.data.decoder_packed_mtf_dataset import build_train_valid_test_datasets, build_dataset_group
+from megatron.enums import PositionEmbeddingType, AttnMaskType
+from megatron.model import GPTModelPipe
+from megatron.training import pretrain
+from megatron.utils import get_ltor_masks_and_position_ids, get_packed_attention_mask
+
+import deepspeed
+from deepspeed.runtime.utils import see_memory_usage
+
+try:
+    from torch.distributed.elastic.multiprocessing.errors import record
+except ImportError:
+    # noop
+    def record(fn):
+        return fn
+
+def model_provider(pre_process=True, post_process=True):
+    """Build the model."""
+
+    print_rank_0("building GPT model ...")
+    see_memory_usage(f"Before Building Model", force=True)
+
+    args = get_args()
+
+    with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(),
+                             remote_device=None if args.remote_device == "none" else args.remote_device,
+                             config_dict_or_path=args.deepspeed_config,
+                             enabled=args.zero_stage == 3,
+                             mpu=mpu):
+        if args.deepspeed:
+            model = GPTModelPipe(
+                num_tokentypes=0,
+                parallel_output=True,
+                attn_mask_type=AttnMaskType.custom
+            )
+            # This is a hack to give us a reference to get_batch_pipe from within training.py
+            # We need to call model.set_batch_fn after deepspeed.initialize
+            model._megatron_batch_fn = get_batch_pipe
+        else:
+            raise NotImplementedError("DeepSpeed is required for T0")
+
+    see_memory_usage(f"After Building Model", force=True)
+    return model
+
+def get_batch_pipe(data):
+    """
+    Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator` & in packed fashion
+    
+    data:
+    decoder_tokens = [[6, 7, 8, 3, 4, 5, 0]]
+    decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]]
+    decoder_is_inputs = [[1, 1, 0, 1, 1, 0, 0]]
+    """
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Broadcast data.
+    data_b = mpu.broadcast_data(["decoder_token_ids", "decoder_segment_ids"], data, torch.int64)
+    data_c = mpu.broadcast_data(["decoder_is_inputs"], data, torch.bool)
+
+    # Unpack.
+    tokens_ = data_b["decoder_token_ids"].long()
+    labels = tokens_[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].contiguous()
+    
+    segment_ids = data_b["decoder_segment_ids"].long()[:, :-1]
+    decoder_is_inputs = data_c["decoder_is_inputs"][:, :-1]
+
+    # Get the masks and position ids.
+    causal_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        tokenizer.eod,
+        args.reset_position_ids,
+        args.reset_attention_mask,
+        args.eod_mask_loss,
+        prefix_indices=None,
+        loss_on_targets_only=False # This is done below
+    )
+    # Only compute loss over causal target tokens, i.e. ignore input_tokens & padding
+    loss_on_targets_only = ~data_c["decoder_is_inputs"][:, 1:]
+    loss_on_non_pad_only = (tokens != tokenizer.pad)
+    loss_mask *= loss_on_targets_only * loss_on_non_pad_only
+
+    attention_mask = get_packed_attention_mask(
+        # Run non-causal decoder
+        is_causal=False,
+        causal_mask=~(causal_mask.bool()),
+        decoder_is_inputs=decoder_is_inputs.bool(),
+        segment_ids=segment_ids.long(),
+    )
+
+    if args.position_embedding_type not in [PositionEmbeddingType.alibi, PositionEmbeddingType.rotary]:
+        raise NotImplementedError("absolute positional embeddings require us to reset position_ids accordingly.")
+
+    return (tokens, position_ids, attention_mask), (labels, loss_mask)
+
+
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build train, valid, and test datasets."""
+    args = get_args()
+    train_ds, valid_ds, test_ds = None, None, None
+
+    tokenizer = get_tokenizer()
+
+    print_rank_0("> building train, validation, and test datasets for T0 ...")
+    # Option 1 of data loading using --data-path
+    if args.data_path:
+        # TODO: Not yet compatible with dataset weights (Will break at prefixes, weights = analyze_data_prefix(args.data_path))
+        train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+            data_prefix=args.data_path,
+            data_impl=args.data_impl,
+            splits_string=args.split,
+            seq_length=args.seq_length + 1,
+            pad_token=tokenizer.pad,
+            eos_token=tokenizer.eos,
+            train_valid_test_num_samples=train_val_test_num_samples,
+            seed=args.seed,
+            skip_warmup=(not args.mmap_warmup)
+        )
+        # Option 2 of data loading using --(train|valid|test)-weighted-split-paths
+    elif args.train_weighted_split_paths:
+        assigned_train_valid_test = []
+        if args.train_weighted_split_paths is not None:
+            train_ds = []
+            assigned_train_valid_test.append("train")
+        if args.valid_weighted_split_paths is not None:
+            valid_ds = []
+            assigned_train_valid_test.append("valid")
+        if args.test_weighted_split_paths is not None:
+            test_ds = []
+            assigned_train_valid_test.append("test")
+
+        for s in assigned_train_valid_test:
+            data_groups = zip(eval(f"args.{s}_weighted_split_paths"),
+                              eval(f"args.{s}_weighted_split_weights"),
+                              eval(f"args.{s}_weighted_split_splits"),
+                              eval(f"args.{s}_weighted_split_names"))
+            for paths, weights, splits, name in data_groups:
+                d = build_dataset_group(
+                    dataset_group_name=name,
+                    paths=paths,
+                    weights=weights,
+                    splits=splits,
+                    data_impl=args.data_impl,
+                    train_valid_test_num_samples=train_val_test_num_samples,
+                    seq_length=args.seq_length + 1,
+                    pad_token=tokenizer.pad,
+                    eos_token=tokenizer.eos,
+                    seed=args.seed,
+                    skip_warmup=(not args.mmap_warmup),
+                    train_valid_test=s
+                )
+                eval(f"{s}_ds").append(d)
+    else:
+        raise NotImplementedError("No dataloading argument passed")
+
+    print_rank_0("> finished creating T0 datasets ...")
+    return train_ds, valid_ds, test_ds
+
+@record
+def main():
+    pretrain(
+        train_valid_test_datasets_provider,
+        model_provider,
+        forward_step_func=None,
+        args_defaults={}
+    )
+
+if __name__ == "__main__":
+    main()