GPT2 base on megatron-deepspeed

8ec5d678 · hepj987 · 8ec5d678 · 8ec5d678 · 8ec5d678 · 8ec5d678
Commit 8ec5d678 authored Apr 03, 2023 by hepj987
20 changed files
--- a/megatron-deepspeed_dtk22.10/run_fp16.sh
+++ b/megatron-deepspeed_dtk22.10/run_fp16.sh
+#!/bin/bash
+
+
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+#mkdir -p $DIR/logs
+#mkdir -p /tmp/logs
+
+
+#DATASET_1="<PATH TO THE FIRST DATASET>"
+#DATASET_2="<PATH TO THE SECOND DATASET>"
+#DATASET_3="<PATH TO THE THIRD DATASET>"
+#DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
+
+BASE_DATA_PATH=/data/Megatron-LM/data
+DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
+VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+
+
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+#CONFIG_JSON="$script_dir/ds_config.json"
+CONFIG_JSON="/tmp/ds_config.json"
+
+USE_DEEPSPEED=1
+ZERO_STAGE=0
+
+
+# Debug
+#TP=4
+#PP=4
+#LAYERS=8
+#HIDDEN=512
+#SEQ=1024
+#GLOBAL_BATCH=128
+#WORKER_STR="-i worker-0"
+
+
+TP=1
+PP=1
+DP=2
+WORLD_SIZE=$((TP*PP*DP))
+HIDDEN=1024
+LAYERS=24
+SEQ=1024
+GLOBAL_BATCH=1
+WORKER_STR=""
+
+MICRO_BATCH=1
+LR=6.0e-4
+MIN_LR=6.0e-5
+DTYPE="fp16"
+EXP_DIR=${HOME}/experiments/results/bf16
+LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_fix3"
+mkdir -p $LOG_DIR
+
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    --no-deepspeed)
+    USE_DEEPSPEED=0;
+    shift
+    ;;
+    -z|--zero-stage)
+    ZERO_STAGE=$2;
+    shift
+    ;;
+    *)
+    echo "Unknown argument(s)"
+    usage
+    exit 1
+    shift
+    ;;
+esac
+done
+
+
+options=" \
+	--tensor-model-parallel-size $TP \
+	--pipeline-model-parallel-size $PP \
+        --num-layers $LAYERS \
+        --hidden-size $HIDDEN \
+        --num-attention-heads 32 \
+        --seq-length $SEQ \
+        --loss-scale 12 \
+        --max-position-embeddings $SEQ \
+	--micro-batch-size $MICRO_BATCH \
+	--global-batch-size $GLOBAL_BATCH \
+	--train-iters 1000 \
+        --lr $LR \
+	--min-lr $MIN_LR \
+        --lr-decay-style cosine \
+        --log-interval 1 \
+        --eval-iters 40 \
+        --eval-interval 10 \
+	--data-path ${DATASET} \
+	--vocab-file ${VOCAB_PATH} \
+	--merge-file ${MERGE_PATH} \
+	--save-interval 10000 \
+        --split 98,2,0 \
+        --clip-grad 1.0 \
+	--weight-decay 0.1 \
+	--adam-beta1 0.9 \
+	--adam-beta2 0.95 \
+	--init-method-std 0.006 \
+        --${DTYPE} \
+	--checkpoint-activations \
+	--exit-interval 10000 \
+	--tensorboard-dir $LOG_DIR
+        "
+
+
+if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
+	echo "Using DeepSpeed"
+	options="${options} \
+		--deepspeed \
+		--deepspeed_config=${CONFIG_JSON} \
+		--zero-stage=${ZERO_STAGE} \
+		--deepspeed-activation-checkpointing \
+	"
+fi
+
+
+cat <<EOT > $CONFIG_JSON
+{
+  "train_batch_size" : $GLOBAL_BATCH,
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
+  "steps_per_print": 1,
+
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+
+  "bf16": {
+    "enabled": false
+  },
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 8
+  },
+
+  "wall_clock_breakdown" : true
+}
+EOT
+
+WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
+#WORKER_STR="-i worker-0:0,1,2,3"
+#run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
+#run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
+run_cmd="deepspeed --master_port 29600 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
+
+
+echo ${run_cmd}
+eval ${run_cmd}
+
+set +x
--- a/megatron-deepspeed_dtk22.10/run_universal_bf16.sh
+++ b/megatron-deepspeed_dtk22.10/run_universal_bf16.sh
+#!/bin/bash
+
+
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+#mkdir -p $DIR/logs
+#mkdir -p /tmp/logs
+
+
+#DATASET_1="<PATH TO THE FIRST DATASET>"
+#DATASET_2="<PATH TO THE SECOND DATASET>"
+#DATASET_3="<PATH TO THE THIRD DATASET>"
+#DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
+
+#BASE_DATA_PATH=tests/data/gpt2
+#DATASET=${BASE_DATA_PATH}/meg-gpt2-openwebtext_text_document
+#VOCAB_PATH=${BASE_DATA_PATH}/gpt2-tiny-vocab.json
+#MERGE_PATH=${BASE_DATA_PATH}/gpt2-tiny-merges.txt
+
+BASE_DATA_PATH=/vc_data/Megatron-LM/data
+DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
+VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+
+
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+CONFIG_JSON="$script_dir/ds_config.json"
+#CONFIG_JSON="/tmp/ds_config.json"
+
+USE_DEEPSPEED=1
+ZERO_STAGE=0
+
+#TP=4
+#PP=4
+
+# Debug
+DEBUG_MODE=0 
+if [[ $DEBUG_MODE == 1 ]]; then
+        LAYERS=4
+        HIDDEN=512
+        SEQ=512
+        EXIT_INTERVAL=3
+else
+        HIDDEN=1024
+        LAYERS=24
+        SEQ=1024
+        EXIT_INTERVAL=10
+fi  
+
+TP=2
+PP=2
+DP=4
+WORLD_SIZE=$((TP*PP*DP))
+GLOBAL_BATCH=4
+
+MICRO_BATCH=1
+TRAIN_ITERS=100000
+CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP} 
+LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp2_pp2_dp4
+
+LR=6.0e-4
+MIN_LR=6.0e-5
+DTYPE="bf16"
+EXP_DIR=${HOME}/experiments/results/ckpt_reshape
+LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_uni"
+mkdir -p $LOG_DIR
+
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    --no-deepspeed)
+    USE_DEEPSPEED=0;
+    shift
+    ;;
+    -z|--zero-stage)
+    ZERO_STAGE=$2;
+    shift
+    ;;
+    *)
+    echo "Unknown argument(s)"
+    usage
+    exit 1
+    shift
+    ;;
+esac
+done
+
+
+options=" \
+	--tensor-model-parallel-size $TP \
+	--pipeline-model-parallel-size $PP \
+        --num-layers $LAYERS \
+        --hidden-size $HIDDEN \
+        --num-attention-heads 32 \
+        --seq-length $SEQ \
+        --loss-scale 12 \
+        --max-position-embeddings $SEQ \
+	--micro-batch-size $MICRO_BATCH \
+	--global-batch-size $GLOBAL_BATCH \
+	--train-iters $TRAIN_ITERS \
+        --lr $LR \
+	--min-lr $MIN_LR \
+        --lr-decay-style cosine \
+        --log-interval 1 \
+        --eval-iters 40 \
+        --eval-interval 10 \
+	--data-path ${DATASET} \
+	--vocab-file ${VOCAB_PATH} \
+	--merge-file ${MERGE_PATH} \
+	--save-interval 1000 \
+        --split 98,2,0 \
+        --clip-grad 1.0 \
+	--weight-decay 0.1 \
+	--adam-beta1 0.9 \
+	--adam-beta2 0.95 \
+	--init-method-std 0.006 \
+        --${DTYPE} \
+	--checkpoint-activations \
+	--exit-interval ${EXIT_INTERVAL} \
+        --save ${CHECKPOINT_PATH} \
+        --load ${LOAD_CHECKPOINT_PATH} \
+        --universal-checkpoint \
+        --position-embedding-type alibi \
+        --override-lr-scheduler \
+        --embed-layernorm \
+	--tensorboard-dir $LOG_DIR
+        "
+
+
+if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
+	echo "Using DeepSpeed"
+	options="${options} \
+		--deepspeed \
+		--deepspeed_config=${CONFIG_JSON} \
+		--zero-stage=${ZERO_STAGE} \
+		--deepspeed-activation-checkpointing \
+	"
+fi
+
+
+cat <<EOT > $CONFIG_JSON
+{
+  "train_batch_size" : $GLOBAL_BATCH,
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
+  "steps_per_print": 1,
+
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+
+  "bf16": {
+    "enabled": true
+  },
+
+  "fp16": {
+    "enabled": false,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+
+  "wall_clock_breakdown" : true
+}
+EOT
+
+#WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
+#WORKER_STR="-i worker-0:0,1,2,3"
+#run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
+#run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
+run_cmd="deepspeed --master_port 29700 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
+
+
+echo ${run_cmd}
+eval ${run_cmd}
+
+set +x
--- a/megatron-deepspeed_dtk22.10/scripts/README.md
+++ b/megatron-deepspeed_dtk22.10/scripts/README.md
+# Scripts
+
+This section should be organized with sub-folders for different things.
--- a/megatron-deepspeed_dtk22.10/scripts/bloom-inference-scripts/README.md
+++ b/megatron-deepspeed_dtk22.10/scripts/bloom-inference-scripts/README.md
+# Inference scripts for BLOOM
+
+Moved to https://github.com/huggingface/transformers-bloom-inference/tree/main/bloom-inference-scripts
--- a/megatron-deepspeed_dtk22.10/scripts/bloom-inference-server/README.md
+++ b/megatron-deepspeed_dtk22.10/scripts/bloom-inference-server/README.md
+## Inference solutions for BLOOM 176B
+
+Moved to https://github.com/huggingface/transformers-bloom-inference/tree/main/bloom-inference-server
--- a/megatron-deepspeed_dtk22.10/scripts/test_multiple_dataset_sampling/create_dummy_dataset.py
+++ b/megatron-deepspeed_dtk22.10/scripts/test_multiple_dataset_sampling/create_dummy_dataset.py
+import json
+import argparse
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dir',
+        type=str,
+        required=True,
+        help='directory to save data'
+        )
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = get_args()
+
+    for i in range(10):
+
+        row_limit = 1000
+        rows_to_save = [{'text': ''.join([str(i)+'-*']*128)}]
+
+        with open('{}/dataset_{}.json'.format(args.dir, i), 'w') as f:
+            f.write(
+                '\n'.join(json.dumps(_i) for _i in rows_to_save*row_limit)
+            )
+
+if __name__ == '__main__':
+    main()
--- a/megatron-deepspeed_dtk22.10/scripts/test_multiple_dataset_sampling/preprocess_data.py
+++ b/megatron-deepspeed_dtk22.10/scripts/test_multiple_dataset_sampling/preprocess_data.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Processing data for pretraining."""
+
+import argparse
+import json
+import multiprocessing
+import os
+import sys
+
+from megatron.data.indexed_dataset import best_fitting_dtype
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir)))
+import time
+
+import torch
+try:
+    import nltk
+    nltk_available = True
+except ImportError:
+    nltk_available = False
+
+from megatron.tokenizer import build_tokenizer
+from megatron.data import indexed_dataset
+
+
+# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
+class CustomLanguageVars(nltk.tokenize.punkt.PunktLanguageVars):
+
+    _period_context_fmt = r"""
+        \S*                          # some word material
+        %(SentEndChars)s             # a potential sentence ending
+        \s*                       #  <-- THIS is what I changed
+        (?=(?P<after_tok>
+            %(NonWord)s              # either other punctuation
+            |
+            (?P<next_tok>\S+)     #  <-- Normally you would have \s+ here
+        ))"""
+
+class IdentitySplitter(object):
+    def tokenize(self, *text):
+        return text
+
+class Encoder(object):
+    def __init__(self, args):
+        self.args = args
+
+    def initializer(self):
+        # Use Encoder class as a container for global data
+        Encoder.tokenizer = build_tokenizer(self.args)
+        if self.args.split_sentences:
+            if not nltk_available:
+                print("NLTK is not available to split sentences.")
+                exit()
+            splitter = nltk.load("tokenizers/punkt/english.pickle")
+            if self.args.keep_newlines:
+                # this prevents punkt from eating newlines after sentences
+                Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
+                    train_text = splitter._params,
+                    lang_vars = CustomLanguageVars())
+            else:
+                Encoder.splitter = splitter
+
+        else:
+            Encoder.splitter = IdentitySplitter()
+
+    def encode(self, json_line):
+        data = json.loads(json_line)
+        ids = {}
+        for key in self.args.json_keys:
+            text = data[key]
+            doc_ids = []
+            for sentence in Encoder.splitter.tokenize(text):
+                sentence_ids = Encoder.tokenizer.tokenize(sentence)
+                if len(sentence_ids) > 0:
+                    doc_ids.append(sentence_ids)
+            if len(doc_ids) > 0 and self.args.append_eod:
+                doc_ids[-1].append(Encoder.tokenizer.eod)
+            ids[key] = doc_ids
+        return ids, len(json_line)
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title='input data')
+    group.add_argument('--input', type=str,
+                       help='Path to input JSON')
+    group.add_argument('--datasets', nargs='+', default=None,
+                       help='Paths to one or more input datasets to merge')
+    group.add_argument('--json-keys', nargs='+', default=['text'],
+                       help='space separate listed of keys to extract from json')
+    group.add_argument('--split-sentences', action='store_true',
+                       help='Split documents into sentences.')
+    group.add_argument('--keep-newlines', action='store_true',
+                       help='Keep newlines between sentences when splitting.')
+
+    group = parser.add_argument_group(title='tokenizer')
+    group.add_argument('--tokenizer-type', type=str, required=True,
+                       choices=['BertWordPieceLowerCase','BertWordPieceCase',
+                                'GPT2BPETokenizer', 'PretrainedFromHF'],
+                       help='What type of tokenizer to use.')
+    group.add_argument('--vocab-file', type=str, default=None,
+                       help='Path to the vocab file')
+    group.add_argument('--merge-file', type=str, default=None,
+                       help='Path to the BPE merge file (if necessary).')
+    group.add_argument('--append-eod', action='store_true',
+                       help='Append an <eod> token to the end of a document.')
+    group.add_argument("--tokenizer-name-or-path", type=str, default=None,
+                       help="Name or path of the huggingface tokenizer.")
+
+    group = parser.add_argument_group(title='output data')
+    group.add_argument('--output-prefix', type=str, required=True,
+                       help='Path to binary output file without suffix')
+    group.add_argument('--dataset-impl', type=str, default='mmap',
+                       choices=['lazy', 'cached', 'mmap'])
+
+    group = parser.add_argument_group(title='runtime')
+    group.add_argument('--workers', type=int, default=1,
+                       help='Number of worker processes to launch')
+    group.add_argument('--log-interval', type=int, default=100,
+                       help='Interval between progress updates')
+    args = parser.parse_args()
+    args.keep_empty = False
+
+    if args.tokenizer_type.lower().startswith('bert'):
+        if not args.split_sentences:
+            print("Bert tokenizer detected, are you sure you don't want to split sentences?")
+
+    # some default/dummy values for the tokenizer
+    args.rank = 0
+    args.make_vocab_size_divisible_by = 128
+    args.tensor_model_parallel_size = 1
+    args.vocab_extra_ids = 0
+
+    return args
+
+def main():
+    args = get_args()
+    startup_start = time.time()
+
+    print("Opening", args.input)
+    fin = open(args.input, 'r', encoding='utf-8')
+
+    if nltk_available and args.split_sentences:
+        nltk.download("punkt", quiet=True)
+
+    encoder = Encoder(args)
+    tokenizer = build_tokenizer(args)
+    pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
+    encoded_docs = pool.imap(encoder.encode, fin, 25)
+    #encoded_docs = map(encoder.encode, fin)
+
+    level = "document"
+    if args.split_sentences:
+        level = "sentence"
+
+    print(f"Vocab size: {tokenizer.vocab_size}")
+    print(f"Output prefix: {args.output_prefix}")
+    output_bin_files = {}
+    output_idx_files = {}
+    builders = {}
+    for key in args.json_keys:
+        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix,
+                                                    key, level)
+        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix,
+                                                    key, level)
+        builders[key] = indexed_dataset.make_builder(output_bin_files[key],
+                                                     impl=args.dataset_impl,
+                                                     dtype=best_fitting_dtype(tokenizer.vocab_size))
+
+    startup_end = time.time()
+    proc_start = time.time()
+    total_bytes_processed = 0
+    print("Time to startup:", startup_end - startup_start)
+
+    for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
+        total_bytes_processed += bytes_processed
+        for key, sentences in doc.items():
+            if len(sentences) == 0:
+                continue
+            for sentence in sentences:
+                builders[key].add_item(torch.IntTensor(sentence))
+            builders[key].end_document()
+        if i % args.log_interval == 0:
+            current = time.time()
+            elapsed = current - proc_start
+            mbs = total_bytes_processed/elapsed/1024/1024
+            print(f"Processed {i} documents",
+                f"({i/elapsed} docs/s, {mbs} MB/s).",
+                file=sys.stderr)
+
+    for key in args.json_keys:
+        builders[key].finalize(output_idx_files[key])
+
+if __name__ == '__main__':
+    main()
--- a/megatron-deepspeed_dtk22.10/scripts/test_multiple_dataset_sampling/test_sampling.py
+++ b/megatron-deepspeed_dtk22.10/scripts/test_multiple_dataset_sampling/test_sampling.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pretrain GPT"""
+
+import torch
+from functools import partial
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_timers
+from megatron import get_tokenizer
+from megatron import mpu
+from megatron.data.gpt_dataset import build_train_valid_test_datasets
+from megatron.model import GPTModel, GPTModelPipe
+from megatron.training import pretrain
+from megatron.utils import get_ltor_masks_and_position_ids
+from megatron.utils import average_losses_across_data_parallel_group
+
+import deepspeed
+from deepspeed.runtime.utils import see_memory_usage
+import os
+import subprocess
+
+collected_sample = {}
+token_dict = {
+    15: 0,
+    16: 1,
+    17: 2,
+    18: 3,
+    19: 4,
+    20: 5,
+    21: 6,
+    22: 7,
+    23: 8,
+    24: 9,
+}
+
+def model_provider(pre_process=True, post_process=True):
+    """Build the model."""
+
+    print_rank_0('building GPT model ...')
+    see_memory_usage(f"Before Building Model", force=True)
+
+    args = get_args()
+    with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(),
+                             remote_device=None if args.remote_device=='none' else args.remote_device,
+                             config=args.deepspeed_config,
+                             enabled=args.zero_stage==3):
+        if args.deepspeed and mpu.get_pipeline_model_parallel_world_size() > 1:
+            model = GPTModelPipe(
+                num_tokentypes=0,
+                parallel_output=True
+            )
+            # This is a hack to give us a reference to get_batch_pipe from within training.py
+            # We need to call model.set_batch_fn after deepspeed.initialize
+            model._megatron_batch_fn = get_batch_pipe
+
+            # Predompute the attention mask and store it in args. This avoids having to
+            # pipeline it as an activation during training. The mask is constant, and thus
+            # we can reuse it.
+            attention_mask = torch.tril(torch.ones(
+                (1, args.seq_length, args.seq_length), device=torch.cuda.current_device())).view(
+                    1, 1, args.seq_length, args.seq_length)
+            
+            # Convert attention mask to binary:
+            attention_mask = (attention_mask < 0.5)
+            if args.fp16:
+                attention_mask = attention_mask.half()
+            elif args.bf16:
+                attention_mask = attention_mask.bfloat16()
+            
+            args.attn_mask = attention_mask
+
+        else:
+            model = GPTModel(
+                num_tokentypes=0,
+                parallel_output=True,
+                pre_process=pre_process,
+                post_process=post_process
+            )
+    see_memory_usage(f"After Building Model", force=True)
+    return model
+
+
+def get_batch(data_iterator):
+    """Generate a batch"""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Items and their type.
+    keys = ['text']
+    datatype = torch.int64
+
+    # Broadcast data.
+    if data_iterator is not None:
+        data = next(data_iterator)
+    else:
+        data = None
+    data_b = mpu.broadcast_data(keys, data, datatype)
+
+    # Unpack.
+    # tokens_ = data_b['text'].long()
+    tokens_ = data_b[keys[0]].long()
+    labels = tokens_[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].contiguous()
+
+    # Get the masks and postition ids.
+    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        tokenizer.eod,
+        args.reset_position_ids,
+        args.reset_attention_mask,
+        args.eod_mask_loss)
+
+    return tokens, labels, loss_mask, attention_mask, position_ids
+
+def get_batch_pipe(data):
+    """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`"""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    # Items and their type.
+    keys = ['text']
+    datatype = torch.int64
+
+    # Broadcast data.
+    data_b = mpu.broadcast_data(keys, data, datatype)
+
+    # Unpack.
+    tokens_ = data_b['text'].long()
+    labels = tokens_[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].contiguous()
+
+    # Get the masks and postition ids.
+    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        tokenizer.eod,
+        args.reset_position_ids,
+        args.reset_attention_mask,
+        args.eod_mask_loss)
+
+    return (tokens, position_ids, attention_mask), (labels, loss_mask)
+
+def loss_func(loss_mask, output_tensor):
+    losses = output_tensor.float()
+    loss_mask = loss_mask.view(-1).float()
+    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
+
+    # Reduce loss for logging.
+    averaged_loss = average_losses_across_data_parallel_group([loss])
+
+    return loss, {'lm loss': averaged_loss[0]}
+
+
+def forward_step(data_iterator, model):
+    """Forward step."""
+    args = get_args()
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch-generator').start()
+    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+        data_iterator)
+    timers('batch-generator').stop()
+
+    for row in tokens.detach().cpu():
+        token_idx = [i for i in row.tolist() if i in list(token_dict.keys())][0]
+        num = "dataset_{}".format(token_dict[token_idx])
+        if num in collected_sample:
+            collected_sample[num] += 1
+        else:
+            collected_sample[num] = 1
+
+    print_rank_0(collected_sample)
+    output_tensor = model(tokens, position_ids, attention_mask,
+                          labels=labels)
+
+    return output_tensor, partial(loss_func, loss_mask)
+
+
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build train, valid, and test datasets."""
+    args = get_args()
+
+    print_rank_0('> building train, validation, and test datasets '
+                 'for GPT ...')
+    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+        data_prefix=args.data_path,
+        data_impl=args.data_impl,
+        splits_string=args.split,
+        train_valid_test_num_samples=train_val_test_num_samples,
+        seq_length=args.seq_length,
+        seed=args.seed,
+        skip_warmup=(not args.mmap_warmup))
+    print_rank_0("> finished creating GPT datasets ...")
+
+    return train_ds, valid_ds, test_ds
+
+def command_exists(cmd):
+    result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
+    return result.wait() == 0
+
+def git_ds_info():
+    from deepspeed.env_report import main as ds_report
+    ds_report()
+
+    # Write out version/git info
+    git_hash_cmd = "git rev-parse --short HEAD"
+    git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
+    if command_exists('git'):
+        try:
+            result = subprocess.check_output(git_hash_cmd, shell=True)
+            git_hash = result.decode('utf-8').strip()
+            result = subprocess.check_output(git_branch_cmd, shell=True)
+            git_branch = result.decode('utf-8').strip()
+        except subprocess.CalledProcessError:
+            git_hash = "unknown"
+            git_branch = "unknown"
+    else:
+        git_hash = "unknown"
+        git_branch = "unknown"
+    print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****')
+
+
+if __name__ == "__main__":
+    git_ds_info()
+    pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
+             args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
--- a/megatron-deepspeed_dtk22.10/scripts/test_multiple_dataset_sampling/test_sampling.sh
+++ b/megatron-deepspeed_dtk22.10/scripts/test_multiple_dataset_sampling/test_sampling.sh
+#!/bin/bash
+BASE_DATA_PATH=/tmp
+INPUT_PATH=/tmp
+OUTPUT_PATH=/tmp
+
+wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -P ${BASE_DATA_PATH}
+wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -P ${BASE_DATA_PATH}
+
+python create_dummy_dataset.py --dir ${INPUT_PATH}
+
+python preprocess_data.py \
+    --input ${INPUT_PATH}/dataset_0.json \
+    --output-prefix ${OUTPUT_PATH}/dataset-0 \
+    --vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
+    --append-eod
+
+python preprocess_data.py \
+    --input ${INPUT_PATH}/dataset_1.json \
+    --output-prefix ${OUTPUT_PATH}/dataset-1 \
+    --vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
+    --append-eod
+
+python preprocess_data.py \
+    --input ${INPUT_PATH}/dataset_2.json \
+    --output-prefix ${OUTPUT_PATH}/dataset-2 \
+    --vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
+    --append-eod
+
+python preprocess_data.py \
+    --input ${INPUT_PATH}/dataset_3.json \
+    --output-prefix ${OUTPUT_PATH}/dataset-3 \
+    --vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
+    --append-eod
+
+python preprocess_data.py \
+    --input ${INPUT_PATH}/dataset_4.json \
+    --output-prefix ${OUTPUT_PATH}/dataset-4 \
+    --vocab ${BASE_DATA_PATH}/gpt2-vocab.json \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file ${BASE_DATA_PATH}/gpt2-merges.txt \
+    --append-eod
+
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+mkdir -p ${BASE_DATA_PATH}/logs
+
+DATASET_0="${OUTPUT_PATH}/dataset-0_text_document"
+DATASET_1="${OUTPUT_PATH}/dataset-1_text_document"
+DATASET_2="${OUTPUT_PATH}/dataset-2_text_document"
+DATASET_3="${OUTPUT_PATH}/dataset-3_text_document"
+DATASET_4="${OUTPUT_PATH}/dataset-4_text_document"
+DATASET="0.1 ${DATASET_0} 0.25 ${DATASET_1} 0.2 ${DATASET_2} 0.15 ${DATASET_3} 0.3 ${DATASET_4}"
+
+VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+
+
+#script_path=$(realpath $0)
+#script_dir=$(dirname $script_path)
+#CONFIG_JSON="$script_dir/ds_config.json"
+
+USE_DEEPSPEED=1
+ZERO_STAGE=0
+
+
+# Debug
+#TP=4
+#PP=4
+#LAYERS=8
+#HIDDEN=512
+#SEQ=1024
+#GLOBAL_BATCH=128
+#WORKER_STR="-i worker-0"
+
+
+# 52B
+TP=4
+PP=16
+HIDDEN=1024
+LAYERS=24
+SEQ=128
+GLOBAL_BATCH=16
+WORKER_STR=""
+
+MICRO_BATCH=8
+
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    --no-deepspeed)
+    USE_DEEPSPEED=0;
+    shift
+    ;;
+    -z|--zero-stage)
+    ZERO_STAGE=$2;
+    shift
+    ;;
+    *)
+    echo "Unknown argument(s)"
+    usage
+    exit 1
+    shift
+    ;;
+esac
+done
+
+
+options=" \
+	--tensor-model-parallel-size $TP \
+	--pipeline-model-parallel-size $PP \
+        --num-layers $LAYERS \
+        --hidden-size $HIDDEN \
+        --num-attention-heads 32 \
+        --seq-length $SEQ \
+        --loss-scale 12 \
+        --max-position-embeddings $SEQ \
+	--micro-batch-size $MICRO_BATCH \
+	--global-batch-size $GLOBAL_BATCH \
+	--train-iters 1000 \
+        --lr 6.0e-5 \
+	--min-lr 6.0e-6 \
+        --lr-decay-style cosine \
+        --log-interval 1 \
+        --eval-iters 40 \
+        --eval-interval 1000 \
+	--data-path ${DATASET} \
+	--vocab-file ${VOCAB_PATH} \
+	--merge-file ${MERGE_PATH} \
+	--save-interval 1000 \
+        --split 98,2,0 \
+        --clip-grad 1.0 \
+	--weight-decay 0.1 \
+	--adam-beta1 0.9 \
+	--adam-beta2 0.95 \
+	--init-method-std 0.006 \
+        --fp16 \
+	--checkpoint-activations
+        "
+
+
+if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
+	echo "Using DeepSpeed"
+	options="${options} \
+		--deepspeed \
+		--deepspeed_config=${CONFIG_JSON} \
+		--zero-stage=${ZERO_STAGE} \
+		--deepspeed-activation-checkpointing \
+	"
+fi
+
+
+cat <<EOT > $CONFIG_JSON
+{
+  "train_batch_size" : $GLOBAL_BATCH,
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
+  "steps_per_print": 1,
+
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+
+  "gradient_clipping": 1.0,
+  "prescale_gradients": true,
+
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+
+  "wall_clock_breakdown" : true
+}
+EOT
+
+# run_cmd="deepspeed $WORKER_STR ${DIR}/test_sampling.py $@ ${options}"
+run_cmd="deepspeed $WORKER_STR test_sampling.py $@ ${options}"
+
+echo ${run_cmd}
+eval ${run_cmd}
+
+set +x
--- a/megatron-deepspeed_dtk22.10/setup.cfg
+++ b/megatron-deepspeed_dtk22.10/setup.cfg
+[isort]
+default_section = FIRSTPARTY
+ensure_newline_before_comments = True
+force_grid_wrap = 0
+include_trailing_comma = True
+known_first_party = megatron
+known_third_party =
+    apex
+    codecarbon
+    datasets
+    deepspeed
+    git
+    nltk
+    numpy
+    pytest
+    tensorboard
+    torch
+    tqdm
+    transformers
+
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+use_parentheses = True
\ No newline at end of file
--- a/megatron-deepspeed_dtk22.10/setup.py
+++ b/megatron-deepspeed_dtk22.10/setup.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Setup for pip package."""
+
+import os
+import sys
+import setuptools
+
+if sys.version_info < (3,):
+    raise Exception("Python 2 is not supported by Megatron.")
+
+from megatron.package_info import (
+    __description__,
+    __contact_names__,
+    __url__,
+    __download_url__,
+    __keywords__,
+    __license__,
+    __package_name__,
+    __version__,
+)
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+###############################################################################
+#                             Dependency Loading                              #
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
+
+
+def req_file(filename):
+    with open(filename) as f:
+        content = f.readlines()
+    return [x.strip() for x in content]
+
+
+install_requires = req_file("requirements.txt")
+
+setuptools.setup(
+    name=__package_name__,
+    # Versions should comply with PEP440.  For a discussion on single-sourcing
+    # the version across setup.py and the project code, see
+    # https://packaging.python.org/en/latest/single_source_version.html
+    version=__version__,
+    description=__description__,
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    # The project's main homepage.
+    url=__url__,
+    author=__contact_names__,
+    maintainer=__contact_names__,
+    # The licence under which the project is released
+    license=__license__,
+    classifiers=[
+        'Intended Audience :: Developers',
+        'Intended Audience :: Science/Research',
+        'Intended Audience :: Information Technology',
+        # Indicate what your project relates to
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+        # Supported python versions
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        # Additional Setting
+        'Environment :: Console',
+        'Natural Language :: English',
+        'Operating System :: OS Independent',
+    ],
+    python_requires='>=3.6',
+    packages=setuptools.find_packages(),
+    install_requires=install_requires,
+    # Add in any packaged data.
+    include_package_data=True,
+    zip_safe=False,
+    # PyPI package information.
+    keywords=__keywords__
+)
--- a/megatron-deepspeed_dtk22.10/single-16B.sh
+++ b/megatron-deepspeed_dtk22.10/single-16B.sh
+#!/bin/bash
+export NCCL_SOCKET_IFNAME=ib0
+export NCCL_IB_HCA=mlx5
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+
+
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+
+
+MODEL_NAME=gpt2-oscar_16B-4tp
+DATA_OUTPUT_PATH=./
+LOGS_PATH=$DATA_OUTPUT_PATH/logs
+CHECKPOINT_PATH=output-module/$MODEL_NAME
+DATA_PATH="my-gpt2_text_document"
+
+TENSORBOARD_PATH=output_dir/tensorboard/$MODEL_NAME
+CODECARBON_PATH=output_dir/codecarbon/$MODEL_NAME
+
+TP_SIZE=4   # always fixed to the size of a single node
+PP_SIZE=4   # NLAYERS must be a multiple of PP_SIZE here
+
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=128 
+NLAYERS=40
+NHIDDEN=5760
+NHEADS=24
+SEQ_LEN=2048
+SAVE_INTERVAL=1000
+
+
+OPTIMIZER_ARGS=" \
+    --optimizer adam \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --adam-eps 1e-8 \
+    --lr 6.0e-5 \
+    --min-lr 6.0e-6 \
+    --lr-decay-style cosine \
+    --clip-grad 1.0 \
+    --weight-decay 1e-1 \
+    "
+
+
+GPT_ARGS=" \
+    --num-layers $NLAYERS \
+    --hidden-size $NHIDDEN \
+    --num-attention-heads $NHEADS \
+    --seq-length $SEQ_LEN \
+    --max-position-embeddings $SEQ_LEN \
+    --micro-batch-size $MICRO_BATCH_SIZE \
+    --global-batch-size $GLOBAL_BATCH_SIZE \
+    --train-samples  3782590 \
+    --loss-scale 12 \
+    --vocab-file gpt2-vocab.json \
+    --merge-file gpt2-merges.txt \
+    --clip-grad 1.0 \
+    --fp16 \
+    --checkpoint-activations \
+    --seed 42
+    $OPTIMIZER_ARGS \
+    "
+
+OUTPUT_ARGS=" \
+    --log-interval 1 \
+    --save-interval $SAVE_INTERVAL \
+    --eval-interval 1000 \
+    --eval-iters 40 \
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-queue-size 5 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-validation-ppl-to-tensorboard \
+    "
+
+DATA_ARGS=" \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --data-path $DATA_PATH \
+    "
+ZERO_STAGE=1
+config_json="./${MODEL_NAME}_ds_config.json"
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+  "train_batch_size": $GLOBAL_BATCH_SIZE,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+DEEPSPEED_ARGS=" \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${ZERO_STAGE} \
+    --deepspeed-activation-checkpointing \
+    "
+export CMD=" \
+    --tensor-model-parallel-size $TP_SIZE \
+    --pipeline-model-parallel-size $PP_SIZE \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --data-impl mmap \
+    --split 949,50,1 \
+    --distributed-backend nccl \
+     $DEEPSPEED_ARGS \
+    "
+
+APP="python3 -u `pwd`/pretrain_gpt.py \
+    --rank ${RANK} \
+    --world_size ${WORLD_SIZE} \
+    --dist_url tcp://${1}:34566 \
+    --num-workers 2 \
+    ${CMD} \
+    "
+
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/megatron-deepspeed_dtk22.10/start_fast.md
+++ b/megatron-deepspeed_dtk22.10/start_fast.md
+# Fast Setup instructions
+
+This quick instructions document contains 3 steps:
+
+1. installing software
+2. preparing data
+3. running the script
+
+This is useful if you need to ask someone to reproduce problems with `Megatron-Deepspeed`
+
+## 1. Software
+
+Please follow this exact order.
+
+
+0. Create a new conda env if need be or activate an existing environment.
+
+1. Install `pytorch`. Choose the desired version install instructions [here](https://pytorch.org/get-started/locally/), but for conda it'd be:
+
+```
+conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
+```
+
+2. Install system-wide `cuda` if you don't have it already. [NVIDIA instructions](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html). Of course ideally use [the premade packages for your distro](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#package-manager-installation).
+Use the same major version as pytorch's cuda build. To check use:
+
+```
+python -c 'import torch; print(f"pt={torch.__version__}, cuda={torch.version.cuda}")'
+```
+
+The minor versions don't actually have to match, but then you will need to hack `apex` installer to ignore minor version changes, see below.
+
+3. Install `apex`
+
+```
+git clone https://github.com/NVIDIA/apex
+cd apex
+pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .  2>&1 | tee build.log
+cd -
+```
+
+If the pytorch and system-wide cuda minor versions mismatch, it's not a problem, you just need to hack `apex`'s build to bypass the check by applying this patch first and then build it.
+```
+diff --git a/setup.py b/setup.py
+index d76e998..f224dae 100644
+--- a/setup.py
+++ b/setup.py
+@@ -31,6 +31,8 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
+     print(raw_output + "from " + cuda_dir + "/bin\n")
+
+     if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
+        # allow minor diffs
+        if bare_metal_minor != torch_binary_minor: return
+         raise RuntimeError(
+             "Cuda extensions are being compiled with a version of Cuda that does "
+             "not match the version used to compile Pytorch binaries.  "
+```
+
+
+4. Checkout and prepare `Megatron-DeepSpeed` and install its requirements
+
+```
+git clone https://github.com/bigscience-workshop/Megatron-DeepSpeed
+cd Megatron-DeepSpeed
+pip install -r requirements.txt
+```
+
+
+
+
+## 2. Data
+
+Will work under the `Megatron-DeepSpeed` clone
+
+```
+cd Megatron-DeepSpeed
+```
+
+
+
+Prepare data for preprocessing
+```
+mkdir -p data
+wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -O data/gpt2-vocab.json
+wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -O data/gpt2-merges.txt
+python -c 'from datasets import load_dataset; ds = load_dataset("stas/oscar-en-10k", split="train", keep_in_memory=False); ds.to_json(f"data/oscar-en-10k.jsonl", orient="records", lines=True, force_ascii=False)'
+```
+
+Pre-process a small dataset to be used for training
+
+```
+python tools/preprocess_data.py \
+    --input data/oscar-en-10k.jsonl \
+    --output-prefix data/meg-gpt2-oscar-en-10k \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file data/gpt2-merges.txt \
+    --vocab data/gpt2-vocab.json \
+    --append-eod \
+    --workers 4
+```
+
+now you have data/meg-gpt2-oscar-en-10k, vocab and merges files to pass as arguments to training, the next section shows how to use them.
+
+Note that Megatron wants `data/meg-gpt2-oscar-en-10k_text_document` prefix later in `--data-path`
+
+## 3. Train
+
+Here is a tiny model training setup configured over 2 gpus to train on the data we prepared in step 2.
+
+Put it in a script or run it directly.
+
+If you have only 1 gpu, change these 2 lines below to:
+
+```
+N_GPUS=1
+TP_SIZE=1
+```
+
+The script:
+
+```
+CHECKPOINT_PATH=checkpoints/gpt2
+
+VOCAB_FILE=data/gpt2-vocab.json
+MERGE_FILE=data/gpt2-merges.txt
+DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document
+TENSORBOARD_PATH=output_dir/tensorboard
+
+N_GPUS=2
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=16
+TP_SIZE=2
+PP_SIZE=1
+
+NLAYERS=2
+NHIDDEN=8
+NHEADS=2
+SEQ_LEN=512
+VOCAB_SIZE=50257
+
+SAVE_INTERVAL=50
+
+TRAIN_SAMPLES=10_000
+
+GPT_ARGS=" \
+    --num-layers $NLAYERS \
+    --hidden-size $NHIDDEN \
+    --num-attention-heads $NHEADS \
+    --seq-length $SEQ_LEN \
+    --max-position-embeddings $SEQ_LEN \
+    --micro-batch-size $MICRO_BATCH_SIZE \
+    --rampup-batch-size 2 2 1_000 \
+    --global-batch-size $GLOBAL_BATCH_SIZE \
+    --train-samples $TRAIN_SAMPLES \
+    --optimizer adam \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --adam-eps 1e-8 \
+    --lr 1e-4 \
+    --lr-warmup-samples 5 \
+    --min-lr 1e-6 \
+    --lr-decay-style cosine \
+    --lr-decay-samples 12 \
+    --clip-grad 1.0 \
+    --weight-decay 1e-1 \
+    --embed-layernorm \
+    --fp16 \
+    --partition-activations \
+    --seed 42 \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    "
+
+OUTPUT_ARGS=" \
+    --exit-interval 100 \
+    --log-interval 10 \
+    --save-interval $SAVE_INTERVAL \
+    --eval-interval 100 \
+    --eval-iters 10 \
+    --checkpoint-activations \
+    "
+
+DATA_ARGS=" \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --data-path $DATA_PATH \
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-queue-size 5 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-validation-ppl-to-tensorboard \
+    --kill-switch-path /tmp/kill-switch \
+    "
+
+ZERO_STAGE=1
+
+config_json="./ds_config.json"
+
+# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+  "train_batch_size": $GLOBAL_BATCH_SIZE,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+DEEPSPEED_ARGS=" \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${ZERO_STAGE} \
+    --deepspeed-activation-checkpointing \
+    "
+
+ALL_ARGS="$GPT_ARGS $OUTPUT_ARGS $DATA_ARGS $DEEPSPEED_ARGS"
+
+MASTER_ADDR=localhost
+MASTER_PORT=6777
+
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $N_GPUS \
+    --nnodes 1 \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+export CMD=" \
+    $LAUNCHER pretrain_gpt.py \
+    --tensor-model-parallel-size $TP_SIZE \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --distributed-backend nccl \
+    $ALL_ARGS \
+    "
+
+echo $CMD
+
+$CMD
+
+```
+
+You can, of course, run this as a slurm script, but here is [a full slurm script example](https://github.com/bigscience-workshop/bigscience/blob/d57b76bb592832bb4d2054cd5cbf132796be2d83/train/tr11-176B-ml/setup-test-n2.slurm), which has some tweaks to get `MASTER_ADDR` and a few other bits right under the SLURM environment on JeanZay, which may or may not be needed if you run it elsewhere.
+
+Remember to wipe out `$CHECKPOINT_PATH`, if you change the model shape and there is a checkpoint with the old shapes saved already.
--- a/megatron-deepspeed_dtk22.10/tasks/data_utils.py
+++ b/megatron-deepspeed_dtk22.10/tasks/data_utils.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Tasks data utility."""
+
+import re
+import numpy as np
+
+
+def clean_text(text):
+    """Remove new lines and multiple spaces and adjust end of sentence dot."""
+
+    text = text.replace("\n", " ")
+    text = re.sub(r'\s+', ' ', text)
+    for _ in range(3):
+        text = text.replace(' . ', '. ')
+
+    return text
+
+
+def build_sample(ids, types, paddings, label, unique_id):
+    """Convert to numpy and return a sample consumed by the batch producer."""
+
+    ids_np = np.array(ids, dtype=np.int64)
+    types_np = np.array(types, dtype=np.int64)
+    paddings_np = np.array(paddings, dtype=np.int64)
+    sample = ({'text': ids_np,
+               'types': types_np,
+               'padding_mask': paddings_np,
+               'label': int(label),
+               'uid': int(unique_id)})
+
+    return sample
+
+
+def build_tokens_types_paddings_from_text(text_a, text_b,
+                                          tokenizer, max_seq_length):
+    """Build token types and paddings, trim if needed, and pad if needed."""
+
+    text_a_ids = tokenizer.tokenize(text_a)
+    text_b_ids = None
+    if text_b is not None:
+        text_b_ids = tokenizer.tokenize(text_b)
+
+    return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids,
+                                                max_seq_length, tokenizer.cls,
+                                                tokenizer.sep, tokenizer.pad)
+
+
+def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length,
+                                         cls_id, sep_id, pad_id):
+    """Build token types and paddings, trim if needed, and pad if needed."""
+
+    ids = []
+    types = []
+    paddings = []
+
+    # [CLS].
+    ids.append(cls_id)
+    types.append(0)
+    paddings.append(1)
+
+    # A.
+    len_text_a = len(text_a_ids)
+    ids.extend(text_a_ids)
+    types.extend([0] * len_text_a)
+    paddings.extend([1] * len_text_a)
+
+    # [SEP].
+    ids.append(sep_id)
+    types.append(0)
+    paddings.append(1)
+
+    # B.
+    if text_b_ids is not None:
+        len_text_b = len(text_b_ids)
+        ids.extend(text_b_ids)
+        types.extend([1] * len_text_b)
+        paddings.extend([1] * len_text_b)
+
+    # Cap the size.
+    trimmed = False
+    if len(ids) >= max_seq_length:
+        max_seq_length_m1 = max_seq_length - 1
+        ids = ids[0:max_seq_length_m1]
+        types = types[0:max_seq_length_m1]
+        paddings = paddings[0:max_seq_length_m1]
+        trimmed = True
+
+    # [SEP].
+    if (text_b_ids is not None) or trimmed:
+        ids.append(sep_id)
+        if text_b_ids is None:
+            types.append(0)
+        else:
+            types.append(1)
+        paddings.append(1)
+
+    # Padding.
+    padding_length = max_seq_length - len(ids)
+    if padding_length > 0:
+        ids.extend([pad_id] * padding_length)
+        types.extend([pad_id] * padding_length)
+        paddings.extend([0] * padding_length)
+
+    return ids, types, paddings
--- a/megatron-deepspeed_dtk22.10/tasks/ensemble_classifier.py
+++ b/megatron-deepspeed_dtk22.10/tasks/ensemble_classifier.py
+import os
+import argparse
+import collections
+
+import numpy as np
+import torch
+
+
+def process_files(args):
+    all_predictions = collections.OrderedDict()
+    all_labels = collections.OrderedDict()
+    all_uid = collections.OrderedDict()
+    for path in args.paths:
+        path = os.path.join(path, args.prediction_name)
+        try:
+            data = torch.load(path)
+            for dataset in data:
+                name, d = dataset
+                predictions, labels, uid = d
+                if name not in all_predictions:
+                    all_predictions[name] = np.array(predictions)
+                    if args.labels is None:
+                        args.labels = [i for i in range(all_predictions[name].shape[1])]
+                    if args.eval:
+                        all_labels[name] = np.array(labels)
+                    all_uid[name] = np.array(uid)
+                else:
+                    all_predictions[name] += np.array(predictions)
+                    assert np.allclose(all_uid[name], np.array(uid))
+        except Exception as e:
+            print(e)
+            continue
+    return all_predictions, all_labels, all_uid
+
+
+def get_threshold(all_predictions, all_labels, one_threshold=False):
+    if one_threshold:
+        all_predictons = {'combined': np.concatenate(list(all_predictions.values()))}
+        all_labels = {'combined': np.concatenate(list(all_predictions.labels()))}
+    out_thresh = []
+    for dataset in all_predictions:
+        preds = all_predictions[dataset]
+        labels = all_labels[dataset]
+        out_thresh.append(calc_threshold(preds, labels))
+    return out_thresh
+
+
+def calc_threshold(p, l):
+    trials = [(i) * (1. / 100.) for i in range(100)]
+    best_acc = float('-inf')
+    best_thresh = 0
+    for t in trials:
+        acc = ((apply_threshold(p, t).argmax(-1) == l).astype(float)).mean()
+        if acc > best_acc:
+            best_acc = acc
+            best_thresh = t
+    return best_thresh
+
+
+def apply_threshold(preds, t):
+    assert (np.allclose(preds.sum(-1), np.ones(preds.shape[0])))
+    prob = preds[:, -1]
+    thresholded = (prob >= t).astype(int)
+    preds = np.zeros_like(preds)
+    preds[np.arange(len(thresholded)), thresholded.reshape(-1)] = 1
+    return preds
+
+
+def threshold_predictions(all_predictions, threshold):
+    if len(threshold) != len(all_predictions):
+        threshold = [threshold[-1]] * (len(all_predictions) - len(threshold))
+    for i, dataset in enumerate(all_predictions):
+        thresh = threshold[i]
+        preds = all_predictions[dataset]
+        all_predictions[dataset] = apply_threshold(preds, thresh)
+    return all_predictions
+
+
+def postprocess_predictions(all_predictions, all_labels, args):
+    for d in all_predictions:
+        all_predictions[d] = all_predictions[d] / len(args.paths)
+
+    if args.calc_threshold:
+        args.threshold = get_threshold(all_predictions, all_labels, args.one_threshold)
+        print('threshold', args.threshold)
+
+    if args.threshold is not None:
+        all_predictions = threshold_predictions(all_predictions, args.threshold)
+
+    return all_predictions, all_labels
+
+
+def write_predictions(all_predictions, all_labels, all_uid, args):
+    all_correct = 0
+    count = 0
+    for dataset in all_predictions:
+        preds = all_predictions[dataset]
+        preds = np.argmax(preds, -1)
+        if args.eval:
+            correct = (preds == all_labels[dataset]).sum()
+            num = len(all_labels[dataset])
+            accuracy = correct / num
+            count += num
+            all_correct += correct
+            accuracy = (preds == all_labels[dataset]).mean()
+            print(accuracy)
+        if not os.path.exists(os.path.join(args.outdir, dataset)):
+            os.makedirs(os.path.join(args.outdir, dataset))
+        outpath = os.path.join(
+            args.outdir, dataset, os.path.splitext(
+                args.prediction_name)[0] + '.tsv')
+        with open(outpath, 'w') as f:
+            f.write('id\tlabel\n')
+            f.write('\n'.join(str(uid) + '\t' + str(args.labels[p])
+                              for uid, p in zip(all_uid[dataset], preds.tolist())))
+    if args.eval:
+        print(all_correct / count)
+
+
+def ensemble_predictions(args):
+    all_predictions, all_labels, all_uid = process_files(args)
+    all_predictions, all_labels = postprocess_predictions(all_predictions, all_labels, args)
+    write_predictions(all_predictions, all_labels, all_uid, args)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--paths', required=True, nargs='+',
+                        help='paths to checkpoint directories used in ensemble')
+    parser.add_argument('--eval', action='store_true',
+                        help='compute accuracy metrics against labels (dev set)')
+    parser.add_argument('--outdir',
+                        help='directory to place ensembled predictions in')
+    parser.add_argument('--prediction-name', default='test_predictions.pt',
+                        help='name of predictions in checkpoint directories')
+    parser.add_argument('--calc-threshold', action='store_true',
+                        help='calculate threshold classification')
+    parser.add_argument('--one-threshold', action='store_true',
+                        help='use on threshold for all subdatasets')
+    parser.add_argument('--threshold', nargs='+', default=None, type=float,
+                        help='user supplied threshold for classification')
+    parser.add_argument('--labels', nargs='+', default=None,
+                        help='whitespace separated list of label names')
+    args = parser.parse_args()
+    ensemble_predictions(args)
+
+
+if __name__ == '__main__':
+    main()
--- a/megatron-deepspeed_dtk22.10/tasks/eval_harness/download.py
+++ b/megatron-deepspeed_dtk22.10/tasks/eval_harness/download.py
+# Downloads the specified taks in the evaluation harness
+# This is particularly useful when running in environments where the GPU nodes 
+# do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
+
+from lm_eval import tasks
+from lm_eval.tasks import ALL_TASKS
+import argparse
+import os
+
+
+parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False)
+parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.')
+args = parser.parse_args()
+
+def main():
+    task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
+    tasks.get_task_dict(task_list)
+
+if __name__ == '__main__':
+    main()
+
+
+    
\ No newline at end of file
--- a/megatron-deepspeed_dtk22.10/tasks/eval_harness/evaluate.py
+++ b/megatron-deepspeed_dtk22.10/tasks/eval_harness/evaluate.py
+from functools import reduce
+from logging import logMultiprocessing
+import os
+import sys
+import datetime
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir,os.path.pardir)))
+
+from lm_eval.models.gpt2 import GPT2LM
+from lm_eval import evaluator, tasks, utils
+from lm_eval.base import CacheHook
+from tqdm import tqdm
+import torch.nn.functional as F
+
+from lm_eval.tasks import ALL_TASKS
+from pretrain_gpt import model_provider
+import numpy as np
+
+import torch
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_tokenizer
+from megatron import mpu
+from megatron.training import setup_model_and_optimizer, get_model
+from megatron.mpu.mappings import gather_from_tensor_model_parallel_region
+
+from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
+from megatron.p2p_communication import recv_forward, send_forward
+import pickle
+import json
+
+from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
+from megatron.model.distributed import DistributedDataParallel as LocalDDP
+from megatron.model.module import Float16Module
+from deepspeed.runtime.pipe import schedule
+
+class EvalHarnessAdaptor(GPT2LM):
+    def __init__(self, model, tokenizer):
+        args = get_args()
+        self.args = args
+        self.model = model
+        self.tokenizer = tokenizer
+        self.VOCAB_SIZE = tokenizer.vocab_size
+        self.EOT_TOKEN_ID = tokenizer.eod
+
+        self._max_length = args.seq_length
+
+        # For ds we split into mini batches and then micro batches to keep pipelining api happy.
+        # With Megatron we just go to micro_batches directly
+        self._batch_size = args.micro_batch_size * args.micro_bs_multiplier
+
+        self.cache_hook = CacheHook(None)
+        self.is_main = args.rank == 0
+        self.is_local_main = args.local_rank == 0
+        self._device = torch.cuda.current_device()
+        self.is_model_parallel = mpu.get_tensor_model_parallel_world_size() > 1
+        self.is_pipe_parallel = mpu.get_pipeline_model_parallel_world_size() > 1
+        self.is_data_parallel = mpu.get_data_parallel_world_size() > 1
+        self.adaptive_seq_len = args.adaptive_seq_len
+        if self.is_data_parallel:
+            raise NotImplementedError("Data parallelism is currently not supported for evaluation")
+
+        self.is_last_stage = True if not self.is_pipe_parallel else mpu.is_pipeline_last_stage()  # only the last stage of the pipeline model will receive the logits
+
+    @property
+    def max_length(self):
+        return self._max_length
+
+    @property
+    def batch_size(self):
+        return self._batch_size
+
+    @property
+    def device(self):
+        return self._device
+
+
+    def loglikelihood(self, requests):
+        new_reqs = []
+        for context, continuation in requests:
+            if context == "":
+                # end of text as context
+                context_enc = [self.EOT_TOKEN_ID]
+            else:
+                context_enc = self.tokenizer_encode(context)
+
+            continuation_enc = self.tokenizer_encode(continuation)
+
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+
+        return self._loglikelihood_tokens(new_reqs)
+
+    def loglikelihood_rolling(self, requests):
+        # TODO: Implement caching once we've confirmed the perplexity implementation
+        # TODO: automatic batch size detection for vectorization
+
+        loglikelihoods = []
+        with torch.no_grad():
+            for string, in tqdm(requests):
+                rolling_token_windows = list(map(utils.make_disjoint_window, utils.get_rolling_token_windows(
+                    token_list=self.tokenizer_encode(string),
+                    prefix_token=self.EOT_TOKEN_ID,
+                    max_seq_len=self.max_length,
+                    context_len=1,
+                )))
+
+                rolling_token_windows = [(None,) + x for x in rolling_token_windows]
+
+                # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that
+                string_nll = self._loglikelihood_tokens(rolling_token_windows, disable_tqdm=True)
+
+                # discard is_greedy
+                string_nll = [x[0] for x in string_nll]
+
+                string_nll = sum(string_nll)
+                loglikelihoods.append(string_nll)
+
+        return loglikelihoods
+
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
+        disable_tqdm = disable_tqdm if self.is_main else True
+        res = []
+        res_len = 0  # storing the result length for later
+        self.model.eval()
+        with torch.no_grad():
+            def _collate(x):
+                toks = x[1] + x[2]
+                return (-len(toks), tuple(toks))
+
+            reord = utils.Reorderer(requests, _collate)
+            for chunk in utils.chunks(tqdm(reord.get_reordered(), disable=disable_tqdm), self.batch_size):
+                inps, contlens, inplens, padding_length = [], [], [], None
+                for _, context_enc, continuation_enc in chunk:
+                    # when too long to fit in context, truncate from the left
+                    inp = torch.tensor(
+                        (context_enc + continuation_enc)[-(self.max_length + 1):][:-1]
+                        , dtype=torch.long).to(self.device)
+                    inplen, = inp.shape
+
+                    cont = continuation_enc
+
+                    # since in _collate we make sure length is descending, the longest is always the first one.
+                    padding_length = padding_length if padding_length is not None else inplen
+                    if not self.adaptive_seq_len:
+                        padding_length = self.max_length
+                    # pad to length
+                    inp = torch.cat([
+                        inp,  # [seq]
+                        torch.zeros(padding_length - inplen, dtype=torch.long).to(inp.device)  # [padding_length - seq]
+                    ], dim=0)
+
+                    inps.append(inp.unsqueeze(0))
+
+                    contlens.append(cont)
+                    inplens.append(inplen)
+
+                logits = self._model_call(torch.cat(inps, dim=0))
+                res_len += len(chunk)
+                if logits is not None:
+                    multi_logits = F.log_softmax(logits, dim=-1).cpu()  # [batch, seq, vocab]
+
+                    for (cache_key, _, _), logits, inp, inplen, cont_toks in zip(chunk, multi_logits, inps, inplens, contlens):
+                        contlen = len(cont_toks)
+                        logits = logits[inplen - contlen:inplen].unsqueeze(0)  # [1, seq, vocab]
+                        greedy_tokens = logits.argmax(dim=-1)
+                        # cont_toks :: [1, seq]
+                        cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(0)
+                        max_equal = (greedy_tokens == cont_toks).all()
+                        # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+
+                        logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)  # [1, seq]
+                        answer = (float(logits.sum()), bool(max_equal))
+                        # partial caching
+                        if cache_key is not None:
+                            self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+                        res.append(answer)
+
+        if not mpu.is_pipeline_last_stage():
+            # @HACK: To make the eval harness happy on threads that don't have access to the results.
+            #        We just randomly generate some data.
+            res = [(np.random.rand(), np.random.rand()>0.5) for _ in requests]
+
+        return reord.get_original(res)
+
+    def create_model_inputs(self, tokens):
+        args = get_args()
+
+        attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+            tokens,
+            self.EOT_TOKEN_ID,
+            args.reset_position_ids,
+            args.reset_attention_mask,
+            args.eod_mask_loss,
+            prefix_indices=None,
+            loss_on_targets_only=False)
+
+        return (tokens, position_ids, attention_mask), (tokens, loss_mask)
+
+    def _model_call(self, inps):
+        args = get_args()
+
+        if args.deepspeed:
+            self.model.set_batch_fn(self.create_model_inputs)
+            # round up to multiple of micro_batch_size
+            new_size = ((len(inps) + args.micro_batch_size-1)  // args.micro_batch_size) * args.micro_batch_size
+            padded = F.pad(inps, (0, 0, 0, new_size-len(inps)), value = 0)
+            # dummy data iterator for pipelining.
+            data_iterator = list((torch.stack(inp) for inp in utils.chunks(padded, args.micro_batch_size)))
+            self.model.micro_batches = len(data_iterator)
+            
+            if self.adaptive_seq_len:
+                # Allow different shapes than the default seq_len to be communicated across pipes
+                # Without this Deepspeed will hang when trying to receive activations
+                self.model.reset_activation_shape()
+            
+            output = self.model.eval_batch(iter(data_iterator), compute_loss = False, reduce_output = None)
+
+
+            if output is not None:
+                output = torch.cat(output, 0)[:len(inps)]
+            else:
+                output = None
+
+            # hack #2 for adaptive_seq_len to work as total_loss gets appended to and shapes aren't the same
+            if args.adaptive_seq_len:
+                self.model.total_loss = None
+        else:
+            # Since the shape of the micro-batch will change
+            # We need set the correct shapes here
+            # So that latter pipeline stages knows which shapes to expect.
+            # Otherwise we will deadlock.
+
+            args.micro_batch_size = len(inps)
+            args.seq_length = len(inps[0])
+            args.max_position_embeddings = args.seq_length
+
+            input_tensor = recv_forward()
+
+            # Forward pass through the model.
+            unwrapped_model = unwrap_model(self.model, (torchDDP, LocalDDP, Float16Module))
+            unwrapped_model.set_input_tensor(input_tensor)
+            output = self.model(*self.create_model_inputs(inps)[0])
+            send_forward(output)
+
+        if mpu.is_pipeline_last_stage():
+            return gather_from_tensor_model_parallel_region(output)[..., :self.tokenizer.vocab_size]
+        else:
+            return None
+
+    def tokenizer_encode(self, text):
+        """Tokenize text *without* adding special tokens."""
+        # Splitting this into its own method in case we need to handle special cases for different tokenizers
+        from megatron.tokenizer.gpt2_tokenization import GPT2Tokenizer
+        if isinstance(self.tokenizer.tokenizer, GPT2Tokenizer):
+            return self.tokenizer.tokenizer.encode(text)
+        else:
+            return self.tokenizer.tokenizer.encode(text, add_special_tokens=False)
+
+
+from megatron.initialize import initialize_megatron
+import megatron
+
+from tools.convert_checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint
+from tools.convert_checkpoint.deepspeed_to_megatron import _create_rank_checkpoint
+
+def override_args(args, override_args, skip_keys, skip_if_specified_keys):
+    for k, v in vars(override_args).items():
+        if k in skip_keys:
+            continue
+        if k in skip_if_specified_keys and getattr(args, k) is not None:
+            continue
+        setattr(args, k, v)
+
+
+# Note(Hesslow):
+# The model loading is a bit convoluted.
+# We want to parse out the model arguments from the checkpoint and use those to initialize megatron-ds.
+#
+# However megatron-ds expects its arguments on the command line.
+# And at that point we don't know them.
+#
+# Instead we use Jasons way: we load the arguments form the checkpoint and then override _parse_args to return whatever args we want.
+#
+# If the checkpoint is old, some new arguments may have been introduced and the code will expect these arguments to exist.
+# In order to support this we _first_ parse the arguments normally, and then override them with the arguments from the checkpoint.
+# Keeping the default-value of newer arguments.
+#
+# We then use the megatron deepspeed converter to load the deepspeed checkpoints as if they we're megatron checkpoints.
+def load_ds_checkpoint_and_setup_megatron(args):
+    _print_args = megatron.arguments._print_args
+    megatron.arguments._print_args = lambda *_args, **kwarg: None
+
+    if not os.path.exists(args.load):
+        raise ValueError(f"checkpoint path {args.load} doesn't exit")
+
+    ds_checkpoint = DeepSpeedCheckpoint(args.load,
+                                        tp_degree=args.tensor_model_parallel_size,
+                                        pp_degree=args.pipeline_model_parallel_size)
+
+
+    cp_args = ds_checkpoint.get_args()
+    # Merge the current args with the checkpoint args.
+    skip_keys = [
+        'abort_on_unmet_fused_kernel_constraints',
+        'batch_size',
+        'data_parallel_size',
+        'deepspeed',
+        'deepspeed_config',
+        'device_count',
+        'global_batch_size',
+        'inference',
+        'iteration',
+        'load',
+        'local_rank',
+        'micro_batch_size',
+        'pipeline_model_parallel_size',
+        'rampup_batch_size',
+        'rank',
+        'tensor_model_parallel_size',
+        'tensorboard_dir',
+        'world_size',
+    ]
+
+    skip_if_specified = ['merge_file', 'vocab_file']
+
+    if args.eval_fp32:
+        cp_args.fp16 = False
+        cp_args.bf16 = False
+        cp_args.params_dtype = torch.float32
+
+    override_args(args, cp_args, skip_keys, skip_if_specified)
+
+    # stop megatron from reparsing the arguments.
+    megatron.global_vars._parse_args = lambda *_args, **kwarg: args
+    megatron.global_vars._GLOBAL_ARGS = args
+
+    initialize_megatron()
+    torch.distributed.barrier()
+
+    # Initializing megatron will update eg. tokenizer size. Override again.
+    override_args(args, cp_args, skip_keys, skip_if_specified)
+
+    # print final arguments.
+    _print_args(args)
+    if args.deepspeed:
+
+        # Hack #3:
+        # Loading pipelined models in deepspeed with different TP than it was originally trained on fails
+        # due to a sanity check, that makes sure that all state_dicts that we merge contains attention layers.
+        # This, however, is not true for pipelining when we will merge the state_dict for the embeddings which
+        # which does not contain these attention-specific keys.
+        #
+        # Deepspeed does however manage to load the model if we just turn off this sanity check.
+        import deepspeed
+        deepspeed.runtime.state_dict_factory.MegatronSDLoader.sanity_check = lambda self, ckpt_file_name: None
+
+
+        cp_path = args.load
+        args.load = None
+        model, _, _ = setup_model_and_optimizer(model_provider)
+        model = model[0]
+        zero_enabled = model._config.zero_enabled
+        model._config.zero_enabled = False
+        _, _ = model.load_checkpoint(cp_path, tag = '.', load_optimizer_states=False, load_lr_scheduler_states=False, load_module_only=True)
+        model._config.zero_enabled = zero_enabled
+    else:
+        model = get_model(model_provider)[0]
+        # Initialize megatron model using the parsed state dict.
+        sd = _create_rank_checkpoint(ds_checkpoint, None, mpu.get_tensor_model_parallel_rank(), mpu.get_pipeline_model_parallel_rank(), True)
+
+        model.load_state_dict(sd['model'], strict=True)
+
+    if args.eval_fp32:
+        model = model.float()
+
+    torch.distributed.barrier()
+    return model
+
+def tasks_args(parser):
+
+    results_path_default = f"results-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.json"
+
+    """Provide extra arguments required for tasks."""
+    group = parser.add_argument_group(title='Evaluation options')
+    group.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks.')
+    group.add_argument('--results_path', type=str, default = results_path_default, help='Path to where the results will be stored.')
+    group.add_argument('--adaptive_seq_len',  default = False, action='store_true',
+                       help='Should the sequence length be adapted to the batch during evaluation, if in fp16 the results will be slightly different due to numerical errors but greatly speed up evaluation.')
+    group.add_argument('--eval_fp32',  default = False, action='store_true', help='Should the evaluation run in fp32')
+    group.add_argument('--intermed_results',  default = False, action='store_true', help='Whether to print & write intermediate results for each task')
+    group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
+    group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel')
+    return parser
+
+from megatron.global_vars import _parse_args
+
+def main():
+    # parse the megatron args. But wait with initalizing megatron.
+    # avoid printing the arguments, since they will later be overridden.
+    args = _parse_args(tasks_args)
+    load_path = args.load
+    model = load_ds_checkpoint_and_setup_megatron(args)
+
+    args = get_args()
+    if args.deepspeed and args.adaptive_seq_len:
+        # adaptive_seq_len hack #1:
+        # CL automatically enables reset_activation_shape() which allows us to change input shapes
+        # and it also reshapes the attenion scores in attention_mask_func
+        args.curriculum_learning = 1
+
+    task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
+    task_dict = tasks.get_task_dict(task_list)
+
+    model.module.activation_checkpoint_interval = 0
+    model._compute_loss = False
+    model.fwd_outputs = []
+
+    tokenizer = get_tokenizer()
+    adaptor = EvalHarnessAdaptor(model, tokenizer)
+    
+    if args.intermed_results:
+        global_results = {"results": {}, "versions": {}}
+        timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
+        iteration_id = load_path.split("/")[-1].replace("/", "")
+        results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json")
+        # Backup file in case of interruption during writing
+        results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json")
+        for task_name, task in task_dict.items():
+            results = evaluator.evaluate(adaptor, {task_name: task}, False, 0, None, bootstrap_iters=args.bootstrap_iters)
+            global_results["results"] = {**global_results["results"], **results["results"]}
+            global_results["versions"] = {**global_results["versions"], **results["versions"]}
+            if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+                print(json.dumps(results, indent=2))
+                with open(results_path, 'w') as outfile:
+                    json.dump(global_results, outfile, indent=4)
+                with open(results_path_backup, 'w') as outfile:
+                    json.dump(global_results, outfile, indent=4)
+    else:
+        global_results = evaluator.evaluate(adaptor, task_dict, False, 0, None, bootstrap_iters=args.bootstrap_iters)
+        if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+            print(json.dumps(global_results, indent=2))
+            with open(args.results_path, 'w') as outfile:
+                json.dump(global_results, outfile, indent=4)
+
+if __name__ == '__main__':
+    main()
--- a/megatron-deepspeed_dtk22.10/tasks/eval_harness/report-to-csv.py
+++ b/megatron-deepspeed_dtk22.10/tasks/eval_harness/report-to-csv.py
+#!/usr/bin/env python
+
+# this script converts results.json:
+#
+#   "results": {
+#     "arc_challenge": {
+#       "acc": 0.24232081911262798,
+#       "acc_stderr": 0.01252159329580012,
+#       "acc_norm": 0.2764505119453925,
+#       "acc_norm_stderr": 0.013069662474252425
+#     },
+#
+# into a format expected by a spreadsheet, which is:
+#
+#   task          metric   value    err
+#   arc_challenge acc      xxx      yyy
+#   arc_challenge acc_norm xxx      yyy
+#   arc_challenge f1       xxx      yyy
+#
+# usage:
+# report-to-csv.py results.json
+
+
+import sys
+import json
+import io
+import csv
+
+results_file = sys.argv[1]
+
+csv_file = results_file.replace("json", "csv")
+
+print(f"Converting {results_file} to {csv_file}")
+
+with io.open(results_file, 'r', encoding='utf-8') as f:
+    results = json.load(f)
+
+with io.open(csv_file, 'w', encoding='utf-8') as f:
+
+    writer = csv.writer(f)
+    writer.writerow(["task", "metric", "value", "err", "version"])
+
+    versions = results["versions"]
+
+    for k,v in sorted(results["results"].items()):
+        if k not in versions:
+            versions[k] = -1
+
+        if "acc" in v:
+            writer.writerow([k, "acc", v["acc"], v["acc_stderr"], versions[k]])
+        if "acc_norm" in v:
+            writer.writerow([k, "acc_norm", v["acc_norm"], v["acc_norm_stderr"], versions[k]])
+        if "f1" in v:
+            writer.writerow([k, "f1", v["f1"], v["f1_stderr"] if "f1_stderr" in v else "", versions[k]])
+        # if "ppl" in v:
+        #     writer.writerow([k, "ppl", v["ppl"], v["ppl_stderr"], versions[k]])
+        # if "em" in v:
+        #     writer.writerow([k, "em", v["em"], v["em_stderr"] if "em_stderr" in v else "", versions[k]])
--- a/megatron-deepspeed_dtk22.10/tasks/eval_utils.py
+++ b/megatron-deepspeed_dtk22.10/tasks/eval_utils.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Evaluation utilities."""
+
+import os
+import time
+from functools import partial
+
+import torch
+
+from megatron import get_args
+from megatron import print_rank_last, is_last_rank
+from megatron import mpu
+from megatron.schedules import get_forward_backward_func
+from tasks.finetune_utils import build_data_loader
+from tasks.finetune_utils import process_batch
+
+
+def accuracy_func_provider(single_dataset_provider):
+    """Provide function that calculates accuracies."""
+    args = get_args()
+
+    # Build dataloaders.
+    datapaths = args.valid_data
+    dataloaders = []
+    for datapath in datapaths:
+        dataset = single_dataset_provider(datapath)
+        dataloader = build_data_loader(
+            dataset, args.orig_micro_batch_size, num_workers=args.num_workers,
+            drop_last=(mpu.get_data_parallel_world_size() > 1))
+        dataloaders.append((dataset.dataset_name, dataloader))
+
+    def metrics_func(model, epoch, output_predictions=False):
+        print_rank_last('calculating metrics ...')
+        correct = 0
+        total = 0
+        if output_predictions:
+            assert mpu.get_data_parallel_world_size() == 1
+            named_predictions = []
+            names = 'predictions'
+        for name, dataloader in dataloaders:
+            output = calculate_correct_answers(name, model, dataloader,
+                                               epoch, output_predictions)
+            if not output_predictions:
+                correct_ans, total_count = output
+            else:
+                correct_ans, total_count, predictions = output
+                named_predictions.append((name, predictions))
+                names += '_' + name
+            correct += correct_ans
+            total += total_count
+        if is_last_rank():
+            percent = float(correct) * 100.0 / float(total)
+            print(' >> |epoch: {}| overall: correct / total = {} / {} = '
+                  '{:.4f} %'.format(epoch, correct, total, percent))
+
+        if output_predictions and is_last_rank():
+            assert args.load is not None
+            filename = os.path.join(args.load, names + '.pt')
+            torch.save(named_predictions, filename)
+
+    return metrics_func
+
+
+def calculate_correct_answers(name, model, dataloader,
+                              epoch, output_predictions):
+    """Calculate correct over total answers and return prediction if the
+    `output_predictions` is true."""
+    args = get_args()
+    forward_backward_func = get_forward_backward_func()
+    start_time = time.time()
+    for m in model:
+        m.eval()
+    saved_micro_batch_size = args.micro_batch_size
+    saved_global_batch_size = args.global_batch_size
+
+    ds = dataloader.dataset
+    if hasattr(ds, 'sample_multiplier'):
+        # If our dataset as a sample_multiplier attribute that means
+        # each "sample" from the dataset actually has multiple samples
+        # that will collapse into the batch dimension (for example in
+        # the RACE dataset that has several options), we need to
+        # account for that when setting the micro batch size.
+        sample_multiplier = ds.sample_multiplier
+    else:
+        sample_multiplier = 1
+    micro_batch_size_times_data_parallel = args.orig_micro_batch_size * args.data_parallel_size
+    num_micro_batches = args.orig_global_batch_size // micro_batch_size_times_data_parallel
+
+    def loss_func(output_predictions, labels, output_tensor):
+        logits = output_tensor
+
+        loss_dict = {}
+        # Add output predictions.
+        if output_predictions:
+            assert False
+            loss_dict['softmaxes'] = torch.nn.Softmax(dim=-1)(
+                logits.float()).data.cpu().numpy().tolist()
+            loss_dict['labels'] = labels.data.cpu().numpy().tolist()
+            loss_dict['ids'] = batch['uid'].cpu().numpy().tolist()
+        # Compute the correct answers.
+        predicted = torch.argmax(logits, dim=-1)
+        corrects = (predicted == labels)
+        # Add to the counters.
+        loss_dict['total'] = labels.size(0)
+        loss_dict['correct'] = corrects.sum().item()
+
+        return 0, loss_dict
+
+    # defined inside to capture output_predictions
+    def correct_answers_forward_step(batch, model):
+        try:
+            batch_ = next(batch)
+        except BaseException:
+            batch_ = batch
+        tokens, types, labels, attention_mask = process_batch(batch_)
+
+        # Forward model.
+        args = get_args()
+        output_tensor = model(tokens, attention_mask, tokentype_ids=types)
+
+        return output_tensor, partial(loss_func, output_predictions, labels)
+
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        total = 0
+        correct = 0
+        if output_predictions:
+            # This option is only possible when data parallel size is 1.
+            assert mpu.get_data_parallel_world_size() == 1
+            softmaxes = []
+            labels = []
+            ids = []
+        for _, batch in enumerate(dataloader):
+            # For evaluation only mode we use drop_last = False to get all the
+            # samples, which means we might not have a full batch, so we
+            # adjust batch_size here to actual batch size of data
+            actual_batch_size = len(batch['label'])
+            # ... applying sample_multiplier if necessary
+            args.micro_batch_size = actual_batch_size * sample_multiplier
+            args.global_batch_size = actual_batch_size * sample_multiplier * num_micro_batches
+
+            loss_dicts = forward_backward_func(correct_answers_forward_step, batch, model,
+                                               optimizer=None, timers=None, forward_only=True)
+
+            for loss_dict in loss_dicts:
+                if output_predictions:
+                    softmaxes.extend(loss_dict['softmaxes'])
+                    labels.extend(loss_dict['labels'])
+                    ids.extend(loss_dict['ids'])
+                total += loss_dict['total']
+                correct += loss_dict['correct']
+
+
+    for m in model:
+        m.train()
+    args.micro_batch_size = saved_micro_batch_size
+    args.global_batch_size = saved_global_batch_size
+
+    # Reduce.
+    if mpu.is_pipeline_last_stage():
+        unreduced = torch.cuda.LongTensor([correct, total])
+        torch.distributed.all_reduce(unreduced,
+                                     group=mpu.get_data_parallel_group())
+
+        # Print on screen.
+
+        correct_ans = unreduced[0].item()
+        total_count = unreduced[1].item()
+        percent = float(correct_ans) * 100.0 / float(total_count)
+        elapsed_time = time.time() - start_time
+        print_rank_last(' > |epoch: {}| metrics for {}: correct / total '
+                        '= {} / {} = {:.4f} %, elapsed time (sec): {:.3f}'.format(
+                            epoch, name, correct_ans, total_count,
+                            percent, elapsed_time))
+
+        if output_predictions:
+            return correct_ans, total_count, (softmaxes, labels, ids)
+        return correct_ans, total_count
+    if output_predictions:
+        return 0, 0, ()
+    return 0, 0
--- a/megatron-deepspeed_dtk22.10/tasks/finetune_utils.py
+++ b/megatron-deepspeed_dtk22.10/tasks/finetune_utils.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Finetune utilities."""
+
+from functools import partial
+
+import torch
+
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_timers
+from megatron import mpu
+from megatron.checkpointing import load_checkpoint
+from megatron.checkpointing import save_checkpoint
+from megatron.training import evaluate_and_print_results
+from megatron.training import setup_model_and_optimizer
+from megatron.training import train_step
+from megatron.training import training_log
+from megatron.utils import average_losses_across_data_parallel_group
+from megatron.utils import calc_params_l2_norm
+from megatron.utils import check_adlr_autoresume_termination
+
+
+def process_batch(batch):
+    """Process batch and produce inputs for the model."""
+    args = get_args()
+
+    tokens = batch['text'].long().cuda().contiguous()
+    types = batch['types'].long().cuda().contiguous()
+    labels = batch['label'].long().cuda().contiguous()
+    attention_mask = batch['padding_mask'].float().cuda().contiguous()
+    if args.fp16:
+        attention_mask = attention_mask.half()
+
+    return tokens, types, labels, attention_mask
+
+
+def cross_entropy_loss_func(labels, output_tensor):
+    logits = output_tensor
+
+    # Cross-entropy loss.
+    loss_func = torch.nn.CrossEntropyLoss()
+    loss = loss_func(logits.contiguous().float(), labels)
+
+    # Reduce loss for logging.
+    averaged_loss = average_losses_across_data_parallel_group([loss])
+
+    return loss, {'lm loss': averaged_loss[0]}
+
+
+def _cross_entropy_forward_step(batch, model):
+    """Simple forward step with cross-entropy loss."""
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch-generator').start()
+    try:
+        batch_ = next(batch)
+    except BaseException:
+        batch_ = batch
+    tokens, types, labels, attention_mask = process_batch(batch_)
+    timers('batch-generator').stop()
+
+    # Forward model.
+    output_tensor = model(tokens, attention_mask, tokentype_ids=types)
+
+    return output_tensor, partial(cross_entropy_loss_func, labels)
+
+
+def build_data_loader(dataset, micro_batch_size, num_workers, drop_last):
+    """Data loader. Note that batch-size is the local (per GPU) batch-size."""
+
+    # Sampler.
+    world_size = mpu.get_data_parallel_world_size()
+    rank = mpu.get_data_parallel_rank()
+    sampler = torch.utils.data.distributed.DistributedSampler(
+        dataset, num_replicas=world_size, rank=rank)
+
+    # Data loader. Note that batch size is the per GPU batch size.
+    data_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=micro_batch_size,
+                                              sampler=sampler,
+                                              shuffle=False,
+                                              num_workers=num_workers,
+                                              drop_last=drop_last,
+                                              pin_memory=True)
+
+    return data_loader
+
+
+def _build_infinite_size_dataloader(dataloader):
+    """Build a looped dataloader with infinite size."""
+
+    iterator = dataloader.__iter__()
+    while True:
+        try:
+            yield iterator.__next__()
+        except StopIteration:
+            iterator = dataloader.__iter__()
+
+
+def _build_train_valid_dataloaders(train_dataset, valid_dataset):
+    """Traing and validation dataloaders."""
+    args = get_args()
+
+    print_rank_0('building train and validation dataloaders ...')
+    # Training dataset.
+    train_dataloader = build_data_loader(train_dataset, args.micro_batch_size,
+                                         args.num_workers, not args.keep_last)
+    # Set the training iterations.
+    args.train_iters_per_epoch = len(train_dataloader)
+    args.train_iters = args.epochs * args.train_iters_per_epoch
+    # Validation dataset. For this dataset, we do not need to set up
+    # shuffling so we can just use a simple infinite loop.
+    valid_dataloader_ = build_data_loader(valid_dataset, args.micro_batch_size,
+                                          args.num_workers, not args.keep_last)
+    valid_dataloader = _build_infinite_size_dataloader(valid_dataloader_)
+
+    # Now that we've built the data loaders, set batch_size arguments
+    # to the actual batch size the model will see for this dataset.
+    # This is necessary so pipeline transfers know what size they are
+    # and the LR schedule, which is based on samples seen, gets set
+    # correctly.
+    args.orig_micro_batch_size = args.micro_batch_size
+    args.orig_global_batch_size = args.global_batch_size
+    if hasattr(train_dataset, 'sample_multiplier'):
+        # If our dataset as a sample_multiplier attribute that means
+        # each "sample" from the dataset actually has multiple samples
+        # that will collapse into the batch dimension (for example in
+        # the RACE dataset that has several options), we need to
+        # account for that when setting the micro batch size.
+        args.micro_batch_size *= train_dataset.sample_multiplier
+        args.global_batch_size *= train_dataset.sample_multiplier
+
+    return train_dataloader, valid_dataloader
+
+
+def _train(model, optimizer, lr_scheduler, forward_step,
+           train_dataloader, valid_dataloader, end_of_epoch_callback):
+    """Train the model."""
+    args = get_args()
+    timers = get_timers()
+
+    # Turn on training mode which enables dropout.
+    for m in model:
+        m.train()
+
+    # Tracking loss.
+    losses_dict_sum = {}
+
+    # Starting epoch and iteration
+    start_epoch = args.iteration // args.train_iters_per_epoch
+    start_iteration = args.iteration % args.train_iters_per_epoch
+    iteration = args.iteration
+
+    # Memory reporting flag.
+    report_memory_flag = True
+
+    # For each remaining epoch
+    timers('interval-time').start()
+    for epoch in range(start_epoch, args.epochs):
+        print_rank_0('working on epoch {} ...'.format(epoch + 1))
+
+        # Set the data loader epoch to shuffle the index iterator.
+        train_dataloader.sampler.set_epoch(args.seed + epoch)
+
+        # For all the batches in the dataset.
+        for iteration_, batch in enumerate(train_dataloader):
+
+            # Ignore the iterations before starting value
+            if iteration_ < start_iteration:
+                continue
+            # Set to zero so the next epoch does not skip any batches.
+            start_iteration = 0
+
+            # Train for one step.
+            out = train_step(forward_step, batch, model, optimizer, lr_scheduler)
+            losses_dict, skipped_iter, grad_norm, num_zeros_in_grad = out
+            iteration += 1
+
+            # Logging.
+            params_norm = None
+            if args.log_params_norm:
+                params_norm = calc_params_l2_norm(model)
+            report_memory_flag = training_log(losses_dict, losses_dict_sum,
+                                              optimizer.param_groups[0]['lr'],
+                                              iteration,
+                                              optimizer.get_loss_scale().item(),
+                                              report_memory_flag, skipped_iter,
+                                              grad_norm, params_norm, num_zeros_in_grad)
+
+            # Autoresume
+            if args.adlr_autoresume and \
+               (iteration % args.adlr_autoresume_interval == 0):
+                check_adlr_autoresume_termination(iteration, model,
+                                                  optimizer, lr_scheduler)
+
+            # Checkpointing
+            if args.save and args.save_interval and \
+               iteration % args.save_interval == 0:
+                save_checkpoint(iteration, model, optimizer, lr_scheduler)
+
+            # Evaluation
+            if args.eval_interval and iteration % args.eval_interval == 0:
+                prefix = 'iteration {}'.format(iteration)
+                evaluate_and_print_results(prefix, forward_step,
+                                           valid_dataloader, model,
+                                           iteration, False)
+
+        # Checkpointing at the end of each epoch.
+        if args.save:
+            save_checkpoint(iteration, model, optimizer, lr_scheduler)
+
+        # Callback at the end of each epoch.
+        if end_of_epoch_callback is not None:
+            end_of_epoch_callback(model, epoch)
+
+
+def finetune(train_valid_datasets_provider, model_provider,
+             forward_step=_cross_entropy_forward_step,
+             end_of_epoch_callback_provider=None):
+    """Main finetune function used across all tasks."""
+    args = get_args()
+    timers = get_timers()
+
+    assert args.rampup_batch_size is None, \
+        'batch size scaling is not supported for finetuning'
+
+    # Train and validation data loaders.
+    timers('train/valid/test dataset/dataloder').start()
+    if args.epochs > 0:
+        train_dataset, valid_dataset = train_valid_datasets_provider()
+        train_dataloader, valid_dataloader = _build_train_valid_dataloaders(
+            train_dataset, valid_dataset)
+    else:
+        args.train_iters = 0
+    timers('train/valid/test dataset/dataloder').stop()
+
+    # Build calback function.
+    timers('callback function').start()
+    end_of_epoch_callback = None
+    if end_of_epoch_callback_provider is not None:
+        end_of_epoch_callback = end_of_epoch_callback_provider()
+    timers('callback function').stop()
+
+    # Build model, optimizer and learning rate scheduler.
+    timers('model and optimizer').start()
+    model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
+    timers('model and optimizer').stop()
+
+    # If pretrained checkpoint is provided and we have not trained for
+    # any iteration (i.e., iteration is zero), then load the pretrained
+    # checkpoint.
+    timers('pretrained checkpoint').start()
+    if args.iteration == 0 and args.pretrained_checkpoint is not None:
+        original_load = args.load
+        args.load = args.pretrained_checkpoint
+        _ = load_checkpoint(model, None, None)
+        args.load = original_load
+        # This is critical when only model is loaded. We should make sure
+        # main parameters are also updated.
+        optimizer.reload_model_params()
+    timers('pretrained checkpoint').stop()
+
+    # Print setup timing.
+    print_rank_0('done with setups ...')
+    timers.log(['train/valid/test dataset/dataloder', 'callback function',
+                'model and optimizer', 'pretrained checkpoint'])
+    print_rank_0('training ...')
+
+    # Finetune the model.
+    if args.epochs > 0:
+        _train(model, optimizer, lr_scheduler, forward_step,
+               train_dataloader, valid_dataloader, end_of_epoch_callback)
+    # Or just evaluate.
+    else:
+        if end_of_epoch_callback is not None:
+            print_rank_0('evaluation only mode, setting epoch to -1')
+            end_of_epoch_callback(model, epoch=-1, output_predictions=True)
+    print_rank_0('done :-)')