GPT2 base on megatron-deepspeed

8ec5d678 · hepj987 · 8ec5d678 · 8ec5d678 · 8ec5d678 · 8ec5d678
Commit 8ec5d678 authored Apr 03, 2023 by hepj987
20 changed files
--- a/megatron-deepspeed_dtk22.10/tests/test_model.py
+++ b/megatron-deepspeed_dtk22.10/tests/test_model.py
+from random import randint
+from typing import Set
+from unittest.mock import patch
+
+import deepspeed
+import torch
+from parameterized import parameterized
+from torch import nn
+import torch.nn.functional as F
+
+from megatron.enums import AttnMaskType
+from megatron.model.fused_layer_norm import MixedFusedLayerNorm
+from packaging import version
+
+from megatron import initialize_megatron, get_args, get_tokenizer, global_vars
+from megatron.model.fused_softmax import ScaledMaskedSoftmax, FusedScaleMaskSoftmax
+from megatron.model.utils import attention_mask_func
+from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, torch_assert_equal, \
+    torch_assert_close, require_torch_bf16
+from megatron.training import setup_model_and_optimizer
+import pretrain_gpt
+import pretrain_prefix_lm
+import finetune_t0_non_causal_decoder
+
+
+def get_default_args(test_file_dir: str):
+    """return a dictionary with key as argument name and value as additional arguments"""
+    return {
+        # GPT_ARGS
+        "--num-layers": "2",
+        "--hidden-size": "128",
+        "--num-attention-heads": "4",
+        "--seq-length": "256",
+        "--max-position-embeddings": "256",
+        "--micro-batch-size": "2",
+        "--global-batch-size": "2",
+        "--lr-decay-iters": "320000",
+        "--lr-decay-style": "cosine",
+        "--lr": "0.00015",
+        "--min-lr": "1.0e-5",
+        "--train-iters": "5000",
+        "--tokenizer-type": "PretrainedFromHF",
+        "--tokenizer-name-or-path": "gpt2",
+        "--data-impl": "mmap",
+        "--split": "949,50,1",
+        "--distributed-backend": "nccl",
+        "--weight-decay": "1e-2",
+        "--clip-grad": "1.0",
+        "--lr-warmup-fraction": ".01",
+        "--fp16": "",
+        "--inference": "",
+
+        "--attention-dropout": "0",
+        "--hidden-dropout": "0",
+
+        # OUTPUT_ARGS
+        "--log-interval": "10",
+        "--save-interval": "500",
+        "--eval-interval": "100",
+        "--eval-iters": "10",
+        "--checkpoint-activations": "",
+
+        # DATA_ARGS
+
+        # DeepSpeed args
+        "--deepspeed": "",
+        "--deepspeed_config": f"{test_file_dir}/ds_config_inference.json",
+        "--zero-stage": "0",
+    }
+
+
+def equal_vectors(tensor1, tensor2, dim=-1):
+    """View tensor1 and tensor2 as a list of vectors, and compute equality"""
+    return torch.linalg.norm(tensor1 - tensor2, dim=dim) == 0
+
+
+def iter_out_of_one(one):
+    return iter([one])
+
+
+def get_dummy_mtf_decoder_packed_data(micro_batch_size: int, seq_length: int, vocab_size: int, special_tokens_ids: Set[int]):
+    """Code from `tests/test_dataloaders.py"""
+    seq_length += 1
+
+    num_segments = torch.randint(1, 5, ())
+    segment_ids = torch.zeros(micro_batch_size, seq_length, dtype=torch.long)
+    is_inputs = torch.zeros(micro_batch_size, seq_length, dtype=torch.bool)
+    for batch_id in range(micro_batch_size):
+        # - `*2`: Hack in order to two start_new_segements to be seperated with two tokens at least
+        # - `+1`: Hack in order the start_mew_segments not to be 0
+        start_new_segments = torch.sort(torch.randperm((seq_length - 2) // 2, )[:num_segments]).values * 2 + 1
+        segment_ids[batch_id, start_new_segments] = 1
+
+        end_inputs = [
+            torch.randint(low=start_segment, high=end_segment - 1, size=())
+            for start_segment, end_segment in zip([0, *start_new_segments], [*start_new_segments, seq_length])
+        ]
+        for end_input, start_segment in zip(end_inputs, [0, *start_new_segments]):
+            is_inputs[batch_id][start_segment: end_input + 1] = True
+
+    segment_ids = torch.cumsum(segment_ids, dim=-1) + 1
+
+    tokens = torch.randint(high=vocab_size, size=(micro_batch_size, seq_length), dtype=torch.long)
+    flatten_token_view = tokens.view(-1,)
+    for token_id in range(len(flatten_token_view)):
+        token = flatten_token_view[token_id]
+        # While token is a special tokens we change that token
+        while token in special_tokens_ids:
+            flatten_token_view[token_id] = (token + 1) % vocab_size
+            token = flatten_token_view[token_id]
+
+    return {
+        "decoder_token_ids": tokens,
+        "decoder_segment_ids": segment_ids,
+        "decoder_is_inputs": is_inputs
+    }
+
+
+class MyTestCase(TestCasePlus):
+    def setUp(self) -> None:
+        super().setUp()
+
+        # We reset all global variables
+        global_vars._GLOBAL_ARGS = None
+        global_vars._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
+        global_vars._GLOBAL_TOKENIZER = None
+        global_vars._GLOBAL_TENSORBOARD_WRITER = None
+        global_vars._GLOBAL_ADLR_AUTORESUME = None
+        global_vars._GLOBAL_TIMERS = None
+
+        self.dist_env_1_gpu = dict(
+            MASTER_ADDR="localhost", MASTER_PORT="9994", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
+        )
+
+    def test_gpt(self):
+        """Test causal invariance, ie past token don't depend on future tokens."""
+        command_args = get_default_args(self.test_file_dir_str)
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                deepspeed.init_distributed()
+                initialize_megatron()
+                args = get_args()
+                tokenizer = get_tokenizer()
+
+                model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider)
+                model = model[0]
+                model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
+                model.set_train_batch_size(args.micro_batch_size)
+
+                token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
+
+                # eod is a special token
+                token_ids[token_ids == tokenizer.eod] += 1
+                token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
+
+                # get a modified version of the first batch, we change a specific index
+                changed_index = randint(0, args.seq_length - 2)
+                token_ids_changed = token_ids.clone()
+                # We increment the token_id by one for that index in order to artificially change the sequence.
+                token_ids_changed[:, changed_index] = \
+                    (token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size
+
+                output = model.eval_batch(iter_out_of_one({"text": token_ids}), compute_loss=False)
+                output_changed = model.eval_batch(iter_out_of_one({"text": token_ids_changed}), compute_loss=False)
+
+                # All token in past should be unchanged
+                torch_assert_equal(output[:, :changed_index], output_changed[:, :changed_index])
+                # All tokens in the future should have changed
+                self.assertFalse(
+                    torch.any(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:]))
+                )
+
+    def test_prefix_lm_reset_attention_mask(self):
+        """
+        Test prefix invariances when `reset_attention_mask=True`:
+            - Past target tokens don't depend on future target tokens.
+            - Target tokens depend on input tokens.
+            - Input tokens depend on all other input tokens, but never target tokens.
+        """
+        command_args = get_default_args(self.test_file_dir_str)
+
+        command_args["--reset-attention-mask"] = ""
+        command_args["--loss-on-targets-only"] = ""
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                deepspeed.init_distributed()
+                initialize_megatron()
+                args = get_args()
+                tokenizer = get_tokenizer()
+
+                model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider)
+                model = model[0]
+                model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
+                model.set_train_batch_size(args.micro_batch_size)
+                # we preprocess batch_fn manually
+                model.set_batch_fn(None)
+
+                token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
+
+                # eod is a special token, this also guarantees that the whole row is considered as a document.
+                token_ids[token_ids == tokenizer.eod] += 1
+                token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
+
+                # process batch to have non empty prefix
+                input_batch, (labels, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids})
+
+                for batch_id in range(len(prefix_indices)):
+                    for id in prefix_indices[batch_id]:
+                        self.assertTrue(loss_mask[batch_id, id] == 1)
+                        self.assertTrue(id > 0)
+                        # Make sure that the last prefix token predicts the first token.
+                        self.assertTrue(loss_mask[batch_id, id -1] == 1)
+
+                output = model.eval_batch(iter_out_of_one((input_batch, (labels, loss_mask), prefix_indices)), compute_loss=False)
+
+                ## --------------- CHANGE A TARGET TOKEN ---------------------------
+                # get a modified version of the first batch
+                # guaranteed to exist as each row has at least one partial document
+                changed_target_index = prefix_indices[0][0]
+                token_ids_changed_target = input_batch[0].clone()
+                # We increment the token id on the changed index.
+                token_ids_changed_target[0, changed_target_index] = \
+                    (token_ids_changed_target[0, changed_target_index] + 1) % args.padded_vocab_size
+                # make sure we're not changing a token to eod as it's a special token
+                token_ids_changed_target[token_ids_changed_target == tokenizer.eod] += 1
+                token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size
+
+                # Test change
+                output_changed_target = model.eval_batch(iter_out_of_one(((token_ids_changed_target, *input_batch[1:]), (labels, loss_mask), prefix_indices)), compute_loss=False)
+
+                # All token in past should be unchanged
+                torch_assert_equal(output[0, :changed_target_index], output_changed_target[0, :changed_target_index])
+                # All tokens in the future should have changed
+                self.assertFalse(
+                    torch.any(
+                        equal_vectors(output[0, changed_target_index:], output_changed_target[0, changed_target_index:])
+                    )
+                )
+                # Unchanged changed rows should not change either
+                torch_assert_equal(output[1, :], output_changed_target[1, :])
+
+                ## --------------- CHANGE AN INPUT TOKEN ---------------------------
+                # Let's change the the last prefix token and make sure that the first token changed
+                # guaranteed to be positive as we avoid pathological case previously
+                last_prefix_index = prefix_indices[0][0] - 1
+                token_ids_changed_input = input_batch[0].clone()
+                #  We increment the token id on the changed index.
+                token_ids_changed_input[0, last_prefix_index] = \
+                    (token_ids_changed_input[0, last_prefix_index] + 1) % args.padded_vocab_size
+                # make sure we're not changing a token to eod as it's a special token
+                token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1
+                token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size
+
+                output_changed_input = model.eval_batch(iter_out_of_one(((token_ids_changed_input, *input_batch[1:]), (labels, loss_mask), prefix_indices)), compute_loss=False)
+
+                # All tokens should be changed
+                self.assertFalse(
+                    torch.any(
+                        equal_vectors(output[0, :], output_changed_input[0, :])
+                    )
+                )
+                # Unchanged changed rows should not change either
+                torch_assert_equal(output[1, :], output_changed_input[1, :])
+
+    def test_prefix_lm_wo_reset_attention_mask(self):
+        """
+        Test prefix invariances when `reset_attention_mask=False`:
+            - Past target tokens don't depend on future target tokens.
+            - Target tokens depend on input tokens.
+            - Input tokens depend on all other input tokens, but never target tokens.
+        """
+        command_args = get_default_args(self.test_file_dir_str)
+
+        command_args["--loss-on-targets-only"] = ""
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                deepspeed.init_distributed()
+                initialize_megatron()
+                args = get_args()
+
+                model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider)
+                model = model[0]
+                model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
+                model.set_train_batch_size(args.micro_batch_size)
+                # we preprocess batch_fn manually
+                model.set_batch_fn(None)
+
+                token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
+                input_batch, (labels, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids})
+
+                for batch_id in range(len(prefix_indices)):
+                    id = prefix_indices[batch_id]
+                    self.assertTrue(loss_mask[batch_id, id] == 1)
+                    self.assertTrue(id > 0)
+                    # Make sure that the last prefix token predicts the first token.
+                    self.assertTrue(loss_mask[batch_id, id -1] == 1)
+
+                model.eval_batch(iter_out_of_one((input_batch, (labels, loss_mask), prefix_indices)), compute_loss=False)
+
+                #TODO: Check all invariants
+
+    def test_gpt_rotary_embeddings(self):
+        """Test rotary embeddings"""
+        command_args = get_default_args(self.test_file_dir_str)
+
+        del command_args["--max-position-embeddings"]
+        command_args["--position-embedding-type"] = "rotary"
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                deepspeed.init_distributed()
+                initialize_megatron()
+                args = get_args()
+                tokenizer = get_tokenizer()
+
+                model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider)
+                model = model[0]
+                model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
+                model.set_train_batch_size(args.micro_batch_size)
+
+                token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
+
+                # eod is a special token
+                token_ids[token_ids == tokenizer.eod] += 1
+                token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
+
+                model.eval_batch(iter_out_of_one({"text": token_ids}), compute_loss=False)
+
+                #TODO: Check all invariants
+
+    @require_torch_bf16
+    def test_fused_layer_norm(self):
+        command_args = get_default_args(self.test_file_dir_str)
+
+        # Condition to use custom cuda kernel
+        command_args["--bf16"] = ""
+        del command_args["--fp16"]
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                initialize_megatron()
+                args = get_args()
+
+                dummy_input = torch.randn(args.micro_batch_size, args.seq_length, args.hidden_size, device="cuda", dtype=torch.bfloat16)
+
+                normalized_shape = (args.hidden_size,)
+                epsilon = 1e-5
+                mfln = MixedFusedLayerNorm(normalized_shape, eps=epsilon)
+
+                self.assertTrue(mfln.use_meg_ds_fused_layer_norm, "Expected model to use Megatron-DeepSpeed custom cuda kernel for LayerNorm.")
+                self.assertTrue(args.bf16, "Test has to be done in half precision.")
+
+                # We set the weight manually so we simulate state that's not the initialisation
+                weight = torch.randn(args.hidden_size, device="cuda", dtype=torch.bfloat16)
+                bias = torch.randn(args.hidden_size, device="cuda", dtype=torch.bfloat16)
+                mfln.weight = nn.Parameter(weight)
+                mfln.bias = nn.Parameter(bias)
+
+                mfln_output = mfln(dummy_input)
+                # We check that our layernorm matches pytorch 1.11 onwards
+                if version.parse(torch.__version__) >= version.parse("1.11.0"):
+                    torch_layer_norm_output = F.layer_norm(dummy_input, normalized_shape, weight, bias, eps=epsilon)
+                else:
+                    # In this case we use can check that basically it corresponds to the fp32 version
+                    torch_layer_norm_output = F.layer_norm(dummy_input.float(), normalized_shape, weight.float(), bias.float(), eps=epsilon).to(torch.bfloat16)
+
+                torch_assert_equal(mfln_output, torch_layer_norm_output)
+
+    @parameterized.expand([(attn_mask_type,) for attn_mask_type in AttnMaskType])
+    def test_fused_masked_softmax(self, attn_mask_type: AttnMaskType):
+        command_args = get_default_args(self.test_file_dir_str)
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                initialize_megatron()
+                args = get_args()
+
+                dummy_input = torch.randn(
+                    args.micro_batch_size,
+                    args.num_attention_heads,
+                    args.seq_length,
+                    args.seq_length,
+                    device="cuda",
+                    dtype=args.params_dtype
+                )
+                if attn_mask_type == AttnMaskType.causal:
+                    dummy_attention_mask = None
+                else:
+                    dummy_attention_mask = torch.randn(
+                        args.micro_batch_size,
+                        1, # `args.num_attention_heads` not implemented in our cuda kernel
+                        args.seq_length,
+                        args.seq_length,
+                        device="cuda",
+                        dtype=args.params_dtype
+                    ) < 0
+                scale = torch.rand(())
+
+                fused_scaled_softmax = FusedScaleMaskSoftmax(
+                    input_in_fp16=args.params_dtype == torch.float16,
+                    input_in_bf16=args.params_dtype == torch.bfloat16,
+                    attn_mask_type=attn_mask_type,
+                    scaled_masked_softmax_fusion=True,
+                    mask_func=attention_mask_func,
+                    softmax_in_fp32=True,
+                    scale=scale,
+                )
+                unfused_scaled_softmax = FusedScaleMaskSoftmax(
+                    input_in_fp16=args.params_dtype == torch.float16,
+                    input_in_bf16=args.params_dtype == torch.bfloat16,
+                    attn_mask_type=attn_mask_type,
+                    scaled_masked_softmax_fusion=False,
+                    mask_func=attention_mask_func,
+                    softmax_in_fp32=True,
+                    scale=scale,
+                )
+
+                self.assertTrue(fused_scaled_softmax.is_kernel_available(dummy_attention_mask, *dummy_input.size()))
+                fused_output = fused_scaled_softmax(dummy_input, dummy_attention_mask)
+                self.assertFalse(unfused_scaled_softmax.is_kernel_available(dummy_attention_mask, *dummy_input.size()))
+                unfused_output = unfused_scaled_softmax(dummy_input, dummy_attention_mask)
+
+                # Test that the nonzeros are the same with the mask
+                for i in range(args.num_attention_heads):
+                    if dummy_attention_mask is None:
+                        # Make sure it's causal, values in the lower triangle should be not zero.
+                        non_zero_values = torch.tril(torch.ones_like(fused_output[:, i]))
+                        torch_assert_equal(torch.nonzero(fused_output[:, i]), torch.nonzero(non_zero_values))
+                    else:
+                        torch_assert_equal(torch.nonzero(fused_output[:, i]), torch.nonzero(~dummy_attention_mask[:, 0]))
+
+                # Cuda kernel produces slightly different results
+                torch_assert_close(fused_output, unfused_output)
+
+
+    def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_is_not_causal_across_segments(self):
+        command_args = get_default_args(self.test_file_dir_str)
+        command_args["--position-embedding-type"] = "alibi"
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                deepspeed.init_distributed()
+                initialize_megatron()
+
+                args = get_args()
+                tokenizer = get_tokenizer()
+                # Hack: `gpt2` doesn't have a padding token, so we override that value.
+                tokenizer.tokenizer.pad_token_id = tokenizer.tokenizer.eos_token_id
+
+                data = get_dummy_mtf_decoder_packed_data(
+                    micro_batch_size=args.micro_batch_size,
+                    seq_length=args.seq_length,
+                    vocab_size=args.padded_vocab_size,
+                    special_tokens_ids={tokenizer.pad}
+                )
+                model, _, _ = setup_model_and_optimizer(finetune_t0_non_causal_decoder.model_provider)
+                model = model[0]
+                model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
+                model.set_train_batch_size(args.micro_batch_size)
+
+                output = model.eval_batch(iter_out_of_one(data), compute_loss=False)
+
+                ## --------------- CHANGE A TARGET TOKEN ---------------------------
+                # change the first token in the first batch to a random value
+                change_batch_id = 0
+                change_token_id = 0
+                token_ids_changed = data["decoder_token_ids"].clone()
+                # We increment the token id on the changed index.
+                token_ids_changed[change_batch_id, change_token_id] = (token_ids_changed[change_batch_id, change_token_id] + 1) % args.padded_vocab_size
+                while token_ids_changed[change_batch_id, change_token_id] in {tokenizer.eod, tokenizer.pad}:
+                    token_ids_changed[change_batch_id, change_token_id] = (token_ids_changed[change_batch_id, change_token_id] + 1) % args.padded_vocab_size
+
+                # Test change
+                output_changed_target = model.eval_batch(iter_out_of_one({**data, "decoder_token_ids": token_ids_changed}), compute_loss=False)
+
+                first_segment_first_batch_id_end = (torch.nonzero(data["decoder_segment_ids"][change_batch_id, 1:] - data["decoder_segment_ids"][change_batch_id, :-1]) + 1)[0]
+                # Check that values changed in segment 1 of batch_id 0
+                self.assertFalse(torch.any(
+                    equal_vectors(
+                        output[change_batch_id, change_token_id:first_segment_first_batch_id_end],
+                        output_changed_target[change_batch_id, change_token_id:first_segment_first_batch_id_end]
+                    )
+                ))
+                # Check that values did not change in other segments of batch_id 0
+                torch_assert_equal(
+                    output[change_batch_id, first_segment_first_batch_id_end:],
+                    output_changed_target[change_batch_id, first_segment_first_batch_id_end:]
+                )
+                # Check that values did not change in other segments in other batches
+                non_change_ids = torch.arange(output.shape[0]) != change_batch_id
+                torch_assert_equal(output[non_change_ids], output_changed_target[non_change_ids])
+
+                ## --------------- CHANGE A TARGET TOKEN ---------------------------
+                # change the last token in the first batch to a pad
+                token_ids_changed_pad = data["decoder_token_ids"].clone()
+                segment_ids_changed_pad = data["decoder_segment_ids"].clone()
+                # We increment the token id on the changed index.
+                token_ids_changed_pad[change_batch_id, -1] = tokenizer.pad
+                segment_ids_changed_pad[change_batch_id, -1] = 0
+
+                # Test model handles padding correctly
+                output_changed_pad = model.eval_batch(iter_out_of_one({**data, "decoder_token_ids": token_ids_changed_pad, "decoder_segment_ids": segment_ids_changed_pad}), compute_loss=False)
+
+                self.assertFalse(torch.any(torch.isnan(output_changed_pad)))
--- a/megatron-deepspeed_dtk22.10/tests/test_preprocessing.py
+++ b/megatron-deepspeed_dtk22.10/tests/test_preprocessing.py
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import filecmp
+import io
+import json
+import re
+import os
+import unittest
+import functools
+
+from pathlib import Path
+
+from megatron.testing_utils import (
+    TestCasePlus,
+    execute_subprocess_async,
+    set_seed
+)
+
+from datasets import load_dataset
+
+set_seed(42)
+
+
+def write_jsonl(path, lines_num=1000, line_length=1024):
+    def get_text_line(line_length):
+        # XXX: fix to generate line_length
+        return "It's a wonderful world. I'm just walking on air. Talk of heaven on earth. I've got more than my share. Haven't got a care. Happy all day through. It's a wonderful world. Loving wonderful you!"
+
+    with io.open(path, "w", encoding="utf-8") as f:
+
+        for i in range(lines_num):
+            rec = dict(text=get_text_line(line_length))
+            x = json.dumps(rec, indent=0, ensure_ascii=False)
+            x = re.sub(r'\n', ' ', x, 0, re.M)
+            f.write(x + "\n")
+
+@functools.lru_cache()
+def download_hf_dataset(dsetname):
+    return load_dataset(dsetname)
+
+class MegDSTestPreprocessing(TestCasePlus):
+    """ """
+
+    def setUp(self):
+        super().setUp()
+
+
+    def test_preprocess_data(self):
+        src_dir = self.src_dir
+        data_dir = f"{self.data_dir}/gpt2"
+        output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
+
+        # autogenerate "input.jsonl"
+        input_path = f"{output_dir}/input.jsonl"
+        write_jsonl(input_path)
+
+        output_prefix =f"{output_dir}/test-ds"
+
+        cmd = f"""
+        python {src_dir}/tools/preprocess_data.py
+            --input {input_path}
+            --output-prefix {output_prefix}
+            --dataset-impl mmap
+            --tokenizer-type GPT2BPETokenizer
+            --merge-file {data_dir}/gpt2-tiny-merges.txt
+            --vocab {data_dir}/gpt2-tiny-vocab.json
+            --append-eod
+            --workers 2
+        """.split()
+
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        for ext in ["bin", "idx"]:
+            tgt_path = f"{output_prefix}_text_document.{ext}"
+            self.assertTrue(Path(tgt_path).exists(), )
+
+    def compare_meg_data_files(self, tgt, ref):
+        for ext in ["bin", "idx"]:
+            tgt_path = f"{tgt}.{ext}"
+            ref_path = f"{ref}.{ext}"
+            self.assertTrue(Path(tgt_path).exists(), )
+            self.assertTrue(filecmp.cmp(tgt_path, ref_path, shallow=False))
+
+    def preprocess_partitioned_dataset(self, output_dir, dsetname, splitname, linelimit, numparts):
+        """Preprocess a dataset as a whole and in shards to prepare environment for merge test.
+
+        Load specified HF dataset using given split and record limit.
+        Write the dataset to a jsonl file and preprocess.
+        Also split dataset into numparts contiguous shards, write each shard to its own jsonl, and preprocess each.
+        Return path to the full dataset and a list of paths for each shard."""
+
+        src_dir = self.src_dir
+        data_dir = f"{self.data_dir}/gpt2"
+
+        # preproces_data_dist requires one to have already downloaded the input HF dataset.
+        # We do that by running this script before the test.
+        dset = download_hf_dataset(dsetname)[splitname]
+
+        # limit the test to use the first linelimit entries to be faster
+        dset = dset.select(range(linelimit))
+
+        # write jsonl file of full dataset
+        json_ds = f"{output_dir}/ds-full.jsonl"
+        dset.to_json(json_ds)
+
+        # process full jsonl into indexed dataset file
+        ds_full = f"{output_dir}/ds-full"
+        cmd = f"""
+                python {src_dir}/tools/preprocess_data.py
+                    --input {json_ds}
+                    --output-prefix {ds_full}
+                    --dataset-impl mmap
+                    --tokenizer-type GPT2BPETokenizer
+                    --merge-file {data_dir}/gpt2-tiny-merges.txt
+                    --vocab {data_dir}/gpt2-tiny-vocab.json
+                    --append-eod
+                """.split()
+        ds_full += '_text_document'
+
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        # write each part to its own json file
+        ds_parts = []
+        for i in range(numparts):
+            json_part = f"{output_dir}/ds-part-{i}.jsonl"
+            dset.shard(numparts, i, contiguous=True).to_json(json_part)
+
+            ds_part = f"{output_dir}/ds-part-{i}"
+            ds_parts.append(ds_part + '_text_document')
+            cmd = f"""
+                    python {src_dir}/tools/preprocess_data.py
+                        --input {json_part}
+                        --output-prefix {ds_part}
+                        --dataset-impl mmap
+                        --tokenizer-type GPT2BPETokenizer
+                        --merge-file {data_dir}/gpt2-tiny-merges.txt
+                        --vocab {data_dir}/gpt2-tiny-vocab.json
+                        --append-eod
+                    """.split()
+
+            # keep for quick debug
+            # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        return ds_full, ds_parts
+
+    def test_merge_serial(self):
+        """Check that serial merge of partial dataset files produces the same file as the full dataset."""
+        src_dir = self.src_dir
+        output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False)
+
+        # process full dataset, and process the full dataset as 3 contiguous chunks
+        ds_full, ds_parts = self.preprocess_partitioned_dataset(output_dir, 'stas/openwebtext-10k', 'train', 100, 3)
+
+        # merge the part files into a single indexed dataset
+        ds_merged = f"{output_dir}/ds-merged"
+        cmd = f"""
+                python {src_dir}/tools/merge_preprocessed_data.py
+                    --datasets {" ".join(ds_parts)}
+                    --output-prefix {ds_merged}
+                """.split()
+
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        # the full dataset and the merged dataset should be identical
+        self.compare_meg_data_files(ds_full, ds_merged)
+
+    def test_merge_distributed(self):
+        """Check that serial merge of partial dataset files produces the same file as the full dataset."""
+        src_dir = self.src_dir
+        output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False)
+
+        # process full dataset, and process the full dataset as 3 contiguous chunks
+        ds_full, ds_parts = self.preprocess_partitioned_dataset(output_dir, 'stas/openwebtext-10k', 'train', 100, 3)
+
+        # merge the part files into a single indexed dataset
+        ds_merged = f"{output_dir}/ds-merged"
+        cmd = f"""
+                python -m torch.distributed.launch --nproc_per_node 6 {src_dir}/tools/merge_preprocessed_data.py
+                    --merge distributed
+                    --datasets {" ".join(ds_parts)}
+                    --output-prefix {ds_merged}
+                    --torch-backend gloo
+                """.split()
+
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        # the full dataset and the merged dataset should be identical
+        self.compare_meg_data_files(ds_full, ds_merged)
+
+    def test_process_data_microsoft(self):
+        """We want to be stable to Microsoft version."""
+        src_dir = self.src_dir
+        data_dir = f"{self.data_dir}/gpt2"
+        output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False)
+
+        input_path = f"{self.tests_dir}/data/gpt2/openwebtext-1000.jsonl"
+
+        output_prefix = f"{output_dir}/test-ds-meg-gpt2-openwebtext"
+
+        cmd = f"""
+                python {src_dir}/tools/preprocess_data.py
+                    --input {input_path}
+                    --output-prefix {output_prefix}
+                    --dataset-impl mmap
+                    --tokenizer-type GPT2BPETokenizer
+                    --merge-file {data_dir}/gpt2-tiny-merges.txt
+                    --vocab {data_dir}/gpt2-tiny-vocab.json
+                    --append-eod
+                    --workers 2
+                """.split()
+
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        self.compare_meg_data_files(f"{output_prefix}_text_document", f"{data_dir}/meg-gpt2-openwebtext_text_document")
+
+    def test_process_data_dist_microsoft(self):
+        """We want to be stable to Microsoft version."""
+        src_dir = self.src_dir
+        data_dir = f"{self.data_dir}/gpt2"
+        output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False)
+
+        output_prefix = f"{output_dir}/test-ds-meg-gpt2-openwebtext_1k"
+
+        # preprocess_data_dist requires one to have already downloaded the input HF dataset.
+        # We do that by running this script before the test.
+        dsetname = 'stas/openwebtext-10k'
+        download_hf_dataset(dsetname)
+
+        cmd = f"""
+                python -m torch.distributed.launch --nproc_per_node 2 {src_dir}/tools/preprocess_data_dist.py
+                    --input {dsetname}
+                    --count 1000
+                    --output-prefix {output_prefix}
+                    --dataset-impl mmap
+                    --tokenizer-type GPT2BPETokenizer
+                    --merge-file {data_dir}/gpt2-tiny-merges.txt
+                    --vocab {data_dir}/gpt2-tiny-vocab.json
+                    --append-eod
+                """.split()
+
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        self.compare_meg_data_files(f"{output_prefix}_text_document", f"{data_dir}/meg-gpt2-openwebtext_text_document")
+
+    def test_process_data_dist_serial_microsoft(self):
+        """We want to be stable to Microsoft version."""
+        src_dir = self.src_dir
+        data_dir = f"{self.data_dir}/gpt2"
+        output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False)
+
+        output_prefix = f"{output_dir}/test-ds-meg-gpt2-openwebtext_1k"
+
+        # preproces_data_dist requires one to have already downloaded the input HF dataset.
+        # We do that by running this script before the test.
+        dsetname = 'stas/openwebtext-10k'
+        download_hf_dataset(dsetname)
+
+        cmd = f"""
+                python -m torch.distributed.launch --nproc_per_node 2 {src_dir}/tools/preprocess_data_dist.py
+                    --input {dsetname}
+                    --count 1000
+                    --merge serial
+                    --output-prefix {output_prefix}
+                    --dataset-impl mmap
+                    --tokenizer-type GPT2BPETokenizer
+                    --merge-file {data_dir}/gpt2-tiny-merges.txt
+                    --vocab {data_dir}/gpt2-tiny-vocab.json
+                    --append-eod
+                """.split()
+
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        self.compare_meg_data_files(f"{output_prefix}_text_document", f"{data_dir}/meg-gpt2-openwebtext_text_document")
--- a/megatron-deepspeed_dtk22.10/tests/test_tensor_parallel.py
+++ b/megatron-deepspeed_dtk22.10/tests/test_tensor_parallel.py
+import unittest
+from random import randint
+from unittest.mock import patch
+
+import deepspeed
+import torch
+import logging
+import numpy as np
+
+import pytest
+from megatron import initialize_megatron, get_args, get_tokenizer, global_vars
+from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, require_deepspeed, require_torch_multi_gpu
+from megatron.training import setup_model_and_optimizer
+from megatron.mpu.mappings import gather_from_tensor_model_parallel_region
+from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe
+from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe
+import multiprocessing as mp
+from multiprocessing import Pool
+from megatron.checkpointing import save_checkpoint
+
+from megatron.utils import get_ltor_masks_and_position_ids
+
+@require_deepspeed
+@require_torch_multi_gpu
+class MegDSTestTP(TestCasePlus):
+    def get_default_args(self):
+        """return a dictionary with key as argument name and value as additional arguments"""
+        data_dir = f"{self.data_dir}/gpt2"
+        return {
+            # GPT_ARGS
+            "--num-layers": "2",
+            "--hidden-size": "128",
+            "--num-attention-heads": "4",
+            "--seq-length": "256",
+            "--max-position-embeddings": "256",
+            "--micro-batch-size": "4",
+            "--global-batch-size": "8",
+            "--lr-decay-iters": "320000",
+            "--lr-decay-style": "cosine",
+            "--lr": "0.00015",
+            "--min-lr": "1.0e-5",
+            "--train-iters": "5000",
+            "--tokenizer-type": "GPT2BPETokenizer",
+            "--merge-file": f"{data_dir}/gpt2-tiny-merges.txt",
+            "--vocab-file": f"{data_dir}/gpt2-tiny-vocab.json",
+            "--data-impl": "mmap",
+            "--split": "949,50,1",
+            "--distributed-backend": "nccl",
+            "--weight-decay": "1e-2",
+            "--clip-grad": "1.0",
+            "--lr-warmup-fraction": ".01",
+            "--fp16": "",
+
+            "--attention-dropout": "0",
+            "--hidden-dropout": "0",
+            
+
+            # OUTPUT_ARGS
+            "--log-interval": "10",
+            "--save-interval": "500",
+            "--eval-interval": "100",
+            "--eval-iters": "10",
+            "--checkpoint-activations": "",
+            
+            #ds args
+            "--deepspeed": "",
+            "--deepspeed_config":f"{self.test_file_dir_str}/ds_config.json",
+            "--zero-stage": "1",
+            "--deepspeed-activation-checkpointing": ""
+            # DATA_ARGS
+        }
+        
+    def setUp(self) -> None:
+        super().setUp()
+
+        # We reset all global variables
+        global_vars._GLOBAL_ARGS = None
+        global_vars._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
+        global_vars._GLOBAL_TOKENIZER = None
+        global_vars._GLOBAL_TENSORBOARD_WRITER = None
+        global_vars._GLOBAL_ADLR_AUTORESUME = None
+        global_vars._GLOBAL_TIMERS = None
+
+    def infer_model(args):
+        tp_index, tp_size, command_args, token_ids, save, load = args
+        dist_env = dict(
+            MASTER_ADDR="localhost", MASTER_PORT="9991", RANK=str(tp_index), LOCAL_RANK=str(tp_index), WORLD_SIZE=str(tp_size)
+        )
+        logging.getLogger().critical("Process: starting")
+        
+        #Hack
+        import megatron.initialize as init
+        init.git_ds_info = lambda: None
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**dist_env):
+                
+                def create_model_inputs(tokens):
+                    args = get_args()
+
+                    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+                        tokens,
+                        tokenizer.eod,
+                        args.reset_position_ids,
+                        args.reset_attention_mask,
+                        args.eod_mask_loss,
+                        prefix_indices=None,
+                        loss_on_targets_only=False)
+
+                    return (tokens, position_ids, attention_mask), (tokens, loss_mask)
+
+                deepspeed.init_distributed()
+                initialize_megatron()
+                args = get_args()
+
+                tokenizer = get_tokenizer()
+
+                model, _, _ = setup_model_and_optimizer(gpt_model_provider)
+                model = model[0]
+                if load is not None:
+                    # Hack (same as in eval_harness/evaluate.py)
+                    # Loading pipelined models in deepspeed with different TP than it was originally trained on fails
+                    # due to a sanity check, that makes sure that all state_dicts that we merge contains attention layers.
+                    # This, however, is not true for pipelining when we will merge the state_dict for the embeddings which
+                    # which does not contain these attention-specific keys.
+                    #
+                    # Deepspeed does however manage to load the model if we just turn off this sanity check.
+                    deepspeed.runtime.state_dict_factory.MegatronSDLoader.sanity_check = lambda self, ckpt_file_name: None
+
+                    zero_enabled = model._config.zero_enabled
+                    model._config.zero_enabled = False
+                    _, _ = model.load_checkpoint(load, load_optimizer_states=False, load_lr_scheduler_states=False, load_module_only=True)
+                    model._config.zero_enabled = zero_enabled
+                
+                if token_ids is None:
+                    token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
+
+                    # eod is a special token
+                    token_ids[token_ids == tokenizer.eod] += 1
+                    token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
+                else:
+                    token_ids = torch.tensor(token_ids)
+                
+                model.micro_batches = 1
+                model.set_batch_fn(create_model_inputs)
+                # process batch
+                input_batch = get_gpt_batch_pipe({"text": token_ids})[0]
+
+                # get a modified version of the first batch, we change a specific index
+                changed_index = randint(0, args.seq_length - 2)
+                input_token_ids_changed = input_batch[0].clone()
+                # We increment the token_id by one for that index in order to artificially change the sequence.
+                input_token_ids_changed[:, changed_index] = \
+                    (input_token_ids_changed[:,changed_index] + 1) % args.padded_vocab_size
+
+                output = model.eval_batch(iter([token_ids]), compute_loss = False, reduce_output = None)[0]
+                
+                output = gather_from_tensor_model_parallel_region(output)
+
+                if save != None:
+                    args.save = save
+                    save_checkpoint(0, [model], None, None)
+                
+                return (output[0].detach().cpu().numpy(), token_ids.detach().cpu().numpy())
+
+    def test_alibi_tp(self):
+        mp.set_start_method('spawn', force=True)
+        cp_dir = self.get_auto_remove_tmp_dir()
+        
+        command_args = self.get_default_args()
+        command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
+        command_args["--position-embedding-type"] = "alibi"
+        command_args["--tensor-model-parallel-size"] = "1"
+        
+        pool = Pool(1)
+        result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, None, cp_dir, None))])
+        pool.close()
+        pool.join()
+        
+        output, tokens = result[0]
+        logging.getLogger().info("First done!")
+
+        command_args["--tensor-model-parallel-size"] = "2"
+
+        pool = Pool(2)
+        result = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, tokens, None, cp_dir)), ((1, 2, command_args, tokens, None, cp_dir))])
+        pool.close()
+        pool.join()
+        
+        output2, tokens = result[0]
+
+        logging.getLogger().critical(output-output2)
+        self.assertTrue(np.allclose(output,output2, atol=5e-3, rtol=0), "Different results when running with TP=1 and TP=2")
+
+
+
+    def test_embedding_matrix_tp(self):
+        mp.set_start_method('spawn', force=True)
+        cp_dir = self.get_auto_remove_tmp_dir()
+        
+        command_args = self.get_default_args()
+        command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
+        command_args["--seq-length"] = "4"
+        command_args["--micro-batch-size"] = "2"
+        tokens = [[5119, 0, 1, 5100],[0, 1, 5111, 5101]]
+
+        command_args["--tensor-model-parallel-size"] = "1"
+        
+        pool = Pool(1)
+        # tp_index, tp_size, command_args, token_ids, save, load
+        result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, tokens, cp_dir, None))])
+        pool.close()
+        pool.join()
+        
+        output, _ = result[0]
+        logging.getLogger().info("First done!")
+
+        command_args["--tensor-model-parallel-size"] = "2"
+
+        pool = Pool(2)
+        result = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, tokens, None, cp_dir)), ((1, 2, command_args, tokens, None, cp_dir))])
+        pool.close()
+        pool.join()
+        
+        output2, _ = result[0]
+
+        logging.getLogger().critical(output-output2)
+        self.assertTrue(np.allclose(output,output2, atol=5e-3, rtol=0), "Different results when running with TP=1 and TP=2")
+
+
+    def test_embedding_matrix_tp_with_invalid_tokens_ids(self):
+        mp.set_start_method('spawn', force=True)
+        
+        command_args = self.get_default_args()
+        command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
+        command_args["--seq-length"] = "4"
+        command_args["--micro-batch-size"] = "2"
+        tokens = [[5120, 0, 1, 2],[0, 1, 3, 4]]
+
+        command_args["--tensor-model-parallel-size"] = "1"
+
+        pool = Pool(1)
+        with pytest.raises(Exception) as exc_info: 
+            _ = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, tokens, None, None))])
+        pool.close()
+        pool.join()
+
+        self.assertIn("There is an input id in the input that is greater than the highest possible input id" , str(exc_info.value))
+        
+        logging.getLogger().info("First done!")
+
+        command_args["--tensor-model-parallel-size"] = "2"
+
+        pool = Pool(2)
+        with pytest.raises(Exception) as exc_info: 
+            _ = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, tokens, None, None)), ((1, 2, command_args, tokens, None, None))])
+        pool.close()
+        pool.join()
+
+        self.assertIn("There is an input id in the input that is greater than the highest possible input id", str(exc_info.value))
+
+
+    def test_tokenizer_vocab_size_multiple_of_tp_size(self):
+        mp.set_start_method('spawn', force=True)
+        
+        command_args = self.get_default_args()
+        command_args["--pad-vocab-size-to"] = "5121" # This is equal to 128 * 40 + 1 which is above the len of gp2-tiny vocabulary
+        command_args["--micro-batch-size"] = "4"
+        command_args["--tensor-model-parallel-size"] = "2"
+        command_args["--make-vocab-size-divisible-by"] = "1"
+
+        pool = Pool(2)
+        with pytest.raises(Exception) as exc_info: 
+            _ = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, None, None, None)), ((1, 2, command_args, None, None, None))])
+        pool.close()
+        pool.join()
+
+        self.assertEqual(str(exc_info.value), "5121 is not divisible by 2")
+
+    def test_tokenizer_raise_error_make_vocab_size_divisible_by(self):
+        mp.set_start_method('spawn', force=True)
+        
+        command_args = self.get_default_args()
+        command_args["--pad-vocab-size-to"] = "5121" # This is equal to 128 * 40 + 1 which is above the len of gp2-tiny vocabulary
+        command_args["--micro-batch-size"] = "4"
+        
+
+        pool = Pool(2)
+        with pytest.raises(Exception) as exc_info: 
+            _ = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, None, None, None)), ((1, 2, command_args, None, None, None))])
+        pool.close()
+        pool.join()
+
+        self.assertEqual(str(exc_info.value), "5121 is not divisible by 128")
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/megatron-deepspeed_dtk22.10/tests/test_training.py
+++ b/megatron-deepspeed_dtk22.10/tests/test_training.py
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import io
+import json
+import os
+import glob
+import re
+import shutil
+import unittest
+from pathlib import Path
+from parameterized import parameterized
+
+from megatron.testing_utils import (
+    CaptureStdout,
+    CaptureStd,
+    TestCasePlus,
+    execute_subprocess_async,
+    get_gpu_count,
+    require_bnb_non_decorator,
+    require_deepspeed,
+    require_torch_gpu,
+    set_seed
+)
+
+set_seed(42)
+
+
+def get_launcher(num_gpus):
+    # 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup
+    # - it won't be able to handle that
+    return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
+
+def get_3d_dimensions():
+    num_gpus = get_gpu_count()
+
+    # with fewer gpus the preference is first to to do PP>1, then TP>1, then DP>1
+    if num_gpus >= 8:
+        dp_size = 2
+        pp_size = 2
+        tp_size = 2
+    if num_gpus >= 4:
+        dp_size = 1
+        pp_size = 2
+        tp_size = 2
+    elif num_gpus >= 2:
+        dp_size = 1
+        pp_size = 2
+        tp_size = 1
+    else:
+        dp_size = 1
+        pp_size = 1
+        tp_size = 1
+
+    return pp_size, tp_size, dp_size
+
+
+@require_deepspeed
+@require_torch_gpu
+class MegDSTestTraining(TestCasePlus):
+    """ """
+
+    def setUp(self):
+        super().setUp()
+
+        # at times magatron fails to build kernels and doesn't remove the lock file, which makes
+        # subsequent runs hang - so make sure there is no lock when starting the testing
+        meg_lock_file_path = self.repo_root_dir_str + "/megatron/fused_kernels/build/lock"
+        if os.path.exists(meg_lock_file_path):
+            os.unlink(meg_lock_file_path)
+
+    def copy_data_to_temp(self, root_dir, prefix):
+        """copy data to temp, and return paths to temp version"""
+        src_path = os.path.join(root_dir, prefix)
+        src_dirname = os.path.dirname(src_path)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        dest_path = os.path.join(tmp_dir, prefix)
+        dest_dirname = os.path.dirname(dest_path)
+        os.makedirs(dest_dirname, exist_ok=True)
+        for folder in os.listdir(src_dirname):
+            src_folder = os.path.join(src_dirname, folder)
+            dest_folder = os.path.join(dest_dirname, folder)
+            if src_folder.startswith(src_path):
+                if os.path.isdir(src_folder):
+                    shutil.copytree(src_folder, dest_folder)
+                else:
+                    shutil.copy2(src_folder, dest_folder)
+        return dest_path
+
+    def get_variation_config(self, variation, output_dir, n_samples=None):
+        data_dir = self.copy_data_to_temp(self.data_dir,"gpt2")
+
+        pp_size, tp_size, dp_size = get_3d_dimensions()
+        num_gpus = pp_size * tp_size * dp_size
+        print(f"Using {num_gpus} GPUs")
+
+        if variation == "bnb":
+            # we want to make sure at least tp=2 is used, so we swap tp and pp
+            pp_size, tp_size = tp_size, pp_size
+
+        if n_samples is None:
+            n_samples = 300 # about 56 iterations
+
+        exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume
+        seq_len = 128
+
+        # common/shared configs
+
+        ds_args = f"""
+                --deepspeed
+                --deepspeed_config {self.test_file_dir_str}/ds_config.json
+                --zero-stage 1
+                --deepspeed-activation-checkpointing
+        """.split()
+
+        args = f"""
+                --tensor-model-parallel-size {tp_size}
+                --pipeline-model-parallel-size {pp_size}
+                --distributed-backend nccl
+
+                --log-interval 1
+                --save-interval 10
+                --eval-interval 10
+                --eval-iters 5
+                --checkpoint-activations
+                --partition-activations
+                --exit-interval {exit_interval}
+
+                --merge-file {data_dir}/gpt2-tiny-merges.txt
+                --vocab-file {data_dir}/gpt2-tiny-vocab.json
+                --save {output_dir}/checkpoints
+                --load {output_dir}/checkpoints
+                --data-path {data_dir}/meg-gpt2-openwebtext_text_document
+                --tensorboard-dir {output_dir}/tensorboard
+                --tensorboard-queue-size 5
+                --log-timers-to-tensorboard
+                --log-batch-size-to-tensorboard
+                --log-validation-ppl-to-tensorboard
+
+                --num-layers 2
+                --hidden-size 64
+                --num-attention-heads 2
+                --seq-length {seq_len}
+                --max-position-embeddings 1024
+                --micro-batch-size 1
+                --global-batch-size 16
+
+                --optimizer adam
+                --adam-beta1 0.9
+                --adam-beta2 0.95
+                --adam-eps 1e-8
+                --lr 1e-4
+                --lr-warmup-samples 5
+                --clip-grad 1.0
+                --weight-decay 1e-1
+                --embed-layernorm
+                --sync-tp-duplicated-parameters
+                --fp16
+
+                --log-level debug
+                --log-level-replica info
+        """.split()
+
+
+        if variation == "base":
+
+            new_args = f"""
+                --rampup-batch-size 2 2 {n_samples}
+                --train-samples {n_samples}
+
+                --lr-decay-samples 6
+
+            """.split()
+
+            new_ds_args = f"""
+                --deepspeed_config {self.test_file_dir_str}/ds_config.json
+            """.split()
+
+
+        elif variation == "bnb":
+            # BitsAndBytes - 8-bit optimizer
+
+            new_args = f"""
+                --rampup-batch-size 2 2 {n_samples}
+                --train-samples {n_samples}
+
+                --lr-decay-samples 6
+
+                --use-bnb-optimizer
+            """.split()
+
+            new_ds_args = f"""
+                --deepspeed_config {self.test_file_dir_str}/ds_config.json
+            """.split()
+
+
+        elif variation == "cl":
+            # CurriculumLearning
+
+            lr_decay_samples = 6
+            lr_decay_tokens = lr_decay_samples * seq_len
+
+            train_tokens = n_samples * seq_len
+
+            # XXX: if changing seq_len from 128, must adjust ds config to:
+            #  curriculum_learning.max_difficulty: $SEQLEN
+
+            # XXX: probably we should write the ds config on the fly to keep everything in sync,
+            # rather than using the pre-saved config
+
+            new_args = f"""
+                --train-samples {n_samples*2}
+                --train-tokens {train_tokens}
+
+                --lr-decay-tokens {lr_decay_tokens}
+            """.split()
+
+            new_ds_args = f"""
+                --deepspeed_config {self.test_file_dir_str}/ds_config_cl.json
+            """.split()
+
+        elif variation == "glu":
+            new_args = f"""
+                --rampup-batch-size 2 2 {n_samples}
+                --train-samples {n_samples}
+
+                --lr-decay-samples 6
+
+                --no-bias-gelu-fusion
+                --glu-activation geglu
+            """.split()
+
+            new_ds_args = f"""
+                --deepspeed_config {self.test_file_dir_str}/ds_config.json
+            """.split()
+
+        elif variation == "alibi":
+            new_args = f"""
+                --rampup-batch-size 2 2 {n_samples}
+                --train-samples {n_samples}
+
+                --lr-decay-samples 6
+
+                --position-embedding-type alibi
+            """.split()
+
+            new_ds_args = f"""
+                --deepspeed_config {self.test_file_dir_str}/ds_config.json
+            """.split()
+
+        else:
+            raise ValueError(f"Don't know of variation {variation}")
+
+        args.extend(new_args)
+        ds_args.extend(new_ds_args)
+
+        return args, ds_args, num_gpus
+
+    def test_kill_switch(self):
+
+        variation = "base"
+
+        src_dir = self.src_dir
+        output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
+        kill_switch_path = os.path.join(output_dir, "kill-switch-xyz")
+        args, ds_args, num_gpus = self.get_variation_config(variation, output_dir)
+        args += f"--kill-switch-path {kill_switch_path}".split()
+
+        script = [f"{src_dir}/pretrain_gpt.py"]
+        launcher = get_launcher(num_gpus)
+
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        # 1. kill switch armed but not triggered
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test deepspeed is running
+        self.assertIn("DeepSpeed info", cs.out)
+
+        # 2. trigger kill switch
+        fh = open(kill_switch_path, "w")
+        with CaptureStd() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        self.assertIn(f"Detected kill switch at {kill_switch_path}", cs.out)
+
+        # test deepspeed wasn't run
+        self.assertNotIn("DeepSpeed info", cs.out)
+
+
+    @parameterized.expand(["base", "cl", "bnb", "glu", "alibi"])
+    def test_training_all(self, variation):
+
+        # optional runs
+        if variation == "bnb":
+            require_bnb_non_decorator()
+
+        # all in one test
+        src_dir = self.src_dir
+        output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
+
+        args, ds_args, num_gpus = self.get_variation_config(variation, output_dir)
+
+        script = [f"{src_dir}/pretrain_gpt.py"]
+        launcher = get_launcher(num_gpus)
+
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        # 1. test training from scratch (no checkpoint)
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test deepspeed is running
+        self.assertIn("DeepSpeed info", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test there should be no checkpoint this round
+        self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+        # test tensorboard
+        tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
+        self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
+
+        if variation == "glu":
+            self.assertIn("Using GLU activation: GELU", cs.out)
+
+        if variation == "alibi":
+            self.assertIn("Using Alibi", cs.out)
+
+        # 2. test training from checkpoint: resume
+        # now do it again, this time resuming from the checkpoint
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test checkpoint loading
+        self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+        # test tensorboard (1 file from the first run, plus 1 now)
+        tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
+        self.assertEqual(len(tensorboard_files), 2, "tensorboard files")
+
+        if variation == "glu":
+            self.assertIn("Using GLU activation: GELU", cs.out)
+
+    @parameterized.expand([(True, True), (False, False), (True, False), (False, True)])
+    def test_training_prefix_lm_all(self, loss_on_targets_only, reweight_loss_based_on_position_frequency):
+        # all in one test
+        src_dir = self.src_dir
+        data_dir = self.copy_data_to_temp(self.data_dir,"gpt2")
+
+        output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
+        logs_dir = f"{output_dir}/logs"
+        Path(logs_dir).mkdir(parents=True, exist_ok=True)
+
+        pp_size, tp_size, dp_size = get_3d_dimensions()
+        num_gpus = pp_size * tp_size * dp_size
+
+        n_samples = 200 # about 37 iterations
+        exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume
+        args = f"""
+            --tensor-model-parallel-size {tp_size}
+            --pipeline-model-parallel-size {pp_size}
+            --distributed-backend nccl
+
+            --num-layers 2
+            --hidden-size 64
+            --num-attention-heads 2
+            --seq-length 128
+            --max-position-embeddings 1024
+            --micro-batch-size 1
+            --rampup-batch-size 2 2 {n_samples}
+            --global-batch-size 16
+            --train-samples {n_samples}
+            {"--loss-on-targets-only" if loss_on_targets_only else ""}
+            {"--reweight-loss-based-on-position-frequency" if reweight_loss_based_on_position_frequency else ""}
+
+            --optimizer adam
+            --adam-beta1 0.9
+            --adam-beta2 0.95
+            --adam-eps 1e-8
+            --lr 1e-4
+            --lr-warmup-samples 5
+            --clip-grad 1.0
+            --weight-decay 1e-1
+            --fp16
+
+            --log-interval 5
+            --save-interval 10
+            --eval-interval 10
+            --eval-iters 5
+            --checkpoint-activations
+            --exit-interval {exit_interval}
+
+            --merge-file {data_dir}/gpt2-tiny-merges.txt
+            --vocab-file {data_dir}/gpt2-tiny-vocab.json
+            --log-path {logs_dir}
+            --save {output_dir}/checkpoints
+            --load {output_dir}/checkpoints
+            --data-path {data_dir}/meg-gpt2-openwebtext_text_document
+            --tensorboard-dir {output_dir}/tensorboard
+            --tensorboard-queue-size 5
+            --log-timers-to-tensorboard
+            --log-batch-size-to-tensorboard
+            --log-validation-ppl-to-tensorboard
+
+            --log-level debug
+        """.split()
+
+        ds_args = f"""
+            --deepspeed
+            --deepspeed_config {self.test_file_dir_str}/ds_config.json
+            --zero-stage 1
+            --deepspeed-activation-checkpointing
+        """.split()
+
+        script = [f"{src_dir}/pretrain_prefix_lm.py"]
+        launcher = get_launcher(num_gpus)
+
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        # 1. test training from scratch (no checkpoint)
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test deepspeed is running
+        self.assertIn("DeepSpeed info", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test there should be no checkpoint this round
+        self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+        # test tensorboard
+        tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
+        self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
+
+        if reweight_loss_based_on_position_frequency:
+            self.assertIn("Using loss reweighting", cs.out)
+
+        # 2. test training from checkpoint: resume
+        # now do it again, this time resuming from the checkpoint
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test checkpoint loading
+        self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+        # test tensorboard (1 file from the first run, plus 1 now)
+        tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
+        self.assertEqual(len(tensorboard_files), 2, "tensorboard files")
+
+    def test_training_t0(self):
+        data_path = self.copy_data_to_temp(self.data_dir, "gpt2/ag_news_prompt")
+        output_dir = self.get_auto_remove_tmp_dir()
+        logs_dir = f"{output_dir}/logs"
+        Path(logs_dir).mkdir(parents=True, exist_ok=True)
+
+        pp_size, tp_size, dp_size = get_3d_dimensions()
+        num_gpus = pp_size * tp_size * dp_size
+
+        n_samples = 200 # about 37 iterations
+        exit_interval = 10 # some samples in the first half and then some more in the 2nd half after resume
+
+        args = f"""
+            --tensor-model-parallel-size {tp_size}
+            --pipeline-model-parallel-size {pp_size}
+            --distributed-backend nccl
+
+            --num-layers 2
+            --hidden-size 64
+            --num-attention-heads 2
+            --seq-length 128
+            --max-position-embeddings 1024
+            --position-embedding-type alibi
+            --micro-batch-size 1
+            --rampup-batch-size 2 2 {n_samples}
+            --global-batch-size 16
+            --train-samples {n_samples}
+
+            --optimizer adam
+            --adam-beta1 0.9
+            --adam-beta2 0.95
+            --adam-eps 1e-8
+            --lr 1e-4
+            --lr-warmup-samples 5
+            --clip-grad 1.0
+            --weight-decay 1e-1
+            --fp16
+
+            --log-interval 5
+            --save-interval 10
+            --eval-interval 10
+            --eval-iters 5
+            --checkpoint-activations
+            --exit-interval {exit_interval}
+            --tokenizer-type PretrainedFromHF
+            --tokenizer-name-or-path bigscience/tokenizer
+            --log-path {logs_dir}
+            --save {output_dir}/checkpoints
+            --load {output_dir}/checkpoints
+            --data-path {data_path}
+            --split 90,10,0
+            --tensorboard-dir {output_dir}/tensorboard
+            --tensorboard-queue-size 5
+            --log-timers-to-tensorboard
+            --log-batch-size-to-tensorboard
+            --log-validation-ppl-to-tensorboard
+
+            --log-level debug
+        """.split()
+
+        ds_args = f"""
+            --deepspeed
+            --deepspeed_config {self.test_file_dir_str}/ds_config.json
+            --zero-stage 1
+            --deepspeed-activation-checkpointing
+        """.split()
+
+        script = [f"{self.src_dir}/finetune_t0_non_causal_decoder.py"]
+        launcher = get_launcher(num_gpus)
+
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        # 1. test training from scratch (no checkpoint)
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test deepspeed is running
+        self.assertIn("DeepSpeed info", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test there should be no checkpoint this round
+        self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+        # test tensorboard
+        tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
+        self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
+
+        # 2. test training from checkpoint: resume
+        # now do it again, this time resuming from the checkpoint
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test checkpoint loading
+        self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+        # test tensorboard (1 file from the first run, plus 1 now)
+        tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
+        self.assertEqual(len(tensorboard_files), 2, "tensorboard files")
+
+    @parameterized.expand(["gpt", "prefix", "no_eval"])
+    def test_mode2_dataloading(self, variation):
+        src_dir = self.src_dir
+        data_dir = self.copy_data_to_temp(self.data_dir, "gpt2")
+        output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
+        logs_dir = f"{output_dir}/logs"
+        Path(logs_dir).mkdir(parents=True, exist_ok=True)
+
+        pp_size, tp_size, dp_size = get_3d_dimensions()
+        num_gpus = pp_size * tp_size * dp_size
+
+        n_samples = 200 # about 37 iterations
+        exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume
+        args = f"""
+            --tensor-model-parallel-size {tp_size}
+            --pipeline-model-parallel-size {pp_size}
+            --distributed-backend nccl
+
+            --num-layers 2
+            --hidden-size 64
+            --num-attention-heads 2
+            --seq-length 128
+            --max-position-embeddings 1024
+            --micro-batch-size 1
+            --rampup-batch-size 2 2 {n_samples}
+            --global-batch-size 16
+            --train-samples {n_samples}
+            --loss-on-targets-only
+
+            --optimizer adam
+            --adam-beta1 0.9
+            --adam-beta2 0.95
+            --adam-eps 1e-8
+            --lr 1e-4
+            --lr-warmup-samples 5
+            --clip-grad 1.0
+            --weight-decay 1e-1
+            --fp16
+
+            --log-interval 5
+            --save-interval 10
+            --eval-interval 10
+            --eval-iters 5
+            --checkpoint-activations
+            --exit-interval {exit_interval}
+
+            --merge-file {data_dir}/gpt2-tiny-merges.txt
+            --vocab-file {data_dir}/gpt2-tiny-vocab.json
+            --log-path {logs_dir}
+            --save {output_dir}/checkpoints
+            --tensorboard-dir {output_dir}/tensorboard
+            --tensorboard-queue-size 5
+            --log-timers-to-tensorboard
+            --log-batch-size-to-tensorboard
+            --log-validation-ppl-to-tensorboard
+        """.split()
+
+        data_args = [
+            "--train-weighted-split-paths", f'TRAIN: 1 0:0.95 {data_dir}/meg-gpt2-openwebtext_text_document, 0.3 0:0.90 {data_dir}/meg-gpt2-openwebtext_text_document']
+
+        if variation != "no_eval":
+            data_args += ["--valid-weighted-split-paths", f'VALID1: 1 0.95:0.98 {data_dir}/meg-gpt2-openwebtext_text_document, 0.3 0.90:0.99 {data_dir}/meg-gpt2-openwebtext_text_document',
+                                            f'VALID2: 0.5 0.95:0.97 {data_dir}/meg-gpt2-openwebtext_text_document, 0.5 0.90:0.98 {data_dir}/meg-gpt2-openwebtext_text_document']
+
+        ds_args = f"""
+            --deepspeed
+            --deepspeed_config {self.test_file_dir_str}/ds_config.json
+            --zero-stage 1
+            --deepspeed-activation-checkpointing
+        """.split()
+
+        if variation == "prefix":
+            script = [f"{src_dir}/pretrain_prefix_lm.py"]
+        else:
+            script = [f"{src_dir}/pretrain_gpt.py"]
+        launcher = get_launcher(num_gpus)
+
+        cmd = launcher + script + args + data_args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        # 1. test training from scratch (no checkpoint)
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test deepspeed is running
+        self.assertIn("DeepSpeed info", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+        # test tensorboard
+        tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
+        self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
+
+    def test_skip_train_iteration(self):
+        # skip iterations setup
+        extra_args = f"""
+            --skip-train-iteration-range 2-2 4-7
+        """.split()
+
+        src_dir = self.src_dir
+        output_dir = self.get_auto_remove_tmp_dir()
+        args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200)
+        args.extend(extra_args)
+        script = [f"{src_dir}/pretrain_gpt.py"]
+        launcher = get_launcher(num_gpus)
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # check skipped iterations
+        self.assertIn("Skipped iterations 2 to 2 due to --skip-train-iteration-range flag", cs.out)
+        self.assertIn("Skipped iterations 4 to 7 due to --skip-train-iteration-range flag", cs.out)
+
+        train_iterations = range(1,10)
+        for i in train_iterations:
+            self.assertTrue(f"iteration {i:8d}/" in cs.out)
--- a/megatron-deepspeed_dtk22.10/tests/tools/README.md
+++ b/megatron-deepspeed_dtk22.10/tests/tools/README.md
+# Test suite tools
+
+# Make tiny tokenizer files
+
+currently for gpt2 run:
+```
+./shrink-tokenizer.py
+```
+
+and then we have tiny vocab and merge files under the generated dir `tiny` to add to repo under `data/gpt2`.
+
+```
+cp tiny/merges.txt ../data/gpt2/gpt2-tiny-merges.txt
+cp tiny/vocab.json ../data/gpt2/gpt2-tiny-vocab.json
+```
+
+Note, the tiny vocab was set to 5000 items after experimenting with the resulting index files size. Using a tiny vocab of 500 (and adjusted merge entries) proved to generate very large index files, so it actually ends up costing more in final file size. 5000 proved to generate an almost identical index files as with the original 50k vocab size.
+
+
+# Make tiny pre-processed index
+
+to be used in test training
+
+```
+./openwebtext-to-jsonl.py
+```
+
+generates:
+
+```
+openwebtext-10000.jsonl
+```
+
+we don't want to store jsonl in repo, to keep the size small, so it's a temp file.
+
+Now we pre-process it:
+
+```
+cd ../..
+input=tests/tools/openwebtext-1000.jsonl
+python tools/preprocess_data.py \
+    --input $input \
+    --output-prefix tests/data/gpt2/meg-gpt2-openwebtext \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file tests/data/gpt2/gpt2-tiny-merges.txt \
+    --vocab tests/data/gpt2/gpt2-tiny-vocab.json \
+    --append-eod \
+    --workers 6
+```
+
+and voila we now have:
+```
+ls -sh1 tests/data/gpt2/meg-gpt2-openwebtext*
+2.6M tests/data/gpt2/meg-gpt2-openwebtext_text_document.bin
+ 20K tests/data/gpt2/meg-gpt2-openwebtext_text_document.idx
+```
+which we can now commit and use in tests.
--- a/megatron-deepspeed_dtk22.10/tests/tools/openwebtext-to-jsonl.py
+++ b/megatron-deepspeed_dtk22.10/tests/tools/openwebtext-to-jsonl.py
+#!/usr/bin/env python
+
+# generate a jsonl version of a small slice of a dataset that can be fed to megatron-lm preprocessor
+
+import sys
+from datasets import load_dataset
+
+dataset_name = "stas/openwebtext-10k"
+
+# subset to jsonlines
+n_samples = 1000
+ds = load_dataset(dataset_name, split='train')
+ds_small = ds.select(range(n_samples))
+path = f"openwebtext-{n_samples}.jsonl"
+ds_small.to_json(path, orient="records", lines=True)
--- a/megatron-deepspeed_dtk22.10/tests/tools/shrink-tokenizer.py
+++ b/megatron-deepspeed_dtk22.10/tests/tools/shrink-tokenizer.py
+#!/usr/bin/env python
+
+# produce a tiny tokenizer which we can use in testing (so that it won't take much space in the repo)
+
+import json
+from transformers import AutoTokenizer
+from tokenizers import Tokenizer
+
+mname = "gpt2"
+
+vocab_keep_items = 5000
+
+tokenizer = AutoTokenizer.from_pretrained(mname, use_fast=True)
+assert tokenizer.is_fast, "This only works for fast tokenizers."
+tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
+vocab = tokenizer_json["model"]["vocab"]
+if tokenizer_json["model"]["type"] == "BPE":
+    if "gpt2" in mname:
+        new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items-1 }
+        new_vocab["<|endoftext|>"] = vocab_keep_items-1
+    else:
+        new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
+    merges = tokenizer_json["model"]["merges"]
+    new_merges = []
+    for i in range(len(merges)):
+        a, b = merges[i].split()
+        new_token = "".join((a, b))
+        if a in new_vocab and b in new_vocab and new_token in new_vocab:
+            new_merges.append(merges[i])
+    tokenizer_json["model"]["merges"] = new_merges
+elif tokenizer_json["model"]["type"] == "Unigram":
+    new_vocab = vocab[:vocab_keep_items]
+elif tokenizer_json["model"]["type"] == "WordPiece" or tokenizer_json["model"]["type"] == "WordLevel":
+    new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
+else:
+    raise ValueError(f"don't know how to handle {tokenizer_json['model']['type']}")
+tokenizer_json["model"]["vocab"] = new_vocab
+tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
+tokenizer.save_pretrained("tiny")
--- a/megatron-deepspeed_dtk22.10/tools/README.md
+++ b/megatron-deepspeed_dtk22.10/tools/README.md
+
+# Tools
+
+- [sample_idxs_to_text.py](./sample_idxs_to_text.py) - want to see which text was feed at specific iterations? for example to understand why the training went astray? Then use this script. The pre-amble of the script contains the documentation and usage examples.
+
+
+## A few notes on how we created the datasets:
+
+### Creating the Json Lines text file
+
+First you need to create a jsonl file containing your dataset. For this we exported from the HF-datasets format. For example for C4:
+
+```
+from datasets import load_dataset
+c4 = load_dataset("c4", "en")
+c4["train"].to_json("c4_en_train.jsonl")
+c4["validation"].to_json("c4_en_valid.jsonl")
+```
+
+This creates quite a large file compared to the size of the HF dataset on disk (810GB vs 305 for C4 for example)
+
+### Megatron pre-processing
+
+Then you need to pass that text file to the `preprocess_data.py` script for tokenization and memory-mapping, creating two files, one to store the tokens indices and one to store the document start and ends. The result will be slightly bigger than the text dataset. (360GB vs 305GB for C4 for example). You can choose one of the default Megatron tokenizers (but then you have to pass merges and vocab files) or one from HF-tokenizers. For example, in our GPT-like models reusing a T5 sentencepiece-bpe tokenizer:
+
+`python tools/preprocess_data.py   --input ~/c4_en_train.jsonl        --output-prefix c4_en_train --dataset-impl mmap        --tokenizer-type PretrainedFromHF --tokenizer-name-or-path t5-small        --workers 30        --append-eod`
+
+Do note that adding too many workers can be counterproductive for very large dataset: as the bottleneck becomes disk writing, the intermediary process results pool up and can flood the RAM. In our experiments on GCP machines, running with 60 workers on C4 inevitably led the program to fail.
--- a/megatron-deepspeed_dtk22.10/tools/convert_checkpoint/README.md
+++ b/megatron-deepspeed_dtk22.10/tools/convert_checkpoint/README.md
+# Introduction
+
+This folder is a collection of scripts for converting checkpoints of one training framework (e.g., DeepSpeed) into that of a different framework (e.g., Megatron-LM, HF Transformers).
+
+The folder also contains scripts for inspecting checkpoint files and folders, which could be useful when developing checkpoint conversion logic. At the time of creation, this folder contains scripts to convert DeepSpeed checkpoints to Megatron-LM and HF Transformers checkpoints (this motivated this effort as part of the BigScience project).
+
+Here are the list and details of checkpoint conversions provided by the available scripts:
+
+1. [Megatron-DeepSpeed to Megatron-LM](#Megatron-DeepSpeed-to-Megatron)
+1. [Megatron-DeepSpeed to HF Transformers](#Megatron-DeepSpeed-to-HF-Transformers)
+
+
+## Megatron-DeepSpeed to Megatron
+
+The (current implementation of the) converter extracts args and model parameters from a DeepSpeed checkpoint (i.e., excludes other training states such as optimizer, scheduler, etc) and convert into a Megatron-LM checkpoint similarly containing only model parameters. The converter also provides a best-effort attempt to reshape the tensor-parallelism and pipeline parallelism degrees for the checkpoint. The resulting Megatron-LM checkpoint could be loaded into Megatron-LM framework for finetuning or inference. Tensor parallelism (TP) and pipeline parallelism (PP) are supported in the sense that the generated Megatron-LM checkpoint (folders and files) will be of the same TP and PP of the training that created the input DeepSpeed checkpoint. The entry point of the converter is `deepspeed_to_megatron.py`, which as the following usage:
+```bash
+python tools/convert_checkpoint/deepspeed_to_megatron.py -h
+Convert DeepSpeed Checkpoint to Megatron Checkpoint
+usage: deepspeed_to_megatron.py [-h] [--input_folder INPUT_FOLDER]
+                                [--output_folder OUTPUT_FOLDER]
+                                [--target_tp TARGET_TP]
+                                [--target_pp TARGET_PP] [--for_release]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --input_folder INPUT_FOLDER
+                        Input DeepSpeed Checkpoint folder
+  --output_folder OUTPUT_FOLDER
+                        Output Megatron checkpoint folder
+  --target_tp TARGET_TP
+                        Target TP degree
+  --target_pp TARGET_PP
+                        Target PP degree
+  --for_release         Convert for release purpose, reset some (progress)
+                        counters.
+```
+
+The following scripts which proved useful for debugging are also included:
+1. `inspect_deepspeed_checkpoint.py`: view the contents of a DeepSpeed checkpoint folder.
+2. `inspect_checkpoint.py`: view the contents of a PyTorch checkpoint file.
+
+## Megatron-DeepSpeed to HF Transformers
+
+In order to convert from Megatron-DeepSpeed to HF Transformers, you can do this directly using:
+
+```bash
+python tools/convert_checkpoint/deepspeed_to_transformers.py  \
+--input_folder /path/to/Megatron-Deepspeed/checkpoint/global_step97500 \
+--output_folder /path/to/transformers/checkpoint
+```
+since `transformers` currently only works with PP=1/TP=1 we use the defaults `--target_tp 1 --target_pp 1`.
+
+The script taps into `transformers` and as of this writing requires `transformers@master` (or `transformers==4.11` if you read this later and a new version is released).
+
+Note that you may run into problems with not having `megatron.enums` defined since `Megatron-Deepspeed` in the `bigscience-workshop` tree diverged from the `microsoft` tree. In such cases you can fix this on the fly by ensuring the former appears first in the `sys.path`. For example:
+
+
+```bash
+PYTHONPATH=/hf/Megatron-DeepSpeed-bigscience:/hf/Megatron-DeepSpeed-microsoft \
+python tools/convert_checkpoint/deepspeed_to_transformers.py  \
+--input_folder /path/to/Megatron-Deepspeed/checkpoint/global_step97500 \
+--output_folder /path/to/transformers/checkpoint
+```
+
+Alternatively, you can convert first from Megatron-DeepSpeed to Megatron and then to HF Transformers:
+
+```bash
+# 1. Megatron-DeepSpeed to Megatron
+cd /hf/Megatron-DeepSpeed-bigscience
+python tools/convert_checkpoint/deepspeed_to_megatron.py --target_tp 1 --target_pp 1 \
+--input_folder /path/to/Megatron-Deepspeed/checkpoint/global_step97500 \
+--output_folder /path/to/Megatron/checkpoint
+
+# 2. Megatron to HF Transformers
+cd /hf/transformers
+python src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py \
+/path/to/Megatron/checkpoint/iter_0097500/mp_rank_00/model_optim_rng.pt
+```
--- a/megatron-deepspeed_dtk22.10/tools/convert_checkpoint/deepspeed_to_deepspeed.py
+++ b/megatron-deepspeed_dtk22.10/tools/convert_checkpoint/deepspeed_to_deepspeed.py
+#!/usr/bin/env python
+import sys
+import argparse
+import os
+import torch
+
+from pathlib import Path
+
+# insert megatron's root dir into sys.path
+root_repo_path = str(Path(__file__).resolve().parents[2])
+if root_repo_path not in sys.path:
+    sys.path.insert(0, root_repo_path)
+
+from megatron.tokenizer.tokenizer import _vocab_size_with_padding
+from deepspeed.checkpoint.deepspeed_checkpoint import (
+    ARGS_KEY,
+    CHECKPOINT_INFO_KEY,
+)
+
+from deepspeed.checkpoint import (
+    DeepSpeedCheckpoint,
+    get_model_ckpt_name_for_rank,
+    get_zero_ckpt_name_for_rank,
+    get_layer_ckpt_name_for_rank
+)
+
+CHECKPOINT_FILE_SUFFIX = '_model_states.pt'
+MP_WORLD_SIZE ='mp_world_size'
+WORD_EMBEDDINGS_KEY = 'word_embeddings.weight'
+ORIGINAL_VOCAB_SIZE = 'original_vocab_size'
+PADDED_VOCAB_SIZE = 'padded_vocab_size'
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_folder',
+                        default=None,
+                        type=str,
+                        help='Input DeepSpeed Checkpoint folder')
+    parser.add_argument('--output_folder',
+                        default=None,
+                        type=str,
+                        help='Output Megatron checkpoint folder')
+    parser.add_argument('--target_tp',
+                        default=None,
+                        type=int,
+                        help='Target TP degree')
+    parser.add_argument('--target_pp',
+                        default=None,
+                        type=int,
+                        help='Target PP degree')
+    parser.add_argument('--target_dp',
+                        default=None,
+                        type=int,
+                        help='Target DP degree')
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+
+def _save_checkpoint(file_path, chkpt_sd):
+    dir, _ = os.path.split(file_path)
+    os.makedirs(dir, exist_ok=True)
+    torch.save(chkpt_sd, file_path)
+
+
+def _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, tp_index, pp_index):
+    sd_list = ds_checkpoint.get_transformer_state(tp_index, pp_index)
+    layer_id_list = ds_checkpoint.get_pp_transformer_map(pp_index)
+    assert len(sd_list) == len(layer_id_list)
+    for sd, layer_id in zip(sd_list, layer_id_list):
+        ckpt_path = get_layer_ckpt_name_for_rank(
+            base_folder=base_folder,
+            layer_id=layer_id,
+            tp_rank=tp_index)
+        _save_checkpoint(ckpt_path, sd)
+
+
+def _strip_vocab_padding(ds_checkpoint, padded_vocab_tensor):
+    target_args = ds_checkpoint.get_args()
+    checkpoint_info = ds_checkpoint.get_checkpoint_info()
+    target_args.tensor_model_parallel_size = ds_checkpoint.tp_degree
+    target_args.padded_vocab_size = _vocab_size_with_padding(checkpoint_info[ORIGINAL_VOCAB_SIZE], target_args)
+    assert target_args.padded_vocab_size <= padded_vocab_tensor.numel()
+    checkpoint_info[PADDED_VOCAB_SIZE] = target_args.padded_vocab_size
+    unpadded_vocab_tensor = torch.narrow(padded_vocab_tensor, 0, 0, target_args.padded_vocab_size)
+    return unpadded_vocab_tensor.clone()
+
+
+def _create_embedding_layer_checkpoint(ds_checkpoint, base_folder, tp_index):
+    sd = ds_checkpoint.get_embedding_state(tp_index)
+    if ds_checkpoint.is_change_tp_degree():
+        sd[WORD_EMBEDDINGS_KEY] = _strip_vocab_padding(ds_checkpoint, sd[WORD_EMBEDDINGS_KEY])
+    layer_id = ds_checkpoint.get_embedding_layer_id()
+    ckpt_path = get_layer_ckpt_name_for_rank(
+        base_folder=base_folder,
+        tp_rank=tp_index,
+        layer_id=layer_id)
+    _save_checkpoint(ckpt_path, sd)
+
+
+def _create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, tp_index):
+    sd = ds_checkpoint.get_final_norm_state(tp_index)
+    layer_id = ds_checkpoint.get_final_norm_layer_id()
+    ckpt_path = get_layer_ckpt_name_for_rank(
+        base_folder=base_folder,
+        tp_rank=tp_index,
+        layer_id=layer_id)
+    _save_checkpoint(ckpt_path, sd)
+
+
+def _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, tp_index,
+                                   pp_index):
+    sd = ds_checkpoint.get_2d_parallel_state(tp_index=tp_index,
+                                             pp_index=pp_index)
+    sd[MP_WORLD_SIZE] = ds_checkpoint.tp_degree
+    file_id = pp_index * ds_checkpoint.tp_degree + tp_index
+    ckpt_path = get_model_ckpt_name_for_rank(base_folder, f'{file_id:02d}')
+
+    # Adjust specific fields
+    sd[ARGS_KEY] = ds_checkpoint.get_args()
+    sd[ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree
+    sd[ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree
+    sd[CHECKPOINT_INFO_KEY][PADDED_VOCAB_SIZE] = sd[ARGS_KEY].padded_vocab_size
+    _save_checkpoint(ckpt_path, sd)
+
+
+def _create_zero_checkpoint(ds_checkpoint, base_folder, dp_index, pp_index, tp_index):
+    _2d_rank = (pp_index * ds_checkpoint.tp_degree) + tp_index
+    sd = ds_checkpoint.get_zero_checkpoint_state(
+        pp_index=pp_index,
+        tp_index=tp_index,
+        dp_index=dp_index)
+
+    ckpt_path = get_zero_ckpt_name_for_rank(base_folder=base_folder,
+                                            dp_rank=dp_index,
+                                            mp_rank=_2d_rank)
+    _save_checkpoint(ckpt_path, sd)
+
+
+def _create_latest_file(base_folder, file_name, latest_tag):
+    file_path = os.path.join(base_folder, file_name)
+    os.makedirs(base_folder, exist_ok=True)
+    with open(file_path, 'w') as f:
+        f.write(str(latest_tag))
+
+
+def main():
+    print(f'Convert DeepSpeed Checkpoint to DeepSpeed Checkpoint')
+
+    args = parse_arguments()
+    print(
+        f'Converting DeepSpeed checkpoint in {args.input_folder} to DeepSpeed checkpoint in {args.output_folder}'
+    )
+
+    ds_checkpoint = DeepSpeedCheckpoint(
+        args.input_folder,
+        args.target_tp,
+        args.target_pp,
+        args.target_dp)
+    iteration = ds_checkpoint.get_iteration()
+    latest_tag = f'global_step{iteration}'
+    _create_latest_file(args.output_folder,
+                        'latest_checkpointed_iteration.txt', iteration)
+    _create_latest_file(args.output_folder, 'latest', latest_tag)
+    base_folder = os.path.join(args.output_folder, latest_tag)
+
+    for i in range(ds_checkpoint.tp_degree):
+        _create_embedding_layer_checkpoint(ds_checkpoint, base_folder, i)
+        _create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, i)
+
+        for j in range(ds_checkpoint.pp_degree):
+            _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, i, j)
+            _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, i, j)
+
+    for i in range(ds_checkpoint.dp_degree):
+        for j in range(ds_checkpoint.pp_degree):
+            for k in range(ds_checkpoint.tp_degree):
+                _create_zero_checkpoint(ds_checkpoint, base_folder, i, j, k)
+
+
+if __name__ == "__main__":
+    main()
--- a/megatron-deepspeed_dtk22.10/tools/convert_checkpoint/deepspeed_to_megatron.py
+++ b/megatron-deepspeed_dtk22.10/tools/convert_checkpoint/deepspeed_to_megatron.py
+#!/usr/bin/env python
+
+import argparse
+import os
+import torch
+from collections import OrderedDict
+from deepspeed.checkpoint.deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint
+
+MODEL_KEY = 'model'
+ARGS_KEY = 'args'
+LANGUGAGE_MODEL_KEY = 'language_model'
+EMBEDDING_KEY = 'embedding'
+ENCODER_KEY = 'encoder'
+WORD_EMBEDDINGS_FOR_HEAD_KEY = 'word_embeddings_for_head'
+WORD_EMBEDDINGS_KEY = 'word_embeddings'
+FINAL_LAYER_NORM_KEY = 'final_layernorm'
+CHECKPOINT_VERSION_KEY = 'checkpoint_version'
+CHECKPOINT_VERSION_VALUE = 3.0
+ITERATION_KEY = 'iteration'
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_folder',
+                        default=None,
+                        type=str,
+                        help='Input DeepSpeed Checkpoint folder')
+    parser.add_argument('--output_folder',
+                        default=None,
+                        type=str,
+                        help='Output Megatron checkpoint folder')
+    parser.add_argument('--target_tp',
+                        default=1,
+                        type=int,
+                        help='Target TP degree')
+    parser.add_argument('--target_pp',
+                        default=1,
+                        type=int,
+                        help='Target PP degree')
+    parser.add_argument(
+        '--for_release',
+        action='store_true',
+        help='Convert for release purpose, reset some (progress) counters.')
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def _convert_ds_transformer_state(sd_list):
+    new_sd = OrderedDict()
+    for i, sd in enumerate(sd_list):
+        for key, value in sd.items():
+            new_key = f'layers.{i}.{key}'
+            new_sd[new_key] = value
+
+    return new_sd
+
+
+def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree):
+    path_list = []
+    iter_folder = f'iter_{iteration:07d}'
+    for i in range(0, tp_degree):
+        path_list.append([])
+        for j in range(0, pp_degree):
+            rank_folder = f'mp_rank_{i:02d}' if pp_degree == 1 else f'mp_rank_{i:02d}_{j:03d}'
+            ckpt_path = os.path.join(rank_folder, 'model_optim_rng.pt')
+            path_list[i].append(
+                os.path.join(base_folder, iter_folder, ckpt_path))
+
+    return path_list
+
+
+def _create_megatron_dict():
+    language_model_dict = {EMBEDDING_KEY: {}, ENCODER_KEY: {}}
+    megatron_dict = {
+        MODEL_KEY: {
+            LANGUGAGE_MODEL_KEY: language_model_dict
+        },
+        CHECKPOINT_VERSION_KEY: CHECKPOINT_VERSION_VALUE
+    }
+    return megatron_dict
+
+
+def _save_checkpoint(file_path, chkpt_sd):
+    dir, _ = os.path.split(file_path)
+    os.makedirs(dir, exist_ok=True)
+    torch.save(chkpt_sd, file_path)
+
+
+def _renest_sd(sd):
+    new_sd = OrderedDict()
+    for key, value in sd.items():
+        a, b = key.split('.')
+        new_sd[a] = {b: value}
+    return new_sd
+
+
+def _create_rank_checkpoint(ds_checkpoint,
+                            checkpoint_path,
+                            tp_index,
+                            pp_index,
+                            for_release=False):
+    meg_encoder_sd = OrderedDict()
+    meg_embedding_sd = OrderedDict()
+    meg_embedding_for_head_sd = OrderedDict()
+
+    transformer_sd = ds_checkpoint.get_transformer_state(tp_index, pp_index)
+    meg_encoder_sd.update(_convert_ds_transformer_state(transformer_sd))
+
+    if pp_index in [0, ds_checkpoint.pp_degree - 1]:
+        embedding_sd = ds_checkpoint.get_embedding_state(tp_index)
+        nested_embedding_sd = _renest_sd(embedding_sd)
+        if pp_index == 0:
+            meg_embedding_sd.update(nested_embedding_sd)
+
+        if pp_index == ds_checkpoint.pp_degree - 1:
+            for key, value in embedding_sd.items():
+                if key.startswith(WORD_EMBEDDINGS_KEY):
+                    fields = key.split('.')
+                    new_fields = fields[1:]
+                    new_key = '.'.join(new_fields)
+                    meg_embedding_for_head_sd[new_key] = value
+
+            final_norm_sd = ds_checkpoint.get_final_norm_state(tp_index)
+            new_final_norm_sd = {
+                f'{FINAL_LAYER_NORM_KEY}.{key}': value
+                for key, value in final_norm_sd.items()
+            }
+            meg_encoder_sd.update(new_final_norm_sd)
+
+    checkpoint_sd = _create_megatron_dict()
+
+    iteration = ds_checkpoint.get_iteration()
+    checkpoint_sd[ITERATION_KEY] = iteration
+    if pp_index == 0:
+        checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][
+            EMBEDDING_KEY] = meg_embedding_sd
+    checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][ENCODER_KEY] = meg_encoder_sd
+    if pp_index == ds_checkpoint.pp_degree - 1:
+        checkpoint_sd[MODEL_KEY][
+            WORD_EMBEDDINGS_FOR_HEAD_KEY] = meg_embedding_for_head_sd
+
+    checkpoint_sd[ARGS_KEY] = ds_checkpoint.get_args()
+    # Adjust specific fields
+    checkpoint_sd[
+        ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree
+    checkpoint_sd[
+        ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree
+    if for_release:
+        checkpoint_sd[ARGS_KEY].consumed_train_samples = 0
+        checkpoint_sd[ARGS_KEY].consumed_valid_samples = 0
+
+    return checkpoint_sd
+
+
+def _create_latest_file(base_folder, iteration):
+    file_path = os.path.join(base_folder, 'latest_checkpointed_iteration.txt')
+    os.makedirs(base_folder, exist_ok=True)
+    with open(file_path, 'w') as f:
+        f.write(str(iteration))
+
+
+def main():
+    print(f'Convert DeepSpeed Checkpoint to Megatron Checkpoint')
+
+    args = parse_arguments()
+    print(
+        f'Converting DeepSpeed checkpoint in {args.input_folder} to Megatron checkpoint in {args.output_folder}'
+    )
+
+    ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp,
+                                        args.target_pp)
+    iteration = ds_checkpoint.get_iteration()
+    _create_latest_file(args.output_folder, iteration)
+    checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration,
+                                                ds_checkpoint.tp_degree,
+                                                ds_checkpoint.pp_degree)
+    for i in range(0, ds_checkpoint.tp_degree):
+        for j in range(0, ds_checkpoint.pp_degree):
+            sd = _create_rank_checkpoint(ds_checkpoint, i, j, args.for_release)
+            _save_checkpoint(checkpoint_paths[i][j], sd)
+
+
+if __name__ == "__main__":
+    main()
--- a/megatron-deepspeed_dtk22.10/tools/convert_checkpoint/deepspeed_to_transformers.py
+++ b/megatron-deepspeed_dtk22.10/tools/convert_checkpoint/deepspeed_to_transformers.py
+#!/usr/bin/env python
+
+import os
+import torch
+import json
+import sys
+from pathlib import Path
+ 
+# insert megatron's root dir into sys.path
+root_repo_path = str(Path(__file__).resolve().parents[2])
+if root_repo_path not in sys.path:
+    sys.path.insert(0, root_repo_path)
+    
+from deepspeed.checkpoint import DeepSpeedCheckpoint
+from deepspeed_to_megatron import _create_rank_checkpoint, parse_arguments
+
+# the import was tested to work with this version
+# https://github.com/huggingface/transformers/commit/0af901e83 if it diverges we may consider
+# copying that version here instead
+from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import convert_megatron_checkpoint
+from transformers import GPT2Config
+
+
+def main():
+
+    # this first part comes mainly from deepspeed_to_megatron.main
+    args = parse_arguments()
+    print(
+        f'Converting DeepSpeed checkpoint in {args.input_folder} to HF Transformers checkpoint in {args.output_folder}'
+    )
+
+    ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp,
+                                        args.target_pp)
+    iteration = ds_checkpoint.get_iteration()
+    input_state_dict = _create_rank_checkpoint(ds_checkpoint, 0, 0,
+                                               args.for_release)
+
+    # the 2nd part comes from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint.main
+    # Spell out all parameters in case the defaults change.
+    config = GPT2Config(
+        vocab_size=50257,
+        n_positions=1024,
+        n_ctx=1024,
+        n_embd=1024,
+        n_layer=24,
+        n_head=16,
+        n_inner=4096,
+        activation_function="gelu",  # used to be "gelu_new" in earlier versions
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        summary_type="cls_index",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        scale_attn_weights=True,
+        gradient_checkpointing=False,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+    )
+
+    # Convert.
+    print("Converting to HF Checkpoint")
+    output_state_dict = convert_megatron_checkpoint(args, input_state_dict,
+                                                    config)
+
+    basename = args.output_folder
+    os.makedirs(basename, exist_ok=True)
+
+    # Print the structure of converted state dict.
+    #if args.print_checkpoint_structure:
+    #    recursive_print(None, output_state_dict)
+
+    # Store the config to file.
+    output_config_file = os.path.join(basename, "config.json")
+    output_config = config.to_dict()
+    output_config["architectures"] = ["GPT2LMHeadModel"]
+    output_config["model_type"] = "gpt2"
+    print(f'Saving config to "{output_config_file}"')
+    with open(output_config_file, "w") as f:
+        json.dump(output_config, f)
+
+    # Store the state_dict to file.
+    output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
+    print(f'Saving checkpoint to "{output_checkpoint_file}"')
+    torch.save(output_state_dict, output_checkpoint_file)
+
+    print("Now add tokenizer files and upload to the hub")
+
+
+if __name__ == "__main__":
+    main()
--- a/megatron-deepspeed_dtk22.10/tools/convert_checkpoint/ds_to_universal.py
+++ b/megatron-deepspeed_dtk22.10/tools/convert_checkpoint/ds_to_universal.py
+#!/usr/bin/env python
+
+from collections import OrderedDict
+from copy import deepcopy
+from email.policy import default
+from functools import partial
+from pathlib import Path
+from pprint import pprint
+import argparse
+import glob
+import itertools
+import logging
+import multiprocessing
+import os
+import re
+import shutil
+import sys
+import torch
+import tqdm
+
+# insert megatron's root dir into sys.path
+root_repo_path = str(Path(__file__).resolve().parents[2])
+if root_repo_path not in sys.path:
+    sys.path.insert(0, root_repo_path)
+
+
+from deepspeed.checkpoint import DeepSpeedCheckpoint
+
+MODEL_KEY = 'model'
+ARGS_KEY = 'args'
+LANGUGAGE_MODEL_KEY = 'language_model'
+EMBEDDING_KEY = 'embedding'
+ENCODER_KEY = 'encoder'
+WORD_EMBEDDINGS_FOR_HEAD_KEY = 'word_embeddings_for_head'
+WORD_EMBEDDINGS_KEY = 'word_embeddings'
+FINAL_LAYER_NORM_KEY = 'final_layernorm'
+CHECKPOINT_VERSION_KEY = 'checkpoint_version'
+CHECKPOINT_VERSION_VALUE = 3.0
+ITERATION_KEY = 'iteration'
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_folder',
+                        type=str,
+                        help='Input DeepSpeed Checkpoint folder')
+    parser.add_argument('--output_folder',
+                        type=str,
+                        help='Output Megatron checkpoint folder')
+    parser.add_argument('--target_tp',
+                        default=1,
+                        type=int,
+                        help='Target TP degree')
+    parser.add_argument('--target_pp',
+                        default=1,
+                        type=int,
+                        help='Target PP degree')
+    parser.add_argument('--num_extract_workers',
+                        default=4,
+                        type=int,
+                        help='How many parallel processes to extract zero shards')
+    parser.add_argument('--num_merge_workers',
+                        default=2,
+                        type=int,
+                        help='How many parallel processes to merge tp slices (more memory intensive, use much fewer than --num_extract_workers))')
+    parser.add_argument(
+        '--for_release',
+        action='store_true',
+        help='Convert for release purpose, reset some (progress) counters.')
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def _convert_ds_transformer_state(sd_list):
+    new_sd = OrderedDict()
+    for i, sd in enumerate(sd_list):
+        for key, value in sd.items():
+            new_key = f'layers.{i}.{key}'
+            new_sd[new_key] = value
+
+    return new_sd
+
+
+def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree):
+    path_list = []
+    iter_folder = f'iter_{iteration:07d}'
+    for i in range(0, tp_degree):
+        path_list.append([])
+        for j in range(0, pp_degree):
+            rank_folder = f'mp_rank_{i:02d}' if pp_degree == 1 else f'mp_rank_{i:02d}_{j:03d}'
+            ckpt_path = os.path.join(rank_folder, 'model_optim_rng.pt')
+            path_list[i].append(
+                os.path.join(base_folder, iter_folder, ckpt_path))
+
+    return path_list
+
+
+def _create_megatron_dict():
+    language_model_dict = {EMBEDDING_KEY: {}, ENCODER_KEY: {}}
+    megatron_dict = {
+        MODEL_KEY: {
+            LANGUGAGE_MODEL_KEY: language_model_dict
+        },
+        CHECKPOINT_VERSION_KEY: CHECKPOINT_VERSION_VALUE
+    }
+    return megatron_dict
+
+
+def _save_checkpoint(file_path, chkpt_sd):
+    dir, _ = os.path.split(file_path)
+    os.makedirs(dir, exist_ok=True)
+    torch.save(chkpt_sd, file_path)
+
+
+
+def extract_zero_shards(dir, slice_shapes, ds_checkpoint, indices_3D):
+    pp_index, tp_index, dp_index = indices_3D
+    sd = ds_checkpoint.get_zero_checkpoint_state(
+        pp_index=pp_index,
+        tp_index=tp_index,
+        dp_index=dp_index)
+
+    #pprint(f"Processing {dp_index=} {pp_index=}, {tp_index=}")
+
+    optim_sd = sd["optimizer_state_dict"]
+    param_slice_mappings = optim_sd["param_slice_mappings"]
+
+    # dict
+    state_groups = optim_sd["base_optimizer_state"]["state"]
+    # list
+    fp32_groups = optim_sd["single_partition_of_fp32_groups"]
+    param_groups_cnt = len(state_groups)
+
+    for param_group_id in range(param_groups_cnt):
+
+        flat_state = dict(
+            exp_avg=state_groups[param_group_id]["exp_avg"],
+            exp_avg_sq=state_groups[param_group_id]["exp_avg_sq"],
+            fp32=fp32_groups[param_group_id],
+        )
+
+        for name,fragment_mapping in param_slice_mappings[param_group_id].items():
+            if "word_embeddings.weight" in name and pp_index > 0:
+                # Skip tied weights that are replicated in first and last pp stages
+                continue
+
+            #print(f"{param_group_id} {name} => {fragment_mapping.start}:{fragment_mapping.numel}")
+            for state_key in flat_state.keys():
+                dump_param_fragment(dir, tp_index, dp_index, state_key, flat_state[state_key], name, fragment_mapping.start, fragment_mapping.numel)
+
+
+
+
+cnt = 0
+def dump_param_fragment(dir, tp_index, dp_index, state_name, state_flat_tensor, param_name, offset, numel):
+
+    global cnt # temp hack
+
+    param_base_path = os.path.join(dir, param_name, str(tp_index))
+    os.makedirs(param_base_path, exist_ok=True)
+
+    cnt += 1
+    counter = f"{dp_index:0>2d}"
+
+    path = os.path.join(param_base_path, f"{state_name}.{counter}")
+
+    #print(f"{param_name}: {offset}: {numel} => {path}")
+
+    t = state_flat_tensor.narrow(0, offset, numel)
+    _save_checkpoint(path, t)
+
+
+def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape):
+    slices = []
+    for tp_index in range(tp_degree):
+        prefix_path = os.path.join(param_base_path, str(tp_index), f"{state}")
+        paths = sorted(list(glob.glob(f"{prefix_path}.0*")))
+        #print(paths)
+        shards = [torch.load(p) for p in paths]
+        slice = torch.cat(shards, dim=0).reshape(slice_shape)
+        slices.append(slice)
+
+    return slices
+
+
+ORIGINAL_VOCAB_SIZE = 'original_vocab_size'
+def _strip_vocab_padding(ds_checkpoint, padded_vocab_tensor):
+    checkpoint_info = ds_checkpoint.get_checkpoint_info()
+    padding_tensor = padded_vocab_tensor.narrow(0, checkpoint_info[ORIGINAL_VOCAB_SIZE], padded_vocab_tensor.shape[0]-checkpoint_info[ORIGINAL_VOCAB_SIZE])
+    #print(f'{padded_vocab_tensor[checkpoint_info[ORIGINAL_VOCAB_SIZE]-3:,:]=}')
+    return padded_vocab_tensor.narrow(0, 0, checkpoint_info[ORIGINAL_VOCAB_SIZE])
+
+
+WEIGHTS_TO_AVERAGE_PATTERNS = [
+    r"tied_modules.embed.word_embeddings.norm.weight",
+    r"tied_modules.embed.word_embeddings.norm.bias",
+    r"\d+.input_layernorm.weight",
+    r"\d+.input_layernorm.bias",
+    r"\d+.post_attention_layernorm.weight",
+    r"\d+.post_attention_layernorm.bias",
+    r"\d+.self_attention.dense.bias",
+    r"\d+.mlp.dense_4h_to_h.bias",
+    r"\d+.weight",
+    r"\d+.bias",
+]
+
+WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN = [
+    "dense_4h_to_h.weight",
+    "self_attention.dense.weight",
+]
+
+
+def _get_vocab_divisibility_padding_tensor(ds_checkpoint, padded_vocab_tensor):
+    checkpoint_info = ds_checkpoint.get_checkpoint_info()
+    if padded_vocab_tensor.shape[0] > checkpoint_info[ORIGINAL_VOCAB_SIZE]:
+        return padded_vocab_tensor[-1]
+    else:
+        return torch.zeros(padded_vocab_tensor.shape[1])
+
+def merge_tp_slices(ds_checkpoint, dir, slice_dir, tp_degree, name_and_shape):
+    name, shape = name_and_shape
+    slice_base_path = os.path.join(slice_dir, name)
+    param_base_path = os.path.join(dir, name)
+
+    for state in ("fp32", "exp_avg", "exp_avg_sq"):
+        slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape)
+        final_path = os.path.join(param_base_path, f"{state}.pt")
+
+        #print(f"Expected shape: {shape}")
+        #print(f"Fragment sizes:", list(frag.shape for frag in slices))
+        ckpt_dict = {}
+        if any(re.match(pattern, name) for pattern in WEIGHTS_TO_AVERAGE_PATTERNS):
+            param = sum(slices) / len(slices)
+        else:
+            cat_dim = 1 if any(text in name for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0
+            #print(f"CAT DIM: {cat_dim}")
+            param = torch.cat(slices, dim=cat_dim)
+            ckpt_dict['cat_dim'] = cat_dim
+
+        if "word_embeddings.weight" in name:
+            #print(f"Before {param.shape=}")
+            # strip padding
+            #param = _strip_vocab_padding(ds_checkpoint, param)
+            ckpt_dict['vocab_divisibility_padding_tensor'] = _get_vocab_divisibility_padding_tensor(ds_checkpoint, param)
+            #print(f"After {param.shape=}")
+
+        #print(f"Final shape: {param.shape}")
+        ckpt_dict['param'] = param
+        _save_checkpoint(final_path, ckpt_dict)
+
+
+
+
+
+
+def _get_chunks(l, n):
+    for i in range(0, len(l), n):
+        yield l[i:i + n]
+
+
+def _do_parallel_work(do_work, work_chunks, num_workers):
+    pool = multiprocessing.Pool(num_workers)
+    for batch in tqdm.tqdm(work_chunks):
+        pool.map(do_work, batch)
+    pool.close()
+    pool.join()
+
+def _extract_zero_shard_files(args, ds_checkpoint, slice_shapes, temp_dir):
+    _3d_range_list = list(itertools.product(range(ds_checkpoint.pp_degree), range(ds_checkpoint.tp_degree), range(ds_checkpoint.dp_degree)))
+    #pprint(_3d_range_list)
+    work_chunks = list(_get_chunks(_3d_range_list, args.num_extract_workers))
+    #pprint(work_chunks)
+
+    do_work = partial(extract_zero_shards, temp_dir, slice_shapes, ds_checkpoint)
+    _do_parallel_work(do_work, work_chunks, args.num_extract_workers)
+
+
+
+def _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir):
+    work_chunks = list(_get_chunks(list(slice_shapes.items()), args.num_merge_workers))
+    #pprint(work_chunks)
+    zero_output_folder = os.path.join(args.output_folder, "zero") 
+    do_work = partial(merge_tp_slices, ds_checkpoint, zero_output_folder, temp_dir, ds_checkpoint.tp_degree)
+    _do_parallel_work(do_work, work_chunks, args.num_merge_workers)
+
+
+
+def main():
+    print(f'Convert DeepSpeed Checkpoint to Universal Checkpoint')
+
+    args = parse_arguments()
+    print(
+        f'Converting DeepSpeed checkpoint in {args.input_folder} to Universal checkpoint in {args.output_folder}'
+    )
+
+    ds_checkpoint = DeepSpeedCheckpoint(args.input_folder)#, 1, 2) # args.target_tp, args.target_pp)
+
+    iteration = ds_checkpoint.get_iteration()
+    #_create_latest_file(args.output_folder, iteration)
+    checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration,
+                                                ds_checkpoint.tp_degree,
+                                                ds_checkpoint.pp_degree)
+
+    slice_shapes = []
+    for mp_rank_file in ds_checkpoint.mp_rank_files:
+        mp_sd = torch.load(mp_rank_file, map_location=torch.device('cpu'))
+        slice_shapes += mp_sd["param_shapes"]
+
+    # fix back to normal flat dict, merge duplicates for tp>1
+    slice_shapes = dict((k,v) for d in slice_shapes for k,v in d.items() )
+    temp_dir = os.path.join(args.output_folder, 'tmp')
+
+    print('*** 1. Extracting ZeRO fragments')
+    _extract_zero_shard_files(args, ds_checkpoint, slice_shapes, temp_dir)
+
+    print('*** 2. Merging slices')
+    _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir)
+
+    shutil.rmtree(temp_dir, ignore_errors=True)
+
+    # Copy mp* files into output folder
+    for f in glob.glob(os.path.join(args.input_folder, 'mp*')):
+        shutil.copy2(f, args.output_folder)
+
+    # Update latest to output folder
+    checkpoint_root_folder, step_folder = os.path.split(args.output_folder)
+    latest_file = os.path.join(checkpoint_root_folder, 'latest_universal')
+    with open(latest_file, "w") as f:
+        f.write(step_folder)
+
+    print('*** Done!')
+
+
+if __name__ == "__main__":
+    main()
--- a/megatron-deepspeed_dtk22.10/tools/convert_checkpoint/inspect_checkpoint.py
+++ b/megatron-deepspeed_dtk22.10/tools/convert_checkpoint/inspect_checkpoint.py
+import sys
+import torch
+import os
+from collections import OrderedDict
+from pathlib import Path
+
+# insert megatron's root dir into sys.path
+root_repo_path = str(Path(__file__).resolve().parents[2])
+if root_repo_path not in sys.path:
+    sys.path.insert(0, root_repo_path)
+
+
+def dump_data(datum, name_list=[]):
+    if type(datum) in (dict, OrderedDict):
+        for k, v in datum.items():
+            dump_data(v, name_list + [str(k)])
+    elif type(datum) in (list, tuple):
+        for v in datum:
+            dump_data(v, name_list)
+    elif torch.is_tensor(datum):
+        prefix = '.'.join(name_list)
+        print(f'[tensor] {prefix} = {datum.shape}')
+    else:
+        #pass
+        prefix = '.'.join(name_list)
+        print(f'[other] {prefix} = {datum}')
+
+
+def main():
+    if len(sys.argv) < 2:
+        print(f'Usage: {sys.argv[0]} <checkpoint file>')
+        exit(1)
+
+    ckpt_file = sys.argv[1]
+    if not os.path.isfile(ckpt_file):
+        print(f'{ckpt_file} is not a valid file')
+        exit(1)
+
+    print(f'loading checkpoint file: {ckpt_file}')
+    sd = torch.load(ckpt_file, map_location=torch.device('cpu'))
+    dump_data(sd)
+
+    quit()
+
+
+if __name__ == "__main__":
+    main()
--- a/megatron-deepspeed_dtk22.10/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py
+++ b/megatron-deepspeed_dtk22.10/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py
+import sys
+from pathlib import Path
+ 
+# insert megatron's root dir into sys.path
+root_repo_path = str(Path(__file__).resolve().parents[2])
+if root_repo_path not in sys.path:
+    sys.path.insert(0, root_repo_path)
+    
+import argparse
+
+from deepspeed.checkpoint import DeepSpeedCheckpoint 
+
+
+def list_files(file_list, tag):
+    print(f'Listing files: {tag}')
+    for i, file in enumerate(file_list):
+        print(f'{i+1}: {file}')
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--folder',
+                        default=None,
+                        type=str,
+                        help='DeepSpeed Checkpoint folder')
+    parser.add_argument('--target_tp',
+                        default=None,
+                        type=int,
+                        help='Target TP degree')
+    parser.add_argument('--target_pp',
+                        default=None,
+                        type=int,
+                        help='Target PP degree')
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def show_input_files(ds_checkpoint):
+    list_files(ds_checkpoint.file_list, 'all')
+    list_files(ds_checkpoint.zero_files, 'zero')
+    list_files(ds_checkpoint.layer_files, 'layer')
+    list_files(ds_checkpoint.mp_rank_files, 'mp rank')
+
+
+def show_simple_state(ds_checkpoint):
+    print(f'layer keys = {ds_checkpoint.layer_keys}')
+    print(f'layer count = {ds_checkpoint.layer_count}')
+
+    print(
+        f'tp_degree_count = {ds_checkpoint.original_tp_degree} ------> {ds_checkpoint.tp_degree}'
+    )
+    print(
+        f'pp_degree_count = {ds_checkpoint.original_pp_degree} ------> {ds_checkpoint.pp_degree}'
+    )
+    print(f'dp_degree_count = {ds_checkpoint.dp_degree}')
+    ds_checkpoint.old_2d_map.print_data('old 2d map ==>')
+    ds_checkpoint.new_2d_map.print_data('new 2d map ==>')
+
+
+def show_mappings(ds_checkpoint):
+    ds_checkpoint.show_pp_tranformer_map()
+    ds_checkpoint.show_transformer_file_map()
+    ds_checkpoint.show_tp_embedding_map()
+    ds_checkpoint.show_tp_final_norm_map()
+    ds_checkpoint.show_2d_mapping()
+
+
+def show_state_summary(tag, sd):
+    summary = {k: v.shape for k, v in sd.items()}
+    print(f'{tag} = {summary}')
+
+
+def show_embedding_states(ds_checkpoint):
+    for i in range(0, ds_checkpoint.tp_degree):
+        sd = ds_checkpoint.get_embedding_state(i)
+        show_state_summary(f'embedding[{i}]', sd)
+
+
+def show_final_norm_states(ds_checkpoint):
+    for i in range(0, ds_checkpoint.tp_degree):
+        sd = ds_checkpoint.get_final_norm_state(i)
+        show_state_summary(f'final_norm[{i}]', sd)
+
+
+def show_transformer_states(ds_checkpoint):
+    for i in range(0, ds_checkpoint.tp_degree):
+        for j in range(0, ds_checkpoint.pp_degree):
+            state_list = ds_checkpoint.get_transformer_state(tp_index=i,
+                                                             pp_index=j)
+            print(f'tp_pp_rank[{i},{j}] = ')
+            for k, sd in enumerate(state_list):
+                show_state_summary(f'      block[{k}]', sd)
+                print("")
+
+
+def main():
+    print(f'Inspecting DeepSpeed Checkpoint')
+    args = parse_arguments()
+
+    ds_checkpoint = DeepSpeedCheckpoint(args.folder, args.target_tp,
+                                        args.target_pp)
+    ds_checkpoint.validate_files()
+
+    show_simple_state(ds_checkpoint)
+    show_input_files(ds_checkpoint)
+    show_simple_state(ds_checkpoint)
+    show_mappings(ds_checkpoint)
+    show_embedding_states(ds_checkpoint)
+    show_final_norm_states(ds_checkpoint)
+    show_transformer_states(ds_checkpoint)
+    checkpoint_args = ds_checkpoint.get_args()
+    print(f'checkpoint args = {checkpoint_args}')
+
+
+if __name__ == "__main__":
+    main()
--- a/megatron-deepspeed_dtk22.10/tools/create_doc_index.py
+++ b/megatron-deepspeed_dtk22.10/tools/create_doc_index.py
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir)))
+
+from megatron import print_rank_0
+from megatron.indexer import IndexBuilder
+from megatron.initialize import initialize_megatron
+
+
+def main():
+    """Create a BlockData data structure by running an IndexBuilder over an ICT Dataset
+    - Include all args needed for initial model specification
+
+    Other key args:
+        --block-data-path: path to write to
+        --ict-load or --realm-load: path to checkpoint with which to embed
+        --data-path and --titles-data-path: paths for dataset
+        --indexer-log-interval: reporting interval
+        --indexer-batch-size: size specific for indexer jobs
+
+    Check README.md for example script
+    """
+
+    initialize_megatron(extra_args_provider=None,
+                        args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
+    index_builder = IndexBuilder()
+    index_builder.build_and_save_index()
+    print_rank_0("Build and save indices: done!")
+
+if __name__ == "__main__":
+    main()
+
--- a/megatron-deepspeed_dtk22.10/tools/generate_samples_gpt.py
+++ b/megatron-deepspeed_dtk22.10/tools/generate_samples_gpt.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Sample Generate GPT"""
+
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
+                                             os.path.pardir)))
+
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_tokenizer
+from megatron import mpu
+from megatron.checkpointing import load_checkpoint
+from megatron.initialize import initialize_megatron
+from megatron.model.gpt_model import GPTModel
+from megatron.training import get_model
+from megatron.text_generation_utils import generate_and_write_samples_unconditional
+from megatron.text_generation_utils import generate_samples_input_from_file
+from megatron.text_generation_utils import generate_samples_interactive
+
+
+os.environ["HIP_VISIBLE_DEVICES"] = "4,5,6,7"
+
+def model_provider(pre_process=True, post_process=True):
+    """Build the model."""
+
+    print_rank_0('building GPT model ...')
+    model = GPTModel(num_tokentypes=0, parallel_output=False,
+                     pre_process=pre_process, post_process=post_process)
+
+    return model
+
+
+def add_text_generate_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='text generation')
+
+    group.add_argument("--temperature", type=float, default=1.0,
+                       help='Sampling temperature.')
+    group.add_argument("--greedy", action='store_true', default=False,
+                       help='Use greedy sampling.')
+    group.add_argument("--top_p", type=float, default=0.0,
+                       help='Top p sampling.')
+    group.add_argument("--top_k", type=int, default=0,
+                       help='Top k sampling.')
+    group.add_argument("--out-seq-length", type=int, default=1024,
+                       help='Size of the output generated text.')
+    group.add_argument("--sample-input-file", type=str, default=None,
+                       help='Get input from file instead of interactive mode, '
+                       'each line is an input.')
+    group.add_argument("--sample-output-file", type=str, default=None,
+                       help='Output file got from --sample-input-file')
+    group.add_argument("--num-samples", type=int, default=0,
+                       help='Number of samples to generate unconditionally, '
+                       'defaults to 0 and interactive conditional sampling')
+    group.add_argument("--genfile", type=str,
+                       help='Output file when generating unconditionally')
+    group.add_argument("--recompute", action='store_true',
+                       help='During generation recompute all attention '
+                       'instead of using previously computed keys/values.')
+
+    return parser
+
+
+def main():
+    """Main program."""
+
+    initialize_megatron(extra_args_provider=add_text_generate_args,
+                        args_defaults={'tokenizer_type': 'GPT2BPETokenizer',
+                                       'no_load_rng': True,
+                                       'no_load_optim': True})
+
+    args = get_args()
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        print("Interleaved pipeline schedule is not yet supported for text generation.")
+        exit()
+
+    # Set up model and load checkpoint.
+    model = get_model(model_provider)
+
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+
+    assert len(model) == 1, "Above condition should have caught this"
+    model = model[0]
+
+    # Generate samples.
+    if args.num_samples == 0:
+        args.micro_batch_size = 1
+        if args.sample_input_file != None:
+            generate_samples_input_from_file(model)
+        else:
+            generate_samples_interactive(model)
+    else:
+        generate_and_write_samples_unconditional(model)
+
+
+if __name__ == "__main__":
+
+    main()
--- a/megatron-deepspeed_dtk22.10/tools/linter.py
+++ b/megatron-deepspeed_dtk22.10/tools/linter.py
+import os
+import os.path as osp
+import pathlib
+import subprocess
+
+
+def recursively_lint_files():
+    """Recursively lint all python files in chosen subdirectories of megatron-lm"""
+
+    try:
+        import autopep8
+    except ModuleNotFoundError:
+        print("Please first install autopep8 via `pip install autopep8`")
+        return
+
+    # get all python file paths from top level directory
+    file_dir = str(pathlib.Path(__file__).parent.absolute())
+    working_dir = osp.join(file_dir, os.pardir)
+    all_py_paths = set(os.path.join(working_dir, fname)
+                       for fname in os.listdir(working_dir) if ".py" in fname)
+
+    # get all python file paths from chosen subdirectories
+    check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks']
+    for sub_dir in check_dirs:
+        for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)):
+            all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname))
+
+    print("Linting the following: ")
+    for py_path in all_py_paths:
+        print(py_path)
+        command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path)
+        subprocess.check_call(command)
+
+
+if __name__ == "__main__":
+    recursively_lint_files()
--- a/megatron-deepspeed_dtk22.10/tools/logs/rescale-logs.py
+++ b/megatron-deepspeed_dtk22.10/tools/logs/rescale-logs.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# This script fixes up BigScience log files by adjusting and fixing
+# units of logged values to be seconds instead of milliseconds.
+# It does the modification in-place (so make back ups!).
+#
+# Example:
+#
+# find . -name "*.out*" -print0 | xargs -0 -P 8 rescale-logs.py
+#
+# See also the discussion in
+# https://github.com/bigscience-workshop/Megatron-DeepSpeed/issues/236.
+#
+# This script is derived from https://stackoverflow.com/a/60080531/9201239
+# and https://gist.github.com/stas00/4cd1651d1c8f01196ea322c733bde46c.
+
+import os
+import re
+import sys
+
+LINE_START_RE = re.compile(' ?iteration')
+ELAPSED_TIME_RE = re.compile(r'elapsed time per iteration \(ms\): ([0-9.]+)')
+SAMPLES_PER_SEC_RE = re.compile('samples per second: ([0-9.]+)')
+
+
+def rescale_logs(log_file_path):
+    new_log_file_path = log_file_path + '.new'
+    with open(log_file_path, 'r') as log_file:
+        with open(new_log_file_path, 'w') as new_log_file:
+            for line in log_file.readlines():
+                if LINE_START_RE.match(line):
+                    match = ELAPSED_TIME_RE.search(line)
+                    if match:
+                        # Logged time is in ms, so convert the match.
+                        time_in_sec = float(match[1]) / 1000
+                        replacement = (
+                            f'elapsed time per iteration (s): '
+                            f'{time_in_sec:.2f}'
+                        )
+
+                        # We only need to replace once per line.
+                        line = ELAPSED_TIME_RE.sub(replacement, line, count=1)
+
+                    match = SAMPLES_PER_SEC_RE.search(line)
+                    if match:
+                        # Logged time is in ms, so convert the match.
+                        time_in_sec = float(match[1]) * 1000
+                        # As the values are already logged up to 3
+                        # numbers after the decimal point and we scale
+                        # by exactly that amount, we log them without
+                        # decimal point here in order to not seem more
+                        # exact than we are.
+                        replacement = f'samples per second: {time_in_sec:.0f}'
+
+                        # We only need to replace once per line.
+                        line = SAMPLES_PER_SEC_RE.sub(
+                            replacement,
+                            line,
+                            count=1,
+                        )
+
+                new_log_file.write(line)
+
+    os.rename(new_log_file_path, log_file_path)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print(f'{sys.argv[0]} <input file>',
+              file=sys.stderr)
+        sys.exit(1)
+
+    input_file = sys.argv[1]
+    rescale_logs(input_file)
+    print('Done')
--- a/megatron-deepspeed_dtk22.10/tools/logs/tb-rename-events.py
+++ b/megatron-deepspeed_dtk22.10/tools/logs/tb-rename-events.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# This script renames event names in TensorBoard log files.
+# It does the renaming in-place (so make back ups!).
+#
+# Example:
+#
+# find . -name "*.tfevents*" -exec tb-rename-events.py {} "iteration-time" "iteration-time/iteration-time" \;
+#
+# More than one old tag can be remapped to one new tag – use ";" as a separator:
+#
+# tb-rename-events.py events.out.tfevents.1 "training loss;validation loss" "loss"
+#
+# This script is derived from https://stackoverflow.com/a/60080531/9201239
+# and https://gist.github.com/stas00/4cd1651d1c8f01196ea322c733bde46c.
+
+import os
+import sys
+
+# Use this if you want to avoid using the GPU
+os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
+import tensorflow as tf
+from tensorflow.core.util.event_pb2 import Event
+
+
+def rename_events(input_file, old_tags, new_tag):
+    new_file = input_file + '.new'
+    # Make a record writer
+    with tf.io.TFRecordWriter(new_file) as writer:
+        # Iterate event records
+        for rec in tf.data.TFRecordDataset([input_file]):
+            # Read event
+            ev = Event()
+            ev.MergeFromString(rec.numpy())
+            # Check if it is a summary
+            if ev.summary:
+                # Iterate summary values
+                for v in ev.summary.value:
+                    # Check if the tag should be renamed
+                    if v.tag in old_tags:
+                        # Rename with new tag name
+                        v.tag = new_tag
+            writer.write(ev.SerializeToString())
+    os.rename(new_file, input_file)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 4:
+        print(f'{sys.argv[0]} <input file> <old tags> <new tag>',
+              file=sys.stderr)
+        sys.exit(1)
+    input_file, old_tags, new_tag = sys.argv[1:]
+    old_tags = old_tags.split(';')
+    rename_events(input_file, old_tags, new_tag)
+    print('Done')