dtk23.04初始化

bf95e032 · hepj987 · bf95e032 · bf95e032 · bf95e032 · bf95e032
Commit bf95e032 authored Jul 14, 2023 by hepj987
20 changed files
--- a/tests/ds_config.json
+++ b/tests/ds_config.json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 16,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": 1
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "zero_allow_untested_optimizer": true,
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
--- a/tests/ds_config_bf16.json
+++ b/tests/ds_config_bf16.json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 16,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": 0
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "zero_allow_untested_optimizer": true,
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
--- a/tests/ds_config_cl.json
+++ b/tests/ds_config_cl.json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 16,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": 1
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "curriculum_learning": {
+    "enabled": true,
+    "curriculum_type": "seqlen",
+    "min_difficulty": 8,
+    "max_difficulty": 128,
+    "schedule_type": "fixed_linear",
+    "schedule_config": {
+      "total_curriculum_step": 30,
+      "difficulty_step": 4
+    }
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
--- a/tests/ds_config_inference.json
+++ b/tests/ds_config_inference.json
+{
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 16,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "zero_allow_untested_optimizer": false,
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
--- a/tests/test_activations.py
+++ b/tests/test_activations.py
+import random
+import unittest
+
+import torch
+from torch.nn import functional as F
+
+from megatron.model.glu_activations import GLU_ACTIVATIONS, geglu, liglu, reglu, swiglu
+from megatron.testing_utils import set_seed, torch_assert_equal
+
+
+class TestActivations(unittest.TestCase):
+    def setUp(self):
+        """setup an input of reasonable size"""
+        set_seed()
+        self.batch_size = random.randint(2, 64)
+        self.seq_len = random.randint(256, 1025)
+        self.num_channels = random.randint(1, 384) * 2
+        self.x = torch.randn(self.batch_size, self.seq_len, self.num_channels)
+        self.x1, self.x2 = self.x.chunk(2, dim=-1)
+        # glu should halve the last dimension
+        self.output_shape = [self.batch_size, self.seq_len, self.num_channels // 2]
+
+    def test_shapes(self):
+        for activation_fn in GLU_ACTIVATIONS.values():
+            output = activation_fn(self.x)
+            self.assertEqual(list(output.shape), self.output_shape)
+
+    def test_liglu(self):
+        expected = self.x1 * self.x2
+        torch_assert_equal(liglu(self.x), expected)
+
+    def test_geglu(self):
+        expected = self.x1 * F.gelu(self.x2)
+        torch_assert_equal(geglu(self.x), expected)
+
+    def test_reglu(self):
+        expected = self.x1 * F.relu(self.x2)
+        torch_assert_equal(reglu(self.x), expected)
+
+    def test_swiglu(self):
+        expected = self.x1 * F.silu(self.x2)
+        torch_assert_equal(swiglu(self.x), expected)
+
+    # from megatron.testing_utils import require_torch_bf16
+    # @require_torch_bf16
+    # def test_bf16_jit(self):
+    #     x_bf16 = self.x.to(torch.bfloat16)
+    #     for activation_fn in GLU_ACTIVATIONS.values():
+    #         output = activation_fn(x_bf16)
+    #         self.assertEqual(list(output.shape), self.output_shape)
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
+def test_import():
+    import megatron
--- a/tests/test_checkpoints.py
+++ b/tests/test_checkpoints.py
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import os
+import pytest
+from pathlib import Path
+
+from parameterized import parameterized
+from megatron.testing_utils import (
+    CaptureStdout,
+    TestCasePlus,
+    execute_subprocess_async,
+    get_gpu_count,
+    require_deepspeed,
+    require_torch_gpu,
+    require_torch_multi_gpu,
+    set_seed
+)
+
+set_seed(42)
+
+
+def parameterized_custom_name_func(func, param_num, param):
+    # customize the test name generator function as we want both params to appear in the sub-test
+    # name, as by default it shows only the first param
+    param_based_name = parameterized.to_safe_name("_to_".join(str(x) for x in param.args))
+    return f"{func.__name__}_{param_based_name}"
+
+params = [
+    # TP_PP_DP
+    ["1_1_1", "1_1_1"],
+    ["2_1_1", "1_1_1"],
+    ["1_2_1", "1_1_1"],
+    ["1_1_2", "1_1_1"],
+
+    ["2_1_1", "2_1_1"],
+    ["1_1_1", "2_1_1"],
+    ["1_1_1", "1_2_1"],
+    ["1_1_1", "1_1_2"],
+
+    ["1_1_2", "1_1_2"],
+    ["1_1_2", "2_1_1"],
+    ["1_1_2", "1_2_1"],
+
+    ["1_2_1", "1_2_1"],
+    ["1_2_1", "2_1_1"],
+    ["1_2_1", "1_1_2"],
+
+    ["2_1_1", "2_1_1"],
+    ["2_1_1", "1_2_1"],
+    ["2_1_1", "1_1_2"],
+
+    ["2_2_2", "1_1_1"],
+    ["2_2_2", "2_2_2"],
+    ["1_1_1", "2_2_2"],
+
+    ["1_1_8", "2_2_2"],
+
+]
+
+def get_launcher(num_gpus):
+    # 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup
+    # - it won't be able to handle that
+    return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
+
+@require_deepspeed
+@require_torch_gpu
+class MegDSTestCheckpoints(TestCasePlus):
+    """ """
+
+    def setUp(self):
+        super().setUp()
+
+        # at times magatron fails to build kernels and doesn't remove the lock file, which makes
+        # subsequent runs hang - so make sure there is no lock when starting the testing
+        meg_lock_file_path = self.repo_root_dir_str + "/megatron/fused_kernels/build/lock"
+        if os.path.exists(meg_lock_file_path):
+            os.unlink(meg_lock_file_path)
+
+    def get_config(self, output_dir, tp_size, pp_size, dp_size):
+        data_dir = f"{self.data_dir}/gpt2"
+
+        num_gpus = pp_size * tp_size * dp_size
+        print(f"Using {num_gpus} GPUs")
+
+        n_samples = 300 # about 56 iterations
+
+        exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume
+        seq_len = 128
+
+        # XXX: for now while testing shapes make it really short and fast
+        exit_interval = 1
+        seq_len = 8
+
+
+        # common/shared configs
+
+        ds_args = f"""
+                --deepspeed
+                --deepspeed_config {self.test_file_dir_str}/ds_config_bf16.json
+                --zero-stage 0
+                --deepspeed-activation-checkpointing
+        """.split()
+
+        args = f"""
+                --tensor-model-parallel-size {tp_size}
+                --pipeline-model-parallel-size {pp_size}
+                --distributed-backend nccl
+
+                --log-interval 1
+                --save-interval 1
+                --eval-interval 10
+                --eval-iters 1
+                --checkpoint-activations
+                --partition-activations
+                --exit-interval {exit_interval}
+
+                --merge-file {data_dir}/gpt2-tiny-merges.txt
+                --vocab-file {data_dir}/gpt2-tiny-vocab.json
+                --save {output_dir}/checkpoints
+                --load {output_dir}/checkpoints
+                --data-path {data_dir}/meg-gpt2-openwebtext_text_document
+                --tensorboard-dir {output_dir}/tensorboard
+                --tensorboard-queue-size 5
+                --log-timers-to-tensorboard
+                --log-batch-size-to-tensorboard
+                --log-validation-ppl-to-tensorboard
+
+                --num-layers 2
+                --hidden-size 8
+                --num-attention-heads 2
+                --seq-length {seq_len}
+                --max-position-embeddings 8
+                --micro-batch-size 1
+                --global-batch-size 16
+                --train-samples {n_samples}
+
+                --embed-layernorm
+                --position-embedding-type alibi
+
+                --optimizer adam
+                --adam-beta1 0.9
+                --adam-beta2 0.95
+                --adam-eps 1e-8
+                --lr 1e-4
+                --lr-warmup-samples 5
+                --lr-decay-samples 6
+                --clip-grad 1.0
+                --weight-decay 1e-1
+                --bf16
+
+                --log-level debug
+                --log-level-replica info
+        """.split()
+
+
+        # XXX: fails to handle:
+        #--embed-layernorm
+        #
+# stderr: RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding:
+# stderr:         size mismatch for norm.weight: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
+# stderr:         size mismatch for norm.bias: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]).
+
+        return args, ds_args, num_gpus
+
+
+    def train_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_size=1):
+        src_dir = self.src_dir
+        script = [f"{src_dir}/pretrain_gpt.py"]
+
+        args, ds_args, num_gpus = self.get_config(output_dir, tp_size, pp_size, dp_size)
+        launcher = get_launcher(num_gpus)
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        #print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        # 1. test training from scratch (no checkpoint)
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test deepspeed is running
+        self.assertIn("DeepSpeed info", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test there should be no checkpoint this round
+        self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+    def convert_checkpoint_to_universal(self, output_dir, step):
+        cmd = f"""
+            python tools/convert_checkpoint/ds_to_universal.py
+            --input_folder  {output_dir}/checkpoints/global_step{step}
+            --output_folder {output_dir}/checkpoints/global_step{step}_universal
+        """.split()
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        self.assertIn("Convert DeepSpeed Checkpoint to Universal Checkpoint", cs.out)
+
+    def resume_from_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_size=1):
+        src_dir = self.src_dir
+        script = [f"{src_dir}/pretrain_gpt.py"]
+
+        args, ds_args, num_gpus = self.get_config(output_dir, tp_size, pp_size, dp_size)
+        launcher = get_launcher(num_gpus)
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test checkpoint loading
+        self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+    def resume_from_universal_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_size=1):
+        src_dir = self.src_dir
+        script = [f"{src_dir}/pretrain_gpt.py"]
+
+        args, ds_args, num_gpus = self.get_config(output_dir, tp_size, pp_size, dp_size)
+        launcher = get_launcher(num_gpus)
+        cmd = launcher + script + args + ds_args + ["--universal-checkpoint"]
+        # keep for quick debug
+        #print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test checkpoint loading
+        self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+
+    @require_torch_multi_gpu
+    @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    def test_checkpoint_reshaping_main(self, src, tgt):
+        # this test needs at least 2 gpus - if there are more gpus it will do more extensive testing
+
+        tp_size_src, pp_size_src, dp_size_src = list(map(int, src.split('_')))
+        tp_size_tgt, pp_size_tgt, dp_size_tgt = list(map(int, tgt.split('_')))
+
+        n_gpus = get_gpu_count()
+        n_gpus_src = tp_size_src * pp_size_src * dp_size_src
+        n_gpus_tgt = tp_size_tgt * pp_size_tgt * dp_size_tgt
+
+        if n_gpus_src > n_gpus:
+            pytest.skip(f"the test requires {n_gpus_src} gpus for source topology but have only {n_gpus}")
+        if n_gpus_tgt > n_gpus:
+            pytest.skip(f"the test requires {n_gpus_tgt} gpus for target topology but have only {n_gpus}")
+
+        output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False)
+
+        # 1. train with initial topology defined in the first arg of params
+        self.train_checkpoint(output_dir, tp_size=tp_size_src , pp_size=pp_size_src , dp_size=dp_size_src )
+
+        # 2. convert checkpoint to universal checkpoint (topology )
+        self.convert_checkpoint_to_universal(output_dir=output_dir, step=1)
+
+        # 3. check we can resume training from a reshaped checkpoint to the target topology - the last arg of params
+        self.resume_from_universal_checkpoint(output_dir, tp_size=tp_size_tgt, pp_size=pp_size_tgt, dp_size=dp_size_tgt)
+
+
+    @require_torch_multi_gpu
+    def test_checkpoint_reshaping_empty_dir(self):
+
+        output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
+        with self.assertRaises(RuntimeError) as context:
+            self.convert_checkpoint_to_universal(output_dir=output_dir, step=1)
--- a/tests/test_dataloaders.py
+++ b/tests/test_dataloaders.py
+import itertools
+import os
+import shutil
+from typing import Set
+from unittest.mock import patch
+
+import deepspeed
+import torch
+
+import finetune_t0_non_causal_decoder
+from megatron import global_vars, get_tokenizer, initialize_megatron, get_args
+from megatron.data import mlm_dataset, mtf_dataset, decoder_packed_mtf_dataset
+from megatron.data.data_samplers import build_pretraining_data_loader
+from megatron.testing_utils import TestCasePlus, flatten_arguments, mockenv_context, torch_assert_equal
+
+
+def get_default_args():
+    """return a dictionary with key as argument name and value as additional arguments"""
+    return {
+        # GPT_ARGS
+        "--num-layers": "2",
+        "--hidden-size": "128",
+        "--num-attention-heads": "4",
+        "--seq-length": "512",
+        "--max-position-embeddings": "512",
+        "--micro-batch-size": "4",
+        "--global-batch-size": "8",
+        "--lr-decay-iters": "320000",
+        "--lr-decay-style": "cosine",
+        "--lr": "0.00015",
+        "--min-lr": "1.0e-5",
+        "--train-iters": "5000",
+        "--tokenizer-type": "PretrainedFromHF",
+        "--tokenizer-name-or-path": "gpt2",
+        "--data-impl": "mmap",
+        "--split": "949,50,1",
+        "--distributed-backend": "nccl",
+        "--weight-decay": "1e-2",
+        "--clip-grad": "1.0",
+        "--lr-warmup-fraction": ".01",
+        "--fp16": "",
+
+        "--attention-dropout": "0",
+        "--hidden-dropout": "0",
+
+        # OUTPUT_ARGS
+        "--log-interval": "10",
+        "--save-interval": "500",
+        "--eval-interval": "100",
+        "--eval-iters": "10",
+        "--checkpoint-activations": "",
+
+        # DATA_ARGS
+    }
+
+def get_dummy_mtf_decoder_packed_data(micro_batch_size: int, seq_length: int, vocab_size: int, special_tokens_ids: Set[int]):
+    seq_length += 1
+
+    num_segments = torch.randint(1, 5, ())
+    segment_ids = torch.zeros(micro_batch_size, seq_length, dtype=torch.long)
+    is_inputs = torch.zeros(micro_batch_size, seq_length, dtype=torch.bool)
+    for batch_id in range(micro_batch_size):
+        # - `*2`: Hack in order to two start_new_segements to be seperated with two tokens at least
+        # - `+1`: Hack in order the start_mew_segments not to be 0
+        start_new_segments = torch.sort(torch.randperm((seq_length - 2) // 2, )[:num_segments]).values * 2 + 1
+        segment_ids[batch_id, start_new_segments] = 1
+
+        end_inputs = [
+            torch.randint(low=start_segment, high=end_segment, size=())
+            for start_segment, end_segment in zip([0, *start_new_segments], [*start_new_segments, seq_length])
+        ]
+        for end_input, start_segment in zip(end_inputs, [0, *start_new_segments]):
+            is_inputs[batch_id][start_segment: end_input + 1] = True
+
+    segment_ids = torch.cumsum(segment_ids, dim=-1) + 1
+
+    tokens = torch.randint(high=vocab_size, size=(micro_batch_size, seq_length), dtype=torch.long)
+    flatten_token_view = tokens.view(-1,)
+    for token_id in range(len(flatten_token_view)):
+        token = flatten_token_view[token_id]
+        # While token is a special tokens we change that token
+        while token in special_tokens_ids:
+            flatten_token_view[token_id] = (token + 1) % vocab_size
+            token = flatten_token_view[token_id]
+
+    return {
+        "decoder_token_ids": tokens,
+        "decoder_segment_ids": segment_ids,
+        "decoder_is_inputs": is_inputs
+    }
+
+class TestDataLoading(TestCasePlus):
+    def setUp(self) -> None:
+        super().setUp()
+
+        # We reset all global variables
+        global_vars._GLOBAL_ARGS = None
+        global_vars._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
+        global_vars._GLOBAL_TOKENIZER = None
+        global_vars._GLOBAL_TENSORBOARD_WRITER = None
+        global_vars._GLOBAL_ADLR_AUTORESUME = None
+        global_vars._GLOBAL_TIMERS = None
+
+        self.dist_env_1_gpu = dict(
+            MASTER_ADDR="localhost", MASTER_PORT="9994", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
+        )
+
+    def copy_data_to_temp(self, root_dir, prefix):
+        """copy data to temp, and return paths to temp version"""
+        src_path = os.path.join(root_dir, prefix)
+        src_dirname = os.path.dirname(src_path)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        dest_path = os.path.join(tmp_dir, prefix)
+        dest_dirname = os.path.dirname(dest_path)
+        os.makedirs(dest_dirname, exist_ok=True)
+        for folder in os.listdir(src_dirname):
+            src_folder = os.path.join(src_dirname, folder)
+            dest_folder = os.path.join(dest_dirname, folder)
+            if src_folder.startswith(src_path):
+                if os.path.isdir(src_folder):
+                    shutil.copytree(src_folder, dest_folder)
+                else:
+                    shutil.copy2(src_folder, dest_folder)
+        return dest_path
+
+    def test_mlm_dataset(self):
+        command_args = get_default_args()
+        data_path = self.copy_data_to_temp(self.data_dir, "gpt2/meg-gpt2-openwebtext_text_document")
+        command_args["--data-path"] = data_path
+        command_args["--noise-density"] = "0.15"
+        command_args["--mean-noise-span-length"] = "3"
+        command_args["--vocab-extra-ids"] = "100"
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                deepspeed.init_distributed()
+                initialize_megatron()
+
+                # tokenizer
+                tokenizer = get_tokenizer()
+                # SEP is required to put in MLM preprocessed.
+                tokenizer.tokenizer.add_special_tokens({"sep_token": "<s>"})
+
+                args = get_args()
+                train_val_test_num_samples = [
+                    args.train_iters * args.global_batch_size,
+                    args.eval_iters * args.global_batch_size,
+                    0
+                ]
+                train_ds, valid_ds, test_ds = mlm_dataset.build_train_valid_test_datasets(
+                    data_prefix=args.data_path,
+                    data_impl=args.data_impl,
+                    splits_string=args.split,
+                    # TODO @thomasw21 figure how that value works
+                    train_valid_test_num_samples=train_val_test_num_samples,
+                    sequence_length=args.seq_length,
+                    noise_density=args.noise_density,
+                    mean_noise_span_length=args.mean_noise_span_length,
+                    seed=args.seed,
+                    skip_warmup=(not args.mmap_warmup)
+                )
+
+                sample = train_ds[0]
+                # +1 is needed to compute labels. As inputs and targets are just concatenated.
+                self.assertEqual(len(sample["input_tokens"]) + len(sample["target_tokens"]), args.seq_length + 1)
+
+                # We make sure that inputs/targets end with <sep>
+                self.assertEqual(sample["input_tokens"][-1], tokenizer.sep)
+                self.assertEqual(sample["target_tokens"][-1], tokenizer.sep)
+
+    def test_decoder_packed_mtf_dataloader(self):
+        command_args = get_default_args()
+        data_path = self.copy_data_to_temp(self.data_dir, "gpt2/ag_news_prompt")
+        command_args["--data-path"] = data_path
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                deepspeed.init_distributed()
+                initialize_megatron()
+
+                args = get_args()
+                tokenizer = get_tokenizer()
+                # Hack: `gpt2` doesn't have a padding token, so we override that value.
+                tokenizer.tokenizer.pad_token_id = tokenizer.tokenizer.eos_token_id
+
+                train_val_test_num_samples = [
+                    args.train_iters * args.global_batch_size,
+                    args.eval_iters * args.global_batch_size,
+                    0
+                ]
+                train_ds, valid_ds, test_ds = decoder_packed_mtf_dataset.build_train_valid_test_datasets(
+                    data_prefix=args.data_path,
+                    data_impl=args.data_impl,
+                    splits_string=args.split,
+                    # TODO @thomasw21 figure how that value works
+                    train_valid_test_num_samples=train_val_test_num_samples,
+                    seq_length=args.seq_length + 1,
+                    pad_token=tokenizer.pad,
+                    eos_token=tokenizer.eos,
+                    seed=args.seed,
+                    skip_warmup=(not args.mmap_warmup)
+                )
+
+                batch_iterator = build_pretraining_data_loader(
+                    train_ds, consumed_samples=0, num_workers=4
+                )
+
+                last_padding_size = 0
+                for i, items in enumerate(batch_iterator):
+                    micro_batch_size, seq_length = items["decoder_token_ids"].shape
+
+                    # Check dtypes
+                    self.assertEqual(items["decoder_token_ids"].dtype, torch.int64)
+                    self.assertEqual(items["decoder_segment_ids"].dtype, torch.int64)
+                    self.assertEqual(items["decoder_is_inputs"].dtype, torch.bool)
+
+                    # `micro_batch_size` correspond to the one in argument
+                    self.assertEqual(micro_batch_size, args.micro_batch_size)
+                    # `seq_length` correspond to the one in argument + 1 in order to get tokens/labels
+                    self.assertEqual(seq_length, args.seq_length + 1)
+
+                    original_samples_count = 0
+                    for batch_id in range(micro_batch_size):
+                        segment_ids = [k for k, _ in itertools.groupby(items["decoder_segment_ids"][batch_id])]
+                        # `segment_ids` is [1,2,...]
+                        self.assertEqual(segment_ids[:-1], list(range(1, len(segment_ids))))
+                        # `0` signify that the tokens are padding
+                        self.assertIn(segment_ids[-1], [0, len(segment_ids)])
+                        original_samples_count += len([segment_id for segment_id in segment_ids if segment_id != 0])
+
+                    # Test that we actually pack, ie we have more samples than the `batch_size`
+                    self.assertGreater(original_samples_count, micro_batch_size)
+
+                    # Test that the first sample of each batch couldn't fit inside the previous batch
+                    first_sample_segment_ids = next(itertools.groupby(items["decoder_segment_ids"][0]))[1]
+                    first_sample_size = len(list(first_sample_segment_ids))
+                    self.assertGreater(first_sample_size, last_padding_size)
+
+                    # update `last_padding_size`
+                    last_padding_size = len([None for segment_id in items["decoder_segment_ids"][micro_batch_size - 1] if segment_id == 0])
+
+
+    def test_finetune_t0_non_causal_decoder_get_batch_pipe(self):
+        command_args = get_default_args()
+        command_args["--position-embedding-type"] = "alibi"
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                deepspeed.init_distributed()
+                initialize_megatron()
+
+                args = get_args()
+                tokenizer = get_tokenizer()
+                # Hack: `gpt2` doesn't have a padding token, so we override that value.
+                tokenizer.tokenizer.pad_token_id = tokenizer.tokenizer.eos_token_id
+
+                # Dummy data
+                data = get_dummy_mtf_decoder_packed_data(
+                    micro_batch_size=args.micro_batch_size,
+                    seq_length=args.seq_length,
+                    vocab_size=args.padded_vocab_size,
+                    special_tokens_ids={tokenizer.pad}
+                )
+
+                (tokens, position_ids, attention_mask), (labels, loss_mask) = finetune_t0_non_causal_decoder.get_batch_pipe(data)
+
+                tokens = tokens.cpu()
+                position_ids = position_ids.cpu()
+                attention_mask = attention_mask.cpu()
+                labels = labels.cpu()
+                loss_mask = loss_mask.cpu()
+
+                self.assertEqual(loss_mask.dtype, torch.float)
+                torch_assert_equal(loss_mask.bool(), ~data["decoder_is_inputs"][:, 1:] * (data["decoder_token_ids"][:, :-1] != tokenizer.pad))
+                torch_assert_equal(tokens, data["decoder_token_ids"][:, :-1])
+                torch_assert_equal(labels, data["decoder_token_ids"][:, 1:])
+
+                for batch_id in range(args.micro_batch_size):
+                    segment_cuts = torch.nonzero(data["decoder_segment_ids"][batch_id, 1:] - data["decoder_segment_ids"][batch_id, :-1]) + 1
+                    for segment_start, segment_end in zip([0, *segment_cuts], [*segment_cuts, args.seq_length]):
+                        self.assertTrue(torch.all(attention_mask[batch_id, 0, segment_start: segment_end, :segment_start]))
+                        self.assertTrue(torch.all(attention_mask[batch_id, 0, segment_start: segment_end, segment_end:]))
+
+                # TODO @thomasw21 make sure that we reset `position_ids`
--- a/tests/test_model.py
+++ b/tests/test_model.py
+from random import randint
+from typing import Set
+from unittest.mock import patch
+
+import deepspeed
+import torch
+from parameterized import parameterized
+from torch import nn
+import torch.nn.functional as F
+
+from megatron.enums import AttnMaskType
+from megatron.model.fused_layer_norm import MixedFusedLayerNorm
+from packaging import version
+
+from megatron import initialize_megatron, get_args, get_tokenizer, global_vars
+from megatron.model.fused_softmax import ScaledMaskedSoftmax, FusedScaleMaskSoftmax
+from megatron.model.utils import attention_mask_func
+from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, torch_assert_equal, \
+    torch_assert_close, require_torch_bf16
+from megatron.training import setup_model_and_optimizer
+import pretrain_gpt
+import pretrain_prefix_lm
+import finetune_t0_non_causal_decoder
+
+
+def get_default_args(test_file_dir: str):
+    """return a dictionary with key as argument name and value as additional arguments"""
+    return {
+        # GPT_ARGS
+        "--num-layers": "2",
+        "--hidden-size": "128",
+        "--num-attention-heads": "4",
+        "--seq-length": "256",
+        "--max-position-embeddings": "256",
+        "--micro-batch-size": "2",
+        "--global-batch-size": "2",
+        "--lr-decay-iters": "320000",
+        "--lr-decay-style": "cosine",
+        "--lr": "0.00015",
+        "--min-lr": "1.0e-5",
+        "--train-iters": "5000",
+        "--tokenizer-type": "PretrainedFromHF",
+        "--tokenizer-name-or-path": "gpt2",
+        "--data-impl": "mmap",
+        "--split": "949,50,1",
+        "--distributed-backend": "nccl",
+        "--weight-decay": "1e-2",
+        "--clip-grad": "1.0",
+        "--lr-warmup-fraction": ".01",
+        "--fp16": "",
+        "--inference": "",
+
+        "--attention-dropout": "0",
+        "--hidden-dropout": "0",
+
+        # OUTPUT_ARGS
+        "--log-interval": "10",
+        "--save-interval": "500",
+        "--eval-interval": "100",
+        "--eval-iters": "10",
+        "--checkpoint-activations": "",
+
+        # DATA_ARGS
+
+        # DeepSpeed args
+        "--deepspeed": "",
+        "--deepspeed_config": f"{test_file_dir}/ds_config_inference.json",
+        "--zero-stage": "0",
+    }
+
+
+def equal_vectors(tensor1, tensor2, dim=-1):
+    """View tensor1 and tensor2 as a list of vectors, and compute equality"""
+    return torch.linalg.norm(tensor1 - tensor2, dim=dim) == 0
+
+
+def iter_out_of_one(one):
+    return iter([one])
+
+
+def get_dummy_mtf_decoder_packed_data(micro_batch_size: int, seq_length: int, vocab_size: int, special_tokens_ids: Set[int]):
+    """Code from `tests/test_dataloaders.py"""
+    seq_length += 1
+
+    num_segments = torch.randint(1, 5, ())
+    segment_ids = torch.zeros(micro_batch_size, seq_length, dtype=torch.long)
+    is_inputs = torch.zeros(micro_batch_size, seq_length, dtype=torch.bool)
+    for batch_id in range(micro_batch_size):
+        # - `*2`: Hack in order to two start_new_segements to be seperated with two tokens at least
+        # - `+1`: Hack in order the start_mew_segments not to be 0
+        start_new_segments = torch.sort(torch.randperm((seq_length - 2) // 2, )[:num_segments]).values * 2 + 1
+        segment_ids[batch_id, start_new_segments] = 1
+
+        end_inputs = [
+            torch.randint(low=start_segment, high=end_segment - 1, size=())
+            for start_segment, end_segment in zip([0, *start_new_segments], [*start_new_segments, seq_length])
+        ]
+        for end_input, start_segment in zip(end_inputs, [0, *start_new_segments]):
+            is_inputs[batch_id][start_segment: end_input + 1] = True
+
+    segment_ids = torch.cumsum(segment_ids, dim=-1) + 1
+
+    tokens = torch.randint(high=vocab_size, size=(micro_batch_size, seq_length), dtype=torch.long)
+    flatten_token_view = tokens.view(-1,)
+    for token_id in range(len(flatten_token_view)):
+        token = flatten_token_view[token_id]
+        # While token is a special tokens we change that token
+        while token in special_tokens_ids:
+            flatten_token_view[token_id] = (token + 1) % vocab_size
+            token = flatten_token_view[token_id]
+
+    return {
+        "decoder_token_ids": tokens,
+        "decoder_segment_ids": segment_ids,
+        "decoder_is_inputs": is_inputs
+    }
+
+
+class MyTestCase(TestCasePlus):
+    def setUp(self) -> None:
+        super().setUp()
+
+        # We reset all global variables
+        global_vars._GLOBAL_ARGS = None
+        global_vars._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
+        global_vars._GLOBAL_TOKENIZER = None
+        global_vars._GLOBAL_TENSORBOARD_WRITER = None
+        global_vars._GLOBAL_ADLR_AUTORESUME = None
+        global_vars._GLOBAL_TIMERS = None
+
+        self.dist_env_1_gpu = dict(
+            MASTER_ADDR="localhost", MASTER_PORT="9994", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
+        )
+
+    def test_gpt(self):
+        """Test causal invariance, ie past token don't depend on future tokens."""
+        command_args = get_default_args(self.test_file_dir_str)
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                deepspeed.init_distributed()
+                initialize_megatron()
+                args = get_args()
+                tokenizer = get_tokenizer()
+
+                model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider)
+                model = model[0]
+                model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
+                model.set_train_batch_size(args.micro_batch_size)
+
+                token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
+
+                # eod is a special token
+                token_ids[token_ids == tokenizer.eod] += 1
+                token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
+
+                # get a modified version of the first batch, we change a specific index
+                changed_index = randint(0, args.seq_length - 2)
+                token_ids_changed = token_ids.clone()
+                # We increment the token_id by one for that index in order to artificially change the sequence.
+                token_ids_changed[:, changed_index] = \
+                    (token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size
+
+                output = model.eval_batch(iter_out_of_one({"text": token_ids}), compute_loss=False)
+                output_changed = model.eval_batch(iter_out_of_one({"text": token_ids_changed}), compute_loss=False)
+
+                # All token in past should be unchanged
+                torch_assert_equal(output[:, :changed_index], output_changed[:, :changed_index])
+                # All tokens in the future should have changed
+                self.assertFalse(
+                    torch.any(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:]))
+                )
+
+    def test_prefix_lm_reset_attention_mask(self):
+        """
+        Test prefix invariances when `reset_attention_mask=True`:
+            - Past target tokens don't depend on future target tokens.
+            - Target tokens depend on input tokens.
+            - Input tokens depend on all other input tokens, but never target tokens.
+        """
+        command_args = get_default_args(self.test_file_dir_str)
+
+        command_args["--reset-attention-mask"] = ""
+        command_args["--loss-on-targets-only"] = ""
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                deepspeed.init_distributed()
+                initialize_megatron()
+                args = get_args()
+                tokenizer = get_tokenizer()
+
+                model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider)
+                model = model[0]
+                model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
+                model.set_train_batch_size(args.micro_batch_size)
+                # we preprocess batch_fn manually
+                model.set_batch_fn(None)
+
+                token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
+
+                # eod is a special token, this also guarantees that the whole row is considered as a document.
+                token_ids[token_ids == tokenizer.eod] += 1
+                token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
+
+                # process batch to have non empty prefix
+                input_batch, (labels, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids})
+
+                for batch_id in range(len(prefix_indices)):
+                    for id in prefix_indices[batch_id]:
+                        self.assertTrue(loss_mask[batch_id, id] == 1)
+                        self.assertTrue(id > 0)
+                        # Make sure that the last prefix token predicts the first token.
+                        self.assertTrue(loss_mask[batch_id, id -1] == 1)
+
+                output = model.eval_batch(iter_out_of_one((input_batch, (labels, loss_mask), prefix_indices)), compute_loss=False)
+
+                ## --------------- CHANGE A TARGET TOKEN ---------------------------
+                # get a modified version of the first batch
+                # guaranteed to exist as each row has at least one partial document
+                changed_target_index = prefix_indices[0][0]
+                token_ids_changed_target = input_batch[0].clone()
+                # We increment the token id on the changed index.
+                token_ids_changed_target[0, changed_target_index] = \
+                    (token_ids_changed_target[0, changed_target_index] + 1) % args.padded_vocab_size
+                # make sure we're not changing a token to eod as it's a special token
+                token_ids_changed_target[token_ids_changed_target == tokenizer.eod] += 1
+                token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size
+
+                # Test change
+                output_changed_target = model.eval_batch(iter_out_of_one(((token_ids_changed_target, *input_batch[1:]), (labels, loss_mask), prefix_indices)), compute_loss=False)
+
+                # All token in past should be unchanged
+                torch_assert_equal(output[0, :changed_target_index], output_changed_target[0, :changed_target_index])
+                # All tokens in the future should have changed
+                self.assertFalse(
+                    torch.any(
+                        equal_vectors(output[0, changed_target_index:], output_changed_target[0, changed_target_index:])
+                    )
+                )
+                # Unchanged changed rows should not change either
+                torch_assert_equal(output[1, :], output_changed_target[1, :])
+
+                ## --------------- CHANGE AN INPUT TOKEN ---------------------------
+                # Let's change the the last prefix token and make sure that the first token changed
+                # guaranteed to be positive as we avoid pathological case previously
+                last_prefix_index = prefix_indices[0][0] - 1
+                token_ids_changed_input = input_batch[0].clone()
+                #  We increment the token id on the changed index.
+                token_ids_changed_input[0, last_prefix_index] = \
+                    (token_ids_changed_input[0, last_prefix_index] + 1) % args.padded_vocab_size
+                # make sure we're not changing a token to eod as it's a special token
+                token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1
+                token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size
+
+                output_changed_input = model.eval_batch(iter_out_of_one(((token_ids_changed_input, *input_batch[1:]), (labels, loss_mask), prefix_indices)), compute_loss=False)
+
+                # All tokens should be changed
+                self.assertFalse(
+                    torch.any(
+                        equal_vectors(output[0, :], output_changed_input[0, :])
+                    )
+                )
+                # Unchanged changed rows should not change either
+                torch_assert_equal(output[1, :], output_changed_input[1, :])
+
+    def test_prefix_lm_wo_reset_attention_mask(self):
+        """
+        Test prefix invariances when `reset_attention_mask=False`:
+            - Past target tokens don't depend on future target tokens.
+            - Target tokens depend on input tokens.
+            - Input tokens depend on all other input tokens, but never target tokens.
+        """
+        command_args = get_default_args(self.test_file_dir_str)
+
+        command_args["--loss-on-targets-only"] = ""
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                deepspeed.init_distributed()
+                initialize_megatron()
+                args = get_args()
+
+                model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider)
+                model = model[0]
+                model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
+                model.set_train_batch_size(args.micro_batch_size)
+                # we preprocess batch_fn manually
+                model.set_batch_fn(None)
+
+                token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
+                input_batch, (labels, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids})
+
+                for batch_id in range(len(prefix_indices)):
+                    id = prefix_indices[batch_id]
+                    self.assertTrue(loss_mask[batch_id, id] == 1)
+                    self.assertTrue(id > 0)
+                    # Make sure that the last prefix token predicts the first token.
+                    self.assertTrue(loss_mask[batch_id, id -1] == 1)
+
+                model.eval_batch(iter_out_of_one((input_batch, (labels, loss_mask), prefix_indices)), compute_loss=False)
+
+                #TODO: Check all invariants
+
+    def test_gpt_rotary_embeddings(self):
+        """Test rotary embeddings"""
+        command_args = get_default_args(self.test_file_dir_str)
+
+        del command_args["--max-position-embeddings"]
+        command_args["--position-embedding-type"] = "rotary"
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                deepspeed.init_distributed()
+                initialize_megatron()
+                args = get_args()
+                tokenizer = get_tokenizer()
+
+                model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider)
+                model = model[0]
+                model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
+                model.set_train_batch_size(args.micro_batch_size)
+
+                token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
+
+                # eod is a special token
+                token_ids[token_ids == tokenizer.eod] += 1
+                token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
+
+                model.eval_batch(iter_out_of_one({"text": token_ids}), compute_loss=False)
+
+                #TODO: Check all invariants
+
+    @require_torch_bf16
+    def test_fused_layer_norm(self):
+        command_args = get_default_args(self.test_file_dir_str)
+
+        # Condition to use custom cuda kernel
+        command_args["--bf16"] = ""
+        del command_args["--fp16"]
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                initialize_megatron()
+                args = get_args()
+
+                dummy_input = torch.randn(args.micro_batch_size, args.seq_length, args.hidden_size, device="cuda", dtype=torch.bfloat16)
+
+                normalized_shape = (args.hidden_size,)
+                epsilon = 1e-5
+                mfln = MixedFusedLayerNorm(normalized_shape, eps=epsilon)
+
+                self.assertTrue(mfln.use_meg_ds_fused_layer_norm, "Expected model to use Megatron-DeepSpeed custom cuda kernel for LayerNorm.")
+                self.assertTrue(args.bf16, "Test has to be done in half precision.")
+
+                # We set the weight manually so we simulate state that's not the initialisation
+                weight = torch.randn(args.hidden_size, device="cuda", dtype=torch.bfloat16)
+                bias = torch.randn(args.hidden_size, device="cuda", dtype=torch.bfloat16)
+                mfln.weight = nn.Parameter(weight)
+                mfln.bias = nn.Parameter(bias)
+
+                mfln_output = mfln(dummy_input)
+                # We check that our layernorm matches pytorch 1.11 onwards
+                if version.parse(torch.__version__) >= version.parse("1.11.0"):
+                    torch_layer_norm_output = F.layer_norm(dummy_input, normalized_shape, weight, bias, eps=epsilon)
+                else:
+                    # In this case we use can check that basically it corresponds to the fp32 version
+                    torch_layer_norm_output = F.layer_norm(dummy_input.float(), normalized_shape, weight.float(), bias.float(), eps=epsilon).to(torch.bfloat16)
+
+                torch_assert_equal(mfln_output, torch_layer_norm_output)
+
+    @parameterized.expand([(attn_mask_type,) for attn_mask_type in AttnMaskType])
+    def test_fused_masked_softmax(self, attn_mask_type: AttnMaskType):
+        command_args = get_default_args(self.test_file_dir_str)
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                initialize_megatron()
+                args = get_args()
+
+                dummy_input = torch.randn(
+                    args.micro_batch_size,
+                    args.num_attention_heads,
+                    args.seq_length,
+                    args.seq_length,
+                    device="cuda",
+                    dtype=args.params_dtype
+                )
+                if attn_mask_type == AttnMaskType.causal:
+                    dummy_attention_mask = None
+                else:
+                    dummy_attention_mask = torch.randn(
+                        args.micro_batch_size,
+                        1, # `args.num_attention_heads` not implemented in our cuda kernel
+                        args.seq_length,
+                        args.seq_length,
+                        device="cuda",
+                        dtype=args.params_dtype
+                    ) < 0
+                scale = torch.rand(())
+
+                fused_scaled_softmax = FusedScaleMaskSoftmax(
+                    input_in_fp16=args.params_dtype == torch.float16,
+                    input_in_bf16=args.params_dtype == torch.bfloat16,
+                    attn_mask_type=attn_mask_type,
+                    scaled_masked_softmax_fusion=True,
+                    mask_func=attention_mask_func,
+                    softmax_in_fp32=True,
+                    scale=scale,
+                )
+                unfused_scaled_softmax = FusedScaleMaskSoftmax(
+                    input_in_fp16=args.params_dtype == torch.float16,
+                    input_in_bf16=args.params_dtype == torch.bfloat16,
+                    attn_mask_type=attn_mask_type,
+                    scaled_masked_softmax_fusion=False,
+                    mask_func=attention_mask_func,
+                    softmax_in_fp32=True,
+                    scale=scale,
+                )
+
+                self.assertTrue(fused_scaled_softmax.is_kernel_available(dummy_attention_mask, *dummy_input.size()))
+                fused_output = fused_scaled_softmax(dummy_input, dummy_attention_mask)
+                self.assertFalse(unfused_scaled_softmax.is_kernel_available(dummy_attention_mask, *dummy_input.size()))
+                unfused_output = unfused_scaled_softmax(dummy_input, dummy_attention_mask)
+
+                # Test that the nonzeros are the same with the mask
+                for i in range(args.num_attention_heads):
+                    if dummy_attention_mask is None:
+                        # Make sure it's causal, values in the lower triangle should be not zero.
+                        non_zero_values = torch.tril(torch.ones_like(fused_output[:, i]))
+                        torch_assert_equal(torch.nonzero(fused_output[:, i]), torch.nonzero(non_zero_values))
+                    else:
+                        torch_assert_equal(torch.nonzero(fused_output[:, i]), torch.nonzero(~dummy_attention_mask[:, 0]))
+
+                # Cuda kernel produces slightly different results
+                torch_assert_close(fused_output, unfused_output)
+
+
+    def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_is_not_causal_across_segments(self):
+        command_args = get_default_args(self.test_file_dir_str)
+        command_args["--position-embedding-type"] = "alibi"
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**self.dist_env_1_gpu):
+                deepspeed.init_distributed()
+                initialize_megatron()
+
+                args = get_args()
+                tokenizer = get_tokenizer()
+                # Hack: `gpt2` doesn't have a padding token, so we override that value.
+                tokenizer.tokenizer.pad_token_id = tokenizer.tokenizer.eos_token_id
+
+                data = get_dummy_mtf_decoder_packed_data(
+                    micro_batch_size=args.micro_batch_size,
+                    seq_length=args.seq_length,
+                    vocab_size=args.padded_vocab_size,
+                    special_tokens_ids={tokenizer.pad}
+                )
+                model, _, _ = setup_model_and_optimizer(finetune_t0_non_causal_decoder.model_provider)
+                model = model[0]
+                model._config.train_micro_batch_size_per_gpu = args.micro_batch_size
+                model.set_train_batch_size(args.micro_batch_size)
+
+                output = model.eval_batch(iter_out_of_one(data), compute_loss=False)
+
+                ## --------------- CHANGE A TARGET TOKEN ---------------------------
+                # change the first token in the first batch to a random value
+                change_batch_id = 0
+                change_token_id = 0
+                token_ids_changed = data["decoder_token_ids"].clone()
+                # We increment the token id on the changed index.
+                token_ids_changed[change_batch_id, change_token_id] = (token_ids_changed[change_batch_id, change_token_id] + 1) % args.padded_vocab_size
+                while token_ids_changed[change_batch_id, change_token_id] in {tokenizer.eod, tokenizer.pad}:
+                    token_ids_changed[change_batch_id, change_token_id] = (token_ids_changed[change_batch_id, change_token_id] + 1) % args.padded_vocab_size
+
+                # Test change
+                output_changed_target = model.eval_batch(iter_out_of_one({**data, "decoder_token_ids": token_ids_changed}), compute_loss=False)
+
+                first_segment_first_batch_id_end = (torch.nonzero(data["decoder_segment_ids"][change_batch_id, 1:] - data["decoder_segment_ids"][change_batch_id, :-1]) + 1)[0]
+                # Check that values changed in segment 1 of batch_id 0
+                self.assertFalse(torch.any(
+                    equal_vectors(
+                        output[change_batch_id, change_token_id:first_segment_first_batch_id_end],
+                        output_changed_target[change_batch_id, change_token_id:first_segment_first_batch_id_end]
+                    )
+                ))
+                # Check that values did not change in other segments of batch_id 0
+                torch_assert_equal(
+                    output[change_batch_id, first_segment_first_batch_id_end:],
+                    output_changed_target[change_batch_id, first_segment_first_batch_id_end:]
+                )
+                # Check that values did not change in other segments in other batches
+                non_change_ids = torch.arange(output.shape[0]) != change_batch_id
+                torch_assert_equal(output[non_change_ids], output_changed_target[non_change_ids])
+
+                ## --------------- CHANGE A TARGET TOKEN ---------------------------
+                # change the last token in the first batch to a pad
+                token_ids_changed_pad = data["decoder_token_ids"].clone()
+                segment_ids_changed_pad = data["decoder_segment_ids"].clone()
+                # We increment the token id on the changed index.
+                token_ids_changed_pad[change_batch_id, -1] = tokenizer.pad
+                segment_ids_changed_pad[change_batch_id, -1] = 0
+
+                # Test model handles padding correctly
+                output_changed_pad = model.eval_batch(iter_out_of_one({**data, "decoder_token_ids": token_ids_changed_pad, "decoder_segment_ids": segment_ids_changed_pad}), compute_loss=False)
+
+                self.assertFalse(torch.any(torch.isnan(output_changed_pad)))
--- a/tests/test_preprocessing.py
+++ b/tests/test_preprocessing.py
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import filecmp
+import io
+import json
+import re
+import os
+import unittest
+import functools
+
+from pathlib import Path
+
+from megatron.testing_utils import (
+    TestCasePlus,
+    execute_subprocess_async,
+    set_seed
+)
+
+from datasets import load_dataset
+
+set_seed(42)
+
+
+def write_jsonl(path, lines_num=1000, line_length=1024):
+    def get_text_line(line_length):
+        # XXX: fix to generate line_length
+        return "It's a wonderful world. I'm just walking on air. Talk of heaven on earth. I've got more than my share. Haven't got a care. Happy all day through. It's a wonderful world. Loving wonderful you!"
+
+    with io.open(path, "w", encoding="utf-8") as f:
+
+        for i in range(lines_num):
+            rec = dict(text=get_text_line(line_length))
+            x = json.dumps(rec, indent=0, ensure_ascii=False)
+            x = re.sub(r'\n', ' ', x, 0, re.M)
+            f.write(x + "\n")
+
+@functools.lru_cache()
+def download_hf_dataset(dsetname):
+    return load_dataset(dsetname)
+
+class MegDSTestPreprocessing(TestCasePlus):
+    """ """
+
+    def setUp(self):
+        super().setUp()
+
+
+    def test_preprocess_data(self):
+        src_dir = self.src_dir
+        data_dir = f"{self.data_dir}/gpt2"
+        output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
+
+        # autogenerate "input.jsonl"
+        input_path = f"{output_dir}/input.jsonl"
+        write_jsonl(input_path)
+
+        output_prefix =f"{output_dir}/test-ds"
+
+        cmd = f"""
+        python {src_dir}/tools/preprocess_data.py
+            --input {input_path}
+            --output-prefix {output_prefix}
+            --dataset-impl mmap
+            --tokenizer-type GPT2BPETokenizer
+            --merge-file {data_dir}/gpt2-tiny-merges.txt
+            --vocab {data_dir}/gpt2-tiny-vocab.json
+            --append-eod
+            --workers 2
+        """.split()
+
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        for ext in ["bin", "idx"]:
+            tgt_path = f"{output_prefix}_text_document.{ext}"
+            self.assertTrue(Path(tgt_path).exists(), )
+
+    def compare_meg_data_files(self, tgt, ref):
+        for ext in ["bin", "idx"]:
+            tgt_path = f"{tgt}.{ext}"
+            ref_path = f"{ref}.{ext}"
+            self.assertTrue(Path(tgt_path).exists(), )
+            self.assertTrue(filecmp.cmp(tgt_path, ref_path, shallow=False))
+
+    def preprocess_partitioned_dataset(self, output_dir, dsetname, splitname, linelimit, numparts):
+        """Preprocess a dataset as a whole and in shards to prepare environment for merge test.
+
+        Load specified HF dataset using given split and record limit.
+        Write the dataset to a jsonl file and preprocess.
+        Also split dataset into numparts contiguous shards, write each shard to its own jsonl, and preprocess each.
+        Return path to the full dataset and a list of paths for each shard."""
+
+        src_dir = self.src_dir
+        data_dir = f"{self.data_dir}/gpt2"
+
+        # preproces_data_dist requires one to have already downloaded the input HF dataset.
+        # We do that by running this script before the test.
+        dset = download_hf_dataset(dsetname)[splitname]
+
+        # limit the test to use the first linelimit entries to be faster
+        dset = dset.select(range(linelimit))
+
+        # write jsonl file of full dataset
+        json_ds = f"{output_dir}/ds-full.jsonl"
+        dset.to_json(json_ds)
+
+        # process full jsonl into indexed dataset file
+        ds_full = f"{output_dir}/ds-full"
+        cmd = f"""
+                python {src_dir}/tools/preprocess_data.py
+                    --input {json_ds}
+                    --output-prefix {ds_full}
+                    --dataset-impl mmap
+                    --tokenizer-type GPT2BPETokenizer
+                    --merge-file {data_dir}/gpt2-tiny-merges.txt
+                    --vocab {data_dir}/gpt2-tiny-vocab.json
+                    --append-eod
+                """.split()
+        ds_full += '_text_document'
+
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        # write each part to its own json file
+        ds_parts = []
+        for i in range(numparts):
+            json_part = f"{output_dir}/ds-part-{i}.jsonl"
+            dset.shard(numparts, i, contiguous=True).to_json(json_part)
+
+            ds_part = f"{output_dir}/ds-part-{i}"
+            ds_parts.append(ds_part + '_text_document')
+            cmd = f"""
+                    python {src_dir}/tools/preprocess_data.py
+                        --input {json_part}
+                        --output-prefix {ds_part}
+                        --dataset-impl mmap
+                        --tokenizer-type GPT2BPETokenizer
+                        --merge-file {data_dir}/gpt2-tiny-merges.txt
+                        --vocab {data_dir}/gpt2-tiny-vocab.json
+                        --append-eod
+                    """.split()
+
+            # keep for quick debug
+            # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        return ds_full, ds_parts
+
+    def test_merge_serial(self):
+        """Check that serial merge of partial dataset files produces the same file as the full dataset."""
+        src_dir = self.src_dir
+        output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False)
+
+        # process full dataset, and process the full dataset as 3 contiguous chunks
+        ds_full, ds_parts = self.preprocess_partitioned_dataset(output_dir, 'stas/openwebtext-10k', 'train', 100, 3)
+
+        # merge the part files into a single indexed dataset
+        ds_merged = f"{output_dir}/ds-merged"
+        cmd = f"""
+                python {src_dir}/tools/merge_preprocessed_data.py
+                    --datasets {" ".join(ds_parts)}
+                    --output-prefix {ds_merged}
+                """.split()
+
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        # the full dataset and the merged dataset should be identical
+        self.compare_meg_data_files(ds_full, ds_merged)
+
+    def test_merge_distributed(self):
+        """Check that serial merge of partial dataset files produces the same file as the full dataset."""
+        src_dir = self.src_dir
+        output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False)
+
+        # process full dataset, and process the full dataset as 3 contiguous chunks
+        ds_full, ds_parts = self.preprocess_partitioned_dataset(output_dir, 'stas/openwebtext-10k', 'train', 100, 3)
+
+        # merge the part files into a single indexed dataset
+        ds_merged = f"{output_dir}/ds-merged"
+        cmd = f"""
+                python -m torch.distributed.launch --nproc_per_node 6 {src_dir}/tools/merge_preprocessed_data.py
+                    --merge distributed
+                    --datasets {" ".join(ds_parts)}
+                    --output-prefix {ds_merged}
+                    --torch-backend gloo
+                """.split()
+
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        # the full dataset and the merged dataset should be identical
+        self.compare_meg_data_files(ds_full, ds_merged)
+
+    def test_process_data_microsoft(self):
+        """We want to be stable to Microsoft version."""
+        src_dir = self.src_dir
+        data_dir = f"{self.data_dir}/gpt2"
+        output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False)
+
+        input_path = f"{self.tests_dir}/data/gpt2/openwebtext-1000.jsonl"
+
+        output_prefix = f"{output_dir}/test-ds-meg-gpt2-openwebtext"
+
+        cmd = f"""
+                python {src_dir}/tools/preprocess_data.py
+                    --input {input_path}
+                    --output-prefix {output_prefix}
+                    --dataset-impl mmap
+                    --tokenizer-type GPT2BPETokenizer
+                    --merge-file {data_dir}/gpt2-tiny-merges.txt
+                    --vocab {data_dir}/gpt2-tiny-vocab.json
+                    --append-eod
+                    --workers 2
+                """.split()
+
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        self.compare_meg_data_files(f"{output_prefix}_text_document", f"{data_dir}/meg-gpt2-openwebtext_text_document")
+
+    def test_process_data_dist_microsoft(self):
+        """We want to be stable to Microsoft version."""
+        src_dir = self.src_dir
+        data_dir = f"{self.data_dir}/gpt2"
+        output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False)
+
+        output_prefix = f"{output_dir}/test-ds-meg-gpt2-openwebtext_1k"
+
+        # preprocess_data_dist requires one to have already downloaded the input HF dataset.
+        # We do that by running this script before the test.
+        dsetname = 'stas/openwebtext-10k'
+        download_hf_dataset(dsetname)
+
+        cmd = f"""
+                python -m torch.distributed.launch --nproc_per_node 2 {src_dir}/tools/preprocess_data_dist.py
+                    --input {dsetname}
+                    --count 1000
+                    --output-prefix {output_prefix}
+                    --dataset-impl mmap
+                    --tokenizer-type GPT2BPETokenizer
+                    --merge-file {data_dir}/gpt2-tiny-merges.txt
+                    --vocab {data_dir}/gpt2-tiny-vocab.json
+                    --append-eod
+                """.split()
+
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        self.compare_meg_data_files(f"{output_prefix}_text_document", f"{data_dir}/meg-gpt2-openwebtext_text_document")
+
+    def test_process_data_dist_serial_microsoft(self):
+        """We want to be stable to Microsoft version."""
+        src_dir = self.src_dir
+        data_dir = f"{self.data_dir}/gpt2"
+        output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False)
+
+        output_prefix = f"{output_dir}/test-ds-meg-gpt2-openwebtext_1k"
+
+        # preproces_data_dist requires one to have already downloaded the input HF dataset.
+        # We do that by running this script before the test.
+        dsetname = 'stas/openwebtext-10k'
+        download_hf_dataset(dsetname)
+
+        cmd = f"""
+                python -m torch.distributed.launch --nproc_per_node 2 {src_dir}/tools/preprocess_data_dist.py
+                    --input {dsetname}
+                    --count 1000
+                    --merge serial
+                    --output-prefix {output_prefix}
+                    --dataset-impl mmap
+                    --tokenizer-type GPT2BPETokenizer
+                    --merge-file {data_dir}/gpt2-tiny-merges.txt
+                    --vocab {data_dir}/gpt2-tiny-vocab.json
+                    --append-eod
+                """.split()
+
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        self.compare_meg_data_files(f"{output_prefix}_text_document", f"{data_dir}/meg-gpt2-openwebtext_text_document")
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
+import unittest
+from random import randint
+from unittest.mock import patch
+
+import deepspeed
+import torch
+import logging
+import numpy as np
+
+import pytest
+from megatron import initialize_megatron, get_args, get_tokenizer, global_vars
+from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, require_deepspeed, require_torch_multi_gpu
+from megatron.training import setup_model_and_optimizer
+from megatron.mpu.mappings import gather_from_tensor_model_parallel_region
+from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe
+from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe
+import multiprocessing as mp
+from multiprocessing import Pool
+from megatron.checkpointing import save_checkpoint
+
+from megatron.utils import get_ltor_masks_and_position_ids
+
+@require_deepspeed
+@require_torch_multi_gpu
+class MegDSTestTP(TestCasePlus):
+    def get_default_args(self):
+        """return a dictionary with key as argument name and value as additional arguments"""
+        data_dir = f"{self.data_dir}/gpt2"
+        return {
+            # GPT_ARGS
+            "--num-layers": "2",
+            "--hidden-size": "128",
+            "--num-attention-heads": "4",
+            "--seq-length": "256",
+            "--max-position-embeddings": "256",
+            "--micro-batch-size": "4",
+            "--global-batch-size": "8",
+            "--lr-decay-iters": "320000",
+            "--lr-decay-style": "cosine",
+            "--lr": "0.00015",
+            "--min-lr": "1.0e-5",
+            "--train-iters": "5000",
+            "--tokenizer-type": "GPT2BPETokenizer",
+            "--merge-file": f"{data_dir}/gpt2-tiny-merges.txt",
+            "--vocab-file": f"{data_dir}/gpt2-tiny-vocab.json",
+            "--data-impl": "mmap",
+            "--split": "949,50,1",
+            "--distributed-backend": "nccl",
+            "--weight-decay": "1e-2",
+            "--clip-grad": "1.0",
+            "--lr-warmup-fraction": ".01",
+            "--fp16": "",
+
+            "--attention-dropout": "0",
+            "--hidden-dropout": "0",
+            
+
+            # OUTPUT_ARGS
+            "--log-interval": "10",
+            "--save-interval": "500",
+            "--eval-interval": "100",
+            "--eval-iters": "10",
+            "--checkpoint-activations": "",
+            
+            #ds args
+            "--deepspeed": "",
+            "--deepspeed_config":f"{self.test_file_dir_str}/ds_config.json",
+            "--zero-stage": "1",
+            "--deepspeed-activation-checkpointing": ""
+            # DATA_ARGS
+        }
+        
+    def setUp(self) -> None:
+        super().setUp()
+
+        # We reset all global variables
+        global_vars._GLOBAL_ARGS = None
+        global_vars._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
+        global_vars._GLOBAL_TOKENIZER = None
+        global_vars._GLOBAL_TENSORBOARD_WRITER = None
+        global_vars._GLOBAL_ADLR_AUTORESUME = None
+        global_vars._GLOBAL_TIMERS = None
+
+    def infer_model(args):
+        tp_index, tp_size, command_args, token_ids, save, load = args
+        dist_env = dict(
+            MASTER_ADDR="localhost", MASTER_PORT="9991", RANK=str(tp_index), LOCAL_RANK=str(tp_index), WORLD_SIZE=str(tp_size)
+        )
+        logging.getLogger().critical("Process: starting")
+        
+        #Hack
+        import megatron.initialize as init
+        init.git_ds_info = lambda: None
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            with mockenv_context(**dist_env):
+                
+                def create_model_inputs(tokens):
+                    args = get_args()
+
+                    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+                        tokens,
+                        tokenizer.eod,
+                        args.reset_position_ids,
+                        args.reset_attention_mask,
+                        args.eod_mask_loss,
+                        prefix_indices=None,
+                        loss_on_targets_only=False)
+
+                    return (tokens, position_ids, attention_mask), (tokens, loss_mask)
+
+                deepspeed.init_distributed()
+                initialize_megatron()
+                args = get_args()
+
+                tokenizer = get_tokenizer()
+
+                model, _, _ = setup_model_and_optimizer(gpt_model_provider)
+                model = model[0]
+                if load is not None:
+                    # Hack (same as in eval_harness/evaluate.py)
+                    # Loading pipelined models in deepspeed with different TP than it was originally trained on fails
+                    # due to a sanity check, that makes sure that all state_dicts that we merge contains attention layers.
+                    # This, however, is not true for pipelining when we will merge the state_dict for the embeddings which
+                    # which does not contain these attention-specific keys.
+                    #
+                    # Deepspeed does however manage to load the model if we just turn off this sanity check.
+                    deepspeed.runtime.state_dict_factory.MegatronSDLoader.sanity_check = lambda self, ckpt_file_name: None
+
+                    zero_enabled = model._config.zero_enabled
+                    model._config.zero_enabled = False
+                    _, _ = model.load_checkpoint(load, load_optimizer_states=False, load_lr_scheduler_states=False, load_module_only=True)
+                    model._config.zero_enabled = zero_enabled
+                
+                if token_ids is None:
+                    token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
+
+                    # eod is a special token
+                    token_ids[token_ids == tokenizer.eod] += 1
+                    token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
+                else:
+                    token_ids = torch.tensor(token_ids)
+                
+                model.micro_batches = 1
+                model.set_batch_fn(create_model_inputs)
+                # process batch
+                input_batch = get_gpt_batch_pipe({"text": token_ids})[0]
+
+                # get a modified version of the first batch, we change a specific index
+                changed_index = randint(0, args.seq_length - 2)
+                input_token_ids_changed = input_batch[0].clone()
+                # We increment the token_id by one for that index in order to artificially change the sequence.
+                input_token_ids_changed[:, changed_index] = \
+                    (input_token_ids_changed[:,changed_index] + 1) % args.padded_vocab_size
+
+                output = model.eval_batch(iter([token_ids]), compute_loss = False, reduce_output = None)[0]
+                
+                output = gather_from_tensor_model_parallel_region(output)
+
+                if save != None:
+                    args.save = save
+                    save_checkpoint(0, [model], None, None)
+                
+                return (output[0].detach().cpu().numpy(), token_ids.detach().cpu().numpy())
+
+    def test_alibi_tp(self):
+        mp.set_start_method('spawn', force=True)
+        cp_dir = self.get_auto_remove_tmp_dir()
+        
+        command_args = self.get_default_args()
+        command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
+        command_args["--position-embedding-type"] = "alibi"
+        command_args["--tensor-model-parallel-size"] = "1"
+        
+        pool = Pool(1)
+        result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, None, cp_dir, None))])
+        pool.close()
+        pool.join()
+        
+        output, tokens = result[0]
+        logging.getLogger().info("First done!")
+
+        command_args["--tensor-model-parallel-size"] = "2"
+
+        pool = Pool(2)
+        result = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, tokens, None, cp_dir)), ((1, 2, command_args, tokens, None, cp_dir))])
+        pool.close()
+        pool.join()
+        
+        output2, tokens = result[0]
+
+        logging.getLogger().critical(output-output2)
+        self.assertTrue(np.allclose(output,output2, atol=5e-3, rtol=0), "Different results when running with TP=1 and TP=2")
+
+
+
+    def test_embedding_matrix_tp(self):
+        mp.set_start_method('spawn', force=True)
+        cp_dir = self.get_auto_remove_tmp_dir()
+        
+        command_args = self.get_default_args()
+        command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
+        command_args["--seq-length"] = "4"
+        command_args["--micro-batch-size"] = "2"
+        tokens = [[5119, 0, 1, 5100],[0, 1, 5111, 5101]]
+
+        command_args["--tensor-model-parallel-size"] = "1"
+        
+        pool = Pool(1)
+        # tp_index, tp_size, command_args, token_ids, save, load
+        result = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, tokens, cp_dir, None))])
+        pool.close()
+        pool.join()
+        
+        output, _ = result[0]
+        logging.getLogger().info("First done!")
+
+        command_args["--tensor-model-parallel-size"] = "2"
+
+        pool = Pool(2)
+        result = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, tokens, None, cp_dir)), ((1, 2, command_args, tokens, None, cp_dir))])
+        pool.close()
+        pool.join()
+        
+        output2, _ = result[0]
+
+        logging.getLogger().critical(output-output2)
+        self.assertTrue(np.allclose(output,output2, atol=5e-3, rtol=0), "Different results when running with TP=1 and TP=2")
+
+
+    def test_embedding_matrix_tp_with_invalid_tokens_ids(self):
+        mp.set_start_method('spawn', force=True)
+        
+        command_args = self.get_default_args()
+        command_args["--pad-vocab-size-to"] = "5120" # This is equal to 128 * 40 which is above the len of gp2-tiny vocabulary
+        command_args["--seq-length"] = "4"
+        command_args["--micro-batch-size"] = "2"
+        tokens = [[5120, 0, 1, 2],[0, 1, 3, 4]]
+
+        command_args["--tensor-model-parallel-size"] = "1"
+
+        pool = Pool(1)
+        with pytest.raises(Exception) as exc_info: 
+            _ = pool.map(MegDSTestTP.infer_model, [((0, 1, command_args, tokens, None, None))])
+        pool.close()
+        pool.join()
+
+        self.assertIn("There is an input id in the input that is greater than the highest possible input id" , str(exc_info.value))
+        
+        logging.getLogger().info("First done!")
+
+        command_args["--tensor-model-parallel-size"] = "2"
+
+        pool = Pool(2)
+        with pytest.raises(Exception) as exc_info: 
+            _ = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, tokens, None, None)), ((1, 2, command_args, tokens, None, None))])
+        pool.close()
+        pool.join()
+
+        self.assertIn("There is an input id in the input that is greater than the highest possible input id", str(exc_info.value))
+
+
+    def test_tokenizer_vocab_size_multiple_of_tp_size(self):
+        mp.set_start_method('spawn', force=True)
+        
+        command_args = self.get_default_args()
+        command_args["--pad-vocab-size-to"] = "5121" # This is equal to 128 * 40 + 1 which is above the len of gp2-tiny vocabulary
+        command_args["--micro-batch-size"] = "4"
+        command_args["--tensor-model-parallel-size"] = "2"
+        command_args["--make-vocab-size-divisible-by"] = "1"
+
+        pool = Pool(2)
+        with pytest.raises(Exception) as exc_info: 
+            _ = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, None, None, None)), ((1, 2, command_args, None, None, None))])
+        pool.close()
+        pool.join()
+
+        self.assertEqual(str(exc_info.value), "5121 is not divisible by 2")
+
+    def test_tokenizer_raise_error_make_vocab_size_divisible_by(self):
+        mp.set_start_method('spawn', force=True)
+        
+        command_args = self.get_default_args()
+        command_args["--pad-vocab-size-to"] = "5121" # This is equal to 128 * 40 + 1 which is above the len of gp2-tiny vocabulary
+        command_args["--micro-batch-size"] = "4"
+        
+
+        pool = Pool(2)
+        with pytest.raises(Exception) as exc_info: 
+            _ = pool.map(MegDSTestTP.infer_model, [((0, 2, command_args, None, None, None)), ((1, 2, command_args, None, None, None))])
+        pool.close()
+        pool.join()
+
+        self.assertEqual(str(exc_info.value), "5121 is not divisible by 128")
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/test_training.py
+++ b/tests/test_training.py
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import io
+import json
+import os
+import glob
+import re
+import shutil
+import unittest
+from pathlib import Path
+from parameterized import parameterized
+
+from megatron.testing_utils import (
+    CaptureStdout,
+    CaptureStd,
+    TestCasePlus,
+    execute_subprocess_async,
+    get_gpu_count,
+    require_bnb_non_decorator,
+    require_deepspeed,
+    require_torch_gpu,
+    set_seed
+)
+
+set_seed(42)
+
+
+def get_launcher(num_gpus):
+    # 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup
+    # - it won't be able to handle that
+    return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
+
+def get_3d_dimensions():
+    num_gpus = get_gpu_count()
+
+    # with fewer gpus the preference is first to to do PP>1, then TP>1, then DP>1
+    if num_gpus >= 8:
+        dp_size = 2
+        pp_size = 2
+        tp_size = 2
+    if num_gpus >= 4:
+        dp_size = 1
+        pp_size = 2
+        tp_size = 2
+    elif num_gpus >= 2:
+        dp_size = 1
+        pp_size = 2
+        tp_size = 1
+    else:
+        dp_size = 1
+        pp_size = 1
+        tp_size = 1
+
+    return pp_size, tp_size, dp_size
+
+
+@require_deepspeed
+@require_torch_gpu
+class MegDSTestTraining(TestCasePlus):
+    """ """
+
+    def setUp(self):
+        super().setUp()
+
+        # at times magatron fails to build kernels and doesn't remove the lock file, which makes
+        # subsequent runs hang - so make sure there is no lock when starting the testing
+        meg_lock_file_path = self.repo_root_dir_str + "/megatron/fused_kernels/build/lock"
+        if os.path.exists(meg_lock_file_path):
+            os.unlink(meg_lock_file_path)
+
+    def copy_data_to_temp(self, root_dir, prefix):
+        """copy data to temp, and return paths to temp version"""
+        src_path = os.path.join(root_dir, prefix)
+        src_dirname = os.path.dirname(src_path)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        dest_path = os.path.join(tmp_dir, prefix)
+        dest_dirname = os.path.dirname(dest_path)
+        os.makedirs(dest_dirname, exist_ok=True)
+        for folder in os.listdir(src_dirname):
+            src_folder = os.path.join(src_dirname, folder)
+            dest_folder = os.path.join(dest_dirname, folder)
+            if src_folder.startswith(src_path):
+                if os.path.isdir(src_folder):
+                    shutil.copytree(src_folder, dest_folder)
+                else:
+                    shutil.copy2(src_folder, dest_folder)
+        return dest_path
+
+    def get_variation_config(self, variation, output_dir, n_samples=None):
+        data_dir = self.copy_data_to_temp(self.data_dir,"gpt2")
+
+        pp_size, tp_size, dp_size = get_3d_dimensions()
+        num_gpus = pp_size * tp_size * dp_size
+        print(f"Using {num_gpus} GPUs")
+
+        if variation == "bnb":
+            # we want to make sure at least tp=2 is used, so we swap tp and pp
+            pp_size, tp_size = tp_size, pp_size
+
+        if n_samples is None:
+            n_samples = 300 # about 56 iterations
+
+        exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume
+        seq_len = 128
+
+        # common/shared configs
+
+        ds_args = f"""
+                --deepspeed
+                --deepspeed_config {self.test_file_dir_str}/ds_config.json
+                --zero-stage 1
+                --deepspeed-activation-checkpointing
+        """.split()
+
+        args = f"""
+                --tensor-model-parallel-size {tp_size}
+                --pipeline-model-parallel-size {pp_size}
+                --distributed-backend nccl
+
+                --log-interval 1
+                --save-interval 10
+                --eval-interval 10
+                --eval-iters 5
+                --checkpoint-activations
+                --partition-activations
+                --exit-interval {exit_interval}
+
+                --merge-file {data_dir}/gpt2-tiny-merges.txt
+                --vocab-file {data_dir}/gpt2-tiny-vocab.json
+                --save {output_dir}/checkpoints
+                --load {output_dir}/checkpoints
+                --data-path {data_dir}/meg-gpt2-openwebtext_text_document
+                --tensorboard-dir {output_dir}/tensorboard
+                --tensorboard-queue-size 5
+                --log-timers-to-tensorboard
+                --log-batch-size-to-tensorboard
+                --log-validation-ppl-to-tensorboard
+
+                --num-layers 2
+                --hidden-size 64
+                --num-attention-heads 2
+                --seq-length {seq_len}
+                --max-position-embeddings 1024
+                --micro-batch-size 1
+                --global-batch-size 16
+
+                --optimizer adam
+                --adam-beta1 0.9
+                --adam-beta2 0.95
+                --adam-eps 1e-8
+                --lr 1e-4
+                --lr-warmup-samples 5
+                --clip-grad 1.0
+                --weight-decay 1e-1
+                --embed-layernorm
+                --sync-tp-duplicated-parameters
+                --fp16
+
+                --log-level debug
+                --log-level-replica info
+        """.split()
+
+
+        if variation == "base":
+
+            new_args = f"""
+                --rampup-batch-size 2 2 {n_samples}
+                --train-samples {n_samples}
+
+                --lr-decay-samples 6
+
+            """.split()
+
+            new_ds_args = f"""
+                --deepspeed_config {self.test_file_dir_str}/ds_config.json
+            """.split()
+
+
+        elif variation == "bnb":
+            # BitsAndBytes - 8-bit optimizer
+
+            new_args = f"""
+                --rampup-batch-size 2 2 {n_samples}
+                --train-samples {n_samples}
+
+                --lr-decay-samples 6
+
+                --use-bnb-optimizer
+            """.split()
+
+            new_ds_args = f"""
+                --deepspeed_config {self.test_file_dir_str}/ds_config.json
+            """.split()
+
+
+        elif variation == "cl":
+            # CurriculumLearning
+
+            lr_decay_samples = 6
+            lr_decay_tokens = lr_decay_samples * seq_len
+
+            train_tokens = n_samples * seq_len
+
+            # XXX: if changing seq_len from 128, must adjust ds config to:
+            #  curriculum_learning.max_difficulty: $SEQLEN
+
+            # XXX: probably we should write the ds config on the fly to keep everything in sync,
+            # rather than using the pre-saved config
+
+            new_args = f"""
+                --train-samples {n_samples*2}
+                --train-tokens {train_tokens}
+
+                --lr-decay-tokens {lr_decay_tokens}
+            """.split()
+
+            new_ds_args = f"""
+                --deepspeed_config {self.test_file_dir_str}/ds_config_cl.json
+            """.split()
+
+        elif variation == "glu":
+            new_args = f"""
+                --rampup-batch-size 2 2 {n_samples}
+                --train-samples {n_samples}
+
+                --lr-decay-samples 6
+
+                --no-bias-gelu-fusion
+                --glu-activation geglu
+            """.split()
+
+            new_ds_args = f"""
+                --deepspeed_config {self.test_file_dir_str}/ds_config.json
+            """.split()
+
+        elif variation == "alibi":
+            new_args = f"""
+                --rampup-batch-size 2 2 {n_samples}
+                --train-samples {n_samples}
+
+                --lr-decay-samples 6
+
+                --position-embedding-type alibi
+            """.split()
+
+            new_ds_args = f"""
+                --deepspeed_config {self.test_file_dir_str}/ds_config.json
+            """.split()
+
+        else:
+            raise ValueError(f"Don't know of variation {variation}")
+
+        args.extend(new_args)
+        ds_args.extend(new_ds_args)
+
+        return args, ds_args, num_gpus
+
+    def test_kill_switch(self):
+
+        variation = "base"
+
+        src_dir = self.src_dir
+        output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
+        kill_switch_path = os.path.join(output_dir, "kill-switch-xyz")
+        args, ds_args, num_gpus = self.get_variation_config(variation, output_dir)
+        args += f"--kill-switch-path {kill_switch_path}".split()
+
+        script = [f"{src_dir}/pretrain_gpt.py"]
+        launcher = get_launcher(num_gpus)
+
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        # 1. kill switch armed but not triggered
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test deepspeed is running
+        self.assertIn("DeepSpeed info", cs.out)
+
+        # 2. trigger kill switch
+        fh = open(kill_switch_path, "w")
+        with CaptureStd() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        self.assertIn(f"Detected kill switch at {kill_switch_path}", cs.out)
+
+        # test deepspeed wasn't run
+        self.assertNotIn("DeepSpeed info", cs.out)
+
+
+    @parameterized.expand(["base", "cl", "bnb", "glu", "alibi"])
+    def test_training_all(self, variation):
+
+        # optional runs
+        if variation == "bnb":
+            require_bnb_non_decorator()
+
+        # all in one test
+        src_dir = self.src_dir
+        output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
+
+        args, ds_args, num_gpus = self.get_variation_config(variation, output_dir)
+
+        script = [f"{src_dir}/pretrain_gpt.py"]
+        launcher = get_launcher(num_gpus)
+
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        # 1. test training from scratch (no checkpoint)
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test deepspeed is running
+        self.assertIn("DeepSpeed info", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test there should be no checkpoint this round
+        self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+        # test tensorboard
+        tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
+        self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
+
+        if variation == "glu":
+            self.assertIn("Using GLU activation: GELU", cs.out)
+
+        if variation == "alibi":
+            self.assertIn("Using Alibi", cs.out)
+
+        # 2. test training from checkpoint: resume
+        # now do it again, this time resuming from the checkpoint
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test checkpoint loading
+        self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+        # test tensorboard (1 file from the first run, plus 1 now)
+        tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
+        self.assertEqual(len(tensorboard_files), 2, "tensorboard files")
+
+        if variation == "glu":
+            self.assertIn("Using GLU activation: GELU", cs.out)
+
+    @parameterized.expand([(True, True), (False, False), (True, False), (False, True)])
+    def test_training_prefix_lm_all(self, loss_on_targets_only, reweight_loss_based_on_position_frequency):
+        # all in one test
+        src_dir = self.src_dir
+        data_dir = self.copy_data_to_temp(self.data_dir,"gpt2")
+
+        output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
+        logs_dir = f"{output_dir}/logs"
+        Path(logs_dir).mkdir(parents=True, exist_ok=True)
+
+        pp_size, tp_size, dp_size = get_3d_dimensions()
+        num_gpus = pp_size * tp_size * dp_size
+
+        n_samples = 200 # about 37 iterations
+        exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume
+        args = f"""
+            --tensor-model-parallel-size {tp_size}
+            --pipeline-model-parallel-size {pp_size}
+            --distributed-backend nccl
+
+            --num-layers 2
+            --hidden-size 64
+            --num-attention-heads 2
+            --seq-length 128
+            --max-position-embeddings 1024
+            --micro-batch-size 1
+            --rampup-batch-size 2 2 {n_samples}
+            --global-batch-size 16
+            --train-samples {n_samples}
+            {"--loss-on-targets-only" if loss_on_targets_only else ""}
+            {"--reweight-loss-based-on-position-frequency" if reweight_loss_based_on_position_frequency else ""}
+
+            --optimizer adam
+            --adam-beta1 0.9
+            --adam-beta2 0.95
+            --adam-eps 1e-8
+            --lr 1e-4
+            --lr-warmup-samples 5
+            --clip-grad 1.0
+            --weight-decay 1e-1
+            --fp16
+
+            --log-interval 5
+            --save-interval 10
+            --eval-interval 10
+            --eval-iters 5
+            --checkpoint-activations
+            --exit-interval {exit_interval}
+
+            --merge-file {data_dir}/gpt2-tiny-merges.txt
+            --vocab-file {data_dir}/gpt2-tiny-vocab.json
+            --log-path {logs_dir}
+            --save {output_dir}/checkpoints
+            --load {output_dir}/checkpoints
+            --data-path {data_dir}/meg-gpt2-openwebtext_text_document
+            --tensorboard-dir {output_dir}/tensorboard
+            --tensorboard-queue-size 5
+            --log-timers-to-tensorboard
+            --log-batch-size-to-tensorboard
+            --log-validation-ppl-to-tensorboard
+
+            --log-level debug
+        """.split()
+
+        ds_args = f"""
+            --deepspeed
+            --deepspeed_config {self.test_file_dir_str}/ds_config.json
+            --zero-stage 1
+            --deepspeed-activation-checkpointing
+        """.split()
+
+        script = [f"{src_dir}/pretrain_prefix_lm.py"]
+        launcher = get_launcher(num_gpus)
+
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        # 1. test training from scratch (no checkpoint)
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test deepspeed is running
+        self.assertIn("DeepSpeed info", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test there should be no checkpoint this round
+        self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+        # test tensorboard
+        tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
+        self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
+
+        if reweight_loss_based_on_position_frequency:
+            self.assertIn("Using loss reweighting", cs.out)
+
+        # 2. test training from checkpoint: resume
+        # now do it again, this time resuming from the checkpoint
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test checkpoint loading
+        self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+        # test tensorboard (1 file from the first run, plus 1 now)
+        tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
+        self.assertEqual(len(tensorboard_files), 2, "tensorboard files")
+
+    def test_training_t0(self):
+        data_path = self.copy_data_to_temp(self.data_dir, "gpt2/ag_news_prompt")
+        output_dir = self.get_auto_remove_tmp_dir()
+        logs_dir = f"{output_dir}/logs"
+        Path(logs_dir).mkdir(parents=True, exist_ok=True)
+
+        pp_size, tp_size, dp_size = get_3d_dimensions()
+        num_gpus = pp_size * tp_size * dp_size
+
+        n_samples = 200 # about 37 iterations
+        exit_interval = 10 # some samples in the first half and then some more in the 2nd half after resume
+
+        args = f"""
+            --tensor-model-parallel-size {tp_size}
+            --pipeline-model-parallel-size {pp_size}
+            --distributed-backend nccl
+
+            --num-layers 2
+            --hidden-size 64
+            --num-attention-heads 2
+            --seq-length 128
+            --max-position-embeddings 1024
+            --position-embedding-type alibi
+            --micro-batch-size 1
+            --rampup-batch-size 2 2 {n_samples}
+            --global-batch-size 16
+            --train-samples {n_samples}
+
+            --optimizer adam
+            --adam-beta1 0.9
+            --adam-beta2 0.95
+            --adam-eps 1e-8
+            --lr 1e-4
+            --lr-warmup-samples 5
+            --clip-grad 1.0
+            --weight-decay 1e-1
+            --fp16
+
+            --log-interval 5
+            --save-interval 10
+            --eval-interval 10
+            --eval-iters 5
+            --checkpoint-activations
+            --exit-interval {exit_interval}
+            --tokenizer-type PretrainedFromHF
+            --tokenizer-name-or-path bigscience/tokenizer
+            --log-path {logs_dir}
+            --save {output_dir}/checkpoints
+            --load {output_dir}/checkpoints
+            --data-path {data_path}
+            --split 90,10,0
+            --tensorboard-dir {output_dir}/tensorboard
+            --tensorboard-queue-size 5
+            --log-timers-to-tensorboard
+            --log-batch-size-to-tensorboard
+            --log-validation-ppl-to-tensorboard
+
+            --log-level debug
+        """.split()
+
+        ds_args = f"""
+            --deepspeed
+            --deepspeed_config {self.test_file_dir_str}/ds_config.json
+            --zero-stage 1
+            --deepspeed-activation-checkpointing
+        """.split()
+
+        script = [f"{self.src_dir}/finetune_t0_non_causal_decoder.py"]
+        launcher = get_launcher(num_gpus)
+
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        # 1. test training from scratch (no checkpoint)
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test deepspeed is running
+        self.assertIn("DeepSpeed info", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test there should be no checkpoint this round
+        self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+        # test tensorboard
+        tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
+        self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
+
+        # 2. test training from checkpoint: resume
+        # now do it again, this time resuming from the checkpoint
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test checkpoint loading
+        self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+        # test tensorboard (1 file from the first run, plus 1 now)
+        tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
+        self.assertEqual(len(tensorboard_files), 2, "tensorboard files")
+
+    @parameterized.expand(["gpt", "prefix", "no_eval"])
+    def test_mode2_dataloading(self, variation):
+        src_dir = self.src_dir
+        data_dir = self.copy_data_to_temp(self.data_dir, "gpt2")
+        output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False)
+        logs_dir = f"{output_dir}/logs"
+        Path(logs_dir).mkdir(parents=True, exist_ok=True)
+
+        pp_size, tp_size, dp_size = get_3d_dimensions()
+        num_gpus = pp_size * tp_size * dp_size
+
+        n_samples = 200 # about 37 iterations
+        exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume
+        args = f"""
+            --tensor-model-parallel-size {tp_size}
+            --pipeline-model-parallel-size {pp_size}
+            --distributed-backend nccl
+
+            --num-layers 2
+            --hidden-size 64
+            --num-attention-heads 2
+            --seq-length 128
+            --max-position-embeddings 1024
+            --micro-batch-size 1
+            --rampup-batch-size 2 2 {n_samples}
+            --global-batch-size 16
+            --train-samples {n_samples}
+            --loss-on-targets-only
+
+            --optimizer adam
+            --adam-beta1 0.9
+            --adam-beta2 0.95
+            --adam-eps 1e-8
+            --lr 1e-4
+            --lr-warmup-samples 5
+            --clip-grad 1.0
+            --weight-decay 1e-1
+            --fp16
+
+            --log-interval 5
+            --save-interval 10
+            --eval-interval 10
+            --eval-iters 5
+            --checkpoint-activations
+            --exit-interval {exit_interval}
+
+            --merge-file {data_dir}/gpt2-tiny-merges.txt
+            --vocab-file {data_dir}/gpt2-tiny-vocab.json
+            --log-path {logs_dir}
+            --save {output_dir}/checkpoints
+            --tensorboard-dir {output_dir}/tensorboard
+            --tensorboard-queue-size 5
+            --log-timers-to-tensorboard
+            --log-batch-size-to-tensorboard
+            --log-validation-ppl-to-tensorboard
+        """.split()
+
+        data_args = [
+            "--train-weighted-split-paths", f'TRAIN: 1 0:0.95 {data_dir}/meg-gpt2-openwebtext_text_document, 0.3 0:0.90 {data_dir}/meg-gpt2-openwebtext_text_document']
+
+        if variation != "no_eval":
+            data_args += ["--valid-weighted-split-paths", f'VALID1: 1 0.95:0.98 {data_dir}/meg-gpt2-openwebtext_text_document, 0.3 0.90:0.99 {data_dir}/meg-gpt2-openwebtext_text_document',
+                                            f'VALID2: 0.5 0.95:0.97 {data_dir}/meg-gpt2-openwebtext_text_document, 0.5 0.90:0.98 {data_dir}/meg-gpt2-openwebtext_text_document']
+
+        ds_args = f"""
+            --deepspeed
+            --deepspeed_config {self.test_file_dir_str}/ds_config.json
+            --zero-stage 1
+            --deepspeed-activation-checkpointing
+        """.split()
+
+        if variation == "prefix":
+            script = [f"{src_dir}/pretrain_prefix_lm.py"]
+        else:
+            script = [f"{src_dir}/pretrain_gpt.py"]
+        launcher = get_launcher(num_gpus)
+
+        cmd = launcher + script + args + data_args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        # 1. test training from scratch (no checkpoint)
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # test deepspeed is running
+        self.assertIn("DeepSpeed info", cs.out)
+
+        # test reports
+        self.assertIn("consumed samples", cs.out)
+
+        # test checkpoint saving
+        self.assertIn("successfully saved checkpoint at iteration", cs.out)
+
+        # test tensorboard
+        tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*")
+        self.assertEqual(len(tensorboard_files), 1, "tensorboard files")
+
+    def test_skip_train_iteration(self):
+        # skip iterations setup
+        extra_args = f"""
+            --skip-train-iteration-range 2-2 4-7
+        """.split()
+
+        src_dir = self.src_dir
+        output_dir = self.get_auto_remove_tmp_dir()
+        args, ds_args, num_gpus = self.get_variation_config("base", output_dir, n_samples=200)
+        args.extend(extra_args)
+        script = [f"{src_dir}/pretrain_gpt.py"]
+        launcher = get_launcher(num_gpus)
+        cmd = launcher + script + args + ds_args
+        # keep for quick debug
+        # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
+
+        with CaptureStdout() as cs:
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        # check skipped iterations
+        self.assertIn("Skipped iterations 2 to 2 due to --skip-train-iteration-range flag", cs.out)
+        self.assertIn("Skipped iterations 4 to 7 due to --skip-train-iteration-range flag", cs.out)
+
+        train_iterations = range(1,10)
+        for i in train_iterations:
+            self.assertTrue(f"iteration {i:8d}/" in cs.out)
--- a/tests/tools/README.md
+++ b/tests/tools/README.md
+# Test suite tools
+
+# Make tiny tokenizer files
+
+currently for gpt2 run:
+```
+./shrink-tokenizer.py
+```
+
+and then we have tiny vocab and merge files under the generated dir `tiny` to add to repo under `data/gpt2`.
+
+```
+cp tiny/merges.txt ../data/gpt2/gpt2-tiny-merges.txt
+cp tiny/vocab.json ../data/gpt2/gpt2-tiny-vocab.json
+```
+
+Note, the tiny vocab was set to 5000 items after experimenting with the resulting index files size. Using a tiny vocab of 500 (and adjusted merge entries) proved to generate very large index files, so it actually ends up costing more in final file size. 5000 proved to generate an almost identical index files as with the original 50k vocab size.
+
+
+# Make tiny pre-processed index
+
+to be used in test training
+
+```
+./openwebtext-to-jsonl.py
+```
+
+generates:
+
+```
+openwebtext-10000.jsonl
+```
+
+we don't want to store jsonl in repo, to keep the size small, so it's a temp file.
+
+Now we pre-process it:
+
+```
+cd ../..
+input=tests/tools/openwebtext-1000.jsonl
+python tools/preprocess_data.py \
+    --input $input \
+    --output-prefix tests/data/gpt2/meg-gpt2-openwebtext \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file tests/data/gpt2/gpt2-tiny-merges.txt \
+    --vocab tests/data/gpt2/gpt2-tiny-vocab.json \
+    --append-eod \
+    --workers 6
+```
+
+and voila we now have:
+```
+ls -sh1 tests/data/gpt2/meg-gpt2-openwebtext*
+2.6M tests/data/gpt2/meg-gpt2-openwebtext_text_document.bin
+ 20K tests/data/gpt2/meg-gpt2-openwebtext_text_document.idx
+```
+which we can now commit and use in tests.
--- a/tests/tools/openwebtext-to-jsonl.py
+++ b/tests/tools/openwebtext-to-jsonl.py
+#!/usr/bin/env python
+
+# generate a jsonl version of a small slice of a dataset that can be fed to megatron-lm preprocessor
+
+import sys
+from datasets import load_dataset
+
+dataset_name = "stas/openwebtext-10k"
+
+# subset to jsonlines
+n_samples = 1000
+ds = load_dataset(dataset_name, split='train')
+ds_small = ds.select(range(n_samples))
+path = f"openwebtext-{n_samples}.jsonl"
+ds_small.to_json(path, orient="records", lines=True)
--- a/tests/tools/shrink-tokenizer.py
+++ b/tests/tools/shrink-tokenizer.py
+#!/usr/bin/env python
+
+# produce a tiny tokenizer which we can use in testing (so that it won't take much space in the repo)
+
+import json
+from transformers import AutoTokenizer
+from tokenizers import Tokenizer
+
+mname = "gpt2"
+
+vocab_keep_items = 5000
+
+tokenizer = AutoTokenizer.from_pretrained(mname, use_fast=True)
+assert tokenizer.is_fast, "This only works for fast tokenizers."
+tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
+vocab = tokenizer_json["model"]["vocab"]
+if tokenizer_json["model"]["type"] == "BPE":
+    if "gpt2" in mname:
+        new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items-1 }
+        new_vocab["<|endoftext|>"] = vocab_keep_items-1
+    else:
+        new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
+    merges = tokenizer_json["model"]["merges"]
+    new_merges = []
+    for i in range(len(merges)):
+        a, b = merges[i].split()
+        new_token = "".join((a, b))
+        if a in new_vocab and b in new_vocab and new_token in new_vocab:
+            new_merges.append(merges[i])
+    tokenizer_json["model"]["merges"] = new_merges
+elif tokenizer_json["model"]["type"] == "Unigram":
+    new_vocab = vocab[:vocab_keep_items]
+elif tokenizer_json["model"]["type"] == "WordPiece" or tokenizer_json["model"]["type"] == "WordLevel":
+    new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
+else:
+    raise ValueError(f"don't know how to handle {tokenizer_json['model']['type']}")
+tokenizer_json["model"]["vocab"] = new_vocab
+tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
+tokenizer.save_pretrained("tiny")
--- a/tools/README.md
+++ b/tools/README.md
+
+# Tools
+
+- [sample_idxs_to_text.py](./sample_idxs_to_text.py) - want to see which text was feed at specific iterations? for example to understand why the training went astray? Then use this script. The pre-amble of the script contains the documentation and usage examples.
+
+
+## A few notes on how we created the datasets:
+
+### Creating the Json Lines text file
+
+First you need to create a jsonl file containing your dataset. For this we exported from the HF-datasets format. For example for C4:
+
+```
+from datasets import load_dataset
+c4 = load_dataset("c4", "en")
+c4["train"].to_json("c4_en_train.jsonl")
+c4["validation"].to_json("c4_en_valid.jsonl")
+```
+
+This creates quite a large file compared to the size of the HF dataset on disk (810GB vs 305 for C4 for example)
+
+### Megatron pre-processing
+
+Then you need to pass that text file to the `preprocess_data.py` script for tokenization and memory-mapping, creating two files, one to store the tokens indices and one to store the document start and ends. The result will be slightly bigger than the text dataset. (360GB vs 305GB for C4 for example). You can choose one of the default Megatron tokenizers (but then you have to pass merges and vocab files) or one from HF-tokenizers. For example, in our GPT-like models reusing a T5 sentencepiece-bpe tokenizer:
+
+`python tools/preprocess_data.py   --input ~/c4_en_train.jsonl        --output-prefix c4_en_train --dataset-impl mmap        --tokenizer-type PretrainedFromHF --tokenizer-name-or-path t5-small        --workers 30        --append-eod`
+
+Do note that adding too many workers can be counterproductive for very large dataset: as the bottleneck becomes disk writing, the intermediary process results pool up and can flood the RAM. In our experiments on GCP machines, running with 60 workers on C4 inevitably led the program to fail.
--- a/tools/convert_checkpoint/README.md
+++ b/tools/convert_checkpoint/README.md
+# Introduction
+
+This folder is a collection of scripts for converting checkpoints of one training framework (e.g., DeepSpeed) into that of a different framework (e.g., Megatron-LM, HF Transformers).
+
+The folder also contains scripts for inspecting checkpoint files and folders, which could be useful when developing checkpoint conversion logic. At the time of creation, this folder contains scripts to convert DeepSpeed checkpoints to Megatron-LM and HF Transformers checkpoints (this motivated this effort as part of the BigScience project).
+
+Here are the list and details of checkpoint conversions provided by the available scripts:
+
+1. [Megatron-DeepSpeed to Megatron-LM](#Megatron-DeepSpeed-to-Megatron)
+1. [Megatron-DeepSpeed to HF Transformers](#Megatron-DeepSpeed-to-HF-Transformers)
+
+
+## Megatron-DeepSpeed to Megatron
+
+The (current implementation of the) converter extracts args and model parameters from a DeepSpeed checkpoint (i.e., excludes other training states such as optimizer, scheduler, etc) and convert into a Megatron-LM checkpoint similarly containing only model parameters. The converter also provides a best-effort attempt to reshape the tensor-parallelism and pipeline parallelism degrees for the checkpoint. The resulting Megatron-LM checkpoint could be loaded into Megatron-LM framework for finetuning or inference. Tensor parallelism (TP) and pipeline parallelism (PP) are supported in the sense that the generated Megatron-LM checkpoint (folders and files) will be of the same TP and PP of the training that created the input DeepSpeed checkpoint. The entry point of the converter is `deepspeed_to_megatron.py`, which as the following usage:
+```bash
+python tools/convert_checkpoint/deepspeed_to_megatron.py -h
+Convert DeepSpeed Checkpoint to Megatron Checkpoint
+usage: deepspeed_to_megatron.py [-h] [--input_folder INPUT_FOLDER]
+                                [--output_folder OUTPUT_FOLDER]
+                                [--target_tp TARGET_TP]
+                                [--target_pp TARGET_PP] [--for_release]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --input_folder INPUT_FOLDER
+                        Input DeepSpeed Checkpoint folder
+  --output_folder OUTPUT_FOLDER
+                        Output Megatron checkpoint folder
+  --target_tp TARGET_TP
+                        Target TP degree
+  --target_pp TARGET_PP
+                        Target PP degree
+  --for_release         Convert for release purpose, reset some (progress)
+                        counters.
+```
+
+The following scripts which proved useful for debugging are also included:
+1. `inspect_deepspeed_checkpoint.py`: view the contents of a DeepSpeed checkpoint folder.
+2. `inspect_checkpoint.py`: view the contents of a PyTorch checkpoint file.
+
+## Megatron-DeepSpeed to HF Transformers
+
+In order to convert from Megatron-DeepSpeed to HF Transformers, you can do this directly using:
+
+```bash
+python tools/convert_checkpoint/deepspeed_to_transformers.py  \
+--input_folder /path/to/Megatron-Deepspeed/checkpoint/global_step97500 \
+--output_folder /path/to/transformers/checkpoint
+```
+since `transformers` currently only works with PP=1/TP=1 we use the defaults `--target_tp 1 --target_pp 1`.
+
+The script taps into `transformers` and as of this writing requires `transformers@master` (or `transformers==4.11` if you read this later and a new version is released).
+
+Note that you may run into problems with not having `megatron.enums` defined since `Megatron-Deepspeed` in the `bigscience-workshop` tree diverged from the `microsoft` tree. In such cases you can fix this on the fly by ensuring the former appears first in the `sys.path`. For example:
+
+
+```bash
+PYTHONPATH=/hf/Megatron-DeepSpeed-bigscience:/hf/Megatron-DeepSpeed-microsoft \
+python tools/convert_checkpoint/deepspeed_to_transformers.py  \
+--input_folder /path/to/Megatron-Deepspeed/checkpoint/global_step97500 \
+--output_folder /path/to/transformers/checkpoint
+```
+
+Alternatively, you can convert first from Megatron-DeepSpeed to Megatron and then to HF Transformers:
+
+```bash
+# 1. Megatron-DeepSpeed to Megatron
+cd /hf/Megatron-DeepSpeed-bigscience
+python tools/convert_checkpoint/deepspeed_to_megatron.py --target_tp 1 --target_pp 1 \
+--input_folder /path/to/Megatron-Deepspeed/checkpoint/global_step97500 \
+--output_folder /path/to/Megatron/checkpoint
+
+# 2. Megatron to HF Transformers
+cd /hf/transformers
+python src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py \
+/path/to/Megatron/checkpoint/iter_0097500/mp_rank_00/model_optim_rng.pt
+```
--- a/tools/convert_checkpoint/deepspeed_to_deepspeed.py
+++ b/tools/convert_checkpoint/deepspeed_to_deepspeed.py
+#!/usr/bin/env python
+import sys
+import argparse
+import os
+import torch
+
+from pathlib import Path
+
+# insert megatron's root dir into sys.path
+root_repo_path = str(Path(__file__).resolve().parents[2])
+if root_repo_path not in sys.path:
+    sys.path.insert(0, root_repo_path)
+
+from megatron.tokenizer.tokenizer import _vocab_size_with_padding
+from deepspeed.checkpoint.deepspeed_checkpoint import (
+    ARGS_KEY,
+    CHECKPOINT_INFO_KEY,
+)
+
+from deepspeed.checkpoint import (
+    DeepSpeedCheckpoint,
+    get_model_ckpt_name_for_rank,
+    get_zero_ckpt_name_for_rank,
+    get_layer_ckpt_name_for_rank
+)
+
+CHECKPOINT_FILE_SUFFIX = '_model_states.pt'
+MP_WORLD_SIZE ='mp_world_size'
+WORD_EMBEDDINGS_KEY = 'word_embeddings.weight'
+ORIGINAL_VOCAB_SIZE = 'original_vocab_size'
+PADDED_VOCAB_SIZE = 'padded_vocab_size'
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_folder',
+                        default=None,
+                        type=str,
+                        help='Input DeepSpeed Checkpoint folder')
+    parser.add_argument('--output_folder',
+                        default=None,
+                        type=str,
+                        help='Output Megatron checkpoint folder')
+    parser.add_argument('--target_tp',
+                        default=None,
+                        type=int,
+                        help='Target TP degree')
+    parser.add_argument('--target_pp',
+                        default=None,
+                        type=int,
+                        help='Target PP degree')
+    parser.add_argument('--target_dp',
+                        default=None,
+                        type=int,
+                        help='Target DP degree')
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+
+def _save_checkpoint(file_path, chkpt_sd):
+    dir, _ = os.path.split(file_path)
+    os.makedirs(dir, exist_ok=True)
+    torch.save(chkpt_sd, file_path)
+
+
+def _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, tp_index, pp_index):
+    sd_list = ds_checkpoint.get_transformer_state(tp_index, pp_index)
+    layer_id_list = ds_checkpoint.get_pp_transformer_map(pp_index)
+    assert len(sd_list) == len(layer_id_list)
+    for sd, layer_id in zip(sd_list, layer_id_list):
+        ckpt_path = get_layer_ckpt_name_for_rank(
+            base_folder=base_folder,
+            layer_id=layer_id,
+            tp_rank=tp_index)
+        _save_checkpoint(ckpt_path, sd)
+
+
+def _strip_vocab_padding(ds_checkpoint, padded_vocab_tensor):
+    target_args = ds_checkpoint.get_args()
+    checkpoint_info = ds_checkpoint.get_checkpoint_info()
+    target_args.tensor_model_parallel_size = ds_checkpoint.tp_degree
+    target_args.padded_vocab_size = _vocab_size_with_padding(checkpoint_info[ORIGINAL_VOCAB_SIZE], target_args)
+    assert target_args.padded_vocab_size <= padded_vocab_tensor.numel()
+    checkpoint_info[PADDED_VOCAB_SIZE] = target_args.padded_vocab_size
+    unpadded_vocab_tensor = torch.narrow(padded_vocab_tensor, 0, 0, target_args.padded_vocab_size)
+    return unpadded_vocab_tensor.clone()
+
+
+def _create_embedding_layer_checkpoint(ds_checkpoint, base_folder, tp_index):
+    sd = ds_checkpoint.get_embedding_state(tp_index)
+    if ds_checkpoint.is_change_tp_degree():
+        sd[WORD_EMBEDDINGS_KEY] = _strip_vocab_padding(ds_checkpoint, sd[WORD_EMBEDDINGS_KEY])
+    layer_id = ds_checkpoint.get_embedding_layer_id()
+    ckpt_path = get_layer_ckpt_name_for_rank(
+        base_folder=base_folder,
+        tp_rank=tp_index,
+        layer_id=layer_id)
+    _save_checkpoint(ckpt_path, sd)
+
+
+def _create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, tp_index):
+    sd = ds_checkpoint.get_final_norm_state(tp_index)
+    layer_id = ds_checkpoint.get_final_norm_layer_id()
+    ckpt_path = get_layer_ckpt_name_for_rank(
+        base_folder=base_folder,
+        tp_rank=tp_index,
+        layer_id=layer_id)
+    _save_checkpoint(ckpt_path, sd)
+
+
+def _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, tp_index,
+                                   pp_index):
+    sd = ds_checkpoint.get_2d_parallel_state(tp_index=tp_index,
+                                             pp_index=pp_index)
+    sd[MP_WORLD_SIZE] = ds_checkpoint.tp_degree
+    file_id = pp_index * ds_checkpoint.tp_degree + tp_index
+    ckpt_path = get_model_ckpt_name_for_rank(base_folder, f'{file_id:02d}')
+
+    # Adjust specific fields
+    sd[ARGS_KEY] = ds_checkpoint.get_args()
+    sd[ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree
+    sd[ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree
+    sd[CHECKPOINT_INFO_KEY][PADDED_VOCAB_SIZE] = sd[ARGS_KEY].padded_vocab_size
+    _save_checkpoint(ckpt_path, sd)
+
+
+def _create_zero_checkpoint(ds_checkpoint, base_folder, dp_index, pp_index, tp_index):
+    _2d_rank = (pp_index * ds_checkpoint.tp_degree) + tp_index
+    sd = ds_checkpoint.get_zero_checkpoint_state(
+        pp_index=pp_index,
+        tp_index=tp_index,
+        dp_index=dp_index)
+
+    ckpt_path = get_zero_ckpt_name_for_rank(base_folder=base_folder,
+                                            dp_rank=dp_index,
+                                            mp_rank=_2d_rank)
+    _save_checkpoint(ckpt_path, sd)
+
+
+def _create_latest_file(base_folder, file_name, latest_tag):
+    file_path = os.path.join(base_folder, file_name)
+    os.makedirs(base_folder, exist_ok=True)
+    with open(file_path, 'w') as f:
+        f.write(str(latest_tag))
+
+
+def main():
+    print(f'Convert DeepSpeed Checkpoint to DeepSpeed Checkpoint')
+
+    args = parse_arguments()
+    print(
+        f'Converting DeepSpeed checkpoint in {args.input_folder} to DeepSpeed checkpoint in {args.output_folder}'
+    )
+
+    ds_checkpoint = DeepSpeedCheckpoint(
+        args.input_folder,
+        args.target_tp,
+        args.target_pp,
+        args.target_dp)
+    iteration = ds_checkpoint.get_iteration()
+    latest_tag = f'global_step{iteration}'
+    _create_latest_file(args.output_folder,
+                        'latest_checkpointed_iteration.txt', iteration)
+    _create_latest_file(args.output_folder, 'latest', latest_tag)
+    base_folder = os.path.join(args.output_folder, latest_tag)
+
+    for i in range(ds_checkpoint.tp_degree):
+        _create_embedding_layer_checkpoint(ds_checkpoint, base_folder, i)
+        _create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, i)
+
+        for j in range(ds_checkpoint.pp_degree):
+            _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, i, j)
+            _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, i, j)
+
+    for i in range(ds_checkpoint.dp_degree):
+        for j in range(ds_checkpoint.pp_degree):
+            for k in range(ds_checkpoint.tp_degree):
+                _create_zero_checkpoint(ds_checkpoint, base_folder, i, j, k)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/convert_checkpoint/deepspeed_to_megatron.py
+++ b/tools/convert_checkpoint/deepspeed_to_megatron.py
+#!/usr/bin/env python
+
+import argparse
+import os
+import torch
+from collections import OrderedDict
+from .deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint
+
+MODEL_KEY = 'model'
+ARGS_KEY = 'args'
+LANGUGAGE_MODEL_KEY = 'language_model'
+EMBEDDING_KEY = 'embedding'
+ENCODER_KEY = 'encoder'
+WORD_EMBEDDINGS_FOR_HEAD_KEY = 'word_embeddings_for_head'
+WORD_EMBEDDINGS_KEY = 'word_embeddings'
+FINAL_LAYER_NORM_KEY = 'final_layernorm'
+CHECKPOINT_VERSION_KEY = 'checkpoint_version'
+CHECKPOINT_VERSION_VALUE = 3.0
+ITERATION_KEY = 'iteration'
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_folder',
+                        default=None,
+                        type=str,
+                        help='Input DeepSpeed Checkpoint folder')
+    parser.add_argument('--output_folder',
+                        default=None,
+                        type=str,
+                        help='Output Megatron checkpoint folder')
+    parser.add_argument('--target_tp',
+                        default=1,
+                        type=int,
+                        help='Target TP degree')
+    parser.add_argument('--target_pp',
+                        default=1,
+                        type=int,
+                        help='Target PP degree')
+    parser.add_argument(
+        '--for_release',
+        action='store_true',
+        help='Convert for release purpose, reset some (progress) counters.')
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def _convert_ds_transformer_state(sd_list):
+    new_sd = OrderedDict()
+    for i, sd in enumerate(sd_list):
+        for key, value in sd.items():
+            new_key = f'layers.{i}.{key}'
+            new_sd[new_key] = value
+
+    return new_sd
+
+
+def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree):
+    path_list = []
+    iter_folder = f'iter_{iteration:07d}'
+    for i in range(0, tp_degree):
+        path_list.append([])
+        for j in range(0, pp_degree):
+            rank_folder = f'mp_rank_{i:02d}' if pp_degree == 1 else f'mp_rank_{i:02d}_{j:03d}'
+            ckpt_path = os.path.join(rank_folder, 'model_optim_rng.pt')
+            path_list[i].append(
+                os.path.join(base_folder, iter_folder, ckpt_path))
+
+    return path_list
+
+
+def _create_megatron_dict():
+    language_model_dict = {EMBEDDING_KEY: {}, ENCODER_KEY: {}}
+    megatron_dict = {
+        MODEL_KEY: {
+            LANGUGAGE_MODEL_KEY: language_model_dict
+        },
+        CHECKPOINT_VERSION_KEY: CHECKPOINT_VERSION_VALUE
+    }
+    return megatron_dict
+
+
+def _save_checkpoint(file_path, chkpt_sd):
+    dir, _ = os.path.split(file_path)
+    os.makedirs(dir, exist_ok=True)
+    torch.save(chkpt_sd, file_path)
+
+
+def _renest_sd(sd):
+    new_sd = OrderedDict()
+    for key, value in sd.items():
+        a, b = key.split('.')
+        new_sd[a] = {b: value}
+    return new_sd
+
+
+def _create_rank_checkpoint(ds_checkpoint,
+                            checkpoint_path,
+                            tp_index,
+                            pp_index,
+                            for_release=False):
+    meg_encoder_sd = OrderedDict()
+    meg_embedding_sd = OrderedDict()
+    meg_embedding_for_head_sd = OrderedDict()
+
+    transformer_sd = ds_checkpoint.get_transformer_state(tp_index, pp_index)
+    meg_encoder_sd.update(_convert_ds_transformer_state(transformer_sd))
+
+    if pp_index in [0, ds_checkpoint.pp_degree - 1]:
+        embedding_sd = ds_checkpoint.get_embedding_state(tp_index)
+        nested_embedding_sd = _renest_sd(embedding_sd)
+        if pp_index == 0:
+            meg_embedding_sd.update(nested_embedding_sd)
+
+        if pp_index == ds_checkpoint.pp_degree - 1:
+            for key, value in embedding_sd.items():
+                if key.startswith(WORD_EMBEDDINGS_KEY):
+                    fields = key.split('.')
+                    new_fields = fields[1:]
+                    new_key = '.'.join(new_fields)
+                    meg_embedding_for_head_sd[new_key] = value
+
+            final_norm_sd = ds_checkpoint.get_final_norm_state(tp_index)
+            new_final_norm_sd = {
+                f'{FINAL_LAYER_NORM_KEY}.{key}': value
+                for key, value in final_norm_sd.items()
+            }
+            meg_encoder_sd.update(new_final_norm_sd)
+
+    checkpoint_sd = _create_megatron_dict()
+
+    iteration = ds_checkpoint.get_iteration()
+    checkpoint_sd[ITERATION_KEY] = iteration
+    if pp_index == 0:
+        checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][
+            EMBEDDING_KEY] = meg_embedding_sd
+    checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][ENCODER_KEY] = meg_encoder_sd
+    if pp_index == ds_checkpoint.pp_degree - 1:
+        checkpoint_sd[MODEL_KEY][
+            WORD_EMBEDDINGS_FOR_HEAD_KEY] = meg_embedding_for_head_sd
+
+    checkpoint_sd[ARGS_KEY] = ds_checkpoint.get_args()
+    # Adjust specific fields
+    checkpoint_sd[
+        ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree
+    checkpoint_sd[
+        ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree
+    if for_release:
+        checkpoint_sd[ARGS_KEY].consumed_train_samples = 0
+        checkpoint_sd[ARGS_KEY].consumed_valid_samples = 0
+
+    return checkpoint_sd
+
+
+def _create_latest_file(base_folder, iteration):
+    file_path = os.path.join(base_folder, 'latest_checkpointed_iteration.txt')
+    os.makedirs(base_folder, exist_ok=True)
+    with open(file_path, 'w') as f:
+        f.write(str(iteration))
+
+
+def main():
+    print(f'Convert DeepSpeed Checkpoint to Megatron Checkpoint')
+
+    args = parse_arguments()
+    print(
+        f'Converting DeepSpeed checkpoint in {args.input_folder} to Megatron checkpoint in {args.output_folder}'
+    )
+
+    ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp,
+                                        args.target_pp)
+    iteration = ds_checkpoint.get_iteration()
+    _create_latest_file(args.output_folder, iteration)
+    checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration,
+                                                ds_checkpoint.tp_degree,
+                                                ds_checkpoint.pp_degree)
+    for i in range(0, ds_checkpoint.tp_degree):
+        for j in range(0, ds_checkpoint.pp_degree):
+            sd = _create_rank_checkpoint(ds_checkpoint, i, j, args.for_release)
+            _save_checkpoint(checkpoint_paths[i][j], sd)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/convert_checkpoint/deepspeed_to_transformers.py
+++ b/tools/convert_checkpoint/deepspeed_to_transformers.py
+#!/usr/bin/env python
+
+import os
+import torch
+import json
+import sys
+from pathlib import Path
+ 
+# insert megatron's root dir into sys.path
+root_repo_path = str(Path(__file__).resolve().parents[2])
+if root_repo_path not in sys.path:
+    sys.path.insert(0, root_repo_path)
+    
+from deepspeed.checkpoint import DeepSpeedCheckpoint
+from deepspeed_to_megatron import _create_rank_checkpoint, parse_arguments
+
+# the import was tested to work with this version
+# https://github.com/huggingface/transformers/commit/0af901e83 if it diverges we may consider
+# copying that version here instead
+from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import convert_megatron_checkpoint
+from transformers import GPT2Config
+
+
+def main():
+
+    # this first part comes mainly from deepspeed_to_megatron.main
+    args = parse_arguments()
+    print(
+        f'Converting DeepSpeed checkpoint in {args.input_folder} to HF Transformers checkpoint in {args.output_folder}'
+    )
+
+    ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp,
+                                        args.target_pp)
+    iteration = ds_checkpoint.get_iteration()
+    input_state_dict = _create_rank_checkpoint(ds_checkpoint, 0, 0,
+                                               args.for_release)
+
+    # the 2nd part comes from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint.main
+    # Spell out all parameters in case the defaults change.
+    config = GPT2Config(
+        vocab_size=50257,
+        n_positions=1024,
+        n_ctx=1024,
+        n_embd=1024,
+        n_layer=24,
+        n_head=16,
+        n_inner=4096,
+        activation_function="gelu",  # used to be "gelu_new" in earlier versions
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        summary_type="cls_index",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        scale_attn_weights=True,
+        gradient_checkpointing=False,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+    )
+
+    # Convert.
+    print("Converting to HF Checkpoint")
+    output_state_dict = convert_megatron_checkpoint(args, input_state_dict,
+                                                    config)
+
+    basename = args.output_folder
+    os.makedirs(basename, exist_ok=True)
+
+    # Print the structure of converted state dict.
+    #if args.print_checkpoint_structure:
+    #    recursive_print(None, output_state_dict)
+
+    # Store the config to file.
+    output_config_file = os.path.join(basename, "config.json")
+    output_config = config.to_dict()
+    output_config["architectures"] = ["GPT2LMHeadModel"]
+    output_config["model_type"] = "gpt2"
+    print(f'Saving config to "{output_config_file}"')
+    with open(output_config_file, "w") as f:
+        json.dump(output_config, f)
+
+    # Store the state_dict to file.
+    output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
+    print(f'Saving checkpoint to "{output_checkpoint_file}"')
+    torch.save(output_state_dict, output_checkpoint_file)
+
+    print("Now add tokenizer files and upload to the hub")
+
+
+if __name__ == "__main__":
+    main()