GPT2 base on megatron-deepspeed

8ec5d678 · hepj987 · 8ec5d678 · 8ec5d678 · 8ec5d678 · 8ec5d678
Commit 8ec5d678 authored Apr 03, 2023 by hepj987
20 changed files
--- a/megatron-deepspeed_dtk22.10/megatron/mpu/initialize.py
+++ b/megatron-deepspeed_dtk22.10/megatron/mpu/initialize.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""Model and data parallel groups."""
+
+import torch
+
+from .utils import ensure_divisibility
+
+
+# Intra-layer model parallel group that the current rank belongs to.
+_TENSOR_MODEL_PARALLEL_GROUP = None
+# Inter-layer model parallel group that the current rank belongs to.
+_PIPELINE_MODEL_PARALLEL_GROUP = None
+# Model parallel group (both intra- and pipeline) that the current rank belongs to.
+_MODEL_PARALLEL_GROUP = None
+# Embedding group.
+_EMBEDDING_GROUP = None
+# Data parallel group that the current rank belongs to.
+_DATA_PARALLEL_GROUP = None
+
+_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None
+_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+
+# These values enable us to change the mpu sizes on the fly.
+_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_TENSOR_MODEL_PARALLEL_RANK = None
+_MPU_PIPELINE_MODEL_PARALLEL_RANK = None
+
+# A list of global ranks for each pipeline group to ease calculation of the source
+# rank when broadcasting from the first or last pipeline stage
+_PIPELINE_GLOBAL_RANKS = None
+
+def is_unitialized():
+    """Useful for code segments that may be accessed with or without mpu initialization"""
+    return _DATA_PARALLEL_GROUP is None
+
+
+def initialize_model_parallel(tensor_model_parallel_size_=1,
+                              pipeline_model_parallel_size_=1,
+                              virtual_pipeline_model_parallel_size_=None):
+    """
+    Initialize model data parallel groups.
+
+    Arguments:
+        tensor_model_parallel_size: number of GPUs used to parallelize model tensor.
+        pipeline_model_parallel_size: number of GPUs used to parallelize model pipeline.
+
+    Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
+    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+    the model pipeline. The present function will
+    create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
+    and 8 data-parallel groups as:
+        8 data_parallel groups:
+            [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
+        8 tensor model-parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
+        4 pipeline model-parallel groups:
+            [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    if torch.distributed.get_rank() == 0:
+        print('> initializing tensor model parallel with size {}'.format(
+            tensor_model_parallel_size_))
+        print('> initializing pipeline model parallel with size {}'.format(
+            pipeline_model_parallel_size_))
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size = torch.distributed.get_world_size()
+    tensor_model_parallel_size = min(tensor_model_parallel_size_, world_size)
+    pipeline_model_parallel_size = min(pipeline_model_parallel_size_, world_size)
+    ensure_divisibility(world_size,
+                        tensor_model_parallel_size * pipeline_model_parallel_size)
+    data_parallel_size = world_size // (tensor_model_parallel_size *
+                                        pipeline_model_parallel_size)
+
+    num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
+    num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size
+    num_data_parallel_groups = world_size // data_parallel_size
+
+    if virtual_pipeline_model_parallel_size_ is not None:
+        global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+        global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+        _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
+        _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size_
+
+    rank = torch.distributed.get_rank()
+
+    # Build the data-parallel groups.
+    global _DATA_PARALLEL_GROUP
+    assert _DATA_PARALLEL_GROUP is None, \
+        'data parallel group is already initialized'
+    all_data_parallel_group_ranks = []
+    for i in range(pipeline_model_parallel_size):
+        start_rank = i * num_pipeline_model_parallel_groups
+        end_rank = (i + 1) * num_pipeline_model_parallel_groups
+        for j in range(tensor_model_parallel_size):
+            ranks = range(start_rank + j, end_rank,
+                          tensor_model_parallel_size)
+            all_data_parallel_group_ranks.append(list(ranks))
+            group = torch.distributed.new_group(ranks)
+            if rank in ranks:
+                _DATA_PARALLEL_GROUP = group
+
+    # Build the model-parallel groups.
+    global _MODEL_PARALLEL_GROUP
+    assert _MODEL_PARALLEL_GROUP is None, \
+        'model parallel group is already initialized'
+    for i in range(data_parallel_size):
+        ranks = [data_parallel_group_ranks[i]
+                 for data_parallel_group_ranks in all_data_parallel_group_ranks]
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _MODEL_PARALLEL_GROUP = group
+
+    # Build the tensor model-parallel groups.
+    global _TENSOR_MODEL_PARALLEL_GROUP
+    assert _TENSOR_MODEL_PARALLEL_GROUP is None, \
+        'tensor model parallel group is already initialized'
+    for i in range(num_tensor_model_parallel_groups):
+        ranks = range(i * tensor_model_parallel_size,
+                      (i + 1) * tensor_model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _TENSOR_MODEL_PARALLEL_GROUP = group
+
+    # Build the pipeline model-parallel groups and embedding groups
+    # (first and last rank in each pipeline model-parallel group).
+    global _PIPELINE_MODEL_PARALLEL_GROUP
+    global _PIPELINE_GLOBAL_RANKS
+    assert _PIPELINE_MODEL_PARALLEL_GROUP is None, \
+        'pipeline model parallel group is already initialized'
+    global _EMBEDDING_GROUP
+    assert _EMBEDDING_GROUP is None, \
+        'embedding group is already initialized'
+    for i in range(num_pipeline_model_parallel_groups):
+        ranks = range(i, world_size,
+                      num_pipeline_model_parallel_groups)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _PIPELINE_MODEL_PARALLEL_GROUP = group
+            _PIPELINE_GLOBAL_RANKS = ranks
+        # Setup embedding group (to exchange gradients between
+        # first and last stages).
+        if len(ranks) > 1:
+            embedding_ranks = [ranks[0], ranks[-1]]
+        else:
+            embedding_ranks = ranks
+        group = torch.distributed.new_group(embedding_ranks)
+        if rank in embedding_ranks:
+            _EMBEDDING_GROUP = group
+
+
+def model_parallel_is_initialized():
+    """Check if model and data parallel groups are initialized."""
+    if _TENSOR_MODEL_PARALLEL_GROUP is None or \
+        _PIPELINE_MODEL_PARALLEL_GROUP is None or \
+        _DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+
+
+def get_model_parallel_group():
+    """Get the model parallel group the caller rank belongs to."""
+    assert _MODEL_PARALLEL_GROUP is not None, \
+        'model parallel group is not initialized'
+    return _MODEL_PARALLEL_GROUP
+
+
+def get_tensor_model_parallel_group():
+    """Get the tensor model parallel group the caller rank belongs to."""
+    assert _TENSOR_MODEL_PARALLEL_GROUP is not None, \
+        'intra_layer_model parallel group is not initialized'
+    return _TENSOR_MODEL_PARALLEL_GROUP
+
+
+def get_pipeline_model_parallel_group():
+    """Get the pipeline model parallel group the caller rank belongs to."""
+    assert _PIPELINE_MODEL_PARALLEL_GROUP is not None, \
+        'pipeline_model parallel group is not initialized'
+    return _PIPELINE_MODEL_PARALLEL_GROUP
+
+
+def get_data_parallel_group():
+    """Get the data parallel group the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP is not None, \
+        'data parallel group is not initialized'
+    return _DATA_PARALLEL_GROUP
+
+
+def get_embedding_group():
+    """Get the embedding group the caller rank belongs to."""
+    assert _EMBEDDING_GROUP is not None, \
+        'embedding group is not initialized'
+    return _EMBEDDING_GROUP
+
+
+def set_tensor_model_parallel_world_size(world_size):
+    """Set the tensor model parallel size"""
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = world_size
+
+
+def set_pipeline_model_parallel_world_size(world_size):
+    """Set the pipeline model parallel size"""
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size
+
+
+def get_tensor_model_parallel_world_size():
+    """Return world size for the tensor model parallel group."""
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    if _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
+
+
+def get_model_parallel_world_size():
+    assert get_pipeline_model_parallel_world_size() == 1, "legacy get_model_parallel_world_size is only supported if PP is disabled"
+    return get_tensor_model_parallel_world_size()
+
+
+def get_pipeline_model_parallel_world_size():
+    """Return world size for the pipeline model parallel group."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    if _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_pipeline_model_parallel_group())
+
+
+def set_tensor_model_parallel_rank(rank):
+    """Set tensor model parallel rank."""
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    _MPU_TENSOR_MODEL_PARALLEL_RANK = rank
+
+
+def set_pipeline_model_parallel_rank(rank):
+    """Set pipeline model parallel rank."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    _MPU_PIPELINE_MODEL_PARALLEL_RANK = rank
+
+
+def get_tensor_model_parallel_rank():
+    """Return my rank for the tensor model parallel group."""
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    if _MPU_TENSOR_MODEL_PARALLEL_RANK is not None:
+        return _MPU_TENSOR_MODEL_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
+
+
+def get_model_parallel_rank():
+    assert get_pipeline_model_parallel_world_size() == 1, "legacy get_model_parallel_rank is only supported if PP is disabled"
+    return get_tensor_model_parallel_rank()
+
+
+def get_pipeline_model_parallel_rank():
+    """Return my rank for the pipeline model parallel group."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    if _MPU_PIPELINE_MODEL_PARALLEL_RANK is not None:
+        return _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_pipeline_model_parallel_group())
+
+
+def is_pipeline_first_stage(ignore_virtual=False):
+    """Return True if in the first pipeline model-parallel stage, False otherwise."""
+    if not ignore_virtual:
+        if get_virtual_pipeline_model_parallel_world_size() is not None and \
+            get_virtual_pipeline_model_parallel_rank() != 0:
+            return False
+    return get_pipeline_model_parallel_rank() == 0
+
+
+def is_pipeline_last_stage(ignore_virtual=False):
+    """Return True if in the last pipeline model-parallel stage, False otherwise."""
+    if not ignore_virtual:
+        virtual_pipeline_model_parallel_world_size = \
+            get_virtual_pipeline_model_parallel_world_size()
+        if virtual_pipeline_model_parallel_world_size is not None and \
+            get_virtual_pipeline_model_parallel_rank() != (
+                virtual_pipeline_model_parallel_world_size - 1):
+            return False
+    return get_pipeline_model_parallel_rank() == (
+        get_pipeline_model_parallel_world_size() - 1)
+
+
+def get_virtual_pipeline_model_parallel_rank():
+    """Return the virtual pipeline-parallel rank."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+    return _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+
+
+def set_virtual_pipeline_model_parallel_rank(rank):
+    """Set the virtual pipeline-parallel rank."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+    _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = rank
+
+
+def get_virtual_pipeline_model_parallel_world_size():
+    """Return the virtual pipeline-parallel world size."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    return _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+
+
+def get_tensor_model_parallel_src_rank():
+    """Calculate the global rank corresponding to the first local rank
+    in the tensor model parallel group."""
+    global_rank = torch.distributed.get_rank()
+    local_world_size = get_tensor_model_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+
+
+def get_pipeline_model_parallel_first_rank():
+    assert _PIPELINE_GLOBAL_RANKS is not None, \
+        "Pipeline parallel group is not initialized"
+    return _PIPELINE_GLOBAL_RANKS[0]
+
+
+def get_pipeline_model_parallel_last_rank():
+    assert _PIPELINE_GLOBAL_RANKS is not None, \
+        "Pipeline parallel group is not initialized"
+    last_rank_local = get_pipeline_model_parallel_world_size() - 1
+    return _PIPELINE_GLOBAL_RANKS[last_rank_local]
+
+def get_pipeline_model_parallel_next_rank():
+    assert _PIPELINE_GLOBAL_RANKS is not None, \
+        "Pipeline parallel group is not initialized"
+    rank_in_pipeline = get_pipeline_model_parallel_rank()
+    world_size = get_pipeline_model_parallel_world_size()
+    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]
+
+
+def get_pipeline_model_parallel_prev_rank():
+    assert _PIPELINE_GLOBAL_RANKS is not None, \
+        "Pipeline parallel group is not initialized"
+    rank_in_pipeline = get_pipeline_model_parallel_rank()
+    world_size = get_pipeline_model_parallel_world_size()
+    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]
+
+
+def get_data_parallel_world_size():
+    """Return world size for the data parallel group."""
+    return torch.distributed.get_world_size(group=get_data_parallel_group())
+
+
+def get_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    return torch.distributed.get_rank(group=get_data_parallel_group())
+
+
+def destroy_model_parallel():
+    """Set the groups to none."""
+    global _TENSOR_MODEL_PARALLEL_GROUP
+    _TENSOR_MODEL_PARALLEL_GROUP = None
+    global _PIPELINE_MODEL_PARALLEL_GROUP
+    _PIPELINE_MODEL_PARALLEL_GROUP = None
+    global _DATA_PARALLEL_GROUP
+    _DATA_PARALLEL_GROUP = None
--- a/megatron-deepspeed_dtk22.10/megatron/mpu/layers.py
+++ b/megatron-deepspeed_dtk22.10/megatron/mpu/layers.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+
+import math
+
+import torch
+import torch.nn.functional as F
+import torch.nn.init as init
+from torch.nn.parameter import Parameter
+from functools import partial
+
+from .initialize import get_tensor_model_parallel_rank
+from .initialize import get_tensor_model_parallel_world_size
+from .mappings import copy_to_tensor_model_parallel_region
+from .mappings import gather_from_tensor_model_parallel_region
+from .mappings import reduce_from_tensor_model_parallel_region
+from .mappings import scatter_to_tensor_model_parallel_region
+from .random import get_cuda_rng_tracker
+from .utils import divide
+from .utils import split_tensor_along_last_dim
+from .utils import VocabUtility
+from ..model.fused_layer_norm import MixedFusedLayerNorm as LayerNorm
+from megatron import get_args, mpu
+import deepspeed.runtime.activation_checkpointing.checkpointing as ds_checkpointing
+
+
+_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {'tensor_model_parallel': False,
+                                      'partition_dim': -1,
+                                      'partition_stride': 1}
+
+
+def param_is_not_tensor_parallel_duplicate(param):
+    return (hasattr(param, 'tensor_model_parallel') and
+            param.tensor_model_parallel) or (
+                get_tensor_model_parallel_rank() == 0)
+
+
+def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
+    # Make sure the attributes are not set.
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        assert not hasattr(tensor, attribute)
+    # Set the attributes.
+    setattr(tensor, 'tensor_model_parallel', is_parallel)
+    setattr(tensor, 'partition_dim', dim)
+    setattr(tensor, 'partition_stride', stride)
+
+
+def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor):
+    def maybe_set(attribute, value):
+        if not hasattr(tensor, attribute):
+            setattr(tensor, attribute, value)
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        maybe_set(attribute, _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS[attribute])
+
+
+def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor):
+    def maybe_copy(attribute):
+        if hasattr(source_tensor, attribute):
+            setattr(destination_tensor, attribute,
+                    getattr(source_tensor, attribute))
+    for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
+        maybe_copy(attribute)
+
+
+def _initialize_affine_weight_gpu(weight, init_method,
+                                  partition_dim, stride=1):
+    """Initialize affine weight for model parallel on GPU."""
+
+    set_tensor_model_parallel_attributes(tensor=weight,
+                                         is_parallel=True,
+                                         dim=partition_dim,
+                                         stride=stride)
+
+    if ds_checkpointing.is_configured():
+        global get_cuda_rng_tracker
+        get_cuda_rng_tracker = ds_checkpointing.get_cuda_rng_tracker
+
+    with get_cuda_rng_tracker().fork():
+        init_method(weight)
+
+
+def _initialize_affine_weight_cpu(weight, output_size, input_size,
+                                  per_partition_size, partition_dim,
+                                  init_method, stride=1,
+                                  return_master_weight=False):
+    """Initialize affine weight for model parallel.
+
+    Build the master weight on all processes and scatter
+    the relevant chunk."""
+
+    set_tensor_model_parallel_attributes(tensor=weight,
+                                         is_parallel=True,
+                                         dim=partition_dim,
+                                         stride=stride)
+
+    # Initialize master weight
+    master_weight = torch.empty(output_size, input_size,
+                                dtype=torch.float,
+                                requires_grad=False)
+    init_method(master_weight)
+    args = get_args()
+    master_weight = master_weight.to(dtype=args.params_dtype)
+
+    # Split and copy
+    per_partition_per_stride_size = divide(per_partition_size, stride)
+    weight_list = torch.split(master_weight, per_partition_per_stride_size,
+                              dim=partition_dim)
+    rank = get_tensor_model_parallel_rank()
+    world_size = get_tensor_model_parallel_world_size()
+    my_weight_list = weight_list[rank::world_size]
+
+    with torch.no_grad():
+        torch.cat(my_weight_list, dim=partition_dim, out=weight)
+    if return_master_weight:
+        return master_weight
+    return None
+
+
+def xavier_uniform_tensor_parallel_(tensor, gain=1., tp_degree=1):
+    r"""
+    This is a modified torch.nn.init.xavier_uniform_ with changes to support
+    partitioned on the vocab size dim embedding with tensor parallel.
+
+    Additional args:
+    - tp_degree: degree of tensor parallel
+
+    Note: the code assumes all partitions are equal in size
+    """
+    # receptive_field_size=1 as dim==2, so we don't need init._calculate_fan_in_and_fan_out
+    fan_out, fan_in = tensor.shape
+    fan_out *= tp_degree # tp splits on num_embeddings dim
+
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    a = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+
+    return torch.nn.init._no_grad_uniform_(tensor, -a, a)
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    This is mainly adapted from torch.nn.Embedding and all the default
+    values are kept.
+    Arguments:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        init_method: method to initialize weights.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim,
+                 init_method=init.xavier_normal_):
+        super(VocabParallelEmbedding, self).__init__()
+        # Keep the input dimensions.
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        # Set the defaults for compatibility.
+        self.padding_idx = None
+        self.max_norm = None
+        self.norm_type = 2.
+        self.scale_grad_by_freq = False
+        self.sparse = False
+        self._weight = None
+        self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
+        # Divide the weight matrix along the vocabulary dimension.
+        self.vocab_start_index, self.vocab_end_index = \
+            VocabUtility.vocab_range_from_global_vocab_size(
+                self.num_embeddings, get_tensor_model_parallel_rank(),
+                self.tensor_model_parallel_size)
+        self.num_embeddings_per_partition = self.vocab_end_index - \
+            self.vocab_start_index
+
+        # Allocate weights and initialize.
+        args = get_args()
+
+        # only the first stage embedding runs this class' forward. The head's embedding does its own
+        # thing, so don't waste memory allocating LN weights.
+        if mpu.is_pipeline_first_stage() and (args.use_bnb_optimizer or args.embed_layernorm):
+            self.norm = LayerNorm(embedding_dim)
+
+        if args.use_bnb_optimizer:
+            # for BNB we ignore the passed init_method and use torch.nn.init.xavier_uniform_
+            # modified to calculate std on the unpartitioned embedding
+            init_method = partial(xavier_uniform_tensor_parallel_, tp_degree=self.tensor_model_parallel_size)
+
+        if args.use_cpu_initialization:
+            self.weight = Parameter(torch.empty(
+                self.num_embeddings_per_partition, self.embedding_dim,
+                dtype=args.params_dtype))
+            _initialize_affine_weight_cpu(
+                self.weight, self.num_embeddings, self.embedding_dim,
+                self.num_embeddings_per_partition, 0, init_method)
+        else:
+            self.weight = Parameter(torch.empty(
+                self.num_embeddings_per_partition, self.embedding_dim,
+                device=torch.cuda.current_device(), dtype=args.params_dtype))
+            _initialize_affine_weight_gpu(self.weight, init_method,
+                                          partition_dim=0, stride=1)
+
+        if args.use_bnb_optimizer:
+            from bitsandbytes.optim import GlobalOptimManager
+            GlobalOptimManager.get_instance().override_config(self.weight, 'optim_bits', 32)
+            GlobalOptimManager.get_instance().register_parameters(self.weight)
+
+
+    def forward(self, input_):
+        if torch.any(input_ >= self.num_embeddings):
+            raise ValueError(f"There is an input id in the input that is greater than the highest possible input id.\nInput: {input_}\nnum_embeddings: {self.num_embeddings}")
+
+        if self.tensor_model_parallel_size > 1:
+            # Build the mask.
+            input_mask = (input_ < self.vocab_start_index) | \
+                         (input_ >= self.vocab_end_index)
+            # Mask the input.
+            masked_input = input_.clone() - self.vocab_start_index
+            masked_input[input_mask] = 0
+        else:
+            # input_ is garanted to be in the range [0:self.vocab_end_index - self.vocab_start_index] thanks to the first check
+            masked_input = input_
+
+        # Get the embeddings.
+        output_parallel = F.embedding(masked_input, self.weight,
+                                      self.padding_idx, self.max_norm,
+                                      self.norm_type, self.scale_grad_by_freq,
+                                      self.sparse)
+        # Mask the output embedding.
+        if self.tensor_model_parallel_size > 1:
+            output_parallel[input_mask, :] = 0.0
+        # Reduce across all the model parallel GPUs.
+        output = reduce_from_tensor_model_parallel_region(output_parallel)
+
+        if hasattr(self, 'norm'):
+            output = self.norm(output)
+
+        return output
+
+
+class ColumnParallelLinear(torch.nn.Module):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias
+        gather_output: If true, call all-gether on output and make Y avaiable
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: This was added to enable performance optimations where bias
+                       can be fused with other elementwise operations. we skip
+                       adding bias but instead return it.
+    """
+
+    def __init__(self, input_size, output_size, bias=True, gather_output=True,
+                 init_method=init.xavier_normal_, stride=1,
+                 keep_master_weight_for_test=False,
+                 skip_bias_add=False):
+        super(ColumnParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.gather_output = gather_output
+        # Divide the weight matrix along the last dimension.
+        world_size = get_tensor_model_parallel_world_size()
+        self.output_size_per_partition = divide(output_size, world_size)
+        self.skip_bias_add = skip_bias_add
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        args = get_args()
+        if args.use_cpu_initialization:
+            self.weight = Parameter(torch.empty(self.output_size_per_partition,
+                                                self.input_size,
+                                                dtype=args.params_dtype))
+            self.master_weight = _initialize_affine_weight_cpu(
+                self.weight, self.output_size, self.input_size,
+                self.output_size_per_partition, 0, init_method,
+                stride=stride, return_master_weight=keep_master_weight_for_test)
+        else:
+            self.weight = Parameter(torch.empty(
+                self.output_size_per_partition, self.input_size,
+                device=torch.cuda.current_device(), dtype=args.params_dtype))
+            _initialize_affine_weight_gpu(self.weight, init_method,
+                                          partition_dim=0, stride=stride)
+
+        if bias:
+            if args.use_cpu_initialization:
+                self.bias = Parameter(torch.empty(
+                    self.output_size_per_partition, dtype=args.params_dtype))
+            else:
+                self.bias = Parameter(torch.empty(
+                    self.output_size_per_partition,
+                    device=torch.cuda.current_device(),
+                    dtype=args.params_dtype))
+            set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+
+
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        input_parallel = copy_to_tensor_model_parallel_region(input_)
+        # Matrix multiply.
+
+        bias = self.bias if not self.skip_bias_add else None
+        output_parallel = F.linear(input_parallel, self.weight, bias)
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = gather_from_tensor_model_parallel_region(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+
+class RowParallelLinear(torch.nn.Module):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        init_method: method to initialize weights. Note that bias is always set
+                     to zero.
+        stride: For the strided linear layers.
+        keep_master_weight_for_test: This was added for testing and should be
+                                     set to False. It returns the master weights
+                                     used for initialization.
+        skip_bias_add: This was added to enable performance optimations where bias
+                       can be fused with other elementwise operations. we skip
+                       adding bias but instead return it.
+    """
+
+    def __init__(self, input_size, output_size, bias=True,
+                 input_is_parallel=False,
+                 init_method=init.xavier_normal_, stride=1,
+                 keep_master_weight_for_test=False,
+                 skip_bias_add=False):
+        super(RowParallelLinear, self).__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.input_is_parallel = input_is_parallel
+        # Divide the weight matrix along the last dimension.
+        world_size = get_tensor_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, world_size)
+        self.skip_bias_add = skip_bias_add
+
+        # Parameters.
+        # Note: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        # Initialize weight.
+        args = get_args()
+        if args.use_cpu_initialization:
+            self.weight = Parameter(torch.empty(self.output_size,
+                                                self.input_size_per_partition,
+                                                dtype=args.params_dtype))
+            self.master_weight = _initialize_affine_weight_cpu(
+                self.weight, self.output_size, self.input_size,
+                self.input_size_per_partition, 1, init_method,
+                stride=stride, return_master_weight=keep_master_weight_for_test)
+        else:
+            self.weight = Parameter(torch.empty(
+                self.output_size, self.input_size_per_partition,
+                device=torch.cuda.current_device(), dtype=args.params_dtype))
+            _initialize_affine_weight_gpu(self.weight, init_method,
+                                          partition_dim=1, stride=stride)
+        if bias:
+            if args.use_cpu_initialization:
+                self.bias = Parameter(torch.empty(self.output_size,
+                                                  dtype=args.params_dtype))
+            else:
+                self.bias = Parameter(torch.empty(
+                    self.output_size, device=torch.cuda.current_device(),
+                    dtype=args.params_dtype))
+            # Always initialize bias to zero.
+            with torch.no_grad():
+                self.bias.zero_()
+        else:
+            self.register_parameter('bias', None)
+
+        self.bias_tp_auto_sync = args.sync_tp_duplicated_parameters
+
+
+    def forward(self, input_):
+        # Set up backprop all-reduce.
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            input_parallel = scatter_to_tensor_model_parallel_region(input_)
+        # Matrix multiply.
+        output_parallel = F.linear(input_parallel, self.weight)
+        # All-reduce across all the partitions.
+        output_ = reduce_from_tensor_model_parallel_region(output_parallel)
+
+        if self.bias_tp_auto_sync:
+            torch.distributed.all_reduce(self.bias, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
+
+        if not self.skip_bias_add:
+            output = output_ + self.bias if self.bias is not None else output_
+            output_bias = None
+        else:
+            output = output_
+            output_bias = self.bias
+        return output, output_bias
--- a/megatron-deepspeed_dtk22.10/megatron/mpu/mappings.py
+++ b/megatron-deepspeed_dtk22.10/megatron/mpu/mappings.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .initialize import get_tensor_model_parallel_group, get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank
+from .utils import split_tensor_along_last_dim
+
+
+def _reduce(input_):
+    """All-reduce the the input tensor across model parallel group."""
+
+    # Bypass the function if we are using only 1 GPU.
+    if get_tensor_model_parallel_world_size()==1:
+        return input_
+
+    # All-reduce.
+    torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group())
+
+    return input_
+
+
+def _split(input_):
+    """Split the tensor along its last dimension and keep the
+    corresponding slice."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size==1:
+        return input_
+
+    # Split along last dimension.
+    input_list = split_tensor_along_last_dim(input_, world_size)
+
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = get_tensor_model_parallel_rank()
+    output = input_list[rank].contiguous()
+
+    return output
+
+
+def _gather(input_):
+    """Gather tensors and concatinate along the last dimension."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size==1:
+        return input_
+
+    # Size and dimension.
+    last_dim = input_.dim() - 1
+    rank = get_tensor_model_parallel_rank()
+
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    tensor_list[rank] = input_
+    torch.distributed.all_gather(tensor_list, input_, group=get_tensor_model_parallel_group())
+
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=last_dim).contiguous()
+
+    return output
+
+
+class _CopyToModelParallelRegion(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return input_
+    
+    @staticmethod
+    def forward(ctx, input_):
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _reduce(grad_output)
+
+
+class _ReduceFromModelParallelRegion(torch.autograd.Function):
+    """All-reduce the input from the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _reduce(input_)
+    
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+class _ScatterToModelParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather(grad_output)
+
+
+class _GatherFromModelParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatinate."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _gather(input_)
+    
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split(grad_output)
+
+
+# -----------------
+# Helper functions.
+# -----------------
+
+def copy_to_tensor_model_parallel_region(input_):
+    return _CopyToModelParallelRegion.apply(input_)
+
+
+def reduce_from_tensor_model_parallel_region(input_):
+    return _ReduceFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_tensor_model_parallel_region(input_):
+    return _ScatterToModelParallelRegion.apply(input_)
+
+
+def gather_from_tensor_model_parallel_region(input_):
+    return _GatherFromModelParallelRegion.apply(input_)
--- a/megatron-deepspeed_dtk22.10/megatron/mpu/random.py
+++ b/megatron-deepspeed_dtk22.10/megatron/mpu/random.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import contextlib
+
+import torch
+from torch import _C
+from torch.cuda import _lazy_call, device as device_ctx_manager
+from torch.utils.checkpoint import detach_variable
+
+from megatron import get_args
+from megatron.memory import allocate_mem_buff
+
+from .initialize import get_data_parallel_rank
+from .initialize import get_tensor_model_parallel_group
+from .initialize import get_tensor_model_parallel_rank
+from .initialize import get_tensor_model_parallel_world_size
+
+
+# Default name for the model parallel rng tracker.
+_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
+
+
+# Whether apply model parallelsim to checkpointed hidden states.
+_CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = None
+
+
+def init_checkpointed_activations_memory_buffer():
+    """Initialize the memory buffer for the checkpointed activations."""
+    args = get_args()
+
+    upper_bound_sequence_length = max(
+        args.seq_length if args.seq_length is not None else 0,
+        args.decoder_seq_length if args.decoder_seq_length is not None else 0
+    )
+    per_layer = args.micro_batch_size * upper_bound_sequence_length * \
+                args.hidden_size // args.tensor_model_parallel_size
+    assert args.num_layers % args.checkpoint_num_layers == 0, \
+        'number of layers is not divisible by checkpoint-num-layers'
+    num_checkpointer_layers = args.num_layers // args.checkpoint_num_layers
+    numel = per_layer * num_checkpointer_layers
+    dtype = torch.half
+    if not args.fp16:
+        dtype = torch.float
+
+    global _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER
+    assert _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is None, \
+        'checkpointed activations memory buffer is already allocated.'
+    _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER = allocate_mem_buff(
+        'checkpointed activations', numel, dtype, track_usage=False)
+
+
+def reset_checkpointed_activations_memory_buffer():
+    """Reset the memory used for checkpointing."""
+    if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+        _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.reset()
+
+
+def _set_cuda_rng_state(new_state, device=-1):
+    """Sets the random number generator state of the current GPU.
+
+    Argumentss:
+        new_state (torch.ByteTensor): The desired state
+    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
+    with a single change: the input state is not cloned. Cloning caused
+    major performance issues for +4 GPU cases.
+    """
+    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
+        # older PyTorch
+        def cb():
+            with device_ctx_manager(device):
+                _C._cuda_setRNGState(new_state)
+    else:
+        # newer PyTorch
+        if device == -1:
+            device = torch.device('cuda')
+        elif isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device('cuda', device)
+
+        def cb():
+            idx = device.index
+            if idx is None:
+                idx = torch.cuda.current_device()
+            default_generator = torch.cuda.default_generators[idx]
+            default_generator.set_state(new_state)
+
+    _lazy_call(cb)
+
+
+def split_tensor_into_1d_equal_chunks(tensor):
+    """Break a tensor into equal 1D chunks."""
+    data = tensor.view(-1)
+    partition_size = torch.numel(data) // get_tensor_model_parallel_world_size()
+    start_index = partition_size * get_tensor_model_parallel_rank()
+    end_index = start_index + partition_size
+    return data[start_index:end_index]
+
+
+def gather_split_1d_tensor(tensor):
+    """Opposite of above function, gather values from model parallel ranks."""
+    world_size = get_tensor_model_parallel_world_size()
+    numel = torch.numel(tensor)
+    numel_gathered = world_size * numel
+    gathered = torch.empty(numel_gathered, dtype=tensor.dtype,
+                           device=torch.cuda.current_device(),
+                           requires_grad=False)
+    chunks = [gathered[i*numel:(i+1)*numel] for i in range(world_size)]
+    torch.distributed.all_gather(chunks, tensor,
+                                 group=get_tensor_model_parallel_group())
+    return gathered
+
+
+class CudaRNGStatesTracker:
+    """Tracker for the cuda RNG states.
+
+    Using the `add` method, a cuda rng state is initialized based on
+    the input `seed` and is assigned to `name`. Later, by forking the
+    rng state, we can perform operations and return to our starting
+    cuda state.
+    """
+
+    def __init__(self):
+        # Map from a string name to the cuda rng state.
+        self.states_ = {}
+        # Seeds are just for book keeping and ensure no seed is set twice.
+        self.seeds_ = set()
+
+    def reset(self):
+        """Set to the initial state (no tracker)."""
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def get_states(self):
+        """Get rng states. Copy the dictionary so we have direct
+        pointers to the states, not just a pointer to the dictionary."""
+        states = {}
+        for name in self.states_:
+            states[name] = self.states_[name]
+        return states
+
+    def set_states(self, states):
+        """Set the rng states. For efficiency purposes, we do not check
+        the size of seed for compatibility."""
+        self.states_ = states
+
+    def add(self, name, seed):
+        """Track the rng state."""
+        # Check seed is not already used.
+        if seed in self.seeds_:
+            raise Exception('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        # Check that state is not already defined.
+        if name in self.states_:
+            raise Exception('cuda rng state {} already exists'.format(name))
+        # Get the current rng state.
+        orig_rng_state = torch.cuda.get_rng_state()
+        # Set the new state and store it.
+        torch.cuda.manual_seed(seed)
+        self.states_[name] = torch.cuda.get_rng_state()
+        # Reset rng state to what it was.
+        _set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
+        """Fork the cuda rng state, perform operations, and exit with
+        the original state."""
+        # Check if we have added the state
+        if name not in self.states_:
+            print(name, self.states_)
+            raise Exception('cuda rng state {} is not added'.format(name))
+        # Store current rng state.
+        orig_cuda_rng_state = torch.cuda.get_rng_state()
+        # Set rng state to the desired one
+        _set_cuda_rng_state(self.states_[name])
+        # Do the stuff we wanted to do.
+        try:
+            yield
+        finally:
+            # Update the current rng state for later use.
+            self.states_[name] = torch.cuda.get_rng_state()
+            # And set the state to the original state we started with.
+            _set_cuda_rng_state(orig_cuda_rng_state)
+
+
+# RNG tracker object.
+_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
+
+
+def get_cuda_rng_tracker():
+    """Get cuda rng tracker."""
+    return _CUDA_RNG_STATE_TRACKER
+
+
+def model_parallel_cuda_manual_seed(seed):
+    """Initialize model parallel cuda seed.
+
+    This function should be called after the model parallel is
+    initialized. Also, no torch.cuda.manual_seed should be called
+    after this function. Basically, this is replacement for that
+    function.
+    Two set of RNG states are tracked:
+        default state: This is for data parallelism and is the same among a
+                       set of model parallel GPUs but different across
+                       different model paralle groups. This is used for
+                       example for dropout in the non-tensor-model-parallel regions.
+        tensor-model-parallel state: This state is different among a set of model
+                              parallel GPUs, but the same across data parallel
+                              groups. This is used for example for dropout in
+                              model parallel regions.
+    """
+    # 2718 is just for fun and any POSITIVE value will work.
+    offset = seed + 2718
+    tensor_model_parallel_seed = offset + get_tensor_model_parallel_rank()
+    # Data parallel gets the original seed.
+    data_parallel_seed = seed
+
+    if torch.distributed.get_rank() == 0:
+        print('> initializing model parallel cuda seeds on global rank {}, '
+              'model parallel rank {}, and data parallel rank {} with '
+              'model parallel seed: {} and data parallel seed: {}'.format(
+                  torch.distributed.get_rank(), get_tensor_model_parallel_rank(),
+                  get_data_parallel_rank(), tensor_model_parallel_seed,
+                  data_parallel_seed), flush=True)
+    _CUDA_RNG_STATE_TRACKER.reset()
+    # Set the default state.
+    torch.cuda.manual_seed(data_parallel_seed)
+    # and model parallel state.
+    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
+                                tensor_model_parallel_seed)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    """This function is adapted from torch.utils.checkpoint with
+       two main changes:
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           2) the states in the model parallel tracker are also properly
+              tracked/set/reset.
+    """
+    @staticmethod
+    def forward(ctx, run_function, *args):
+        ctx.run_function = run_function
+
+        # Copy the rng states.
+        ctx.fwd_cpu_rng_state = torch.get_rng_state()
+        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        with torch.no_grad():
+            outputs = run_function(*args)
+
+        # Divide hidden states across model parallel group and only keep
+        # the chunk corresponding to the current rank.
+        if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+            ctx.input_0_shape = args[0].data.shape
+            args[0].data = split_tensor_into_1d_equal_chunks(args[0].data)
+            args[0].data = _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.add(
+                args[0].data)
+
+        # Store everything.
+        ctx.save_for_backward(*args)
+
+
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError("Checkpointing is not compatible with .grad(), "
+                               "please use .backward() if possible")
+        inputs = ctx.saved_tensors
+        if _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is not None:
+            inputs[0].data = gather_split_1d_tensor(inputs[0].data)
+            inputs[0].data = inputs[0].data.view(ctx.input_0_shape)
+
+        # Store the current states.
+        bwd_cpu_rng_state = torch.get_rng_state()
+        bwd_cuda_rng_state = torch.cuda.get_rng_state()
+        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # Set the states to what it used to be before the forward pass.
+        torch.set_rng_state(ctx.fwd_cpu_rng_state)
+        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
+
+        # Compute the forward pass.
+        detached_inputs = detach_variable(inputs)
+        with torch.enable_grad():
+            outputs = ctx.run_function(*detached_inputs)
+
+        # Set the states back to what it was at the start of this function.
+        torch.set_rng_state(bwd_cpu_rng_state)
+        _set_cuda_rng_state(bwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+        torch.autograd.backward(outputs, args)
+        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp
+                      for inp in detached_inputs)
+        return (None,) + grads
+
+
+def checkpoint(function, *args):
+    """Checkpoint a model or part of the model.
+    This has been directly copied from torch.utils.checkpoint."""
+    return CheckpointFunction.apply(function, *args)
--- a/megatron-deepspeed_dtk22.10/megatron/mpu/tests/__init__.py
+++ b/megatron-deepspeed_dtk22.10/megatron/mpu/tests/__init__.py
--- a/megatron-deepspeed_dtk22.10/megatron/mpu/tests/commons.py
+++ b/megatron-deepspeed_dtk22.10/megatron/mpu/tests/commons.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import random
+import numpy
+import torch
+
+import mpu
+
+
+class IdentityLayer(torch.nn.Module):
+    def __init__(self, size, scale=1.0):
+        super(IdentityLayer, self).__init__()
+        self.weight = torch.nn.Parameter(scale * torch.randn(size))
+
+    def forward(self):
+        return self.weight
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    numpy.random.seed(seed)
+    torch.manual_seed(seed)
+    mpu.model_parallel_cuda_manual_seed(seed)
+
+
+def initialize_distributed(backend='nccl'):
+    """Initialize torch.distributed."""
+    # Get local rank in case it is provided.
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--local_rank', type=int, default=None,
+                        help='local rank passed from distributed launcher')
+    args = parser.parse_args()
+    local_rank = args.local_rank
+
+    # Get rank and world size.
+    rank = int(os.getenv('RANK', '0'))
+    world_size = int(os.getenv("WORLD_SIZE", '1'))
+
+    print('> initializing torch.distributed with local rank: {}, '
+          'rank: {}, world size: {}'.format(local_rank, rank, world_size))
+
+    # Set the device id.
+    device = rank % torch.cuda.device_count()
+    if local_rank is not None:
+        device = local_rank
+    torch.cuda.set_device(device)
+
+    # Call the init process.
+    init_method = 'tcp://'
+    master_ip = os.getenv('MASTER_ADDR', 'localhost')
+    master_port = os.getenv('MASTER_PORT', '6000')
+    init_method += master_ip + ':' + master_port
+    torch.distributed.init_process_group(
+        backend=backend,
+        world_size=world_size,
+        rank=rank,
+        init_method=init_method)
+
+
+def print_separator(message):
+    torch.distributed.barrier()
+    filler_len = (78 - len(message)) // 2
+    filler = '-' * filler_len
+    string = '\n' + filler + ' {} '.format(message) + filler
+    if torch.distributed.get_rank() == 0:
+        print(string, flush=True)
+    torch.distributed.barrier()
--- a/megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_cross_entropy.py
+++ b/megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_cross_entropy.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from commons import set_random_seed
+from commons import IdentityLayer
+from commons import print_separator
+from commons import initialize_distributed
+from mpu.cross_entropy import vocab_parallel_cross_entropy
+import mpu
+import torch.nn.functional as F
+import torch
+import random
+import sys
+sys.path.append("../..")
+
+
+def torch_cross_entropy(batch_size, seq_length, vocab_size,
+                        logits_scale, seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    target = torch.cuda.LongTensor(
+        size=(batch_size, seq_length)).random_(0, vocab_size)
+    loss = F.cross_entropy(logits.view(-1, logits.size()[-1]),
+                           target.view(-1),
+                           reduction='none').view_as(target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def mpu_cross_entropy(batch_size, seq_length, vocab_size,
+                      logits_scale, seed):
+    set_random_seed(seed)
+    identity = IdentityLayer((batch_size, seq_length, vocab_size),
+                             scale=logits_scale).cuda()
+    logits = identity()
+    logits_parallel = mpu.scatter_to_tensor_model_parallel_region(logits)
+    target = torch.cuda.LongTensor(
+        size=(batch_size, seq_length)).random_(0, vocab_size)
+    loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
+    loss.backward()
+    return loss, identity.weight.grad
+
+
+def test_cross_entropy(tensor_model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing cross entropy with model parallel size {} ...'.
+              format(tensor_model_parallel_size))
+
+    mpu.initialize_model_parallel(tensor_model_parallel_size)
+    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
+
+    batch_size = 13
+    seq_length = 17
+    vocab_size_per_partition = 11
+    logits_scale = 1000.0
+    vocab_size = vocab_size_per_partition * tensor_model_parallel_size
+    seed = 1234
+
+    loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
+                                                 vocab_size, logits_scale,
+                                                 seed)
+    loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length,
+                                           vocab_size, logits_scale,
+                                           seed)
+
+    error = loss_torch.sub_(loss_mpu).abs().max()
+    print('   max error in loss on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = grad_torch.sub_(grad_mpu).abs().max()
+    print('   max error in grad on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_tensor_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        print_separator('test cross entropy')
+        test_cross_entropy(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2
--- a/megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_data.py
+++ b/megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_data.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from commons import print_separator
+from commons import initialize_distributed
+from mpu import data as data_utils
+import mpu
+import torch
+import functools
+import operator
+import sys
+sys.path.append("../..")
+
+
+def test_broadcast_data(tensor_model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing broadcast_data with model parallel size {} ...'.
+              format(tensor_model_parallel_size))
+
+    mpu.initialize_model_parallel(tensor_model_parallel_size)
+    torch.manual_seed(1234 + mpu.get_data_parallel_rank())
+    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
+
+    key_size_t = {'key1': [7, 11],
+                  'key2': [8, 2, 1],
+                  'key3': [13],
+                  'key4': [5, 1, 2],
+                  'key5': [5, 12]}
+    keys = list(key_size_t.keys())
+
+    data = {}
+    data_t = {}
+    for key in key_size_t:
+        data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
+        data_t[key] = data[key].clone()
+    data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
+    data_t['keyX'] = data['keyX'].clone()
+    if mpu.get_tensor_model_parallel_rank() != 0:
+        data = None
+
+    data_utils._check_data_types(keys, data_t, torch.int64)
+    key_size, key_numel, \
+        total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
+    for key in keys:
+        assert key_size[key] == key_size_t[key]
+    total_numel_t = 0
+    for key in keys:
+        target_size = functools.reduce(operator.mul, key_size_t[key], 1)
+        assert key_numel[key] == target_size
+        total_numel_t += target_size
+    assert total_numel == total_numel_t
+
+    data_b = data_utils.broadcast_data(keys, data, torch.int64)
+    for key in keys:
+        tensor = data_t[key].cuda()
+        assert data_b[key].sub(tensor).abs().max() == 0
+
+    # Reset groups
+    mpu.destroy_tensor_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        print_separator('test test broadcast data')
+        test_broadcast_data(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2
--- a/megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_initialize.py
+++ b/megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_initialize.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from commons import print_separator
+from commons import initialize_distributed
+import mpu
+import torch
+import sys
+sys.path.append("../..")
+
+
+def test_initialize_model_parallel(tensor_model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing initialize_model_parallel with size {} ...'.format(
+            tensor_model_parallel_size))
+    tensor_model_parallel_size_ = min(tensor_model_parallel_size,
+                               torch.distributed.get_world_size())
+    assert not mpu.model_parallel_is_initialized()
+    mpu.initialize_model_parallel(tensor_model_parallel_size_)
+    assert mpu.model_parallel_is_initialized()
+
+    # Checks.
+    def check(group, world_size, rank):
+        assert world_size == torch.distributed.get_world_size(group=group)
+        assert rank == torch.distributed.get_rank(group=group)
+
+    # Model parallel.
+    world_size = tensor_model_parallel_size_
+    rank = torch.distributed.get_rank() % tensor_model_parallel_size_
+    assert world_size == mpu.get_tensor_model_parallel_world_size()
+    assert rank == mpu.get_tensor_model_parallel_rank()
+    check(mpu.get_tensor_model_parallel_group(), world_size, rank)
+
+    # Data parallel.
+    world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_
+    rank = torch.distributed.get_rank() // tensor_model_parallel_size
+    assert world_size == mpu.get_data_parallel_world_size()
+    assert rank == mpu.get_data_parallel_rank()
+    check(mpu.get_data_parallel_group(), world_size, rank)
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing get_tensor_model_parallel_src_rank with size {} ...'.format(
+            tensor_model_parallel_size_))
+    tensor_model_parallel_size = min(tensor_model_parallel_size_,
+                              torch.distributed.get_world_size())
+    assert not mpu.model_parallel_is_initialized()
+    mpu.initialize_model_parallel(tensor_model_parallel_size)
+    assert mpu.model_parallel_is_initialized()
+
+    # Checks
+    src_rank = torch.distributed.get_rank() - mpu.get_tensor_model_parallel_rank()
+    assert mpu.get_tensor_model_parallel_src_rank() == src_rank
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        print_separator('test initialize model parallel')
+        test_initialize_model_parallel(tensor_model_parallel_size)
+        print_separator('test model parallel source rank')
+        test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2
--- a/megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_layers.py
+++ b/megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_layers.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from mpu import layers
+from commons import set_random_seed
+from commons import print_separator
+from commons import initialize_distributed
+import mpu
+from torch.nn.parameter import Parameter
+import torch.nn.init as init
+import torch
+import random
+import sys
+sys.path.append("../..")
+
+
+def test_parallel_embedding(tensor_model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing parallel embedding with model parallel size {} ...'.
+              format(tensor_model_parallel_size))
+
+    mpu.initialize_model_parallel(tensor_model_parallel_size)
+    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
+
+    batch_size = 17
+    seq_length = 23
+    vocab_size = 48
+    hidden_size = 16
+    seed = 1236
+
+    set_random_seed(123)
+    input_data = torch.LongTensor(
+        size=(batch_size, seq_length)).random_(0, vocab_size).cuda()
+    loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda()
+
+    set_random_seed(seed)
+    embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda()
+
+    output = embedding_original(input_data)
+    loss_original = torch.mul(output, loss_weight).sum()
+    loss_original.backward()
+
+    set_random_seed(seed)
+    embedding_parallel = layers.ParallelEmbedding(
+        vocab_size, hidden_size, init_method=init.normal_).cuda()
+    output = embedding_parallel(input_data)
+    loss_parallel = torch.mul(output, loss_weight).sum()
+    loss_parallel.backward()
+
+    set_random_seed(seed)
+    embedding_vocab_parallel = layers.VocabParallelEmbedding(
+        vocab_size, hidden_size, init_method=init.normal_).cuda()
+    output = embedding_vocab_parallel(input_data)
+    loss_vocab_parallel = torch.mul(output, loss_weight).sum()
+    loss_vocab_parallel.backward()
+
+    torch.distributed.barrier()
+    error = loss_parallel.sub(loss_original).abs()
+    print('   error in loss (parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    torch.distributed.barrier()
+    error = loss_vocab_parallel.sub(loss_original).abs()
+    print('   error in loss (vocab parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    weight_grad_orig = torch.split(embedding_original.weight.grad,
+                                   hidden_size // tensor_model_parallel_size,
+                                   1)[mpu.get_tensor_model_parallel_rank()]
+    error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max()
+    print('   error in grad (parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    weight_grad_orig = torch.split(embedding_original.weight.grad,
+                                   vocab_size // tensor_model_parallel_size,
+                                   0)[mpu.get_tensor_model_parallel_rank()]
+    error = embedding_vocab_parallel.weight.grad.sub(
+        weight_grad_orig).abs().max()
+    print('   error in grad (vocab parallel) on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-12, 'error: {}'.format(error)
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_initialize_affine_weight(tensor_model_parallel_size):
+
+    mpu.initialize_model_parallel(tensor_model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing initialize_affine_weight with model parallel '
+              'size: {}'.format(tensor_model_parallel_size))
+    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
+
+    seed = 12345
+    input_size_coeff = 13
+    input_size = input_size_coeff * tensor_model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * tensor_model_parallel_size
+
+    # ---------------
+    # Column parallel
+    # ---------------
+    weight = torch.empty(output_size_coeff, input_size)
+    set_random_seed(seed)
+    layers._initialize_affine_weight(weight, output_size, input_size,
+
+                                     output_size_coeff, 0,
+                                     torch.nn.init.normal_)
+    # Target.
+    set_random_seed(seed)
+    master_weight = torch.empty(output_size, input_size)
+    torch.nn.init.normal_(master_weight)
+    rank = mpu.get_tensor_model_parallel_rank()
+    my_weight = torch.split(master_weight, output_size_coeff,
+                            dim=0)[rank].contiguous().clone()
+
+    # Compare.
+    error = weight.sub(my_weight).abs().max()
+    torch.distributed.barrier()
+    print('   column parallel max error (should be zero) on global rank '
+          '{}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # ------------
+    # Row parallel
+    # ------------
+    weight = torch.empty(output_size, input_size_coeff)
+    set_random_seed(seed)
+    mpu.layers._initialize_affine_weight(weight, output_size, input_size,
+                                         input_size_coeff, 1,
+                                         torch.nn.init.normal_)
+    # Target.
+    set_random_seed(seed)
+    master_weight = torch.empty(output_size, input_size)
+    torch.nn.init.normal_(master_weight)
+    rank = mpu.get_tensor_model_parallel_rank()
+    my_weight = torch.split(master_weight, input_size_coeff,
+                            dim=1)[rank].contiguous().clone()
+
+    # Compare.
+    error = weight.sub(my_weight).abs().max()
+    torch.distributed.barrier()
+    print('   row parallel max error (should be zero) on global rank '
+          '{}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+class IdentityLayer2D(torch.nn.Module):
+    def __init__(self, m, n):
+        super(IdentityLayer2D, self).__init__()
+        self.weight = Parameter(torch.Tensor(m, n))
+        torch.nn.init.xavier_normal_(self.weight)
+
+    def forward(self):
+        return self.weight
+
+
+def test_column_parallel_linear(tensor_model_parallel_size):
+
+    mpu.initialize_model_parallel(tensor_model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing ColumnParallelLinear with model parallel '
+              'size: {}'.format(tensor_model_parallel_size))
+    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+    input_size_coeff = 13
+    input_size = input_size_coeff * tensor_model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * tensor_model_parallel_size
+    batch_size = 7
+
+    # Network
+    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
+    linear_layer = mpu.ColumnParallelLinear(
+        input_size, output_size, keep_master_weight_for_test=True).cuda()
+    loss_weight = torch.randn([batch_size, output_size]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = linear_layer(input_)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    # Values.
+    dLdY = loss_weight
+    X = identity_layer.weight
+    A = linear_layer.master_weight.cuda()
+    dLdA = torch.matmul(dLdY.t(), X)
+    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
+    dLdX = torch.matmul(dLdY, A)
+
+    rank = mpu.get_tensor_model_parallel_rank()
+    my_dLdA = torch.split(dLdA, output_size_coeff,
+                          dim=0)[rank].contiguous().clone()
+    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdA on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    my_dLdb = torch.split(dLdb, output_size_coeff,
+                          dim=0)[rank].contiguous().clone()
+    error = my_dLdb.sub(linear_layer.bias.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdb on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdX.sub(identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdX on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+def test_row_parallel_linear(tensor_model_parallel_size):
+
+    mpu.initialize_model_parallel(tensor_model_parallel_size)
+    if torch.distributed.get_rank() == 0:
+        print('> testing RowParallelLinear with model parallel '
+              'size: {}'.format(tensor_model_parallel_size))
+    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+    input_size_coeff = 13
+    input_size = input_size_coeff * tensor_model_parallel_size
+    output_size_coeff = 17
+    output_size = output_size_coeff * tensor_model_parallel_size
+    batch_size = 7
+
+    # Network
+    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
+    linear_layer = mpu.RowParallelLinear(
+        input_size, output_size, keep_master_weight_for_test=True).cuda()
+    loss_weight = torch.randn([batch_size, output_size]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = linear_layer(input_)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    # Values.
+    dLdY = loss_weight
+    X = identity_layer.weight
+    A = linear_layer.master_weight.cuda()
+    dLdA = torch.matmul(dLdY.t(), X)
+    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
+    dLdX = torch.matmul(dLdY, A)
+
+    rank = mpu.get_tensor_model_parallel_rank()
+    my_dLdA = torch.split(dLdA, input_size_coeff,
+                          dim=1)[rank].contiguous().clone()
+    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdA on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdb.sub(linear_layer.bias.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdb on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    error = dLdX.sub(identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   error in dLdX on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+class IdentityLayer3D(torch.nn.Module):
+    def __init__(self, m, n, k):
+        super(IdentityLayer3D, self).__init__()
+        self.weight = Parameter(torch.Tensor(m, n, k))
+        torch.nn.init.xavier_normal_(self.weight)
+
+    def forward(self):
+        return self.weight
+
+
+def parallel_self_attention(tensor_model_parallel_size, num_att_heads_per_partition,
+                            hidden_size_per_att_head, dropout_prob, batch_size,
+                            sequence_length):
+    mpu.initialize_model_parallel(tensor_model_parallel_size)
+    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+
+    num_att_heads = num_att_heads_per_partition * \
+        torch.distributed.get_world_size()
+    hidden_size = hidden_size_per_att_head * num_att_heads
+
+    # Network
+    identity_layer = IdentityLayer3D(batch_size, sequence_length,
+                                     hidden_size).cuda()
+    attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads,
+                                                    dropout_prob).cuda()
+    loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda()
+    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = attention_layer(input_, attention_mask)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    rank = mpu.get_tensor_model_parallel_rank()
+    mpu.destroy_model_parallel()
+    return rank, hidden_size, tensor_model_parallel_size, loss, \
+        attention_layer, identity_layer
+
+
+def test_parallel_self_attention(tensor_model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing ParallelSelfAttention with model parallel '
+              'size: {}'.format(tensor_model_parallel_size))
+
+    num_att_heads_per_partition = 3
+    hidden_size_per_att_head = 7
+    dropout_prob = 0.0  # has to be zero
+    batch_size = 5
+    sequence_length = 13
+
+    rank_1, hideen_size_1, tensor_model_parallel_size_1, loss_1, \
+        attention_layer_1, identity_layer_1 = parallel_self_attention(
+            1, num_att_heads_per_partition,
+            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
+
+    rank, hidden_size, tensor_model_parallel_size, loss, \
+        attention_layer, identity_layer = parallel_self_attention(
+            tensor_model_parallel_size, num_att_heads_per_partition,
+            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
+    assert hideen_size_1 == hidden_size
+
+    error = loss_1.sub(loss).abs().max()
+    torch.distributed.barrier()
+    print('   loss error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    my_lin_grad_list = torch.split(
+        attention_layer_1.query_key_value.weight.grad,
+        hidden_size // tensor_model_parallel_size, 0)[rank::tensor_model_parallel_size]
+    my_lin_grad = torch.cat(my_lin_grad_list, dim=0)
+    error = my_lin_grad.sub(
+        attention_layer.query_key_value.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   weight gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    error = identity_layer_1.weight.grad.sub(
+        identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   input gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-6
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+def parallel_transformer(tensor_model_parallel_size, num_att_heads_per_partition,
+                         hidden_size_per_att_head, batch_size, sequence_length):
+
+    mpu.initialize_model_parallel(tensor_model_parallel_size)
+    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
+
+    seed = 12345
+    set_random_seed(seed)
+
+    num_att_heads = num_att_heads_per_partition * \
+        torch.distributed.get_world_size()
+    hidden_size = hidden_size_per_att_head * num_att_heads
+    intermediate_size = 4 * hidden_size
+
+    # Network
+    identity_layer = IdentityLayer3D(batch_size, sequence_length,
+                                     hidden_size).cuda()
+    transformer_layer = mpu.BertParallelTransformerLayer(
+        hidden_size, intermediate_size, num_att_heads, 0.0, 0.0,
+        torch.nn.functional.relu, 1.0e-5).cuda()
+
+    loss_weight = torch.randn([batch_size, sequence_length, hidden_size]).cuda()
+    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
+    # Forward
+    input_ = identity_layer()
+    output = transformer_layer(input_, attention_mask)
+    loss = torch.mul(output, loss_weight).sum()
+    # Backward
+    loss.backward()
+
+    rank = mpu.get_tensor_model_parallel_rank()
+    mpu.destroy_model_parallel()
+    return rank, hidden_size, tensor_model_parallel_size, loss, \
+        transformer_layer, identity_layer
+
+
+def test_parallel_transformer_layer(tensor_model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing ParallelTransformerLayer with model parallel '
+              'size: {}'.format(tensor_model_parallel_size))
+
+    num_att_heads_per_partition = 3
+    hidden_size_per_att_head = 7
+    batch_size = 5
+    sequence_length = 13
+
+    rank_1, hidden_size_1, tensor_model_parallel_size_1, loss_1, \
+        transformer_layer_1, identity_layer_1 = parallel_transformer(
+            1, num_att_heads_per_partition,
+            hidden_size_per_att_head, batch_size, sequence_length)
+
+    rank, hidden_size, tensor_model_parallel_size, loss, \
+        transformer_layer, identity_layer = parallel_transformer(
+            tensor_model_parallel_size, num_att_heads_per_partition,
+            hidden_size_per_att_head, batch_size, sequence_length)
+
+    error = loss_1.sub(loss).abs().max()
+    torch.distributed.barrier()
+    print('   loss error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-5, 'error: {}'.format(error)
+
+    error = identity_layer_1.weight.grad.sub(
+        identity_layer.weight.grad).abs().max()
+    torch.distributed.barrier()
+    print('   input gradient error on global rank {}: {}'.format(
+        torch.distributed.get_rank(), error))
+    assert error < 5.0e-5, 'error: {}'.format(error)
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(' >> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    print_separator('test initialize affine weight')
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        test_initialize_affine_weight(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2
+
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        print_separator('test parallel embedding')
+        test_parallel_embedding(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2
+
+    print_separator('test column-parallel linear')
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        test_column_parallel_linear(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2
+
+    print_separator('test row-parallel linear')
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        test_row_parallel_linear(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2
+
+    print_separator('test parallel self-attention')
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        test_parallel_self_attention(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2
+
+    print_separator('test parallel transformer')
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        test_parallel_transformer_layer(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2
--- a/megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_random.py
+++ b/megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_random.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from commons import print_separator
+from commons import initialize_distributed
+import mpu
+import torch
+import sys
+sys.path.append("../..")
+
+
+def test_set_cuda_rng_state(tensor_model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing set_rng_state with size {} ...'.
+              format(tensor_model_parallel_size))
+
+    mpu.initialize_model_parallel(tensor_model_parallel_size)
+    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
+
+    size = 123
+    seed = 1234
+    torch.cuda.manual_seed(1234)
+    tensor = torch.cuda.FloatTensor(size)
+
+    # Get the state
+    rng_state = torch.cuda.get_rng_state()
+    rng_state_copy = rng_state.clone()
+
+    # Do some stuff.
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    result_1 = tensor.clone()
+
+    assert rng_state.sub(rng_state_copy).max() == 0
+    assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
+
+    # State should be different.
+    new_rng_state = torch.cuda.get_rng_state()
+    max_diff = new_rng_state.sub(rng_state).max()
+    print('   max diff in rng state (should be non-zero) on global rank {}: {}'.
+          format(torch.distributed.get_rank(), max_diff))
+    assert max_diff > 0
+
+    # Reset the rng state and do the same stuff.
+    mpu.random._set_cuda_rng_state(rng_state)
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    mpu.random._set_cuda_rng_state(rng_state)
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    result_2 = tensor.clone()
+
+    # Results should be the same
+    error = result_2.sub(result_1).abs().max()
+    print('   max error in generated tensors (should be zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Input state should have remained intact.
+    error = rng_state.sub(rng_state_copy).max()
+    print('   max error in rng state (should be zero) on global rank {}: {}'.
+          format(torch.distributed.get_rank(), error))
+    assert error == 0
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_cuda_rng_tracker(tensor_model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing cuda rng tracker with size {} ...'.
+              format(tensor_model_parallel_size))
+
+    mpu.initialize_model_parallel(tensor_model_parallel_size)
+    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
+
+    seed_1 = 1234
+    seed_2 = 4321
+    size = [12, 21]
+    tensor = torch.cuda.FloatTensor(size)
+
+    # Set to seed_1 and generate two tensors.
+    torch.cuda.manual_seed(seed_1)
+    torch.randn(size, out=tensor)
+    target_11 = tensor.clone()
+    torch.randn(size, out=tensor)
+    target_12 = tensor.clone()
+
+    # Set to seed_2 and generate two tensors.
+    torch.cuda.manual_seed(seed_2)
+    torch.randn(size, out=tensor)
+    target_21 = tensor.clone()
+    torch.randn(size, out=tensor)
+    target_22 = tensor.clone()
+
+    # Now if we interleave seed_1 and seed_2,
+    # we should still get the same tensors
+    torch.cuda.manual_seed(seed_1)
+    mpu.get_cuda_rng_tracker().add('test', seed_2)
+
+    torch.randn(size, out=tensor)
+    result_11 = tensor.clone()
+
+    with mpu.get_cuda_rng_tracker().fork('test'):
+        torch.randn(size, out=tensor)
+        result_21 = tensor.clone()
+
+    torch.randn(size, out=tensor)
+    result_12 = tensor.clone()
+
+    with mpu.get_cuda_rng_tracker().fork('test'):
+        torch.randn(size, out=tensor)
+        result_22 = tensor.clone()
+
+    diff = result_11.sub(result_21).abs().max()
+    diff = min(diff, result_12.sub(result_22).abs().max())
+    print('   max diff in generated tensors (should be non-zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), diff))
+    assert diff > 1.0e-6
+    error = max(result_11.sub(target_11).abs().max(),
+                result_12.sub(target_12).abs().max())
+    error = max(error, result_21.sub(target_21).abs().max())
+    error = max(error, result_22.sub(target_22).abs().max())
+    print('   max error in generated tensors (should be zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+
+    # Reset the tracker
+    mpu.get_cuda_rng_tracker().reset()
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+def test_model_parallel_cuda_manual_seed(tensor_model_parallel_size):
+
+    if torch.distributed.get_rank() == 0:
+        print('> testing model parallel cuda manual seed with size {} ...'.
+              format(tensor_model_parallel_size))
+
+    mpu.initialize_model_parallel(tensor_model_parallel_size)
+    tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
+
+    mpu.model_parallel_cuda_manual_seed(12345)
+    assert torch.cuda.initial_seed() == 12345
+    with mpu.get_cuda_rng_tracker().fork():
+        assert torch.cuda.initial_seed() == (12345 + 2718 +
+                                             mpu.get_tensor_model_parallel_rank())
+
+    # Reset the tracker
+    mpu.get_cuda_rng_tracker().reset()
+
+    # Reset groups
+    mpu.destroy_model_parallel()
+
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+
+
+if __name__ == '__main__':
+
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        print_separator('test set rng state')
+        test_set_cuda_rng_state(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2
+
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        print_separator('test cuda rng tracker')
+        test_cuda_rng_tracker(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2
+
+    tensor_model_parallel_size = 1
+    while tensor_model_parallel_size <= world_size:
+        print_separator('test model parallel cuda manual seed')
+        test_model_parallel_cuda_manual_seed(tensor_model_parallel_size)
+        tensor_model_parallel_size *= 2
--- a/megatron-deepspeed_dtk22.10/megatron/mpu/utils.py
+++ b/megatron-deepspeed_dtk22.10/megatron/mpu/utils.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
+        numerator, denominator)
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+def split_tensor_along_last_dim(tensor, num_partitions,
+                                contiguous_split_chunks=False):
+    """Split a tensor along its last dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+        first and last index of the vocabulary belonging to the `rank`
+        partition: Note that indecies in [fist, last)"""
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
+                                                  rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(
+            per_partition_vocab_size, rank, world_size)
--- a/megatron-deepspeed_dtk22.10/megatron/optimizer/__init__.py
+++ b/megatron-deepspeed_dtk22.10/megatron/optimizer/__init__.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from apex.optimizers import FusedAdam as Adam
+from apex.optimizers import FusedSGD as SGD
+
+from megatron import get_args
+from megatron.model.fused_layer_norm import MixedFusedLayerNorm as LayerNorm
+
+from .grad_scaler import ConstantGradScaler, DynamicGradScaler
+from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer
+
+
+def _get_params_for_weight_decay_optimization(modules):
+    """Divide params into with-weight-decay and without-weight-decay groups.
+    Layernorms and baises will have no weight decay but the rest will.
+    """
+
+    weight_decay_params = {'params': []}
+    no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
+    for module in modules:
+        for module_ in module.modules():
+            if isinstance(module_, LayerNorm):
+                no_weight_decay_params['params'].extend(
+                    [p for p in list(module_._parameters.values())
+                     if p is not None])
+            else:
+                weight_decay_params['params'].extend(
+                    [p for n, p in list(module_._parameters.items())
+                     if p is not None and n != 'bias'])
+                no_weight_decay_params['params'].extend(
+                    [p for n, p in list(module_._parameters.items())
+                     if p is not None and n == 'bias'])
+
+    # XXX: temp hack to workaround the crash in apex FusedAdam's multi_tensor_applier
+    #
+    # it crashes when the param count is larger than a certain size which we hit at 200B over 80
+    # A100 gpus - I think around 2.7B per gpu, so halving it works around the issue
+    param_count = len(weight_decay_params['params'])
+    first_half = weight_decay_params['params'][:param_count // 2]
+    second_half = weight_decay_params['params'][param_count // 2:]
+
+    first_half =  { 'params': first_half }
+    second_half = { 'params': second_half }
+
+    return first_half, second_half, no_weight_decay_params
+
+    #return weight_decay_params, no_weight_decay_params
+
+
+def get_megatron_optimizer(model):
+    args = get_args()
+
+    if args.cpu_optimizer:
+        raise NotImplementedError('need to add cpu adam')
+
+    # Base optimizer.
+    param_groups = _get_params_for_weight_decay_optimization(model)
+    if args.optimizer == 'adam':
+        if args.use_bnb_optimizer:
+            import bitsandbytes as bnb
+            adam_optimizer = bnb.optim.Adam8bit
+        else:
+            adam_optimizer = Adam
+        optimizer = adam_optimizer(param_groups,
+                                   lr=args.lr,
+                                   weight_decay=args.weight_decay,
+                                   betas=(args.adam_beta1, args.adam_beta2),
+                                   eps=args.adam_eps)
+    elif args.optimizer == 'sgd':
+        optimizer = SGD(param_groups,
+                        lr=args.lr,
+                        weight_decay=args.weight_decay,
+                        momentum=args.sgd_momentum)
+    else:
+        raise Exception('{} optimizer is not supported.'.format(
+            args.optimizer))
+
+    if args.deepspeed:
+        return optimizer
+
+    # Determine whether the params have main-grad field.
+    params_have_main_grad = False
+    if args.DDP_impl == 'local':
+        params_have_main_grad = True
+
+    if args.fp16 or args.bf16:
+
+        # Grad scaler:
+        #    if loss-scale is provided, instantiate the constant scaler.
+        #    if we are using fp16 and loss-scale is not present, use a
+        #       dynamic scaler.
+        #    otherwise we are running in bf16 with no loss-scale so
+        #       leave it as None.
+        grad_scaler = None
+        # Constant loss scale.
+        if args.loss_scale:
+            grad_scaler = ConstantGradScaler(args.loss_scale)
+        # Dynamic loss scale.
+        else:
+            if args.fp16:
+                grad_scaler = DynamicGradScaler(
+                    initial_scale=args.initial_loss_scale,
+                    min_scale=args.min_loss_scale,
+                    growth_factor=2.0,
+                    backoff_factor=0.5,
+                    growth_interval=args.loss_scale_window,
+                    hysteresis=args.hysteresis)
+
+        # Megatron optimizer.
+        return Float16OptimizerWithFloat16Params(optimizer,
+                                                 args.clip_grad,
+                                                 args.log_num_zeros_in_grad,
+                                                 params_have_main_grad,
+                                                 args.bf16,
+                                                 grad_scaler)
+
+    # FP32.
+    return FP32Optimizer(optimizer, args.clip_grad,
+                         args.log_num_zeros_in_grad,
+                         params_have_main_grad)
--- a/megatron-deepspeed_dtk22.10/megatron/optimizer/clip_grads.py
+++ b/megatron-deepspeed_dtk22.10/megatron/optimizer/clip_grads.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Gradient clipping."""
+
+import torch
+from torch._six import inf
+
+from apex.multi_tensor_apply import multi_tensor_applier
+import amp_C
+
+from megatron import mpu
+from megatron.model.module import param_is_not_shared
+from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate
+
+
+def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
+    """Clips gradient norm of an iterable of parameters whose gradients
+       are in fp32.
+
+    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
+    added functionality to handle model parallel parameters. Note that
+    the gradients are modified in place.
+
+    Arguments:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+
+    # Filter parameters based on:
+    #   - grad should not be none
+    #   - parameter should not be shared
+    #   - should not be a replica due to tensor model parallelism
+    grads = []
+    grads_for_norm = []
+    for param in parameters:
+        grad_not_none = param.grad is not None
+        is_not_shared = param_is_not_shared(param)
+        is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
+        grad = param.grad.detach()
+        if grad_not_none:
+            # Make sure the grads are in fp32
+            assert param.grad.type() == 'torch.cuda.FloatTensor'
+            grads.append(grad)
+        if grad_not_none and is_not_shared and is_not_tp_duplicate:
+            grads_for_norm.append(grad)
+
+    # Norm parameters.
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    total_norm = 0.0
+
+    # Calculate norm.
+    if norm_type == inf:
+        total_norm = max(grad.abs().max() for grad in grads_for_norm)
+        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
+        # Take max across all model-parallel GPUs.
+        torch.distributed.all_reduce(total_norm_cuda,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=mpu.get_model_parallel_group())
+        total_norm = total_norm_cuda[0].item()
+
+    else:
+        if norm_type == 2.0:
+            dummy_overflow_buf = torch.cuda.IntTensor([0])
+            # Use apex's multi-tensor applier for efficiency reasons.
+            # Multi-tensor applier takes a function and a list of list
+            # and performs the operation on that list all in one kernel.
+            grad_norm, _ = multi_tensor_applier(
+                amp_C.multi_tensor_l2norm,
+                dummy_overflow_buf,
+                [grads_for_norm],
+                False # no per-parameter norm
+            )
+            # Since we will be summing across data parallel groups,
+            # we need the pow(norm-type).
+            total_norm = grad_norm ** norm_type
+
+        else:
+            for grad in grads_for_norm:
+                grad_norm = torch.norm(grad, norm_type)
+                total_norm += grad_norm ** norm_type
+
+        # Sum across all model-parallel GPUs.
+        torch.distributed.all_reduce(total_norm,
+                                     op=torch.distributed.ReduceOp.SUM,
+                                     group=mpu.get_model_parallel_group())
+        total_norm = total_norm.item() ** (1.0 / norm_type)
+
+    # Scale.
+    clip_coeff = max_norm / (total_norm + 1.0e-6)
+    if clip_coeff < 1.0:
+        dummy_overflow_buf = torch.cuda.IntTensor([0])
+        multi_tensor_applier(amp_C.multi_tensor_scale,
+                             dummy_overflow_buf,
+                             [grads, grads],
+                             clip_coeff)
+
+    return total_norm
+
+
+def count_zeros_fp32(parameters):
+
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+
+    # Filter parameters based on:
+    #   - grad should not be none
+    #   - parameter should not be shared
+    #   - should not be a replica due to tensor model parallelism
+    total_num_zeros = 0.0
+    for param in parameters:
+        grad_not_none = param.grad is not None
+        is_not_shared = param_is_not_shared(param)
+        is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
+        if grad_not_none and is_not_shared and is_not_tp_duplicate:
+            grad = param.grad.detach()
+            num_zeros = grad.numel() - torch.count_nonzero(grad)
+            total_num_zeros = num_zeros + total_num_zeros
+
+    # Sum across all model-parallel GPUs.
+    torch.distributed.all_reduce(total_num_zeros,
+                                 op=torch.distributed.ReduceOp.SUM,
+                                 group=mpu.get_model_parallel_group())
+    total_num_zeros = total_num_zeros.item()
+
+    return total_num_zeros
--- a/megatron-deepspeed_dtk22.10/megatron/optimizer/grad_scaler.py
+++ b/megatron-deepspeed_dtk22.10/megatron/optimizer/grad_scaler.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Megatron grad scaler."""
+
+from abc import ABC
+from abc import abstractmethod
+
+import torch
+
+
+class MegatronGradScaler(ABC):
+
+    def __init__(self, initial_scale):
+        """Initialize scale value with the input initial scale."""
+        assert initial_scale > 0.0
+        self._scale = torch.cuda.FloatTensor([initial_scale])
+
+    @property
+    def scale(self):
+        return self._scale
+
+    @property
+    def inv_scale(self):
+        return self._scale.double().reciprocal().float()
+
+    @abstractmethod
+    def update(self, found_inf):
+        pass
+
+    @abstractmethod
+    def state_dict(self):
+        pass
+
+    @abstractmethod
+    def load_state_dict(self, state_dict):
+        pass
+
+
+
+class ConstantGradScaler(MegatronGradScaler):
+
+    def update(self, found_inf):
+        pass
+
+    def state_dict(self):
+        return dict()
+
+    def load_state_dict(self, state_dict):
+        pass
+
+
+
+class DynamicGradScaler(MegatronGradScaler):
+
+    def __init__(self, initial_scale, min_scale,
+                 growth_factor, backoff_factor,
+                 growth_interval, hysteresis):
+        """"Grad scaler with dynamic scale that gets adjusted
+        during training."""
+        super(DynamicGradScaler, self).__init__(initial_scale)
+
+        # Lower bound on the scale.
+        assert min_scale > 0.0
+        assert min_scale <= initial_scale
+        self.min_scale = torch.cuda.FloatTensor([min_scale])
+        # Growth and backoff factors for the scale.
+        assert growth_factor > 1.0
+        self.growth_factor = torch.cuda.FloatTensor([growth_factor])
+        assert backoff_factor < 1.0
+        assert backoff_factor > 0.0
+        self.backoff_factor = torch.cuda.FloatTensor([backoff_factor])
+        # Interval over which if we don't see any inf/nan,
+        # we will scale the grad scale by the growth factor.
+        assert growth_interval > 0
+        self.growth_interval = growth_interval
+        # Number of inf/nans we should see before scaling down
+        # the grad scale by the backoff factor.
+        assert hysteresis > 0
+        self.hysteresis = hysteresis
+
+        # Trackers.
+        self._growth_tracker = 0
+        self._hysteresis_tracker = self.hysteresis
+
+
+    def update(self, found_inf):
+
+        # If we have an inf/nan, growth tracker is set to 0
+        # and hysterisis tracker is reduced by 1.
+        if found_inf:
+            self._growth_tracker = 0
+            self._hysteresis_tracker -= 1
+            # Now if we are out of hysteresis count, scale down the loss.
+            if self._hysteresis_tracker <= 0:
+                self._scale = torch.max(self._scale * self.backoff_factor,
+                                        self.min_scale)
+        else:
+            # If there is no nan/inf, increment the growth tracker.
+            self._growth_tracker += 1
+            # If we have had enough consequitive intervals with no nan/inf:
+            if self._growth_tracker == self.growth_interval:
+                # Reset the tracker and hysteresis trackers,
+                self._growth_tracker = 0
+                self._hysteresis_tracker = self.hysteresis
+                # and scale up the loss scale.
+                self._scale = self._scale * self.growth_factor
+
+
+    def state_dict(self):
+        state_dict = {}
+        state_dict['scale'] = self._scale
+        state_dict['growth_tracker'] = self._growth_tracker
+        state_dict['hysteresis_tracker'] = self._hysteresis_tracker
+        return state_dict
+
+
+    def load_state_dict(self, state_dict):
+        self._scale = state_dict['scale'].cuda(torch.cuda.current_device())
+        self._growth_tracker = state_dict['growth_tracker']
+        self._hysteresis_tracker = state_dict['hysteresis_tracker']
--- a/megatron-deepspeed_dtk22.10/megatron/optimizer/optimizer.py
+++ b/megatron-deepspeed_dtk22.10/megatron/optimizer/optimizer.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Megatron optimizer."""
+
+from abc import ABC
+from abc import abstractmethod
+
+import torch
+
+from apex.multi_tensor_apply import multi_tensor_applier
+import amp_C
+
+from megatron import get_timers
+from megatron import mpu
+from megatron import print_rank_0
+
+from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32
+
+
+def _zero_grad_group_helper(group, set_to_none):
+    """Zero out the gradient for a group of parameters.
+    Note: copied from torch.optim.optimizer."""
+    for param in group:
+        if param.grad is not None:
+            if set_to_none:
+                param.grad = None
+            else:
+                if param.grad.grad_fn is not None:
+                    param.grad.detach_()
+                else:
+                    param.grad.requires_grad_(False)
+                param.grad.zero_()
+
+
+def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
+    """Use multi-tensor-applier to copy values from one list to another.
+    We don't have a blfoat16 implementation so for now if the overflow_buf
+    is not provided, we default back to simple loop copy to be compatible
+    with bfloat16."""
+    if overflow_buf:
+        overflow_buf.fill_(0)
+        # Scaling with factor `1.0` is equivalent to copy.
+        multi_tensor_applier(amp_C.multi_tensor_scale,
+                             overflow_buf,
+                             [this, that],
+                             1.0)
+    else:
+        for this_, that_ in zip(this, that):
+            that_.copy_(this_)
+
+
+
+class MegatronOptimizer(ABC):
+
+
+    def __init__(self, optimizer, clip_grad,
+                 log_num_zeros_in_grad,
+                 params_have_main_grad):
+        """Input optimizer is the base optimizer for example Adam."""
+        self.optimizer = optimizer
+        assert self.optimizer, 'no optimizer is provided.'
+        # Set gradient clipping and logging params.
+        self.clip_grad = clip_grad
+        self.log_num_zeros_in_grad = log_num_zeros_in_grad
+        self.params_have_main_grad = params_have_main_grad
+
+
+    def get_parameters(self):
+        params = []
+        for param_group in self.optimizer.param_groups:
+            for param in param_group['params']:
+                params.append(param)
+        return params
+
+
+    def clip_grad_norm(self, clip_grad):
+        params = self.get_parameters()
+        return clip_grad_norm_fp32(params, clip_grad)
+
+
+    def count_zeros(self):
+        params = self.get_parameters()
+        return count_zeros_fp32(params)
+
+
+    @abstractmethod
+    def zero_grad(self, set_to_none=True):
+        pass
+
+
+    @abstractmethod
+    def get_loss_scale(self):
+        """The output should be a cuda tensor of size 1."""
+        pass
+
+
+    def scale_loss(self, loss):
+        """Simple scaling."""
+        return self.get_loss_scale() * loss
+
+
+    @abstractmethod
+    def step(self):
+        pass
+
+
+    @abstractmethod
+    def reload_model_params(self):
+        """Refreshes any internal state from the current model parameters.
+        Call whenever the parameters are changed outside of the optimizer.
+        For example, when we load a model from a checkpoint  without loading
+        the optimizer, the model parameters are updated but for fp16 optimizer
+        with main parameters, the main parameters need to also be updated."""
+        pass
+
+
+    @abstractmethod
+    def state_dict(self):
+        pass
+
+
+    @abstractmethod
+    def load_state_dict(self, state_dict):
+        pass
+
+
+    # Promote state so it can be retrieved or set via
+    # "optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+
+    # Promote param_groups so it can be retrieved or set via
+    # "optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
+
+
+
+class Float16OptimizerWithFloat16Params(MegatronOptimizer):
+    """Float16 optimizer for fp16 and bf16 data types.
+
+    Arguments:
+        optimizer: base optimizer such as Adam or SGD
+        clip_grad: clip gradeints with this global L2 norm. Note
+            that clipping is ignored if clip_grad == 0
+        log_num_zeros_in_grad: return number of zeros in the gradients.
+        params_have_main_grad: flag indicating if parameters have
+            a `main_grad` field. If this is set, we are assuming
+            that the model parameters are store in the `main_grad`
+            field instead of the typical `grad` field. This happens
+            for the DDP cases where there is a contihuous buffer
+            holding the gradients. For example for bfloat16, we want
+            to do gradient accumulation and all-reduces in float32
+            and as a result we store those gradients in the main_grad.
+            Note that main grad is not necessarily in float32.
+        bf16: if true, the model is running in bfloat16.
+        grad_scaler: used for scaling gradients. Note that this can be
+            None. This case happens when `bf16 = True` and we don't
+            use any loss scale. Note that for `bf16 = True`, we can have
+            a constnat gradient scaler. Also for `bf16 = False`, we
+            always require a grad scaler.
+    """
+
+    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
+                 params_have_main_grad, bf16, grad_scaler):
+
+        super(Float16OptimizerWithFloat16Params, self).__init__(
+            optimizer, clip_grad, log_num_zeros_in_grad,
+            params_have_main_grad)
+
+        self.bf16 = bf16
+        self.grad_scaler = grad_scaler
+        # None grad scaler is only supported for bf16.
+        if self.grad_scaler is None:
+            assert self.bf16, 'fp16 expects a grad scaler.'
+
+        # Tensor used to determine if a nan/if has happend.
+        # Any non-zero value indicates inf/nan.
+        # Note that we keep this for the cases that grad scaler is none.
+        # We still record nan/inf if we have a bfloat16 with a grad scaler.
+        if self.grad_scaler:
+            self.found_inf = torch.cuda.FloatTensor([0.0])
+
+        # Dummy tensor needed for apex multi-apply tensor.
+        # For bfloat, we don't have multi-tensor apply and for now
+        # we set it to none so the multi-tensor apply gets ignored.
+        if bf16:
+            self._dummy_overflow_buf = None
+        else:
+            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
+
+        # In case grad scaler is not passed, define the unity scale.
+        if self.grad_scaler is None:
+            self._scale_one = torch.cuda.FloatTensor([1.0])
+
+        # ======================
+        # main parameter stuff
+        # ======================
+
+        # Three groups of parameters:
+        #   float16_groups: original float16 parameters
+        #   fp32_from_float16_groups: fp32 copy of float16 parameters
+        #   fp32_from_fp32_groups: original fp32 parameters
+        self.float16_groups = []
+        self.fp32_from_float16_groups = []
+        self.fp32_from_fp32_groups = []
+
+        # For all the groups in the original optimizer:
+        for param_group in self.optimizer.param_groups:
+            float16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_float16_params_this_group = []
+            # For all the parameters in this group:
+            for i, param in enumerate(param_group['params']):
+                if param.requires_grad:
+
+                    # float16 params:
+                    if param.type() in ['torch.cuda.HalfTensor',
+                                        'torch.cuda.BFloat16Tensor']:
+                        float16_params_this_group.append(param)
+                        # Create a copy
+                        main_param = param.detach().clone().float()
+                        # Copy tensor model parallel attributes.
+                        mpu.copy_tensor_model_parallel_attributes(main_param,
+                                                                  param)
+                        if hasattr(param, 'shared'):
+                            main_param.shared = param.shared
+                        # Replace the optimizer params with the new fp32 copy.
+                        param_group['params'][i] = main_param
+                        fp32_from_float16_params_this_group.append(main_param)
+                        # Reset existing state dict key to the new main param.
+                        if param in self.optimizer.state:
+                            self.optimizer.state[main_param] \
+                                = self.optimizer.state.pop(param)
+
+                    # fp32 params.
+                    elif param.type() == 'torch.cuda.FloatTensor':
+                        fp32_params_this_group.append(param)
+                        param_group['params'][i] = param
+
+                    else:
+                        raise TypeError('Wrapped parameters must be one of '
+                                        'torch.cuda.FloatTensor,  '
+                                        'torch.cuda.HalfTensor, or '
+                                        'torch.cuda.BFloat16Tensor. '
+                                        'Received {}'.format(param.type()))
+
+            self.float16_groups.append(float16_params_this_group)
+            self.fp32_from_float16_groups.append(
+                fp32_from_float16_params_this_group)
+            self.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+        # Leverage state_dict() and load_state_dict() to
+        # recast preexisting per-param state tensors
+        self.optimizer.load_state_dict(self.optimizer.state_dict())
+
+
+    def zero_grad(self, set_to_none=True):
+        """We only need to zero the model related parameters, i.e.,
+                float16_groups & fp32_from_fp32_groups."""
+        for group in self.float16_groups:
+            _zero_grad_group_helper(group, set_to_none)
+        for group in self.fp32_from_fp32_groups:
+            _zero_grad_group_helper(group, set_to_none)
+
+
+    def get_loss_scale(self):
+        if self.grad_scaler is None:
+            return self._scale_one
+        return self.grad_scaler.scale
+
+
+    def _copy_model_grads_to_main_grads(self):
+        # This only needs to be done for the float16 group.
+        for model_group, main_group in zip(self.float16_groups,
+                                           self.fp32_from_float16_groups):
+            for model_param, main_param in zip(model_group, main_group):
+                if self.params_have_main_grad:
+                    main_param.grad = model_param.main_grad.float()
+                else:
+                    if model_param.grad is not None:
+                        main_param.grad = model_param.grad.float()
+        # For fp32 grads, we need to reset the grads to main grad.
+        if self.params_have_main_grad:
+            for model_group in self.fp32_from_fp32_groups:
+                for model_param in model_group:
+                    model_param.grad = model_param.main_grad
+
+
+    def _unscale_main_grads_and_check_for_nan(self):
+        main_grads = []
+        # fp32 params fromm float16 ones.
+        for main_group in self.fp32_from_float16_groups:
+            for main_param in main_group:
+                if main_param.grad is not None:
+                    main_grads.append(main_param.grad.data)
+        # Append fp32 parameters.
+        for main_group in self.fp32_from_fp32_groups:
+            for main_param in main_group:
+                if main_param.grad is not None:
+                    main_grads.append(main_param.grad.data)
+        # Reset found inf.
+        self.found_inf.fill_(0.0)
+        # Unscale and set found inf/nan
+        torch._amp_foreach_non_finite_check_and_unscale_(
+            main_grads, self.found_inf, self.grad_scaler.inv_scale)
+        # Update across all model parallel instances.
+        torch.distributed.all_reduce(self.found_inf,
+                                     op=torch.distributed.ReduceOp.MAX,
+                                     group=mpu.get_model_parallel_group())
+
+        # Check for nan.
+        found_inf_flag = (self.found_inf.item() > 0)
+        return found_inf_flag
+
+
+    def _get_model_and_main_params_data_float16(self):
+        model_data = []
+        main_data = []
+        for model_group, main_group in zip(self.float16_groups,
+                                           self.fp32_from_float16_groups):
+            for model_param, main_param in zip(model_group, main_group):
+                model_data.append(model_param.data)
+                main_data.append(main_param.data)
+        return model_data, main_data
+
+
+    def _copy_main_params_to_model_params(self):
+        # Only needed for the float16 params.
+        model_data, main_data = self._get_model_and_main_params_data_float16()
+        _multi_tensor_copy_this_to_that(this=main_data, that=model_data,
+                                        overflow_buf=self._dummy_overflow_buf)
+
+
+    def _copy_model_params_to_main_params(self):
+        # Only needed for the float16 params.
+        model_data, main_data = self._get_model_and_main_params_data_float16()
+        _multi_tensor_copy_this_to_that(this=model_data, that=main_data,
+                                        overflow_buf=self._dummy_overflow_buf)
+
+
+    def reload_model_params(self):
+        self._copy_model_params_to_main_params()
+
+
+    @torch.no_grad()
+    def step(self):
+
+        timers = get_timers()
+
+        # Copy gradients from model params to main params.
+        timers('optimizer-copy-to-main-grad').start()
+        self._copy_model_grads_to_main_grads()
+        timers('optimizer-copy-to-main-grad').stop()
+
+        # Do unscale, check for inf, and update grad scaler only for
+        # the case that grad scaler is provided.
+        if self.grad_scaler:
+
+            # Unscale and check for inf/nan.
+            timers('optimizer-unscale-and-check-inf').start()
+            found_inf_flag = self._unscale_main_grads_and_check_for_nan()
+            timers('optimizer-unscale-and-check-inf').stop()
+
+            # We are done with scaling gradients
+            # so we can update the loss scale.
+            self.grad_scaler.update(found_inf_flag)
+
+            # If we found inf/nan, skip the update.
+            if found_inf_flag:
+                return False, None, None
+
+        # Clip the main gradients.
+        timers('optimizer-clip-main-grad').start()
+        grad_norm = None
+        if self.clip_grad > 0.0:
+            grad_norm = self.clip_grad_norm(self.clip_grad)
+        timers('optimizer-clip-main-grad').stop()
+
+        # count the zeros in the grads
+        num_zeros_in_grad = self.count_zeros() if \
+                            self.log_num_zeros_in_grad else None
+
+        # Step the optimizer.
+        self.optimizer.step()
+
+        # Update params from main params.
+        timers('optimizer-copy-main-to-model-params').start()
+        self._copy_main_params_to_model_params()
+        timers('optimizer-copy-main-to-model-params').stop()
+
+        # Successful update.
+        return True, grad_norm, num_zeros_in_grad
+
+
+    def state_dict(self):
+        state_dict = {}
+        state_dict['optimizer'] = self.optimizer.state_dict()
+        if self.grad_scaler:
+            state_dict['grad_scaler'] = self.grad_scaler.state_dict()
+        state_dict['fp32_from_fp16_params'] = self.fp32_from_float16_groups
+        return state_dict
+
+
+    def load_state_dict(self, state_dict):
+        # Optimizer.
+        optimizer_key = 'optimizer'
+        if optimizer_key not in state_dict:
+            optimizer_key = 'optimizer_state_dict'
+            print_rank_0('***WARNING*** loading optimizer from '
+                         'an old checkpoint ...')
+        self.optimizer.load_state_dict(state_dict[optimizer_key])
+
+        # Grad scaler.
+        if 'grad_scaler' not in state_dict:
+            print_rank_0('***WARNING*** found an old checkpoint, will not '
+                         'load grad scaler ...')
+        else:
+            if self.grad_scaler:
+                self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
+            else:
+                print_rank_0('***WARNING*** fould the grad scaler in the '
+                             'checkpoint but it is None in the class. '
+                             'Skipping loading grad scaler ...')
+
+        # Copy data for the main params.
+        fp32_from_float16_params_key = 'fp32_from_fp16_params'
+        if fp32_from_float16_params_key not in state_dict:
+            fp32_from_float16_params_key = 'fp32_from_fp16'
+        for current_group, saved_group in zip(
+                self.fp32_from_float16_groups,
+                state_dict[fp32_from_float16_params_key]):
+            for current_param, saved_param in zip(current_group, saved_group):
+                current_param.data.copy_(saved_param.data)
+
+
+
+class FP32Optimizer(MegatronOptimizer):
+
+    def __init__(self, optimizer, clip_grad,
+                 log_num_zeros_in_grad,
+                 params_have_main_grad):
+
+        super(FP32Optimizer, self).__init__(
+            optimizer, clip_grad, log_num_zeros_in_grad,
+            params_have_main_grad)
+
+        self._scale = torch.cuda.FloatTensor([1.0])
+
+
+    def zero_grad(self, set_to_none=True):
+        """Copied from torch.optim.optimizer"""
+        for group in self.optimizer.param_groups:
+            _zero_grad_group_helper(group['params'], set_to_none)
+
+
+    def get_loss_scale(self):
+        """FP32 optimizer does not do any scaling."""
+        return self._scale
+
+
+    @torch.no_grad()
+    def step(self):
+        """Clip gradients (if needed) and step the base optimizer.
+        Always return successful since there is no overflow."""
+
+        # Copy main_grads to grads.
+        if self.params_have_main_grad:
+            for param_group in self.optimizer.param_groups:
+                for param in param_group['params']:
+                    param.grad = param.main_grad
+
+        # Clip gradients.
+        grad_norm = None
+        if self.clip_grad > 0.0:
+            grad_norm = self.clip_grad_norm(self.clip_grad)
+
+        # count the zeros in the grads
+        num_zeros_in_grad = self.count_zeros() if \
+                            self.log_num_zeros_in_grad else None
+
+        # Update parameters.
+        self.optimizer.step()
+
+        # No overflow for FP32 optimizer.
+        return True, grad_norm, num_zeros_in_grad
+
+
+    def reload_model_params(self):
+        pass
+
+
+    def state_dict(self):
+        return self.optimizer.state_dict()
+
+
+    def load_state_dict(self, state_dict):
+        self.optimizer.load_state_dict(state_dict)
--- a/megatron-deepspeed_dtk22.10/megatron/p2p_communication.py
+++ b/megatron-deepspeed_dtk22.10/megatron/p2p_communication.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import reduce
+import operator
+import torch
+
+from megatron import get_args
+from megatron import mpu
+
+
+def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
+                 use_ring_exchange=False):
+    """Communicate tensors between stages. Used as helper method in other
+    communication methods that are used in megatron/schedules.py.
+
+    Takes the following arguments:
+        tensor_send_next: tensor to send to next rank (no tensor sent if
+                          set to None).
+        tensor_send_prev: tensor to send to prev rank (no tensor sent if
+                          set to None).
+        recv_prev: boolean for whether tensor should be received from
+                   previous rank.
+        recv_next: boolean for whether tensor should be received from
+                   next rank.
+        use_ring_exchange: boolean for whether torch.distributed.ring_exchange()
+                           API should be used.
+
+    Returns:
+        (tensor_recv_prev, tensor_recv_next)
+    """
+    args = get_args()
+
+    # Create placeholder tensors for receive in forward and backward directions
+    # if needed.
+    tensor_recv_prev = None
+    tensor_recv_next = None
+    tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
+    if args.scatter_gather_tensors_in_pipeline:
+        tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1) // \
+            mpu.get_tensor_model_parallel_world_size()
+    else:
+        tensor_chunk_shape = tensor_shape
+    dtype = args.params_dtype
+    if args.fp32_residual_connection:
+        dtype = torch.float
+    if recv_prev:
+        tensor_recv_prev = torch.empty(tensor_chunk_shape,
+                                       requires_grad=True,
+                                       device=torch.cuda.current_device(),
+                                       dtype=dtype)
+    if recv_next:
+        tensor_recv_next = torch.empty(tensor_chunk_shape,
+                                       requires_grad=True,
+                                       device=torch.cuda.current_device(),
+                                       dtype=dtype)
+
+    # Split tensor into smaller chunks if using scatter-gather optimization.
+    if args.scatter_gather_tensors_in_pipeline:
+        if tensor_send_next is not None:
+            tensor_send_next = mpu.split_tensor_into_1d_equal_chunks(tensor_send_next)
+
+        if tensor_send_prev is not None:
+            tensor_send_prev = mpu.split_tensor_into_1d_equal_chunks(tensor_send_prev)
+
+    # Send tensors in both the forward and backward directions as appropriate.
+    if use_ring_exchange:
+        torch.distributed.ring_exchange(tensor_send_prev=tensor_send_prev,
+                                        tensor_recv_prev=tensor_recv_prev,
+                                        tensor_send_next=tensor_send_next,
+                                        tensor_recv_next=tensor_recv_next,
+                                        group=mpu.get_pipeline_model_parallel_group())
+    else:
+        ops = []
+        if tensor_send_prev is not None:
+            send_prev_op = torch.distributed.P2POp(
+                torch.distributed.isend, tensor_send_prev,
+                mpu.get_pipeline_model_parallel_prev_rank())
+            ops.append(send_prev_op)
+        if tensor_recv_prev is not None:
+            recv_prev_op = torch.distributed.P2POp(
+                torch.distributed.irecv, tensor_recv_prev,
+                mpu.get_pipeline_model_parallel_prev_rank())
+            ops.append(recv_prev_op)
+        if tensor_send_next is not None:
+            send_next_op = torch.distributed.P2POp(
+                torch.distributed.isend, tensor_send_next,
+                mpu.get_pipeline_model_parallel_next_rank())
+            ops.append(send_next_op)
+        if tensor_recv_next is not None:
+            recv_next_op = torch.distributed.P2POp(
+                torch.distributed.irecv, tensor_recv_next,
+                mpu.get_pipeline_model_parallel_next_rank())
+            ops.append(recv_next_op)
+        if len(ops) > 0:
+            reqs = torch.distributed.batch_isend_irecv(ops)
+            for req in reqs:
+                req.wait()
+    # To protect against race condition when using batch_isend_irecv().
+    torch.cuda.synchronize()
+
+    # If using scatter-gather optimization, gather smaller chunks.
+    if args.scatter_gather_tensors_in_pipeline:
+        if recv_prev:
+            tensor_recv_prev = mpu.gather_split_1d_tensor(
+                tensor_recv_prev).view(tensor_shape).requires_grad_()
+
+        if recv_next:
+            tensor_recv_next = mpu.gather_split_1d_tensor(
+                tensor_recv_next).view(tensor_shape).requires_grad_()
+
+    return tensor_recv_prev, tensor_recv_next
+
+
+def recv_forward(timers=None):
+    """Receive tensor from previous rank in pipeline (forward receive)."""
+    if mpu.is_pipeline_first_stage():
+        input_tensor = None
+    else:
+        if timers is not None:
+            timers('forward-recv').start()
+        input_tensor, _ = _communicate(
+            tensor_send_next=None,
+            tensor_send_prev=None,
+            recv_prev=True,
+            recv_next=False)
+        if timers is not None:
+            timers('forward-recv').stop()
+    return input_tensor
+
+
+def recv_backward(timers=None):
+    """Receive tensor from next rank in pipeline (backward receive)."""
+    if mpu.is_pipeline_last_stage():
+        output_tensor_grad = None
+    else:
+        if timers is not None:
+            timers('backward-recv').start()
+        _, output_tensor_grad = _communicate(
+            tensor_send_next=None,
+            tensor_send_prev=None,
+            recv_prev=False,
+            recv_next=True)
+        if timers is not None:
+            timers('backward-recv').stop()
+    return output_tensor_grad
+
+
+def send_forward(output_tensor, timers=None):
+    """Send tensor to next rank in pipeline (forward send)."""
+    if not mpu.is_pipeline_last_stage():
+        if timers is not None:
+            timers('forward-send').start()
+        _communicate(
+            tensor_send_next=output_tensor,
+            tensor_send_prev=None,
+            recv_prev=False,
+            recv_next=False)
+        if timers is not None:
+            timers('forward-send').stop()
+
+
+def send_backward(input_tensor_grad, timers=None):
+    """Send tensor to previous rank in pipeline (backward send)."""
+    if not mpu.is_pipeline_first_stage():
+        if timers is not None:
+            timers('backward-send').start()
+        _communicate(
+            tensor_send_next=None,
+            tensor_send_prev=input_tensor_grad,
+            recv_prev=False,
+            recv_next=False)
+        if timers is not None:
+            timers('backward-send').stop()
+
+
+def send_forward_recv_backward(output_tensor, timers=None):
+    """Batched send and recv with next rank in pipeline."""
+    if mpu.is_pipeline_last_stage():
+        output_tensor_grad = None
+    else:
+        if timers is not None:
+            timers('forward-send-backward-recv').start()
+        _, output_tensor_grad = _communicate(
+            tensor_send_next=output_tensor,
+            tensor_send_prev=None,
+            recv_prev=False,
+            recv_next=True)
+        if timers is not None:
+            timers('forward-send-backward-recv').stop()
+    return output_tensor_grad
+
+
+def send_backward_recv_forward(input_tensor_grad, timers=None):
+    """Batched send and recv with previous rank in pipeline."""
+    if mpu.is_pipeline_first_stage():
+        input_tensor = None
+    else:
+        if timers is not None:
+            timers('backward-send-forward-recv').start()
+        input_tensor, _ = _communicate(
+            tensor_send_next=None,
+            tensor_send_prev=input_tensor_grad,
+            recv_prev=True,
+            recv_next=False)
+        if timers is not None:
+            timers('backward-send-forward-recv').stop()
+    return input_tensor
+
+
+def send_forward_recv_forward(output_tensor, recv_prev, timers=None):
+    """Batched recv from previous rank and send to next rank in pipeline."""
+    if timers is not None:
+        timers('forward-send-forward-recv').start()
+    input_tensor, _ = _communicate(
+        tensor_send_next=output_tensor,
+        tensor_send_prev=None,
+        recv_prev=recv_prev,
+        recv_next=False)
+    if timers is not None:
+        timers('forward-send-forward-recv').stop()
+    return input_tensor
+
+
+def send_backward_recv_backward(input_tensor_grad, recv_next, timers=None):
+    """Batched recv from next rank and send to previous rank in pipeline."""
+    if timers is not None:
+        timers('backward-send-backward-recv').start()
+    _, output_tensor_grad = _communicate(
+        tensor_send_next=None,
+        tensor_send_prev=input_tensor_grad,
+        recv_prev=False,
+        recv_next=recv_next)
+    if timers is not None:
+        timers('backward-send-backward-recv').stop()
+    return output_tensor_grad
+
+
+def send_forward_backward_recv_forward_backward(
+        output_tensor, input_tensor_grad, recv_prev,
+        recv_next, timers=None):
+    """Batched send and recv with previous and next ranks in pipeline."""
+    if timers is not None:
+        timers('forward-backward-send-forward-backward-recv').start()
+    input_tensor, output_tensor_grad = _communicate(
+        tensor_send_next=output_tensor,
+        tensor_send_prev=input_tensor_grad,
+        recv_prev=recv_prev,
+        recv_next=recv_next)
+    if timers is not None:
+        timers('forward-backward-send-forward-backward-recv').stop()
+    return input_tensor, output_tensor_grad
--- a/megatron-deepspeed_dtk22.10/megatron/package_info.py
+++ b/megatron-deepspeed_dtk22.10/megatron/package_info.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+MAJOR = 1
+MINOR = 1.5
+
+# Use the following formatting: (major, minor)
+VERSION = (MAJOR, MINOR)
+
+__version__ = '.'.join(map(str, VERSION)) + '.bs'
+__package_name__ = 'megatron-lm'
+__contact_names__ = 'NVIDIA INC'
+__url__ = 'https://github.com/NVIDIA/Megatron-LM'
+__download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
+__description__ = 'Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.'
+__license__ = 'See https://github.com/NVIDIA/Megatron-LM/blob/master/LICENSE'
+__keywords__ = 'deep learning, Megatron, gpu, NLP, nvidia, pytorch, torch, language'
--- a/megatron-deepspeed_dtk22.10/megatron/schedules.py
+++ b/megatron-deepspeed_dtk22.10/megatron/schedules.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from contextlib import contextmanager
+import torch
+from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
+
+from megatron import get_args
+from megatron import get_num_microbatches
+from megatron import get_timers
+from megatron import mpu
+from megatron import p2p_communication
+from megatron.utils import unwrap_model
+from megatron.model.distributed import DistributedDataParallel as LocalDDP
+from megatron.model.module import Float16Module
+
+def get_forward_backward_func():
+    args = get_args()
+    if mpu.get_pipeline_model_parallel_world_size() > 1:
+        if args.virtual_pipeline_model_parallel_size is not None:
+            forward_backward_func = forward_backward_pipelining_with_interleaving
+        else:
+            forward_backward_func = forward_backward_pipelining_without_interleaving
+    else:
+        forward_backward_func = forward_backward_no_pipelining
+    return forward_backward_func
+
+
+def forward_step(forward_step_func, data_iterator, model, input_tensor, losses_reduced):
+    """Forward step for passed-in model.
+
+    If first stage, input tensor is obtained from data_iterator, otherwise
+    passed-in input_tensor is used.
+
+    Returns output tensor."""
+    timers = get_timers()
+
+    args = get_args()
+
+    timers('forward-compute').start()
+    unwrapped_model = unwrap_model(
+        model, (torchDDP, LocalDDP, Float16Module))
+    if not args.deepspeed:
+        unwrapped_model.set_input_tensor(input_tensor)
+    else:
+        unwrapped_model.module.set_input_tensor(input_tensor)
+
+    output_tensor, loss_func = forward_step_func(data_iterator, model)
+    if mpu.is_pipeline_last_stage():
+        output_tensor = loss_func(output_tensor)
+        loss, loss_reduced = output_tensor
+        output_tensor = loss / get_num_microbatches()
+        losses_reduced.append(loss_reduced)
+    timers('forward-compute').stop()
+
+    return output_tensor
+
+
+def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad, model=None):
+    """Backward step through passed-in output tensor.
+
+    If last stage, output_tensor_grad is None, otherwise gradient of loss
+    with respect to stage's output tensor.
+
+    Returns gradient of loss with respect to input tensor (None if first
+    stage)."""
+    args = get_args()
+
+    if args.deepspeed:
+        assert model is not None
+
+    timers = get_timers()
+    timers('backward-compute').start()
+
+    # Retain the grad on the input_tensor.
+    if input_tensor is not None:
+        input_tensor.retain_grad()
+
+    if args.deepspeed:
+        model.backward(output_tensor)
+    else:
+        # Backward pass.
+        if output_tensor_grad is None:
+            output_tensor = optimizer.scale_loss(output_tensor)
+        torch.autograd.backward(output_tensor, grad_tensors=output_tensor_grad)
+
+    # Collect the grad of the input_tensor.
+    input_tensor_grad = None
+    if input_tensor is not None:
+        input_tensor_grad = input_tensor.grad
+
+    timers('backward-compute').stop()
+
+    return input_tensor_grad
+
+
+@contextmanager
+def dummy_handler():
+    try:
+        yield
+    finally:
+        pass
+
+
+def forward_backward_no_pipelining(forward_step_func, data_iterator, model,
+                                   optimizer, timers, forward_only):
+    """Run forward and backward passes with no pipeline parallelism
+    (no inter-stage communication).
+
+    Returns dictionary with losses."""
+    assert len(model) == 1
+    model = model[0]
+
+    context_handler = dummy_handler
+    if isinstance(model, torchDDP):
+        context_handler = model.no_sync
+
+    losses_reduced = []
+    input_tensor, output_tensor_grad = None, None
+    with context_handler():
+        for i in range(get_num_microbatches() - 1):
+            output_tensor = forward_step(forward_step_func, data_iterator, model,
+                                         input_tensor, losses_reduced)
+            if not forward_only:
+                backward_step(optimizer, input_tensor, output_tensor,
+                              output_tensor_grad, model)
+
+    # Run computation for last microbatch out of context handler (want to
+    # synchronize gradients).
+    output_tensor = forward_step(forward_step_func, data_iterator, model,
+                                 input_tensor, losses_reduced)
+    if not forward_only:
+        backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad, model)
+
+    return losses_reduced
+
+
+def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterator, model,
+                                                  optimizer, timers, forward_only):
+    """Run interleaved 1F1B schedule (model split into model chunks), with
+    communication between pipeline stages as needed.
+
+    Returns dictionary with losses if the last stage, empty dict otherwise."""
+    input_tensors = [[] for _ in range(len(model))]
+    output_tensors = [[] for _ in range(len(model))]
+    losses_reduced = []
+    if not forward_only:
+        output_tensor_grads = [[] for _ in range(len(model))]
+
+    pipeline_parallel_size = mpu.get_pipeline_model_parallel_world_size()
+    pipeline_parallel_rank = mpu.get_pipeline_model_parallel_rank()
+
+    # Compute number of warmup and remaining microbatches.
+    num_model_chunks = len(model)
+    num_microbatches = get_num_microbatches() * num_model_chunks
+    all_warmup_microbatches = False
+    if forward_only:
+        num_warmup_microbatches = num_microbatches
+    else:
+        # Run all forward passes and then all backward passes if number of
+        # microbatches is just the number of pipeline stages.
+        # Otherwise, perform (num_model_chunks-1)*pipeline_parallel_size on
+        # all workers, followed by more microbatches after depending on
+        # stage ID (more forward passes for earlier stages, later stages can
+        # immediately start with 1F1B).
+        if get_num_microbatches() == pipeline_parallel_size:
+            num_warmup_microbatches = num_microbatches
+            all_warmup_microbatches = True
+        else:
+            num_warmup_microbatches = \
+                (pipeline_parallel_size - pipeline_parallel_rank - 1) * 2
+            num_warmup_microbatches += (
+                num_model_chunks - 1) * pipeline_parallel_size
+            num_warmup_microbatches = min(num_warmup_microbatches,
+                                          num_microbatches)
+    num_microbatches_remaining = \
+        num_microbatches - num_warmup_microbatches
+
+    def get_model_chunk_id(microbatch_id, forward):
+        """Helper method to get the model chunk ID given the iteration number."""
+        microbatch_id_in_group = microbatch_id % (pipeline_parallel_size * num_model_chunks)
+        model_chunk_id = microbatch_id_in_group // pipeline_parallel_size
+        if not forward:
+            model_chunk_id = (num_model_chunks - model_chunk_id - 1)
+        return model_chunk_id
+
+    def forward_step_helper(microbatch_id):
+        """Helper method to run forward step with model split into chunks
+        (run set_virtual_pipeline_model_parallel_rank() before calling
+        forward_step())."""
+        model_chunk_id = get_model_chunk_id(microbatch_id, forward=True)
+        mpu.set_virtual_pipeline_model_parallel_rank(model_chunk_id)
+
+        if mpu.is_pipeline_first_stage():
+            if len(input_tensors[model_chunk_id]) == \
+                    len(output_tensors[model_chunk_id]):
+                input_tensors[model_chunk_id].append(None)
+        input_tensor = input_tensors[model_chunk_id][-1]
+        output_tensor = forward_step(forward_step_func,
+                                     data_iterator[model_chunk_id],
+                                     model[model_chunk_id],
+                                     input_tensor, losses_reduced)
+        output_tensors[model_chunk_id].append(output_tensor)
+
+        return output_tensor
+
+    def backward_step_helper(microbatch_id):
+        """Helper method to run backward step with model split into chunks
+        (run set_virtual_pipeline_model_parallel_rank() before calling
+        backward_step())."""
+        model_chunk_id = get_model_chunk_id(microbatch_id, forward=False)
+        mpu.set_virtual_pipeline_model_parallel_rank(model_chunk_id)
+
+        if mpu.is_pipeline_last_stage():
+            if len(output_tensor_grads[model_chunk_id]) == 0:
+                output_tensor_grads[model_chunk_id].append(None)
+        input_tensor = input_tensors[model_chunk_id].pop(0)
+        output_tensor = output_tensors[model_chunk_id].pop(0)
+        output_tensor_grad = output_tensor_grads[model_chunk_id].pop(0)
+        input_tensor_grad = \
+            backward_step(optimizer,
+                          input_tensor,
+                          output_tensor,
+                          output_tensor_grad)
+
+        return input_tensor_grad
+
+    # Run warmup forward passes.
+    mpu.set_virtual_pipeline_model_parallel_rank(0)
+    input_tensors[0].append(
+        p2p_communication.recv_forward(timers))
+    for k in range(num_warmup_microbatches):
+        output_tensor = forward_step_helper(k)
+
+        # Determine if tensor should be received from previous stage.
+        next_forward_model_chunk_id = get_model_chunk_id(k+1, forward=True)
+        recv_prev = True
+        if mpu.is_pipeline_first_stage(ignore_virtual=True):
+            if next_forward_model_chunk_id == 0:
+                recv_prev = False
+        if k == (num_microbatches - 1):
+            recv_prev = False
+
+        # Don't send tensor downstream if on last stage.
+        if mpu.is_pipeline_last_stage():
+            output_tensor = None
+
+        # Send and receive tensors as appropriate (send tensors computed
+        # in this iteration; receive tensors for next iteration).
+        if k == (num_warmup_microbatches - 1) and not forward_only and \
+                not all_warmup_microbatches:
+            input_tensor_grad = None
+            recv_next = True
+            if mpu.is_pipeline_last_stage(ignore_virtual=True):
+                recv_next = False
+            input_tensor, output_tensor_grad = \
+                p2p_communication.send_forward_backward_recv_forward_backward(
+                        output_tensor, input_tensor_grad,
+                        recv_prev=recv_prev, recv_next=recv_next,
+                        timers=timers)
+            output_tensor_grads[num_model_chunks-1].append(output_tensor_grad)
+        else:
+            input_tensor = \
+                p2p_communication.send_forward_recv_forward(
+                    output_tensor, recv_prev, timers)
+        input_tensors[next_forward_model_chunk_id].append(input_tensor)
+
+    # Run 1F1B in steady state.
+    for k in range(num_microbatches_remaining):
+        # Forward pass.
+        forward_k = k + num_warmup_microbatches
+        output_tensor = forward_step_helper(forward_k)
+
+        # Backward pass.
+        backward_k = k
+        input_tensor_grad = backward_step_helper(backward_k)
+
+        # Send output_tensor and input_tensor_grad, receive input_tensor
+        # and output_tensor_grad.
+
+        # Determine if current stage has anything to send in either direction,
+        # otherwise set tensor to None.
+        forward_model_chunk_id = get_model_chunk_id(forward_k, forward=True)
+        mpu.set_virtual_pipeline_model_parallel_rank(forward_model_chunk_id)
+        if mpu.is_pipeline_last_stage():
+            output_tensor = None
+
+        backward_model_chunk_id = get_model_chunk_id(backward_k, forward=False)
+        mpu.set_virtual_pipeline_model_parallel_rank(backward_model_chunk_id)
+        if mpu.is_pipeline_first_stage():
+            input_tensor_grad = None
+
+        # Determine if peers are sending, and where in data structure to put
+        # received tensors.
+        recv_prev = True
+        if mpu.is_pipeline_first_stage(ignore_virtual=True):
+            # First stage is ahead of last stage by (pipeline_parallel_size - 1).
+            next_forward_model_chunk_id = get_model_chunk_id(
+                forward_k - (pipeline_parallel_size - 1), forward=True)
+            if next_forward_model_chunk_id == (num_model_chunks - 1):
+                recv_prev = False
+            next_forward_model_chunk_id += 1
+        else:
+            next_forward_model_chunk_id = get_model_chunk_id(forward_k + 1,
+                                                             forward=True)
+
+        recv_next = True
+        if mpu.is_pipeline_last_stage(ignore_virtual=True):
+            # Last stage is ahead of first stage by (pipeline_parallel_size - 1).
+            next_backward_model_chunk_id = get_model_chunk_id(
+                backward_k - (pipeline_parallel_size - 1), forward=False)
+            if next_backward_model_chunk_id == 0:
+                recv_next = False
+            next_backward_model_chunk_id -= 1
+        else:
+            next_backward_model_chunk_id = get_model_chunk_id(backward_k + 1,
+                                                              forward=False)
+
+        # If last iteration, don't receive; we already received one extra
+        # before the start of the for loop.
+        if k == (num_microbatches_remaining - 1):
+            recv_prev = False
+
+        # Communicate tensors.
+        input_tensor, output_tensor_grad = \
+            p2p_communication.send_forward_backward_recv_forward_backward(
+                    output_tensor, input_tensor_grad,
+                    recv_prev=recv_prev, recv_next=recv_next,
+                    timers=timers)
+
+        # Put input_tensor and output_tensor_grad in data structures in the
+        # right location.
+        if recv_prev:
+            input_tensors[next_forward_model_chunk_id].append(input_tensor)
+        if recv_next:
+            output_tensor_grads[next_backward_model_chunk_id].append(
+                output_tensor_grad)
+
+    # Run cooldown backward passes (flush out pipeline).
+    if not forward_only:
+        if all_warmup_microbatches:
+            output_tensor_grads[num_model_chunks-1].append(
+                p2p_communication.recv_backward(timers))
+        for k in range(num_microbatches_remaining, num_microbatches):
+            input_tensor_grad = backward_step_helper(k)
+            next_backward_model_chunk_id = get_model_chunk_id(k+1, forward=False)
+            recv_next = True
+            if mpu.is_pipeline_last_stage(ignore_virtual=True):
+                if next_backward_model_chunk_id == (num_model_chunks - 1):
+                    recv_next = False
+            if k == (num_microbatches - 1):
+                recv_next = False
+            output_tensor_grads[next_backward_model_chunk_id].append(
+                p2p_communication.send_backward_recv_backward(
+                    input_tensor_grad, recv_next, timers))
+
+    return losses_reduced
+
+
+def forward_backward_pipelining_without_interleaving(forward_step_func, data_iterator,
+                                                     model, optimizer, timers,
+                                                     forward_only):
+    """Run non-interleaved 1F1B schedule, with communication between pipeline
+    stages.
+
+    Returns dictionary with losses if the last stage, empty dict otherwise."""
+    timers = get_timers()
+
+    assert len(model) == 1
+    model = model[0]
+
+    # Compute number of warmup microbatches.
+    num_microbatches = get_num_microbatches()
+    num_warmup_microbatches = \
+        (mpu.get_pipeline_model_parallel_world_size() -
+         mpu.get_pipeline_model_parallel_rank() - 1)
+    num_warmup_microbatches = min(
+        num_warmup_microbatches,
+        num_microbatches)
+    num_microbatches_remaining = \
+        num_microbatches - num_warmup_microbatches
+
+    input_tensors = []
+    output_tensors = []
+    losses_reduced = []
+
+    # Run warmup forward passes.
+    for i in range(num_warmup_microbatches):
+        input_tensor = p2p_communication.recv_forward(timers)
+        output_tensor = forward_step(forward_step_func, data_iterator, model,
+                                     input_tensor, losses_reduced)
+        p2p_communication.send_forward(output_tensor, timers)
+
+        input_tensors.append(input_tensor)
+        output_tensors.append(output_tensor)
+
+    # Before running 1F1B, need to receive first forward tensor.
+    # If all microbatches are run in warmup / cooldown phase, then no need to
+    # receive this tensor here.
+    if num_microbatches_remaining > 0:
+        input_tensor = p2p_communication.recv_forward(timers)
+
+    # Run 1F1B in steady state.
+    for i in range(num_microbatches_remaining):
+        last_iteration = (i == (num_microbatches_remaining - 1))
+
+        output_tensor = forward_step(forward_step_func, data_iterator, model,
+                                     input_tensor, losses_reduced)
+        if forward_only:
+            p2p_communication.send_forward(output_tensor, timers)
+        else:
+            output_tensor_grad = \
+                p2p_communication.send_forward_recv_backward(output_tensor,
+                                                             timers)
+
+        # Add input_tensor and output_tensor to end of list, then pop from the
+        # start of the list for backward pass.
+        input_tensors.append(input_tensor)
+        output_tensors.append(output_tensor)
+
+        if forward_only:
+            if not last_iteration:
+                input_tensor = p2p_communication.recv_forward(timers)
+        else:
+            input_tensor, output_tensor = input_tensors.pop(0), output_tensors.pop(0)
+
+            input_tensor_grad = \
+                backward_step(optimizer, input_tensor, output_tensor,
+                              output_tensor_grad, model)
+
+            if last_iteration:
+                input_tensor = None
+                p2p_communication.send_backward(input_tensor_grad, timers)
+            else:
+                input_tensor = \
+                    p2p_communication.send_backward_recv_forward(
+                        input_tensor_grad, timers)
+
+    # Run cooldown backward passes.
+    if not forward_only:
+        for i in range(num_warmup_microbatches):
+            input_tensor = input_tensors.pop(0)
+            output_tensor = output_tensors.pop(0)
+
+            output_tensor_grad = p2p_communication.recv_backward(timers)
+
+            input_tensor_grad = \
+                backward_step(optimizer, input_tensor, output_tensor,
+                              output_tensor_grad, model)
+
+            p2p_communication.send_backward(input_tensor_grad, timers)
+
+    return losses_reduced
--- a/megatron-deepspeed_dtk22.10/megatron/testing_utils.py
+++ b/megatron-deepspeed_dtk22.10/megatron/testing_utils.py
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import importlib.util
+import inspect
+import logging
+import numpy as np
+import os
+import random
+import re
+import shutil
+import sys
+import tempfile
+import unittest
+
+from distutils.util import strtobool
+from io import StringIO
+from packaging import version
+from pathlib import Path
+from typing import Iterator, Union
+from unittest import mock
+from unittest.case import SkipTest
+
+
+try:
+    import torch
+    _torch_available = True
+except:
+    _torch_available = False
+
+def is_torch_available():
+    return _torch_available
+
+def parse_flag_from_env(key, default=False):
+    try:
+        value = os.environ[key]
+    except KeyError:
+        # KEY isn't set, default to `default`.
+        _value = default
+    else:
+        # KEY is set, convert it to True or False.
+        try:
+            _value = strtobool(value)
+        except ValueError:
+            # More values are supported, but let's keep the message simple.
+            raise ValueError(f"If set, {key} must be yes or no.")
+    return _value
+
+
+def parse_int_from_env(key, default=None):
+    try:
+        value = os.environ[key]
+    except KeyError:
+        _value = default
+    else:
+        try:
+            _value = int(value)
+        except ValueError:
+            raise ValueError(f"If set, {key} must be a int.")
+    return _value
+
+
+def require_torch(test_case):
+    """
+    Decorator marking a test that requires PyTorch.
+
+    These tests are skipped when PyTorch isn't installed.
+
+    """
+    if not is_torch_available():
+        return unittest.skip("test requires PyTorch")(test_case)
+    else:
+        return test_case
+
+
+
+def require_torch_multi_gpu(test_case):
+    """
+    Decorator marking a test that requires a multi-GPU setup (in PyTorch). These tests are skipped on a machine without
+    multiple GPUs.
+
+    To run *only* the multi_gpu tests, assuming all test names contain multi_gpu: $ pytest -sv ./tests -k "multi_gpu"
+    """
+    if not is_torch_available():
+        return unittest.skip("test requires PyTorch")(test_case)
+
+    import torch
+
+    if torch.cuda.device_count() < 2:
+        return unittest.skip("test requires multiple GPUs")(test_case)
+    else:
+        return test_case
+
+
+def require_torch_non_multi_gpu(test_case):
+    """
+    Decorator marking a test that requires 0 or 1 GPU setup (in PyTorch).
+    """
+    if not is_torch_available():
+        return unittest.skip("test requires PyTorch")(test_case)
+
+    import torch
+
+    if torch.cuda.device_count() > 1:
+        return unittest.skip("test requires 0 or 1 GPU")(test_case)
+    else:
+        return test_case
+
+
+def require_torch_up_to_2_gpus(test_case):
+    """
+    Decorator marking a test that requires 0 or 1 or 2 GPU setup (in PyTorch).
+    """
+    if not is_torch_available():
+        return unittest.skip("test requires PyTorch")(test_case)
+
+    import torch
+
+    if torch.cuda.device_count() > 2:
+        return unittest.skip("test requires 0 or 1 or 2 GPUs")(test_case)
+    else:
+        return test_case
+
+
+def require_torch_tpu(test_case):
+    """
+    Decorator marking a test that requires a TPU (in PyTorch).
+    """
+    if not is_torch_tpu_available():
+        return unittest.skip("test requires PyTorch TPU")
+    else:
+        return test_case
+
+
+if is_torch_available():
+    # Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode
+    import torch
+
+    torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+else:
+    torch_device = None
+
+
+def require_torch_gpu(test_case):
+    """Decorator marking a test that requires CUDA and PyTorch."""
+    if torch_device != "cuda":
+        return unittest.skip("test requires CUDA")(test_case)
+    else:
+        return test_case
+
+
+def require_datasets(test_case):
+    """Decorator marking a test that requires datasets."""
+
+    if not is_datasets_available():
+        return unittest.skip("test requires `datasets`")(test_case)
+    else:
+        return test_case
+
+def is_deepspeed_available():
+    return importlib.util.find_spec("deepspeed") is not None
+
+def require_deepspeed(test_case):
+    """
+    Decorator marking a test that requires deepspeed
+    """
+    if not is_deepspeed_available():
+        return unittest.skip("test requires deepspeed")(test_case)
+    else:
+        return test_case
+
+def is_bnb_available():
+    return importlib.util.find_spec("bitsandbytes") is not None
+
+def require_bnb(test_case):
+    """
+    Decorator marking a test that requires bitsandbytes
+    """
+    if not is_bnb_available():
+        return unittest.skip("test requires bitsandbytes from https://github.com/facebookresearch/bitsandbytes")(test_case)
+    else:
+        return test_case
+
+
+def require_bnb_non_decorator():
+    """
+    Non-Decorator function that would skip a test if bitsandbytes is missing
+    """
+    if not is_bnb_available():
+        raise SkipTest("Test requires bitsandbytes from https://github.com/facebookresearch/bitsandbytes")
+
+
+def set_seed(seed: int=42):
+    """
+    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch``
+
+    Args:
+        seed (:obj:`int`): The seed to set.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    if is_torch_available():
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        # ^^ safe to call this function even if cuda is not available
+
+def get_gpu_count():
+    """
+    Return the number of available gpus (regardless of whether torch or tf is used)
+    """
+    if is_torch_available():
+        import torch
+
+        return torch.cuda.device_count()
+    elif is_tf_available():
+        import tensorflow as tf
+
+        return len(tf.config.list_physical_devices("GPU"))
+    else:
+        return 0
+
+def torch_assert_equal(actual, expected, **kwargs):
+    # assert_close was added around pt-1.9, it does better checks - e.g will check dimensions match
+    if hasattr(torch.testing, "assert_close"):
+        return torch.testing.assert_close(actual, expected, rtol=0.0, atol=0.0, **kwargs)
+    else:
+        return torch.allclose(actual, expected, rtol=0.0, atol=0.0)
+
+def torch_assert_close(actual, expected, **kwargs):
+    # assert_close was added around pt-1.9, it does better checks - e.g will check dimensions match
+    if hasattr(torch.testing, "assert_close"):
+        return torch.testing.assert_close(actual, expected, **kwargs)
+    else:
+        kwargs.pop("msg", None) # doesn't have msg arg
+        return torch.allclose(actual, expected, **kwargs)
+
+
+def is_torch_bf16_available():
+    # from https://github.com/huggingface/transformers/blob/26eb566e43148c80d0ea098c76c3d128c0281c16/src/transformers/file_utils.py#L301
+    if is_torch_available():
+        import torch
+        if not torch.cuda.is_available() or torch.version.cuda is None:
+            return False
+        if torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8:
+            return False
+        if int(torch.version.cuda.split(".")[0]) < 11:
+            return False
+        if not version.parse(torch.__version__) >= version.parse("1.09"):
+            return False
+        return True
+    else:
+        return False
+
+
+def require_torch_bf16(test_case):
+    """Decorator marking a test that requires CUDA hardware supporting bf16 and PyTorch >= 1.9."""
+    if not is_torch_bf16_available():
+        return unittest.skip("test requires CUDA hardware supporting bf16 and PyTorch >= 1.9")(test_case)
+    else:
+        return test_case
+
+
+def get_tests_dir(append_path=None):
+    """
+    Args:
+        append_path: optional path to append to the tests dir path
+
+    Return:
+        The full path to the `tests` dir, so that the tests can be invoked from anywhere. Optionally `append_path` is
+        joined after the `tests` dir the former is provided.
+
+    """
+    # this function caller's __file__
+    caller__file__ = inspect.stack()[1][1]
+    tests_dir = os.path.abspath(os.path.dirname(caller__file__))
+    if append_path:
+        return os.path.join(tests_dir, append_path)
+    else:
+        return tests_dir
+
+
+#
+# Helper functions for dealing with testing text outputs
+# The original code came from:
+# https://github.com/fastai/fastai/blob/master/tests/utils/text.py
+
+# When any function contains print() calls that get overwritten, like progress bars,
+# a special care needs to be applied, since under pytest -s captured output (capsys
+# or contextlib.redirect_stdout) contains any temporary printed strings, followed by
+# \r's. This helper function ensures that the buffer will contain the same output
+# with and without -s in pytest, by turning:
+# foo bar\r tar mar\r final message
+# into:
+# final message
+# it can handle a single string or a multiline buffer
+def apply_print_resets(buf):
+    return re.sub(r"^.*\r", "", buf, 0, re.M)
+
+
+def assert_screenout(out, what):
+    out_pr = apply_print_resets(out).lower()
+    match_str = out_pr.find(what.lower())
+    assert match_str != -1, f"expecting to find {what} in output: f{out_pr}"
+
+
+class CaptureStd:
+    """
+    Context manager to capture:
+
+        - stdout: replay it, clean it up and make it available via ``obj.out``
+        - stderr: replay it and make it available via ``obj.err``
+
+        init arguments:
+
+        - out - capture stdout:`` True``/``False``, default ``True``
+        - err - capture stdout: ``True``/``False``, default ``True``
+        - replay - whether to replay or not: ``True``/``False``, default ``True``. By default each
+        captured stream gets replayed back on context's exit, so that one can see what the test was
+        doing. If this is a not wanted behavior and the captured data shouldn't be replayed, pass
+        ``replay=False`` to disable this feature.
+
+        Examples::
+
+            # to capture stdout only with auto-replay
+            with CaptureStdout() as cs:
+                print("Secret message")
+            assert "message" in cs.out
+
+            # to capture stderr only with auto-replay
+            import sys
+            with CaptureStderr() as cs:
+                print("Warning: ", file=sys.stderr)
+            assert "Warning" in cs.err
+
+            # to capture both streams with auto-replay
+            with CaptureStd() as cs:
+                print("Secret message")
+                print("Warning: ", file=sys.stderr)
+            assert "message" in cs.out
+            assert "Warning" in cs.err
+
+            # to capture just one of the streams, and not the other, with auto-replay
+            with CaptureStd(err=False) as cs:
+                print("Secret message")
+            assert "message" in cs.out
+            # but best use the stream-specific subclasses
+
+            # to capture without auto-replay
+            with CaptureStd(replay=False) as cs:
+                print("Secret message")
+            assert "message" in cs.out
+
+    """
+
+    def __init__(self, out=True, err=True, replay=True):
+
+        self.replay = replay
+
+        if out:
+            self.out_buf = StringIO()
+            self.out = "error: CaptureStd context is unfinished yet, called too early"
+        else:
+            self.out_buf = None
+            self.out = "not capturing stdout"
+
+        if err:
+            self.err_buf = StringIO()
+            self.err = "error: CaptureStd context is unfinished yet, called too early"
+        else:
+            self.err_buf = None
+            self.err = "not capturing stderr"
+
+    def __enter__(self):
+        if self.out_buf:
+            self.out_old = sys.stdout
+            sys.stdout = self.out_buf
+
+        if self.err_buf:
+            self.err_old = sys.stderr
+            sys.stderr = self.err_buf
+
+        return self
+
+    def __exit__(self, *exc):
+        if self.out_buf:
+            sys.stdout = self.out_old
+            captured = self.out_buf.getvalue()
+            if self.replay:
+                sys.stdout.write(captured)
+            self.out = apply_print_resets(captured)
+
+        if self.err_buf:
+            sys.stderr = self.err_old
+            captured = self.err_buf.getvalue()
+            if self.replay:
+                sys.stderr.write(captured)
+            self.err = captured
+
+    def __repr__(self):
+        msg = ""
+        if self.out_buf:
+            msg += f"stdout: {self.out}\n"
+        if self.err_buf:
+            msg += f"stderr: {self.err}\n"
+        return msg
+
+
+# in tests it's the best to capture only the stream that's wanted, otherwise
+# it's easy to miss things, so unless you need to capture both streams, use the
+# subclasses below (less typing). Or alternatively, configure `CaptureStd` to
+# disable the stream you don't need to test.
+
+
+class CaptureStdout(CaptureStd):
+    """Same as CaptureStd but captures only stdout"""
+
+    def __init__(self, replay=True):
+        super().__init__(err=False, replay=replay)
+
+
+class CaptureStderr(CaptureStd):
+    """Same as CaptureStd but captures only stderr"""
+
+    def __init__(self, replay=True):
+        super().__init__(out=False, replay=replay)
+
+
+class CaptureLogger:
+    """
+    Context manager to capture `logging` streams
+
+    Args:
+
+    - logger: 'logging` logger object
+
+    Results:
+        The captured output is available via `self.out`
+
+    Example::
+
+        >>> from transformers import logging
+        >>> from transformers.testing_utils import CaptureLogger
+
+        >>> msg = "Testing 1, 2, 3"
+        >>> logging.set_verbosity_info()
+        >>> logger = logging.get_logger("transformers.models.bart.tokenization_bart")
+        >>> with CaptureLogger(logger) as cl:
+        ...     logger.info(msg)
+        >>> assert cl.out, msg+"\n"
+    """
+
+    def __init__(self, logger):
+        self.logger = logger
+        self.io = StringIO()
+        self.sh = logging.StreamHandler(self.io)
+        self.out = ""
+
+    def __enter__(self):
+        self.logger.addHandler(self.sh)
+        return self
+
+    def __exit__(self, *exc):
+        self.logger.removeHandler(self.sh)
+        self.out = self.io.getvalue()
+
+    def __repr__(self):
+        return f"captured: {self.out}\n"
+
+
+
+@contextlib.contextmanager
+# adapted from https://stackoverflow.com/a/64789046/9201239
+def ExtendSysPath(path: Union[str, os.PathLike]) -> Iterator[None]:
+    """
+    Temporary add given path to `sys.path`.
+
+    Usage ::
+
+       with ExtendSysPath('/path/to/dir'):
+           mymodule = importlib.import_module('mymodule')
+
+    """
+
+    path = os.fspath(path)
+    try:
+        sys.path.insert(0, path)
+        yield
+    finally:
+        sys.path.remove(path)
+
+
+class TestCasePlus(unittest.TestCase):
+    """
+    This class extends `unittest.TestCase` with additional features.
+
+    Feature 1: A set of fully resolved important file and dir path accessors.
+
+    In tests often we need to know where things are relative to the current test file, and it's not trivial since the
+    test could be invoked from more than one directory or could reside in sub-directories with different depths. This
+    class solves this problem by sorting out all the basic paths and provides easy accessors to them:
+
+    * ``pathlib`` objects (all fully resolved):
+
+       - ``test_file_path`` - the current test file path (=``__file__``)
+       - ``test_file_dir`` - the directory containing the current test file
+       - ``tests_dir`` - the directory of the ``tests`` test suite
+       - ``data_dir`` - the directory of the ``tests/data`` test suite
+       - ``repo_root_dir`` - the directory of the repository
+       - ``src_dir`` - the directory of ``src`` (i.e. where the ``transformers`` sub-dir resides)
+
+    * stringified paths---same as above but these return paths as strings, rather than ``pathlib`` objects:
+
+       - ``test_file_path_str``
+       - ``test_file_dir_str``
+       - ``tests_dir_str``
+       - ``data_dir_str``
+       - ``repo_root_dir_str``
+       - ``src_dir_str``
+
+    Feature 2: Flexible auto-removable temporary dirs which are guaranteed to get removed at the end of test.
+
+    1. Create a unique temporary dir:
+
+    ::
+
+        def test_whatever(self):
+            tmp_dir = self.get_auto_remove_tmp_dir()
+
+    ``tmp_dir`` will contain the path to the created temporary dir. It will be automatically removed at the end of the
+    test.
+
+
+    2. Create a temporary dir of my choice, ensure it's empty before the test starts and don't
+    empty it after the test.
+
+    ::
+
+        def test_whatever(self):
+            tmp_dir = self.get_auto_remove_tmp_dir("./xxx")
+
+    This is useful for debug when you want to monitor a specific directory and want to make sure the previous tests
+    didn't leave any data in there.
+
+    3. You can override the first two options by directly overriding the ``before`` and ``after`` args, leading to the
+       following behavior:
+
+    ``before=True``: the temporary dir will always be cleared at the beginning of the test.
+
+    ``before=False``: if the temporary dir already existed, any existing files will remain there.
+
+    ``after=True``: the temporary dir will always be deleted at the end of the test.
+
+    ``after=False``: the temporary dir will always be left intact at the end of the test.
+
+    Note 1: In order to run the equivalent of ``rm -r`` safely, only subdirs of the project repository checkout are
+    allowed if an explicit ``tmp_dir`` is used, so that by mistake no ``/tmp`` or similar important part of the
+    filesystem will get nuked. i.e. please always pass paths that start with ``./``
+
+    Note 2: Each test can register multiple temporary dirs and they all will get auto-removed, unless requested
+    otherwise.
+
+    Feature 3: Get a copy of the ``os.environ`` object that sets up ``PYTHONPATH`` specific to the current test suite.
+    This is useful for invoking external programs from the test suite - e.g. distributed training.
+
+
+    ::
+        def test_whatever(self):
+            env = self.get_env()
+
+    """
+
+    def setUp(self):
+        # get_auto_remove_tmp_dir feature:
+        self.teardown_tmp_dirs = []
+
+        # figure out the resolved paths for repo_root, tests,  etc.
+        self._test_file_path = inspect.getfile(self.__class__)
+        path = Path(self._test_file_path).resolve()
+        self._test_file_dir = path.parents[0]
+        for up in [1, 2, 3]:
+            tmp_dir = path.parents[up]
+            if (tmp_dir / "megatron").is_dir() and (tmp_dir / "tests").is_dir():
+                break
+        if tmp_dir:
+            self._repo_root_dir = tmp_dir
+        else:
+            raise ValueError(f"can't figure out the root of the repo from {self._test_file_path}")
+        self._tests_dir = self._repo_root_dir / "tests"
+        self._data_dir = self._repo_root_dir / "tests" / "data"
+        self._src_dir = self._repo_root_dir # megatron doesn't use "src/" prefix in the repo
+
+    @property
+    def test_file_path(self):
+        return self._test_file_path
+
+    @property
+    def test_file_path_str(self):
+        return str(self._test_file_path)
+
+    @property
+    def test_file_dir(self):
+        return self._test_file_dir
+
+    @property
+    def test_file_dir_str(self):
+        return str(self._test_file_dir)
+
+    @property
+    def tests_dir(self):
+        return self._tests_dir
+
+    @property
+    def tests_dir_str(self):
+        return str(self._tests_dir)
+
+    @property
+    def data_dir(self):
+        return self._data_dir
+
+    @property
+    def data_dir_str(self):
+        return str(self._data_dir)
+
+    @property
+    def repo_root_dir(self):
+        return self._repo_root_dir
+
+    @property
+    def repo_root_dir_str(self):
+        return str(self._repo_root_dir)
+
+    @property
+    def src_dir(self):
+        return self._src_dir
+
+    @property
+    def src_dir_str(self):
+        return str(self._src_dir)
+
+    def get_env(self):
+        """
+        Return a copy of the ``os.environ`` object that sets up ``PYTHONPATH`` correctly. This is useful
+        for invoking external programs from the test suite - e.g. distributed training.
+
+        It always inserts ``.`` first, then ``./tests`` depending on the test suite type and
+        finally the preset ``PYTHONPATH`` if any (all full resolved paths).
+
+        """
+        env = os.environ.copy()
+        paths = [self.src_dir_str]
+        paths.append(self.tests_dir_str)
+        paths.append(env.get("PYTHONPATH", ""))
+
+        env["PYTHONPATH"] = ":".join(paths)
+        return env
+
+    def get_auto_remove_tmp_dir(self, tmp_dir=None, before=None, after=None):
+        """
+        Args:
+            tmp_dir (:obj:`string`, `optional`):
+                if :obj:`None`:
+
+                   - a unique temporary path will be created
+                   - sets ``before=True`` if ``before`` is :obj:`None`
+                   - sets ``after=True`` if ``after`` is :obj:`None`
+                else:
+
+                   - :obj:`tmp_dir` will be created
+                   - sets ``before=True`` if ``before`` is :obj:`None`
+                   - sets ``after=False`` if ``after`` is :obj:`None`
+            before (:obj:`bool`, `optional`):
+                If :obj:`True` and the :obj:`tmp_dir` already exists, make sure to empty it right away if :obj:`False`
+                and the :obj:`tmp_dir` already exists, any existing files will remain there.
+            after (:obj:`bool`, `optional`):
+                If :obj:`True`, delete the :obj:`tmp_dir` at the end of the test if :obj:`False`, leave the
+                :obj:`tmp_dir` and its contents intact at the end of the test.
+
+        Returns:
+            tmp_dir(:obj:`string`): either the same value as passed via `tmp_dir` or the path to the auto-selected tmp
+            dir
+        """
+        if tmp_dir is not None:
+
+            # defining the most likely desired behavior for when a custom path is provided.
+            # this most likely indicates the debug mode where we want an easily locatable dir that:
+            # 1. gets cleared out before the test (if it already exists)
+            # 2. is left intact after the test
+            if before is None:
+                before = True
+            if after is None:
+                after = False
+
+            # using provided path
+            path = Path(tmp_dir).resolve()
+
+            # to avoid nuking parts of the filesystem, only relative paths are allowed
+            if not tmp_dir.startswith("./"):
+                raise ValueError(
+                    f"`tmp_dir` can only be a relative path, i.e. `./some/path`, but received `{tmp_dir}`"
+                )
+
+            # ensure the dir is empty to start with
+            if before is True and path.exists():
+                shutil.rmtree(tmp_dir, ignore_errors=True)
+
+            path.mkdir(parents=True, exist_ok=True)
+
+        else:
+            # defining the most likely desired behavior for when a unique tmp path is auto generated
+            # (not a debug mode), here we require a unique tmp dir that:
+            # 1. is empty before the test (it will be empty in this situation anyway)
+            # 2. gets fully removed after the test
+            if before is None:
+                before = True
+            if after is None:
+                after = True
+
+            # using unique tmp dir (always empty, regardless of `before`)
+            tmp_dir = tempfile.mkdtemp()
+
+        if after is True:
+            # register for deletion
+            self.teardown_tmp_dirs.append(tmp_dir)
+
+        return tmp_dir
+
+    def tearDown(self):
+
+        # get_auto_remove_tmp_dir feature: remove registered temp dirs
+        for path in self.teardown_tmp_dirs:
+            shutil.rmtree(path, ignore_errors=True)
+        self.teardown_tmp_dirs = []
+
+
+def mockenv(**kwargs):
+    """
+    this is a convenience wrapper, that allows this ::
+
+    @mockenv(RUN_SLOW=True, USE_TF=False)
+    def test_something():
+        run_slow = os.getenv("RUN_SLOW", False)
+        use_tf = os.getenv("USE_TF", False)
+
+    """
+    return mock.patch.dict(os.environ, kwargs)
+
+
+# from https://stackoverflow.com/a/34333710/9201239
+@contextlib.contextmanager
+def mockenv_context(*remove, **update):
+    """
+    Temporarily updates the ``os.environ`` dictionary in-place. Similar to mockenv
+
+    The ``os.environ`` dictionary is updated in-place so that the modification is sure to work in all situations.
+
+    Args:
+      remove: Environment variables to remove.
+      update: Dictionary of environment variables and values to add/update.
+    """
+    env = os.environ
+    update = update or {}
+    remove = remove or []
+
+    # List of environment variables being updated or removed.
+    stomped = (set(update.keys()) | set(remove)) & set(env.keys())
+    # Environment variables and values to restore on exit.
+    update_after = {k: env[k] for k in stomped}
+    # Environment variables and values to remove on exit.
+    remove_after = frozenset(k for k in update if k not in env)
+
+    try:
+        env.update(update)
+        [env.pop(k, None) for k in remove]
+        yield
+    finally:
+        env.update(update_after)
+        [env.pop(k) for k in remove_after]
+
+
+
+# --- distributed testing functions --- #
+
+# adapted from https://stackoverflow.com/a/59041913/9201239
+import asyncio  # noqa
+
+
+class _RunOutput:
+    def __init__(self, returncode, stdout, stderr):
+        self.returncode = returncode
+        self.stdout = stdout
+        self.stderr = stderr
+
+
+async def _read_stream(stream, callback):
+    while True:
+        line = await stream.readline()
+        if line:
+            callback(line)
+        else:
+            break
+
+
+async def _stream_subprocess(cmd, env=None, stdin=None, timeout=None, quiet=False, echo=False) -> _RunOutput:
+    if echo:
+        print("\nRunning: ", " ".join(cmd))
+
+    p = await asyncio.create_subprocess_exec(
+        cmd[0],
+        *cmd[1:],
+        stdin=stdin,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+        env=env,
+    )
+
+    # note: there is a warning for a possible deadlock when using `wait` with huge amounts of data in the pipe
+    # https://docs.python.org/3/library/asyncio-subprocess.html#asyncio.asyncio.subprocess.Process.wait
+    #
+    # If it starts hanging, will need to switch to the following code. The problem is that no data
+    # will be seen until it's done and if it hangs for example there will be no debug info.
+    # out, err = await p.communicate()
+    # return _RunOutput(p.returncode, out, err)
+
+    out = []
+    err = []
+
+    def tee(line, sink, pipe, label=""):
+        line = line.decode("utf-8").rstrip()
+        sink.append(line)
+        if not quiet:
+            print(label, line, file=pipe)
+
+    # XXX: the timeout doesn't seem to make any difference here
+    await asyncio.wait(
+        [
+            _read_stream(p.stdout, lambda l: tee(l, out, sys.stdout, label="stdout:")),
+            _read_stream(p.stderr, lambda l: tee(l, err, sys.stderr, label="stderr:")),
+        ],
+        timeout=timeout,
+    )
+    return _RunOutput(await p.wait(), out, err)
+
+
+def execute_subprocess_async(cmd, env=None, stdin=None, timeout=180, quiet=False, echo=True) -> _RunOutput:
+
+    loop = asyncio.get_event_loop()
+    result = loop.run_until_complete(
+        _stream_subprocess(cmd, env=env, stdin=stdin, timeout=timeout, quiet=quiet, echo=echo)
+    )
+
+    cmd_str = " ".join(cmd)
+    if result.returncode > 0:
+        stderr = "\n".join(result.stderr)
+        raise RuntimeError(
+            f"'{cmd_str}' failed with returncode {result.returncode}\n\n"
+            f"The combined stderr from workers follows:\n{stderr}"
+        )
+
+    # check that the subprocess actually did run and produced some output, should the test rely on
+    # the remote side to do the testing
+    if not result.stdout and not result.stderr:
+        raise RuntimeError(f"'{cmd_str}' produced no output.")
+
+    return result
+
+
+# --- Misc utils --- #
+
+def flatten_arguments(args):
+    """
+    Converts dictionary argument to a list.
+
+    Note: we add "IGNORED" at the beginning as this value is ignored by the argparser
+
+    Example: {"arg1": "value1", "arg2": "value2"} -> ["IGNORED", "arg1", "value1", "arg2", "value2"]
+    """
+    return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""]