update

e1354f9d · liangjing · e1354f9d · e1354f9d · e1354f9d · e1354f9d
Commit e1354f9d authored May 22, 2024 by liangjing
20 changed files
--- a/megatron/core/fusions/fused_layer_norm.py
+++ b/megatron/core/fusions/fused_layer_norm.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+import numbers
+import torch
+from torch.nn.parameter import Parameter
+from torch.nn import init
+import importlib
+from megatron.core.utils import make_viewless_tensor
+try:
+    from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
+    HAVE_PERSIST_LAYER_NORM = True
+except:
+    HAVE_PERSIST_LAYER_NORM = False
+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
+    HAVE_FUSED_LAYER_NORM = True
+except:
+    HAVE_FUSED_LAYER_NORM = False
+class FusedLayerNorm(torch.nn.Module):
+  def __init__(self, hidden_size, eps=1e-5,
+               persist_layer_norm=True,
+               sequence_parallel=False,
+               zero_centered_gamma=False):
+        super().__init__()
+        self.zero_centered_gamma = zero_centered_gamma
+        # List of hiddens sizes supported in the persistent layer norm kernel
+        # If the hidden size is not supported, fall back to the non-persistent
+        # kernel.
+        persist_ln_hidden_sizes = [1024, 1536, 2048, 2304, 3072, 3840, 4096,
+            5120, 6144, 8192, 10240, 12288, 12800, 15360, 16384, 18432, 20480,
+            24576, 25600, 30720, 32768, 40960, 49152, 65536]
+        if hidden_size not in persist_ln_hidden_sizes or not HAVE_PERSIST_LAYER_NORM:
+            persist_layer_norm = False
+        if not persist_layer_norm and not HAVE_FUSED_LAYER_NORM:
+            # TODO: Add pytorch only layer norm
+            raise ValueError(f'Apex must currently be installed to use megatron core.')
+        if isinstance(hidden_size, numbers.Integral):
+            hidden_size = (hidden_size,)
+        self.hidden_size = torch.Size(hidden_size)
+        self.eps = eps
+        self.weight = Parameter(torch.Tensor(*hidden_size))
+        self.bias = Parameter(torch.Tensor(*hidden_size))
+        self.reset_parameters()
+        self.persist_layer_norm = persist_layer_norm
+        self.sequence_parallel = sequence_parallel
+        # set sequence parallelism flag on weight and bias parameters
+        setattr(self.weight, 'sequence_parallel', self.sequence_parallel)
+        setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
+  def reset_parameters(self):
+    if self.zero_centered_gamma:
+        init.zeros_(self.weight)
+        init.zeros_(self.bias)
+    else:
+        init.ones_(self.weight)
+        init.zeros_(self.bias)
+  def forward(self, input):
+    weight = self.weight + 1 if self.zero_centered_gamma else self.weight
+    if self.persist_layer_norm:
+        output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
+        # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
+        # a populated '_base' field). This will result in schedule.py's
+        # deallocate_output_tensor() throwing an error, so a viewless tensor is
+        # created to prevent this.
+        output = make_viewless_tensor(inp = output,
+                                      requires_grad = input.requires_grad,
+                                      keep_graph = True)
+    else:
+        output = FusedLayerNormAffineFunction.apply(input, weight, self.bias, self.hidden_size, self.eps)
+    return output
--- a/megatron/core/fusions/fused_softmax.py
+++ b/megatron/core/fusions/fused_softmax.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+import torch
+import torch.nn as nn
+from megatron.core.transformer.enums import AttnMaskType
+class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply upper triangular mask (typically used in gpt models).
+    3. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_upper_triang_masked_softmax_cuda
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(
+            inputs, scale_t[0]
+        )
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_upper_triang_masked_softmax_cuda
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_upper_triang_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None
+class ScaledMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply the mask.
+    3. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs, mask, scale):
+        import scaled_masked_softmax_cuda
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_masked_softmax_cuda
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_masked_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None, None
+class ScaledSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following two operations in sequence
+    1. Scale the tensor.
+    2. Perform softmax.
+    """
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        import scaled_softmax_cuda
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_softmax_cuda.forward(
+            inputs, scale_t[0]
+        )
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+    @staticmethod
+    def backward(ctx, output_grads):
+        import scaled_softmax_cuda
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_softmax_cuda.backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None, None
+class FusedScaleMaskSoftmax(nn.Module):
+    """
+    fused operation: scaling + mask + softmax
+    Arguments:
+        input_in_fp16: flag to indicate if input in fp16 data format.
+        input_in_bf16: flag to indicate if input in bf16 data format.
+        attn_mask_type: attention mask type (pad or causal)
+        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
+        mask_func: mask function to be applied.
+        softmax_in_fp32: if true, softmax in performed at fp32 precision.
+        scale: scaling factor used in input tensor scaling.
+    """
+    def __init__(
+        self,
+        input_in_fp16,
+        input_in_bf16,
+        attn_mask_type,
+        scaled_masked_softmax_fusion,
+        mask_func,
+        softmax_in_fp32,
+        scale,
+    ):
+        super(FusedScaleMaskSoftmax, self).__init__()
+        self.input_in_fp16 = input_in_fp16
+        self.input_in_bf16 = input_in_bf16
+        assert not (
+            self.input_in_fp16 and self.input_in_bf16
+        ), "both fp16 and bf16 flags cannot be active at the same time."
+        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
+        self.attn_mask_type = attn_mask_type
+        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
+        self.mask_func = mask_func
+        self.softmax_in_fp32 = softmax_in_fp32
+        self.scale = scale
+        assert (
+            self.scale is None or softmax_in_fp32
+        ), "softmax should be in fp32 when scaled"
+    def forward(self, input, mask):
+        # [b, np, sq, sk]
+        assert input.dim() == 4
+        if self.is_kernel_available(mask, *input.size()):
+            return self.forward_fused_softmax(input, mask)
+        else:
+            return self.forward_torch_softmax(input, mask)
+    def is_kernel_available(self, mask, b, np, sq, sk):
+        attn_batches = b * np
+        if (
+            self.scaled_masked_softmax_fusion  # user want to fuse
+            and self.input_in_float16  # input must be fp16
+            and 16 < sk <= 4096  # sk must be 16 ~ 2048
+            and sq % 4 == 0  # sq must be divisor of 4
+            and sk % 4 == 0  # sk must be divisor of 4 
+            and attn_batches % 4 == 0  # np * b must be divisor of 4
+        ):
+            if 0 <= sk <= 4096:
+                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
+                if self.attn_mask_type == AttnMaskType.causal:
+                    if attn_batches % batch_per_block == 0:
+                        return True
+                else:
+                    if sq % batch_per_block == 0:
+                        return True
+        return False
+    def forward_fused_softmax(self, input, mask):
+        b, np, sq, sk = input.size()
+        scale = self.scale if self.scale is not None else 1.0
+        if self.attn_mask_type == AttnMaskType.causal:
+            assert sq == sk, "causal mask is only for self attention"
+            # input is 3D tensor (attn_batches, sq, sk)
+            input = input.view(-1, sq, sk)
+            probs = ScaledUpperTriangMaskedSoftmax.apply(input, scale)
+            return probs.view(b, np, sq, sk)
+        else:
+            # input is 4D tensor (b, np, sq, sk)
+            if mask is not None:
+                return ScaledMaskedSoftmax.apply(input, mask, scale)
+            else:
+                return ScaledSoftmax.apply(input, scale)
+    def forward_torch_softmax(self, input, mask):
+        if self.input_in_float16 and self.softmax_in_fp32:
+            input = input.float()
+        if self.scale is not None:
+            input = input * self.scale
+        mask_output = self.mask_func(input, mask) if mask is not None else input
+        probs = torch.nn.Softmax(dim=-1)(mask_output)
+        if self.input_in_float16 and self.softmax_in_fp32:
+            if self.input_in_fp16:
+                probs = probs.half()
+            else:
+                probs = probs.bfloat16()
+        return probs
+    @staticmethod
+    def get_batch_per_block(sq, sk, b, np):
+        import scaled_masked_softmax_cuda
+        return scaled_masked_softmax_cuda.get_batch_per_block(sq, sk, b, np)
--- a/megatron/core/model_parallel_config.py
+++ b/megatron/core/model_parallel_config.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+from dataclasses import dataclass
+from typing import Callable
+import torch
+@dataclass
+class ModelParallelConfig:
+    """Base configuration for Megatron Core
+    Model Parallelism
+    -----------------
+    tensor_model_parallel_size (int): Intra-layer model parallelism. Splits tensors across GPU ranks. Defaults to 1.
+    pipeline_model_parallel_size (int): Inter-layer model parallelism. Splits transformer layers across GPU
+        ranks. Defaults to 1.
+    virtual_pipeline_model_parallel_size (int): Interleaved pipeline parallelism is used to improve performance by
+        reducing the pipeline bubble.  Considers a transformer block as a list of smaller transformer (virtual) blocks.
+        The number of virtual blocks per pipeline model parallel rank is the virtual model parallel size.  See Efficient
+        Large-Scale Language Model Training on GPU Clusters Using Megatron-LM: https://arxiv.org/pdf/2104.04473.pdf for
+        more details.  Defaults to None.
+    sequence_parallel (bool): Makes tensor parallelism more memory efficient for LLMs (20B+) by
+        parallelizing layer norms and dropout sequentially.  See Reducing Activation Recomputation in Large Transformer
+        Models: https://arxiv.org/abs/2205.05198 for more details. Defaults to False.
+    Initialization
+    --------------
+    perform_initialization (bool, default=True): If true, weights are initialized. This option can be useful when you
+        know you are going to load values from a checkpoint.
+    use_cpu_initialization: (bool, default=False): When set to False, we initialize the weights directly on the GPU.
+        Transferring weights from CPU to GPU can take a significant amount of time for large models. Defaults to False.
+    Training
+    --------
+    fp16 (bool): If true, train with fp16 mixed precision training. Defaults to False.
+    bf16 (bool): If true, train with bf16 mixed precision training. Defaults to False.
+    params_dtype (torch.dtype): dtype used when intializing the weights. Defaults to torch.float32
+    timers (optional, default=None): TODO
+    Optimizations
+    -------------
+    gradient_accumulation_fusion (bool): If true, fuses weight gradient accumulation to GEMMs. Requires the custom CUDA
+        extension fused_weight_gradient_mlp_cuda module. To use gradient_accumulation_fusion you must install APEX with
+        --cpp_ext and --cuda_ext. For example: "pip install --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext\"
+        ". Note that the extension requires CUDA>=11. Otherwise, you must turn off gradient accumulation fusion.
+        Defaults to False.
+    async_tensor_model_parallel_allreduce (bool, default=True): If true, enables asynchronous execution of
+        tensor-model-parallel all-reduce with weight gradient compuation of a column-linear layer.  Defaults to False.
+    Pipeline Parallelism
+    --------------------
+    pipeline_dtype (required): dtype used in p2p communication, usually params_dtype
+    grad_scale_func (optional, default=None): If using loss scaling, this function should take the loss and return the
+        scaled loss. If None, no function is called on the loss.
+    enable_autocast (bool): If true runs the forward step function inside torch.autocast context. Default is False.
+    autocast_dtype (torch.dtype): dtype to pass to torch.amp.autocast when enabled. Default is pipeline_dtype.
+    variable_seq_lengths (bool, default=False): Support for variable sequence lengths across microbatches. Setting this
+        communicates the size of tensors during pipeline parallelism communication, because of this extra overhead it
+        should only be set if the sequence length varies by microbatch within a global batch.
+    num_microbatches_with_partial_activation_checkpoints (int, default=None): If int, set the number of microbatches
+        where not all of the layers will be checkpointed and recomputed. The rest of the microbatches within the window
+        of maximum outstanding microbatches will recompute all layers (either full recompute or selective recompute). If
+        None, the checkpoint and recompute will be left up to the forward_step function.
+    overlap_p2p_comm (bool, optional, default=False): When True some of the peer to peer communication for pipeline
+        parallelism will overlap with computation. Must be False if batch_p2p_comm is true.
+    batch_p2p_comm (bool, default=True): Use batch_isend_irecv instead of individual isend/irecv calls. Must be False
+        if overlap_p2p_comm is True.
+    batch_p2p_sync (bool, default=True): When using batch_isend_irecv, do a cuda.device.synchronize afterward to work
+        around a bug in older version of PyTorch.
+    use_ring_exchange_p2p (bool, default = False): Use custom ring_exchange kernel instead of
+        torch.distributed.batch_isend_irecv(). Requires custom built torch with torch.distributed.ring_exchange.
+    deallocate_pipeline_outputs (optional, default=False): If True, output data is deallocated after the tensor is sent
+        to the next pipeline stage.  Helps with saving memory, does nothing when pipeline parallel is not used.
+    no_sync_func (optional): Function that creates a context that suppresses asynchronous data-parallel
+        communication. If the model is an instance of torch.nn.DistributedDataParallel, the default is to use
+        torch.nn.DistributedDataParallel.no_sync.
+    grad_sync_func (optional): Function that launches asynchronous gradient reductions (e.g. distributed optimizer
+        gradient reduce-scatters). The function should take one argument: an iterable of parameters whose gradients are
+        to be synchronized.
+    param_sync_func (optional): Function that launches asynchronous parameter synchronizations (e.g. distributed
+        optimizer parameter all-gathers). The function should take one argument: an iterable of parameters to be
+        synchronized.
+    """
+    # Model parallelism
+    tensor_model_parallel_size: int = 1
+    pipeline_model_parallel_size: int = 1
+    virtual_pipeline_model_parallel_size: int = None
+    sequence_parallel: bool = False
+    # Initialization
+    perform_initialization: bool = True
+    use_cpu_initialization: bool = False
+    # Training
+    fp16: bool = False
+    bf16: bool = False
+    params_dtype: torch.dtype = torch.float32
+    timers: Callable = None
+    # Optimizations
+    gradient_accumulation_fusion: bool = False
+    async_tensor_model_parallel_allreduce: bool = False
+    # Pipeline Parallel
+    pipeline_dtype: torch.dtype = None
+    grad_scale_func: Callable = None
+    enable_autocast: bool = False
+    autocast_dtype: torch.dtype = None
+    variable_seq_lengths: bool = False
+    num_microbatches_with_partial_activation_checkpoints: int = None
+    overlap_p2p_comm: bool = False
+    batch_p2p_comm: bool = True
+    batch_p2p_sync: bool = True
+    use_ring_exchange_p2p: bool = False
+    deallocate_pipeline_outputs: bool = False
+    no_sync_func: Callable = None
+    grad_sync_func: Callable = None
+    param_sync_func: Callable = None
+    def __post_init__(self):
+        """ Python dataclass method that is used to modify attributes after initialization.
+            See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
+        """
+        if self.sequence_parallel:
+            if self.tensor_model_parallel_size <= 1:
+                raise ValueError("Can not use sequence paralllelism without tensor parallelism")
+            if self.async_tensor_model_parallel_allreduce:
+                # sequence_parallelism already does this async
+                self.async_tensor_model_parallel_allreduce = False
+        if self.pipeline_model_parallel_size > 1:
+            if self.pipeline_dtype is None:
+                raise ValueError("When using pipeline parallelism, pipeline_dtype must be specified")
+        if self.autocast_dtype is None:
+            self.autocast_dtype = self.params_dtype
--- a/megatron/core/models/__init__.py
+++ b/megatron/core/models/__init__.py
--- a/megatron/core/models/gpt/__init__.py
+++ b/megatron/core/models/gpt/__init__.py
+from .gpt_model import GPTModel
--- a/megatron/core/models/gpt/gpt_embedding.py
+++ b/megatron/core/models/gpt/gpt_embedding.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+import torch
+from megatron.core import tensor_parallel
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+class GPTEmbedding(MegatronModule):
+    """Language model embeddings.
+    Arguments:
+        config (TransformerConfig): config object with all necessary configs for TransformerBlock
+        vocab_size (int): vocabulary size
+        max_sequence_length (int): maximum size of sequence. This
+                             is used for positional embedding
+        embedding_dropout_prob float): dropout probability for embeddings
+    """
+    def __init__(self, config: TransformerConfig, vocab_size: int, max_sequence_length: int):
+        super().__init__(config=config)
+        self.config: TransformerConfig = config
+        self.vocab_size: int = vocab_size
+        self.max_sequence_length: int = max_sequence_length
+        # Word embeddings (parallel).
+        self.word_embeddings = tensor_parallel.VocabParallelEmbedding(
+            num_embeddings=self.vocab_size,
+            embedding_dim=self.config.hidden_size,
+            init_method=self.config.init_method,
+            config=self.config
+        )
+        # @jcasper are these keys needed?
+        self._word_embeddings_key = 'word_embeddings'
+        # Position embedding (serial).
+        self.position_embeddings = torch.nn.Embedding(self.max_sequence_length, self.config.hidden_size)
+        self._position_embeddings_key = 'position_embeddings'
+        # Initialize the position embeddings.
+        if self.config.perform_initialization:
+            self.config.init_method(self.position_embeddings.weight)
+        # Embeddings dropout
+        self.embedding_dropout = torch.nn.Dropout(self.config.hidden_dropout)
+    def zero_parameters(self):
+        """Zero out all parameters in embedding."""
+        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
+        self.position_embeddings.weight.data.fill_(0)
+        self.position_embeddings.weight.shared = True
+    def forward(self, input_ids, position_ids):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = words_embeddings + position_embeddings
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.config.fp32_residual_connection:
+            embeddings = embeddings.float()
+        # Dropout.
+        if self.config.sequence_parallel:
+            embeddings = tensor_parallel.scatter_to_sequence_parallel_region(embeddings)
+            with tensor_parallel.get_cuda_rng_tracker().fork():
+                embeddings = self.embedding_dropout(embeddings)
+        else:
+            embeddings = self.embedding_dropout(embeddings)
+        return embeddings
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """For easy load."""
+        state_dict_ = {}
+        state_dict_[self._word_embeddings_key] = self.word_embeddings.state_dict(prefix=prefix, keep_vars=keep_vars)
+        state_dict_[self._position_embeddings_key] = self.position_embeddings.state_dict(
+            prefix=prefix, keep_vars=keep_vars
+        )
+        return state_dict_
+    def load_state_dict(self, state_dict, strict=True):
+        """Customized load."""
+        # Word embedding.
+        if self._word_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._word_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'word_embeddings' in key:
+                    state_dict_[key.split('word_embeddings.')[1]] = state_dict[key]
+        self.word_embeddings.load_state_dict(state_dict_, strict=strict)
+        # Position embedding.
+        if self._position_embeddings_key in state_dict:
+            state_dict_ = state_dict[self._position_embeddings_key]
+        else:
+            # for backward compatibility.
+            state_dict_ = {}
+            for key in state_dict.keys():
+                if 'position_embeddings' in key:
+                    state_dict_[key.split('position_embeddings.')[1]] = state_dict[key]
+        self.position_embeddings.load_state_dict(state_dict_, strict=strict)
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+import torch
+from torch import Tensor
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.transformer_block import TransformerBlock
+from megatron.core.transformer.enums import AttnMaskType, ModelType
+from megatron.core.models.gpt.gpt_embedding import GPTEmbedding
+class GPTModel(MegatronModule):
+    """Transformer language model.
+    Arguments:
+        config (TransformerConfig): transformer config
+        vocab_size (int): vocabulary size
+        max_sequence_length (int): maximum size of sequence. This is used for positional embedding
+        pre_process (bool): Include embedding layer (used with pipeline parallelism)
+        post_process (bool): Include an output layer (used with pipeline parallelism)
+        parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks
+        share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are
+            shared. Defaults to False.
+    """
+    def __init__(
+        self,
+        config: TransformerConfig,
+        vocab_size: int,
+        max_sequence_length: int,
+        pre_process: bool = True,
+        post_process: bool = True,
+        fp16_lm_cross_entropy: bool = False,
+        parallel_output: bool = True,
+        share_embeddings_and_output_weights: bool = False,
+    ):
+        super(GPTModel, self).__init__(config=config)
+        self.config: TransformerConfig = config
+        self.vocab_size = vocab_size
+        self.max_sequence_length = max_sequence_length
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
+        self.parallel_output = parallel_output
+        self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
+        # megatron core pipelining currently depends on model type
+        self.model_type = ModelType.encoder_or_decoder
+        # Embeddings.
+        if self.pre_process:
+            self.embedding = GPTEmbedding(
+                config=self.config, vocab_size=self.vocab_size, max_sequence_length=self.max_sequence_length,
+            )
+        # Transformer.
+        self.decoder = TransformerBlock(
+            config=self.config,
+            self_attn_mask_type=AttnMaskType.causal,
+            pre_process=self.pre_process,
+            post_process=self.post_process,
+        )
+        # Output
+        if post_process:
+            self.output_layer = tensor_parallel.ColumnParallelLinear(
+                config.hidden_size,
+                self.vocab_size,
+                config=config,
+                init_method=config.init_method,
+                bias=False,
+                skip_bias_add=False,
+                gather_output=not self.parallel_output,
+                skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights)
+        if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process):
+            self.initialize_last_stage_with_word_embeddings()
+    def set_input_tensor(self, input_tensor):
+        """ See megatron.model.transformer.set_input_tensor()"""
+        # This is usually handled in schedules.py but some inference code still
+        # gives us non-lists or None
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+        assert len(input_tensor) == 1, 'input_tensor should only be length 1 for gpt'
+        self.decoder.set_input_tensor(input_tensor[0])
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor,
+        attention_mask: Tensor,
+        labels: Tensor = None,
+        inference_params=None,
+    ):
+        # Encoder embedding.
+        if self.pre_process:
+            decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
+        else:
+            # intermediate stage of pipeline
+            # encoder will get hidden_states from encoder.input_tensor
+            decoder_input = None
+        # Run encoder.
+        hidden_states = self.decoder(
+            hidden_states=decoder_input, attention_mask=attention_mask, inference_params=inference_params
+        )
+        if not self.post_process:
+            return hidden_states
+        # logits and loss
+        output_weight = None
+        if self.share_embeddings_and_output_weights:
+            output_weight = self.shared_embedding_or_output_weight()
+        logits, _ = self.output_layer(hidden_states, weight=output_weight)
+        if labels is None:
+            # [s b h] => [b s h]
+            return logits.transpose(0, 1).contiguous()
+        # [b s] => [s b]
+        labels = labels.transpose(0, 1).contiguous()
+        loss = tensor_parallel.vocab_parallel_cross_entropy(logits.float(), labels)
+        # [s b] => [b, s]
+        loss = loss.transpose(0, 1).contiguous()
+        return loss
+    def shared_embedding_or_output_weight(self):
+        if self.pre_process:
+            return self.embedding.word_embeddings.weight
+        elif self.post_process:
+            return self.output_layer.weight
+        return None
+    def initialize_last_stage_with_word_embeddings(self):
+        # This function just initializes the word embeddings in the final stage
+        # when we are using pipeline parallelism and sharing word
+        # embeddings. Nothing to do if we aren't sharing weights or aren't using
+        # pipeline parallelism.
+        if not self.share_embeddings_and_output_weights or (self.pre_process and self.post_process):
+            return
+        if self.post_process and not self.pre_process:
+            assert not parallel_state.is_pipeline_first_stage()
+            # set word_embeddings weights to 0 here, then copy first
+            # stage's weights using all_reduce below.
+            self.output_layer.weight.data.fill_(0)
+            self.output_layer.weight.shared = True
+        # Parameters are shared between the word embeddings layers, and the
+        # heads at the end of the model. In a pipelined setup with more than
+        # one stage, the initial embedding layer and the head are on different
+        # workers, so we do the following:
+        # 1. Create a second copy of word_embeddings on the last stage, with
+        #    initial parameters of 0.0.
+        # 2. Do an all-reduce between the first and last stage to ensure that
+        #    the two copies of word_embeddings start off with the same
+        #    parameter values.
+        # 3. In the training loop, before an all-reduce between the grads of
+        #    the two word_embeddings layers to ensure that every applied weight
+        #    update is the same on both stages.
+        # Ensure that first and last stages have the same initial parameter
+        # values.
+        if torch.distributed.is_initialized():
+            if parallel_state.is_rank_in_embedding_group():
+                weight = self.shared_embedding_or_output_weight()
+                torch.distributed.all_reduce(weight.data, group=parallel_state.get_embedding_group())
+        elif not getattr(GPTModel, "embedding_warning_printed", False):
+            logging.getLogger(__name__).warning(
+                "Distributed processes aren't initialized, so the output layer "
+                "is not initialized with weights from the word embeddings. "
+                "If you are just manipulating a model this is fine, but "
+                "this needs to be handled manually. If you are training "
+                "something is definitely wrong."
+            )
+            GPTModel.embedding_warning_printed = True
+    # TODO: add distributed checkpointing
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        pass
+        # """For easy load."""
+        # state_dict_ = {}
+        # if self.pre_process:
+        #     state_dict_[self._embedding_key] = self.embedding.state_dict_for_save_checkpoint(
+        #         prefix=prefix, keep_vars=keep_vars
+        #     )
+        # state_dict_[self._encoder_key] = self.encoder.state_dict_for_save_checkpoint(
+        #     prefix=prefix, keep_vars=keep_vars
+        # )
+        # return state_dict_
+    # TODO: add distributed checkpointing
+    def load_state_dict(self, state_dict, strict=True):
+        pass
+        # """Customized load."""
+        # # Embedding.
+        # if self.pre_process:
+        #     if self._embedding_key in state_dict:
+        #         state_dict_ = state_dict[self._embedding_key]
+        #     else:
+        #         # for backward compatibility.
+        #         state_dict_ = {}
+        #         for key in state_dict.keys():
+        #             if '_embeddings' in key:
+        #                 state_dict_[key] = state_dict[key]
+        #     self.embedding.load_state_dict(state_dict_, strict=strict)
+        # # Encoder.
+        # if self._encoder_key in state_dict:
+        #     state_dict_ = state_dict[self._encoder_key]
+        # # For backward compatibility.
+        # elif 'transformer' in state_dict:
+        #     state_dict_ = state_dict['transformer']
+        # else:
+        #     # For backward compatibility.
+        #     state_dict_ = {}
+        #     for key in state_dict.keys():
+        #         if 'transformer.' in key:
+        #             state_dict_[key.split('transformer.')[1]] = state_dict[key]
+        # # For backward compatibility.
+        # state_dict_self_attention = {}
+        # for key in state_dict_.keys():
+        #     if '.attention.' in key:
+        #         state_dict_self_attention[key.replace(".attention.", ".self_attention.")] = state_dict_[key]
+        #     else:
+        #         state_dict_self_attention[key] = state_dict_[key]
+        # state_dict_ = state_dict_self_attention
+        # self.encoder.load_state_dict(state_dict_, strict=strict)
--- a/megatron/core/package_info.py
+++ b/megatron/core/package_info.py
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+MAJOR = 0
+MINOR = 2
+PATCH = 0
+PRE_RELEASE = ''
+# Use the following formatting: (major, minor, patch, pre-release)
+VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
+__shortversion__ = '.'.join(map(str, VERSION[:3]))
+__version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:])
+__package_name__ = 'megatron_core'
+__contact_names__ = 'NVIDIA'
+__contact_emails__ = 'nemo-toolkit@nvidia.com' # use NeMo Email
+__homepage__ = 'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/' # use NeMo homepage
+__repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core'
+__download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
+__description__ = 'Megatron Core - a library for efficient and scalable training of transformer based models'
+__license__ = 'BSD-3'
+__keywords__ = 'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch'
--- a/megatron/core/parallel_state.py
+++ b/megatron/core/parallel_state.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+"""Model and data parallel groups."""
+import torch
+from typing import Optional
+from .utils import GlobalMemoryBuffer
+# Intra-layer model parallel group that the current rank belongs to.
+_TENSOR_MODEL_PARALLEL_GROUP = None
+# Inter-layer model parallel group that the current rank belongs to.
+_PIPELINE_MODEL_PARALLEL_GROUP = None
+# Model parallel group (both intra- and pipeline) that the current rank belongs to.
+_MODEL_PARALLEL_GROUP = None
+# Embedding group.
+_EMBEDDING_GROUP = None
+# Position embedding group.
+_POSITION_EMBEDDING_GROUP = None
+# Data parallel group that the current rank belongs to.
+_DATA_PARALLEL_GROUP = None
+_DATA_PARALLEL_GROUP_GLOO = None
+# FP8 amax reduction group.
+_AMAX_REDUCTION_GROUP = None
+_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None
+_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+_PIPELINE_MODEL_PARALLEL_SPLIT_RANK = None
+# These values enable us to change the mpu sizes on the fly.
+_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_TENSOR_MODEL_PARALLEL_RANK = None
+_MPU_PIPELINE_MODEL_PARALLEL_RANK = None
+# A list of ranks that have a copy of the embedding.
+_EMBEDDING_GLOBAL_RANKS = None
+# A list of ranks that have a copy of the position embedding.
+_POSITION_EMBEDDING_GLOBAL_RANKS = None
+# A list of global ranks for each pipeline group to ease calculation of the source
+# rank when broadcasting from the first or last pipeline stage.
+_PIPELINE_GLOBAL_RANKS = None
+# For DeepSpeed's sequence parallel
+_SEQUENCE_PARALLEL_GROUP = None
+_SEQUENCE_PARALLEL_WORLD_SIZE = None
+_SEQUENCE_PARALLEL_RANK = None
+# This group includes processes for both data and sequence parallelisms.
+# We use this group to reduce gradients and shard parameters and optimizer stages for ZeRO.
+_SEQUENCE_DATA_PARALLEL_GROUP = None
+_SEQUENCE_DATA_PARALLEL_WORLD_SIZE = None
+_SEQUENCE_DATA_PARALLEL_RANK = None
+# A list of global ranks for each data parallel group to ease calculation of the source
+# rank when broadcasting weights from src to all other data parallel ranks
+_DATA_PARALLEL_GLOBAL_RANKS = None
+# Memory buffers to avoid dynamic memory allocation
+_GLOBAL_MEMORY_BUFFER = None
+def initialize_model_parallel(
+    tensor_model_parallel_size: int = 1,
+    pipeline_model_parallel_size: int = 1,
+    sequence_parallel_size: int = 1,
+    virtual_pipeline_model_parallel_size: Optional[int] = None,
+    pipeline_model_parallel_split_rank: Optional[int] = None,
+    use_fp8: bool = False,
+    use_distributed_optimizer: bool = False,
+) -> None:
+    """Initialize model data parallel groups.
+    Arguments:
+        tensor_model_parallel_size (int, default = 1):
+            The number of GPUs to split individual tensors across.
+        pipeline_model_parallel_size (int, default = 1):
+            The number of tensor parallel GPU groups to split the
+            Transformer layers across. For example, if
+            tensor_model_parallel_size is 4 and
+            pipeline_model_parallel_size is 2, the model will be split
+            into 2 groups of 4 GPUs.
+        virtual_pipeline_model_parallel_size (int, optional):
+            The number of stages that each pipeline group will have,
+            interleaving as necessary. If None, no interleaving is
+            performed. For example, if tensor_model_parallel_size is 1,
+            pipeline_model_parallel_size is 4,
+            virtual_pipeline_model_parallel_size is 2, and there are
+            16 transformer layers in the model, the model will be
+            split into 8 stages with two layers each and each GPU
+            would get 2 stages as such (layer number starting with 1):
+            GPU 0: [1, 2] [9, 10]
+            GPU 1: [3, 4] [11, 12]
+            GPU 2: [5, 6] [13, 14]
+            GPU 3: [7, 8] [15, 16]
+        pipeline_model_parallel_split_rank (int, optional):
+            For models with both an encoder and decoder, the rank in
+            pipeline to switch between encoder and decoder (i.e. the
+            first rank of the decoder). This allows the user to set
+            the pipeline parallel size of the encoder and decoder
+            independently. For example, if
+            pipeline_model_parallel_size is 8 and
+            pipeline_model_parallel_split_rank is 3, then ranks 0-2
+            will be the encoder and ranks 3-7 will be the decoder.
+        use_fp8 (bool, default = False):
+            Construct GPU groups needed for FP8 training, namely for
+            amax reduction across the product of the data-parallel and
+            tensor-parallel groups.
+    Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
+    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+    the model pipeline. The present function will
+    create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
+    and 8 data-parallel groups as:
+        8 data_parallel groups:
+            [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
+        8 tensor model-parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
+        4 pipeline model-parallel groups:
+            [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size: int = torch.distributed.get_world_size()
+    if world_size % (tensor_model_parallel_size * pipeline_model_parallel_size) != 0:
+        raise RuntimeError(
+            f"world_size ({world_size}) is not divisible by tensor_model_parallel_size "
+            f"({tensor_model_parallel_size}) x pipeline_model_parallel_size ({pipeline_model_parallel_size})"
+        )
+    enable_ds_sequence_parallel = sequence_parallel_size > 1
+    if enable_ds_sequence_parallel:
+        assert tensor_model_parallel_size == 1 and pipeline_model_parallel_size == 1, \
+        'DeepSpeed\'s sequence parallel does not work with tensor parallel or pipeline parallel'
+        if world_size % sequence_parallel_size != 0:
+            raise RuntimeError(
+                f"world_size ({world_size}) is not divisible by sequence_parallel_size {sequence_parallel_size})"
+            )
+    data_parallel_size: int = world_size // (tensor_model_parallel_size * pipeline_model_parallel_size * sequence_parallel_size)
+    sequence_data_parallel_size: int = sequence_parallel_size * data_parallel_size
+    num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
+    num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
+    num_data_parallel_groups: int = world_size // data_parallel_size
+    num_sequence_parallel_groups: int = world_size // sequence_parallel_size
+    num_sequence_data_parallel_groups: int = world_size // sequence_parallel_size // data_parallel_size
+    if virtual_pipeline_model_parallel_size is not None:
+        if not pipeline_model_parallel_size > 2:
+            raise RuntimeError("pipeline-model-parallel size should be greater than 2 with " "interleaved schedule")
+        global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+        global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+        _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
+        _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = virtual_pipeline_model_parallel_size
+    if pipeline_model_parallel_split_rank is not None:
+        global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
+        _PIPELINE_MODEL_PARALLEL_SPLIT_RANK = pipeline_model_parallel_split_rank
+    rank = torch.distributed.get_rank()
+    # Build the data-parallel groups.
+    global _DATA_PARALLEL_GROUP
+    global _DATA_PARALLEL_GROUP_GLOO
+    global _DATA_PARALLEL_GLOBAL_RANKS
+    assert _DATA_PARALLEL_GROUP is None, 'data parallel group is already initialized'
+    all_data_parallel_group_ranks = []
+    for i in range(pipeline_model_parallel_size):
+        start_rank = i * num_pipeline_model_parallel_groups
+        end_rank = (i + 1) * num_pipeline_model_parallel_groups
+        if sequence_parallel_size > 1:
+            tp_or_sp_size = sequence_parallel_size
+        else:
+            tp_or_sp_size = tensor_model_parallel_size
+        for j in range(tp_or_sp_size):
+            ranks = range(start_rank + j, end_rank, tp_or_sp_size)
+            all_data_parallel_group_ranks.append(list(ranks))
+            group = torch.distributed.new_group(ranks)
+            if use_distributed_optimizer:
+                group_gloo = torch.distributed.new_group(ranks, backend="gloo")
+            else:
+                group_gloo = None
+            if rank in ranks:
+                _DATA_PARALLEL_GROUP = group
+                _DATA_PARALLEL_GROUP_GLOO = group_gloo
+                _DATA_PARALLEL_GLOBAL_RANKS = ranks
+    # Build the sequence parallel groups.
+    global _SEQUENCE_PARALLEL_GROUP
+    assert _SEQUENCE_PARALLEL_GROUP is None, \
+        'sequence parallel group is already initialized'
+    for i in range(num_sequence_parallel_groups):
+        ranks = range(i * sequence_parallel_size,
+                      (i + 1) * sequence_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _SEQUENCE_PARALLEL_GROUP = group
+    # Build the sequence data parallel groups.
+    global _SEQUENCE_DATA_PARALLEL_GROUP
+    assert _SEQUENCE_DATA_PARALLEL_GROUP is None, \
+        'sequence data parallel group is already initialized'
+    all_data_sequence_parallel_group_ranks = []
+    if enable_ds_sequence_parallel:
+        for i in range(num_sequence_data_parallel_groups):
+            ranks = range(i * sequence_data_parallel_size,
+                        (i + 1) * sequence_data_parallel_size)
+            group = torch.distributed.new_group(ranks)
+            all_data_sequence_parallel_group_ranks.append(list(ranks))
+            if rank in ranks:
+                _SEQUENCE_DATA_PARALLEL_GROUP = group
+    else:
+        _SEQUENCE_DATA_PARALLEL_GROUP = _DATA_PARALLEL_GROUP
+    # Build the model-parallel groups.
+    global _MODEL_PARALLEL_GROUP
+    assert _MODEL_PARALLEL_GROUP is None, 'model parallel group is already initialized'
+    num_model_parallel_groups = sequence_data_parallel_size if enable_ds_sequence_parallel else data_parallel_size
+    model_parallel_group_ranks = all_data_sequence_parallel_group_ranks if enable_ds_sequence_parallel else all_data_parallel_group_ranks
+    for i in range(num_model_parallel_groups):
+        ranks = [parallel_group_ranks[i] for parallel_group_ranks in model_parallel_group_ranks]
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _MODEL_PARALLEL_GROUP = group
+    # Build the tensor model-parallel groups.
+    global _TENSOR_MODEL_PARALLEL_GROUP
+    assert _TENSOR_MODEL_PARALLEL_GROUP is None, 'tensor model parallel group is already initialized'
+    for i in range(num_tensor_model_parallel_groups):
+        ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _TENSOR_MODEL_PARALLEL_GROUP = group
+    # Build the pipeline model-parallel groups and embedding groups
+    # (first and last rank in each pipeline model-parallel group).
+    global _PIPELINE_MODEL_PARALLEL_GROUP
+    global _PIPELINE_GLOBAL_RANKS
+    assert _PIPELINE_MODEL_PARALLEL_GROUP is None, 'pipeline model parallel group is already initialized'
+    global _EMBEDDING_GROUP
+    global _EMBEDDING_GLOBAL_RANKS
+    assert _EMBEDDING_GROUP is None, 'embedding group is already initialized'
+    global _POSITION_EMBEDDING_GROUP
+    global _POSITION_EMBEDDING_GLOBAL_RANKS
+    assert _POSITION_EMBEDDING_GROUP is None, 'position embedding group is already initialized'
+    for i in range(num_pipeline_model_parallel_groups):
+        ranks = range(i, world_size, num_pipeline_model_parallel_groups)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _PIPELINE_MODEL_PARALLEL_GROUP = group
+            _PIPELINE_GLOBAL_RANKS = ranks
+        # Setup embedding group (to exchange gradients between
+        # first and last stages).
+        if len(ranks) > 1:
+            embedding_ranks = [ranks[0], ranks[-1]]
+            position_embedding_ranks = [ranks[0]]
+            if pipeline_model_parallel_split_rank is not None:
+                if ranks[pipeline_model_parallel_split_rank] not in embedding_ranks:
+                    embedding_ranks = [ranks[0], ranks[pipeline_model_parallel_split_rank], ranks[-1]]
+                if ranks[pipeline_model_parallel_split_rank] not in position_embedding_ranks:
+                    position_embedding_ranks = [ranks[0], ranks[pipeline_model_parallel_split_rank]]
+        else:
+            embedding_ranks = ranks
+            position_embedding_ranks = ranks
+        group = torch.distributed.new_group(embedding_ranks)
+        if rank in embedding_ranks:
+            _EMBEDDING_GROUP = group
+        if rank in ranks:
+            _EMBEDDING_GLOBAL_RANKS = embedding_ranks
+        group = torch.distributed.new_group(position_embedding_ranks)
+        if rank in position_embedding_ranks:
+            _POSITION_EMBEDDING_GROUP = group
+        if rank in ranks:
+            _POSITION_EMBEDDING_GLOBAL_RANKS = position_embedding_ranks
+    # Build the FP8 groups.
+    global _AMAX_REDUCTION_GROUP
+    assert _AMAX_REDUCTION_GROUP is None, \
+        'FP8 amax reduction group is already initialized'
+    if use_fp8:
+        amax_group_size: int = tensor_model_parallel_size * data_parallel_size
+        num_amax_groups: int = world_size // amax_group_size
+        for i in range(num_amax_groups):
+            start_rank = i * amax_group_size
+            end_rank = (i + 1) * amax_group_size
+            ranks = range(start_rank, end_rank)
+            group = torch.distributed.new_group(ranks)
+            if rank in ranks:
+                _AMAX_REDUCTION_GROUP = group
+    # Initialize global memory buffer
+    # This isn't really "parallel state" but there isn't another good place to
+    # put this. If we end up with a more generic initialization of megatron-core
+    # we could stick it there
+    _set_global_memory_buffer()
+def is_unitialized():
+    """Useful for code segments that may be accessed with or without mpu initialization"""
+    return _DATA_PARALLEL_GROUP is None
+def model_parallel_is_initialized():
+    """Check if model and data parallel groups are initialized."""
+    if _TENSOR_MODEL_PARALLEL_GROUP is None or _PIPELINE_MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+def sequence_parallel_is_initialized():
+    """Check if sequence and data parallel groups are initialized."""
+    if _SEQUENCE_PARALLEL_GROUP is None or \
+        _DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+def sequence_data_parallel_is_initialized():
+    """Check if sequence data parallel groups are initialized."""
+    if _SEQUENCE_DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+def get_model_parallel_group():
+    """Get the model parallel group the caller rank belongs to."""
+    assert _MODEL_PARALLEL_GROUP is not None, 'model parallel group is not initialized'
+    return _MODEL_PARALLEL_GROUP
+def get_tensor_model_parallel_group(check_initialized=True):
+    """Get the tensor model parallel group the caller rank belongs to."""
+    if check_initialized:
+        assert _TENSOR_MODEL_PARALLEL_GROUP is not None, 'tensor model parallel group is not initialized'
+    return _TENSOR_MODEL_PARALLEL_GROUP
+def get_pipeline_model_parallel_group():
+    """Get the pipeline model parallel group the caller rank belongs to."""
+    assert _PIPELINE_MODEL_PARALLEL_GROUP is not None, 'pipeline_model parallel group is not initialized'
+    return _PIPELINE_MODEL_PARALLEL_GROUP
+def get_sequence_parallel_group():
+    """Get the sequence parallel group the caller rank belongs to."""
+    assert _SEQUENCE_PARALLEL_GROUP is not None, \
+        'sequence parallel group is not initialized'
+    return _SEQUENCE_PARALLEL_GROUP
+def get_sequence_data_parallel_group():
+    """Get the sequence parallel group the caller rank belongs to."""
+    assert _SEQUENCE_DATA_PARALLEL_GROUP is not None, \
+        'sequence data parallel group is not initialized'
+    return _SEQUENCE_DATA_PARALLEL_GROUP
+def get_data_parallel_group():
+    """Get the data parallel group the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP is not None, 'data parallel group is not initialized'
+    return _DATA_PARALLEL_GROUP
+def get_data_parallel_group_gloo():
+    """Get the data parallel group-gloo the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP_GLOO is not None, \
+        'data parallel group-gloo is not initialized'
+    return _DATA_PARALLEL_GROUP_GLOO
+def get_embedding_group():
+    """Get the embedding group the caller rank belongs to."""
+    assert _EMBEDDING_GROUP is not None, 'embedding group is not initialized'
+    return _EMBEDDING_GROUP
+def get_position_embedding_group():
+    """Get the position embedding group the caller rank belongs to."""
+    assert _POSITION_EMBEDDING_GROUP is not None, 'position embedding group is not initialized'
+    return _POSITION_EMBEDDING_GROUP
+def get_amax_reduction_group():
+    """Get the FP8 amax reduction group the caller rank belongs to."""
+    assert _AMAX_REDUCTION_GROUP is not None, \
+        'FP8 amax reduction group is not initialized'
+    return _AMAX_REDUCTION_GROUP
+def set_tensor_model_parallel_world_size(world_size):
+    """Set the tensor model parallel size"""
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = world_size
+def set_sequence_parallel_world_size(world_size):
+    """Set the sequence  parallel size"""
+    global _SEQUENCE_PARALLEL_WORLD_SIZE
+    _SEQUENCE_PARALLEL_WORLD_SIZE = world_size
+def set_sequence_data_parallel_world_size(world_size):
+    """Set the sequence  parallel size"""
+    global _SEQUENCE_DATA_PARALLEL_WORLD_SIZE
+    _SEQUENCE_DATA_PARALLEL_WORLD_SIZE = world_size
+def set_pipeline_model_parallel_world_size(world_size):
+    """Set the pipeline model parallel size"""
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size
+def set_virtual_pipeline_model_parallel_world_size(world_size):
+    """Set the pipeline model parallel size"""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size
+def set_virtual_pipeline_model_parallel_world_size(world_size):
+    """Set the virtual pipeline model parallel size"""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size
+def get_tensor_model_parallel_world_size():
+    """Return world size for the tensor model parallel group."""
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    if _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
+def get_model_parallel_world_size():
+    assert get_pipeline_model_parallel_world_size() == 1, "legacy get_model_parallel_world_size is only supported if PP is disabled"
+    return get_tensor_model_parallel_world_size()
+def get_sequence_parallel_world_size():
+    """Return world size for the sequence parallel group."""
+    global _SEQUENCE_PARALLEL_WORLD_SIZE
+    if _SEQUENCE_PARALLEL_WORLD_SIZE is not None:
+        return _SEQUENCE_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_sequence_parallel_group())
+def get_sequence_data_parallel_world_size():
+    """Return world size for the sequence parallel group."""
+    global _SEQUENCE_DATA_PARALLEL_WORLD_SIZE
+    if _SEQUENCE_DATA_PARALLEL_WORLD_SIZE is not None:
+        return _SEQUENCE_DATA_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_sequence_data_parallel_group())
+def get_pipeline_model_parallel_world_size():
+    """Return world size for the pipeline model parallel group."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    if _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    return torch.distributed.get_world_size(group=get_pipeline_model_parallel_group())
+def set_tensor_model_parallel_rank(rank):
+    """Set tensor model parallel rank."""
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    _MPU_TENSOR_MODEL_PARALLEL_RANK = rank
+def get_model_parallel_rank():
+    assert get_pipeline_model_parallel_world_size() == 1, "legacy get_model_parallel_rank is only supported if PP is disabled"
+    return get_tensor_model_parallel_rank()
+def set_sequence_parallel_rank(rank):
+    """Set sequence parallel rank."""
+    global _SEQUENCE_PARALLEL_RANK
+    _SEQUENCE_PARALLEL_RANK = rank
+def set_sequence_data_parallel_rank(rank):
+    """Set sequence parallel rank."""
+    global _SEQUENCE_DATA_PARALLEL_RANK
+    _SEQUENCE_DATA_PARALLEL_RANK = rank
+def set_pipeline_model_parallel_rank(rank):
+    """Set pipeline model parallel rank."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    _MPU_PIPELINE_MODEL_PARALLEL_RANK = rank
+def set_pipeline_model_parallel_split_rank(rank):
+    """Set pipeline model parallel split rank."""
+    global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
+    _PIPELINE_MODEL_PARALLEL_SPLIT_RANK = rank
+def get_tensor_model_parallel_rank():
+    """Return my rank for the tensor model parallel group."""
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    if _MPU_TENSOR_MODEL_PARALLEL_RANK is not None:
+        return _MPU_TENSOR_MODEL_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
+def get_pipeline_model_parallel_rank():
+    """Return my rank for the pipeline model parallel group."""
+    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    if _MPU_PIPELINE_MODEL_PARALLEL_RANK is not None:
+        return _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_pipeline_model_parallel_group())
+def get_pipeline_model_parallel_split_rank():
+    """Return pipeline model parallel split rank."""
+    global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
+    return _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
+def get_sequence_parallel_rank():
+    """Return my rank for the sequence parallel group."""
+    global _SEQUENCE_PARALLEL_RANK
+    if _SEQUENCE_PARALLEL_RANK is not None:
+        return _SEQUENCE_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_sequence_parallel_group())
+def get_sequence_data_parallel_rank():
+    """Return my rank for the sequence data parallel group."""
+    global _SEQUENCE_DATA_PARALLEL_RANK
+    if _SEQUENCE_DATA_PARALLEL_RANK is not None:
+        return _SEQUENCE_DATA_PARALLEL_RANK
+    return torch.distributed.get_rank(group=get_sequence_data_parallel_group())
+def is_pipeline_first_stage(ignore_virtual=False):
+    """Return True if in the first pipeline model-parallel stage, False otherwise."""
+    if not ignore_virtual:
+        if (
+            get_virtual_pipeline_model_parallel_world_size() is not None
+            and get_virtual_pipeline_model_parallel_rank() != 0
+        ):
+            return False
+    return get_pipeline_model_parallel_rank() == 0
+def is_pipeline_last_stage(ignore_virtual=False):
+    """Return True if in the last pipeline model-parallel stage, False otherwise."""
+    if not ignore_virtual:
+        virtual_pipeline_model_parallel_world_size = get_virtual_pipeline_model_parallel_world_size()
+        if virtual_pipeline_model_parallel_world_size is not None and get_virtual_pipeline_model_parallel_rank() != (
+            virtual_pipeline_model_parallel_world_size - 1
+        ):
+            return False
+    return get_pipeline_model_parallel_rank() == (get_pipeline_model_parallel_world_size() - 1)
+def is_rank_in_embedding_group(ignore_virtual=False):
+    """Return true if current rank is in embedding group, False otherwise."""
+    rank = torch.distributed.get_rank()
+    global _EMBEDDING_GLOBAL_RANKS
+    if ignore_virtual:
+        return rank in _EMBEDDING_GLOBAL_RANKS
+    if rank in _EMBEDDING_GLOBAL_RANKS:
+        if rank == _EMBEDDING_GLOBAL_RANKS[0]:
+            return is_pipeline_first_stage(ignore_virtual=False)
+        elif rank == _EMBEDDING_GLOBAL_RANKS[-1]:
+            return is_pipeline_last_stage(ignore_virtual=False)
+        else:
+            return True
+    return False
+def is_rank_in_position_embedding_group():
+    """Return true if current rank is in position embedding group, False otherwise."""
+    rank = torch.distributed.get_rank()
+    global _POSITION_EMBEDDING_GLOBAL_RANKS
+    return rank in _POSITION_EMBEDDING_GLOBAL_RANKS
+def is_pipeline_stage_before_split(rank=None):
+    """Return True if pipeline stage executes encoder block for a model
+    with both encoder and decoder."""
+    if get_pipeline_model_parallel_world_size() == 1:
+        return True
+    if rank is None:
+        rank = get_pipeline_model_parallel_rank()
+    global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
+    if _PIPELINE_MODEL_PARALLEL_SPLIT_RANK is None:
+        return True
+    if rank < _PIPELINE_MODEL_PARALLEL_SPLIT_RANK:
+        return True
+    return False
+def is_pipeline_stage_after_split(rank=None):
+    """Return True if pipeline stage executes decoder block for a model
+    with both encoder and decoder."""
+    if get_pipeline_model_parallel_world_size() == 1:
+        return True
+    if rank is None:
+        rank = get_pipeline_model_parallel_rank()
+    global _PIPELINE_MODEL_PARALLEL_SPLIT_RANK
+    if _PIPELINE_MODEL_PARALLEL_SPLIT_RANK is None:
+        return True
+    if rank >= _PIPELINE_MODEL_PARALLEL_SPLIT_RANK:
+        return True
+    return False
+def is_pipeline_stage_at_split():
+    """Return true if pipeline stage executes decoder block and next
+    stage executes encoder block for a model with both encoder and
+    decoder."""
+    rank = get_pipeline_model_parallel_rank()
+    return is_pipeline_stage_before_split(rank) and is_pipeline_stage_after_split(rank + 1)
+def get_virtual_pipeline_model_parallel_rank():
+    """Return the virtual pipeline-parallel rank."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+    return _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+def set_virtual_pipeline_model_parallel_rank(rank):
+    """Set the virtual pipeline-parallel rank."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+    _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = rank
+def get_virtual_pipeline_model_parallel_world_size():
+    """Return the virtual pipeline-parallel world size."""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    return _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+def set_virtual_pipeline_model_parallel_world_size(world_size):
+    """Set the virtual pipeline-parallel world size"""
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = world_size
+def get_tensor_model_parallel_src_rank():
+    """Calculate the global rank corresponding to the first local rank
+    in the tensor model parallel group."""
+    global_rank = torch.distributed.get_rank()
+    local_world_size = get_tensor_model_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+def get_sequence_parallel_src_rank():
+    """Calculate the global rank corresponding to the first local rank
+    in the sequence parallel group."""
+    global_rank = torch.distributed.get_rank()
+    local_world_size = get_sequence_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+def get_data_parallel_src_rank():
+    """Calculate the global rank corresponding to the first local rank
+    in the data parallel group."""
+    assert _DATA_PARALLEL_GLOBAL_RANKS is not None, "Data parallel group is not initialized"
+    return _DATA_PARALLEL_GLOBAL_RANKS[0]
+def get_pipeline_model_parallel_first_rank():
+    """Return the global rank of the first process in the pipeline for the
+    current tensor parallel group"""
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    return _PIPELINE_GLOBAL_RANKS[0]
+def get_pipeline_model_parallel_last_rank():
+    """Return the global rank of the last process in the pipeline for the
+    current tensor parallel group"""
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    last_rank_local = get_pipeline_model_parallel_world_size() - 1
+    return _PIPELINE_GLOBAL_RANKS[last_rank_local]
+def get_pipeline_model_parallel_next_rank():
+    """Return the global rank that follows the caller in the pipeline"""
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    rank_in_pipeline = get_pipeline_model_parallel_rank()
+    world_size = get_pipeline_model_parallel_world_size()
+    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]
+def get_pipeline_model_parallel_prev_rank():
+    """Return the global rank that preceeds the caller in the pipeline"""
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    rank_in_pipeline = get_pipeline_model_parallel_rank()
+    world_size = get_pipeline_model_parallel_world_size()
+    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]
+def get_data_parallel_world_size():
+    """Return world size for the data parallel group."""
+    return torch.distributed.get_world_size(group=get_data_parallel_group())
+def get_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    return torch.distributed.get_rank(group=get_data_parallel_group())
+def _set_global_memory_buffer():
+    """Initialize global buffer"""
+    global _GLOBAL_MEMORY_BUFFER
+    assert _GLOBAL_MEMORY_BUFFER is None, 'global memory buffer is already initialized'
+    _GLOBAL_MEMORY_BUFFER = GlobalMemoryBuffer()
+def get_global_memory_buffer():
+    """Return the global GlobalMemoryBuffer object"""
+    assert _GLOBAL_MEMORY_BUFFER is not None, 'global memory buffer is not initialized'
+    return _GLOBAL_MEMORY_BUFFER
+def destroy_global_memory_buffer():
+    """Sets the global memory buffer to None"""
+    global _GLOBAL_MEMORY_BUFFER
+    _GLOBAL_MEMORY_BUFFER = None
+def destroy_model_parallel():
+    """Set the groups to none."""
+    global _MODEL_PARALLEL_GROUP
+    _MODEL_PARALLEL_GROUP = None
+    global _TENSOR_MODEL_PARALLEL_GROUP
+    _TENSOR_MODEL_PARALLEL_GROUP = None
+    global _PIPELINE_MODEL_PARALLEL_GROUP
+    _PIPELINE_MODEL_PARALLEL_GROUP = None
+    global _DATA_PARALLEL_GROUP
+    _DATA_PARALLEL_GROUP = None
+    global _SEQUENCE_PARALLEL_GROUP
+    _SEQUENCE_PARALLEL_GROUP = None
+    global _SEQUENCE_DATA_PARALLEL_GROUP
+    _SEQUENCE_DATA_PARALLEL_GROUP = None
+    global _EMBEDDING_GROUP
+    _EMBEDDING_GROUP = None
+    global _POSITION_EMBEDDING_GROUP
+    _POSITION_EMBEDDING_GROUP = None
+    global _AMAX_REDUCTION_GROUP
+    _AMAX_REDUCTION_GROUP = None
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
+    _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = None
+    global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
+    global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE = None
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    _MPU_TENSOR_MODEL_PARALLEL_RANK = None
+    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
+    _MPU_PIPELINE_MODEL_PARALLEL_RANK = None
+    global _GLOBAL_MEMORY_BUFFER
+    _GLOBAL_MEMORY_BUFFER = None
--- a/megatron/core/pipeline_parallel/__init__.py
+++ b/megatron/core/pipeline_parallel/__init__.py
+from .schedules import get_forward_backward_func
--- a/megatron/core/pipeline_parallel/__pycache__/__init__.cpython-310.pyc
+++ b/megatron/core/pipeline_parallel/__pycache__/__init__.cpython-310.pyc
--- a/megatron/core/pipeline_parallel/__pycache__/__init__.cpython-37.pyc
+++ b/megatron/core/pipeline_parallel/__pycache__/__init__.cpython-37.pyc
--- a/megatron/core/pipeline_parallel/__pycache__/__init__.cpython-38.pyc
+++ b/megatron/core/pipeline_parallel/__pycache__/__init__.cpython-38.pyc
--- a/megatron/core/pipeline_parallel/__pycache__/p2p_communication.cpython-310.pyc
+++ b/megatron/core/pipeline_parallel/__pycache__/p2p_communication.cpython-310.pyc
--- a/megatron/core/pipeline_parallel/__pycache__/p2p_communication.cpython-37.pyc
+++ b/megatron/core/pipeline_parallel/__pycache__/p2p_communication.cpython-37.pyc
--- a/megatron/core/pipeline_parallel/__pycache__/p2p_communication.cpython-38.pyc
+++ b/megatron/core/pipeline_parallel/__pycache__/p2p_communication.cpython-38.pyc
--- a/megatron/core/pipeline_parallel/__pycache__/schedules.cpython-310.pyc
+++ b/megatron/core/pipeline_parallel/__pycache__/schedules.cpython-310.pyc
--- a/megatron/core/pipeline_parallel/__pycache__/schedules.cpython-37.pyc
+++ b/megatron/core/pipeline_parallel/__pycache__/schedules.cpython-37.pyc
--- a/megatron/core/pipeline_parallel/__pycache__/schedules.cpython-38.pyc
+++ b/megatron/core/pipeline_parallel/__pycache__/schedules.cpython-38.pyc
--- a/megatron/core/pipeline_parallel/p2p_communication.py
+++ b/megatron/core/pipeline_parallel/p2p_communication.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+from functools import reduce
+import operator
+from typing import Optional, List, Union, Callable, Tuple
+import torch
+from megatron import core
+from megatron.core.parallel_state import (
+    get_pipeline_model_parallel_group,
+    get_pipeline_model_parallel_rank,
+    get_pipeline_model_parallel_prev_rank,
+    get_pipeline_model_parallel_next_rank,
+)
+from megatron.core import ModelParallelConfig
+from deepspeed.accelerator import get_accelerator
+# Types
+Shape = Union[List[int], torch.Size]
+def _communicate_shapes(tensor_send_next, tensor_send_prev,
+                        recv_prev, recv_next, config):
+    """Communicate tensor shapes between stages. Used to communicate
+    tensor shapes before the actual tensor communication happens.
+    This is required when the sequence lengths across micro batches
+    are not uniform.
+    Takes the following arguments:
+        tensor_send_next: tensor to send to next rank (no tensor sent if
+                          set to None).
+        tensor_send_prev: tensor to send to prev rank (no tensor sent if
+                          set to None).
+        recv_prev: boolean for whether tensor should be received from
+                   previous rank.
+        recv_next: boolean for whether tensor should be received from
+                   next rank.
+    Returns:
+        (recv_prev_shape, recv_next_shape)
+    """
+    recv_prev_shape_tensor = None
+    recv_next_shape_tensor = None
+    send_prev_shape_tensor = None
+    send_next_shape_tensor = None
+    if recv_prev:
+        recv_prev_shape_tensor = torch.empty((3),
+                                             device=get_accelerator().current_device(),
+                                             dtype=torch.int64)
+    if recv_next:
+        recv_next_shape_tensor = torch.empty((3),
+                                             device=get_accelerator().current_device(),
+                                             dtype=torch.int64)
+    if tensor_send_prev is not None:
+        send_prev_shape_tensor = torch.tensor(tensor_send_prev.size(),
+                                              device=get_accelerator().current_device(),
+                                              dtype=torch.int64)
+    if tensor_send_next is not None:
+        send_next_shape_tensor = torch.tensor(tensor_send_next.size(),
+                                              device=get_accelerator().current_device(),
+                                              dtype=torch.int64)
+    if config.use_ring_exchange_p2p:
+        torch.distributed.ring_exchange(tensor_send_prev=send_prev_shape_tensor,
+                                        tensor_recv_prev=recv_prev_shape_tensor,
+                                        tensor_send_next=send_next_shape_tensor,
+                                        tensor_recv_next=recv_next_shape_tensor,
+                                        group=get_pipeline_model_parallel_group())
+    else:
+        ops = []
+        if send_prev_shape_tensor is not None:
+            send_prev_op = torch.distributed.P2POp(
+                torch.distributed.isend, send_prev_shape_tensor,
+                get_pipeline_model_parallel_prev_rank())
+            ops.append(send_prev_op)
+        if recv_prev_shape_tensor is not None:
+            recv_prev_op = torch.distributed.P2POp(
+                torch.distributed.irecv, recv_prev_shape_tensor,
+                get_pipeline_model_parallel_prev_rank())
+            ops.append(recv_prev_op)
+        if send_next_shape_tensor is not None:
+            send_next_op = torch.distributed.P2POp(
+                torch.distributed.isend, send_next_shape_tensor,
+                get_pipeline_model_parallel_next_rank())
+            ops.append(send_next_op)
+        if recv_next_shape_tensor is not None:
+            recv_next_op = torch.distributed.P2POp(
+                torch.distributed.irecv, recv_next_shape_tensor,
+                get_pipeline_model_parallel_next_rank())
+            ops.append(recv_next_op)
+        if len(ops) > 0:
+            reqs = torch.distributed.batch_isend_irecv(ops)
+            for req in reqs:
+                req.wait()
+        # To protect against race condition when using batch_isend_irecv().
+        # should take this out once the bug with batch_isend_irecv is resolved.
+        get_accelerator().synchronize()
+    recv_prev_shape = [0, 0, 0]
+    if recv_prev_shape_tensor is not None:
+        recv_prev_shape = recv_prev_shape_tensor.tolist()
+    recv_next_shape = [0, 0, 0]
+    if recv_next_shape_tensor is not None:
+        recv_next_shape = recv_next_shape_tensor.tolist()
+    return recv_prev_shape, recv_next_shape
+def _batched_p2p_ops(*,
+                     tensor_send_prev: Optional[torch.Tensor],
+                     tensor_recv_prev: Optional[torch.Tensor],
+                     tensor_send_next: Optional[torch.Tensor],
+                     tensor_recv_next: Optional[torch.Tensor],
+                     group: torch.distributed.ProcessGroup):
+    ops = []
+    if tensor_send_prev is not None:
+        send_prev_op = torch.distributed.P2POp(
+            torch.distributed.isend, tensor_send_prev,
+            get_pipeline_model_parallel_prev_rank(),
+            group)
+        ops.append(send_prev_op)
+    if tensor_recv_prev is not None:
+        recv_prev_op = torch.distributed.P2POp(
+            torch.distributed.irecv, tensor_recv_prev,
+            get_pipeline_model_parallel_prev_rank(),
+            group)
+        ops.append(recv_prev_op)
+    if tensor_send_next is not None:
+        send_next_op = torch.distributed.P2POp(
+            torch.distributed.isend, tensor_send_next,
+            get_pipeline_model_parallel_next_rank(),
+            group)
+        ops.append(send_next_op)
+    if tensor_recv_next is not None:
+        recv_next_op = torch.distributed.P2POp(
+            torch.distributed.irecv, tensor_recv_next,
+            get_pipeline_model_parallel_next_rank(),
+            group)
+        ops.append(recv_next_op)
+    if len(ops) > 0:
+        reqs = torch.distributed.batch_isend_irecv(ops)
+    else:
+        reqs = []
+    return reqs
+def _p2p_ops(*,
+             tensor_send_prev: Optional[torch.Tensor],
+             tensor_recv_prev: Optional[torch.Tensor],
+             tensor_send_next: Optional[torch.Tensor],
+             tensor_recv_next: Optional[torch.Tensor],
+             group: torch.distributed.ProcessGroup):
+    reqs = []
+    rank = get_pipeline_model_parallel_rank()
+    if get_pipeline_model_parallel_rank() % 2 == 0:
+        if tensor_send_next is not None:
+            send_next_req = torch.distributed.isend(
+                tensor=tensor_send_next,
+                dst=get_pipeline_model_parallel_next_rank(),
+                group=group,
+            )
+            reqs.append(send_next_req)
+        if tensor_recv_prev is not None:
+            recv_prev_req = torch.distributed.irecv(
+                tensor=tensor_recv_prev,
+                src=get_pipeline_model_parallel_prev_rank(),
+                group=group,
+            )
+            reqs.append(recv_prev_req)
+        if tensor_send_prev is not None:
+            send_prev_req = torch.distributed.isend(
+                tensor=tensor_send_prev,
+                dst=get_pipeline_model_parallel_prev_rank(),
+                group=group,
+            )
+            reqs.append(send_prev_req)
+        if tensor_recv_next is not None:
+            recv_next_req = torch.distributed.irecv(
+                tensor=tensor_recv_next,
+                src=get_pipeline_model_parallel_next_rank(),
+                group=group,
+            )
+            reqs.append(recv_next_req)
+    else:
+        if tensor_recv_prev is not None:
+            recv_prev_req = torch.distributed.irecv(
+                tensor=tensor_recv_prev,
+                src=get_pipeline_model_parallel_prev_rank(),
+                group=group,
+            )
+            reqs.append(recv_prev_req)
+        if tensor_send_next is not None:
+            send_next_req = torch.distributed.isend(
+                tensor=tensor_send_next,
+                dst=get_pipeline_model_parallel_next_rank(),
+                group=group,
+            )
+            reqs.append(send_next_req)
+        if tensor_recv_next is not None:
+            recv_next_req = torch.distributed.irecv(
+                tensor=tensor_recv_next,
+                src=get_pipeline_model_parallel_next_rank(),
+                group=group,
+            )
+            reqs.append(recv_next_req)
+        if tensor_send_prev is not None:
+            send_prev_req = torch.distributed.isend(
+                tensor=tensor_send_prev,
+                dst=get_pipeline_model_parallel_prev_rank(),
+                group=group,
+            )
+            reqs.append(send_prev_req)
+    return reqs
+def _communicate(*, tensor_send_next: Optional[torch.Tensor],
+                 tensor_send_prev: Optional[torch.Tensor],
+                 recv_prev: bool,
+                 recv_next: bool,
+                 tensor_shape: Shape,
+                 config: ModelParallelConfig,
+                 wait_on_reqs: bool = True) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Communicate tensors between stages. Used as helper method in other
+    communication methods that are used in megatron/schedules.py.
+    Arguments:
+        tensor_send_next (torch.Tensor, optional):
+            Tensor to send to next rank (no tensor sent if None)
+        tensor_send_prev (torch.Tensor, optional):
+            Tensor to send to prev rank (no tensor sent if None)
+        recv_prev (boolean, required):
+            whether tensor should be received from previous rank.
+        recv_next (boolean, required):
+            whether tensor should be received from next rank.
+        tensor_shape (List[int] or torch.Size, required):
+            shape of tensor to receive (this method assumes that all
+            tensors sent and received in a single function call are
+            the same shape).
+        wait_on_reqs (boolean, optional, default=False):
+            For non-batched p2p communication, wait on each request
+            before returning.
+    Returns:
+        tuple containing
+        - tensor_recv_prev: torch.Tensor if recv_prev is True, None otherwise.
+        - tensor_recv_next: torch.Tensor if recv_next is True, None otherwise.
+    """
+    # Create placeholder tensors for receive in forward and backward directions
+    # if needed.
+    tensor_recv_prev = None
+    tensor_recv_next = None
+    if not config.variable_seq_lengths:
+        recv_prev_shape = tensor_shape
+        recv_next_shape = tensor_shape
+    else:
+        recv_prev_shape, recv_next_shape = \
+            _communicate_shapes(tensor_send_next, tensor_send_prev,
+                                recv_prev, recv_next, config)
+    if recv_prev:
+        if config.pipeline_dtype is None:
+            raise RuntimeError("pipeline_dtype must be provided if recv_prev is True")
+        if tensor_shape is None:
+            raise RuntimeError(
+                "tensor_shape must be specified if recv_prev is True. "
+                "Common tensor_shape is (seq_length, micro_batch_size, hidden_size)"
+            )
+        tensor_recv_prev = torch.empty(recv_prev_shape,
+                                       requires_grad=True,
+                                       device=get_accelerator().current_device(),
+                                       dtype=config.pipeline_dtype)
+    if recv_next:
+        if config.pipeline_dtype is None:
+            raise RuntimeError("dtype must be provided if recv_next is True")
+        if tensor_shape is None:
+            raise RuntimeError(
+                "tensor_shape must be specified if recv_next is True. "
+                "Common tensor_shape is (seq_length, micro_batch_size, hidden_size)"
+            )
+        tensor_recv_next = torch.empty(recv_next_shape,
+                                       requires_grad=True,
+                                       device=get_accelerator().current_device(),
+                                       dtype=config.pipeline_dtype)
+    # Send tensors in both the forward and backward directions as appropriate.
+    if config.use_ring_exchange_p2p:
+        def _ring_exchange_wrapper(**kwargs):
+            torch.distributed.ring_exchange(**kwargs)
+            return []
+        p2p_func = _ring_exchange_wrapper
+    elif config.batch_p2p_comm:
+        assert wait_on_reqs
+        p2p_func = _batched_p2p_ops
+    else:
+        p2p_func = _p2p_ops
+    reqs = p2p_func(tensor_send_prev=tensor_send_prev,
+                    tensor_recv_prev=tensor_recv_prev,
+                    tensor_send_next=tensor_send_next,
+                    tensor_recv_next=tensor_recv_next,
+                    group=get_pipeline_model_parallel_group())
+    if wait_on_reqs and len(reqs) > 0:
+        for req in reqs:
+            req.wait()
+        reqs = None
+    if config.batch_p2p_comm and config.batch_p2p_sync:
+        # To protect against race condition when using batch_isend_irecv().
+        # User should assert that we have a modern enough PyTorch to not need this
+        get_accelerator().synchronize()
+    return tensor_recv_prev, tensor_recv_next, reqs
+def recv_forward(tensor_shape: Shape,
+                 config: ModelParallelConfig) -> torch.Tensor:
+    """ Receive tensor from previous rank in pipeline (forward receive).
+    See _communicate for argument details.
+    """
+    if core.parallel_state.is_pipeline_first_stage():
+        input_tensor = None
+    else:
+        if config.timers is not None:
+            config.timers('forward-recv', log_level=2).start()
+        input_tensor, _, _ = _communicate(
+            tensor_send_next=None,
+            tensor_send_prev=None,
+            recv_prev=True,
+            recv_next=False,
+            tensor_shape=tensor_shape,
+            config=config)
+        if config.timers is not None:
+            config.timers('forward-recv').stop()
+    return input_tensor
+def recv_backward(tensor_shape: Shape,
+                  config: ModelParallelConfig) -> torch.Tensor:
+    """Receive tensor from next rank in pipeline (backward receive).
+    See _communicate for argument details.
+    """
+    if core.parallel_state.is_pipeline_last_stage():
+        output_tensor_grad = None
+    else:
+        if config.timers is not None:
+            config.timers('backward-recv', log_level=2).start()
+        _, output_tensor_grad, _ = _communicate(
+            tensor_send_next=None,
+            tensor_send_prev=None,
+            recv_prev=False,
+            recv_next=True,
+            tensor_shape=tensor_shape,
+            config=config)
+        if config.timers is not None:
+            config.timers('backward-recv').stop()
+    return output_tensor_grad
+def send_forward(output_tensor: torch.Tensor,
+                 config: ModelParallelConfig) -> None:
+    """Send tensor to next rank in pipeline (forward send).
+    See _communicate for argument details.
+    """
+    if not core.parallel_state.is_pipeline_last_stage():
+        if config.timers is not None:
+            config.timers('forward-send', log_level=2).start()
+        _communicate(
+            tensor_send_next=output_tensor,
+            tensor_send_prev=None,
+            recv_prev=False,
+            recv_next=False,
+            tensor_shape=None,
+            config=config)
+        if config.timers is not None:
+            config.timers('forward-send').stop()
+def send_backward(input_tensor_grad: torch.Tensor,
+                  config: ModelParallelConfig) -> None:
+    """Send tensor to previous rank in pipeline (backward send).
+    See _communicate for argument details.
+    """
+    if not core.parallel_state.is_pipeline_first_stage():
+        if config.timers is not None:
+            config.timers('backward-send', log_level=2).start()
+        _communicate(
+            tensor_send_next=None,
+            tensor_send_prev=input_tensor_grad,
+            recv_prev=False,
+            recv_next=False,
+            tensor_shape=None,
+            config=config)
+        if config.timers is not None:
+            config.timers('backward-send').stop()
+def send_forward_recv_backward(output_tensor: torch.Tensor,
+                               tensor_shape: Shape,
+                               config: ModelParallelConfig) -> torch.Tensor:
+    """Batched send and recv with next rank in pipeline.
+    See _communicate for argument details.
+    """
+    if core.parallel_state.is_pipeline_last_stage():
+        output_tensor_grad = None
+    else:
+        if config.timers is not None:
+            config.timers('forward-send-backward-recv', log_level=2).start()
+        _, output_tensor_grad,_ = _communicate(
+            tensor_send_next=output_tensor,
+            tensor_send_prev=None,
+            recv_prev=False,
+            recv_next=True,
+            tensor_shape=tensor_shape,
+            config=config)
+        if config.timers is not None:
+            config.timers('forward-send-backward-recv').stop()
+    return output_tensor_grad
+def send_backward_recv_forward(input_tensor_grad: torch.Tensor,
+                               tensor_shape: Shape,
+                               config: ModelParallelConfig) -> torch.Tensor:
+    """Batched send and recv with previous rank in pipeline.
+    See _communicate for argument details.
+    """
+    if core.parallel_state.is_pipeline_first_stage():
+        input_tensor = None
+    else:
+        if config.timers is not None:
+            config.timers('backward-send-forward-recv', log_level=2).start()
+        input_tensor, _, _ = _communicate(
+            tensor_send_next=None,
+            tensor_send_prev=input_tensor_grad,
+            recv_prev=True,
+            recv_next=False,
+            tensor_shape=tensor_shape,
+            config=config)
+        if config.timers is not None:
+            config.timers('backward-send-forward-recv').stop()
+    return input_tensor
+def send_forward_recv_forward(output_tensor: torch.Tensor,
+                              recv_prev: bool,
+                              tensor_shape: Shape,
+                              config: ModelParallelConfig,
+                              overlap_p2p_comm: bool = False) -> torch.Tensor:
+    """Batched recv from previous rank and send to next rank in pipeline.
+    See _communicate for argument details.
+    """
+    if config.timers is not None:
+        config.timers('forward-send-forward-recv', log_level=2).start()
+    input_tensor, _, wait_handles = _communicate(
+        tensor_send_next=output_tensor,
+        tensor_send_prev=None,
+        recv_prev=recv_prev,
+        recv_next=False,
+        tensor_shape=tensor_shape,
+        wait_on_reqs=(not overlap_p2p_comm),
+        config=config)
+    if config.timers is not None:
+        config.timers('forward-send-forward-recv').stop()
+    if overlap_p2p_comm:
+        return input_tensor, wait_handles
+    return input_tensor
+def send_backward_recv_backward(input_tensor_grad: torch.Tensor,
+                                recv_next: bool,
+                                tensor_shape: Shape,
+                                config: ModelParallelConfig,
+                                overlap_p2p_comm: bool = False) -> torch.Tensor:
+    """Batched recv from next rank and send to previous rank in pipeline.
+    See _communicate for argument details.
+    """
+    if config.timers is not None:
+        config.timers('backward-send-backward-recv', log_level=2).start()
+    _, output_tensor_grad, wait_handles = _communicate(
+        tensor_send_next=None,
+        tensor_send_prev=input_tensor_grad,
+        recv_prev=False,
+        recv_next=recv_next,
+        tensor_shape=tensor_shape,
+        wait_on_reqs=(not overlap_p2p_comm),
+        config=config)
+    if config.timers is not None:
+        config.timers('backward-send-backward-recv').stop()
+    if overlap_p2p_comm:
+        return output_tensor_grad, wait_handles
+    return output_tensor_grad
+def send_forward_backward_recv_forward_backward(
+        output_tensor: torch.Tensor,
+        input_tensor_grad: torch.Tensor,
+        recv_prev: bool,
+        recv_next: bool,
+        tensor_shape: Shape,
+        config: ModelParallelConfig) -> torch.Tensor:
+    """Batched send and recv with previous and next ranks in pipeline.
+    See _communicate for argument details.
+    """
+    if config.timers is not None:
+        config.timers('forward-backward-send-forward-backward-recv',
+               log_level=2).start()
+    input_tensor, output_tensor_grad, _ = _communicate(
+        tensor_send_next=output_tensor,
+        tensor_send_prev=input_tensor_grad,
+        recv_prev=recv_prev,
+        recv_next=recv_next,
+        tensor_shape=tensor_shape,
+        config=config)
+    if config.timers is not None:
+        config.timers('forward-backward-send-forward-backward-recv').stop()
+    return input_tensor, output_tensor_grad