update

e1354f9d · liangjing · e1354f9d · e1354f9d · e1354f9d · e1354f9d
Commit e1354f9d authored May 22, 2024 by liangjing
20 changed files
--- a/megatron/core/transformer/enums.py
+++ b/megatron/core/transformer/enums.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import enum
+
+
+# can we get rid of this?
+# it's being used in pipeline schedules
+class ModelType(enum.Enum):
+    encoder_or_decoder = 1
+    encoder_and_decoder = 2
+
+
+# class LayerType(enum.Enum):
+#     encoder = 1
+#     decoder = 2
+
+
+class AttnType(enum.Enum):
+    self_attn = 1
+    cross_attn = 2
+
+
+class AttnMaskType(enum.Enum):
+    padding = 1
+    causal = 2
--- a/megatron/core/transformer/mlp.py
+++ b/megatron/core/transformer/mlp.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+import torch.nn.functional as F
+
+from megatron.core import tensor_parallel
+from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.custom_layers.transformer_engine import \
+        TERowParallelLinear, TEColumnParallelLinear
+
+class MLP(MegatronModule):
+    """
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+
+
+    Returns an output and a bias to be added to the output.
+    If config.add_bias_linear is False, the bias returned is None.
+
+    We use the following notation:
+     h: hidden size
+     p: number of tensor model parallel partitions
+     b: batch size
+     s: sequence length
+    """
+
+    def __init__(self, config: TransformerConfig):
+        super().__init__(config=config)
+
+        self.config: TransformerConfig = config
+
+        # If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        ffn_hidden_size = self.config.ffn_hidden_size
+        if self.config.gated_linear_unit:
+            ffn_hidden_size *= 2
+
+        self.linear_fc1 = TEColumnParallelLinear(
+            self.config.hidden_size,
+            ffn_hidden_size,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+        )
+
+        if self.config.gated_linear_unit:
+            def glu(x):
+                x = torch.chunk(x, 2, dim=-1)
+                return self.config.activation_func(x[0]) * x[1]
+            self.activation_func = glu
+        else:
+            self.activation_func = self.config.activation_func
+
+        self.linear_fc2 = TERowParallelLinear(
+            self.config.ffn_hidden_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            skip_bias_add=True,
+        )
+
+    def forward(self, hidden_states):
+
+        # [s, b, 4 * h/p]
+        intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states)
+
+        if self.config.bias_gelu_fusion:
+            assert self.config.add_bias_linear is True
+            assert self.activation_func == F.gelu
+            intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
+        else:
+            if bias_parallel is not None:
+                intermediate_parallel = intermediate_parallel + bias_parallel
+            intermediate_parallel = self.activation_func(intermediate_parallel)
+
+        # [s, b, h]
+        output, output_bias = self.linear_fc2(intermediate_parallel)
+        return output, output_bias
--- a/megatron/core/transformer/module.py
+++ b/megatron/core/transformer/module.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron Module"""
+
+import torch
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
+
+from megatron.core import parallel_state, tensor_parallel
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+
+_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
+_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor)
+
+
+def param_is_not_shared(param):
+    return not hasattr(param, 'shared') or not param.shared
+
+
+class MegatronModule(torch.nn.Module):
+    """Megatron specific extensions of torch Module with support
+    for pipelining."""
+
+    # def __init__(self, config: TransformerConfig, share_word_embeddings=True):
+    def __init__(self, config: TransformerConfig):
+        super().__init__()
+        self.config = config
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        """Use this function to override the state dict for
+        saving checkpoints."""
+        return self.state_dict(prefix=prefix, keep_vars=keep_vars)
+
+
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val`
+    #is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+
+
+def fp32_to_float16(val, float16_convertor):
+    """Convert fp32 `val` to fp16/bf16"""
+
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, _FLOAT_TYPES):
+            val = float16_convertor(val)
+        return val
+
+    return conversion_helper(val, half_conversion)
+
+
+def float16_to_fp32(val):
+    """Convert fp16/bf16 `val` to fp32"""
+
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)):
+            val = val.float()
+        return val
+
+    return conversion_helper(val, float_conversion)
+
+
+class Float16Module(MegatronModule):
+    def __init__(self, config: TransformerConfig, module: torch.nn.Module):
+        super(Float16Module, self).__init__(config)
+        self.config = config
+        self.fp16 = config.fp16
+        self.bf16 = config.bf16
+
+        if self.fp16:
+            self.add_module('module', module.half())
+
+            def float16_convertor(val):
+                return val.half()
+
+        elif self.bf16:
+            self.add_module('module', module.bfloat16())
+
+            def float16_convertor(val):
+                return val.bfloat16()
+
+        else:
+            raise Exception('Either config.fp16 or config.bf16 should be True.')
+
+        self.float16_convertor = float16_convertor
+
+    def set_input_tensor(self, input_tensor):
+        return self.module.set_input_tensor(input_tensor)
+
+    def forward(self, *inputs, **kwargs):
+        if parallel_state.is_pipeline_first_stage():
+            inputs = fp32_to_float16(inputs, self.float16_convertor)
+        outputs = self.module(*inputs, **kwargs)
+        if parallel_state.is_pipeline_last_stage():
+            outputs = float16_to_fp32(outputs)
+        return outputs
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(prefix=prefix, keep_vars=keep_vars)
+
+    def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
+        return self.module.state_dict_for_save_checkpoint(prefix=prefix, keep_vars=keep_vars)
+
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
--- a/megatron/core/transformer/transformer_block.py
+++ b/megatron/core/transformer/transformer_block.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from contextlib import nullcontext
+import torch
+
+from megatron.core import parallel_state, tensor_parallel
+
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
+from megatron.core.transformer.transformer_layer import TransformerLayer
+from megatron.core.utils import make_viewless_tensor
+
+
+class TransformerBlock(MegatronModule):
+    """Transformer class."""
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        self_attn_mask_type=AttnMaskType.padding,
+        post_layer_norm=True,
+        pre_process=True,
+        post_process=True,
+    ):
+        super().__init__(config=config)
+
+        self.config: TransformerConfig = config
+
+        self.self_attn_mask_type = self_attn_mask_type
+        self.post_layer_norm = post_layer_norm
+        self.pre_process = pre_process
+        self.post_process = post_process
+
+        # required for pipeline parallel schedules
+        self.input_tensor = None
+
+        self.checkpoint_core_attention = self.config.recompute_granularity == 'selective'
+
+        # TODO: Maybe we can create a build_transformer_block method here instead
+
+        self.num_layers_per_pipeline_rank = (
+            self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size()
+        )
+
+        self._build_layers()
+
+    def _build_layers(self):
+        # Transformer layers.
+        # @jcasper can we improve how we deal with layer_number?
+        # currently it's only used in CoreAttention?
+        # if self.apply_query_key_layer_scaling:
+        #     coeff = self.layer_number
+        #     self.norm_factor *= coeff
+        def build_layer(layer_number):
+            return TransformerLayer(
+                config=self.config, layer_number=layer_number, self_attn_mask_type=self.self_attn_mask_type,
+            )
+
+        pipeline_rank = parallel_state.get_pipeline_model_parallel_rank()
+
+        if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+            # Number of layers in each model chunk is the number of layers in the stage,
+            # divided by the number of model chunks in a stage.
+            # With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
+            # layers to stages like (each list is a model chunk):
+            # Stage 0: [0]  [2]  [4]  [6]
+            # Stage 1: [1]  [3]  [5]  [7]
+            # With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
+            # layers to stages like (each list is a model chunk):
+            # Stage 0: [0, 1]  [4, 5]
+            # Stage 1: [2, 3]  [6, 7]
+
+            vp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank()
+            vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+
+            total_num_layers = self.config.num_layers
+            num_layers_per_virtual_rank = self.num_layers_per_pipeline_rank // vp_size
+            total_virtual_chunks = total_num_layers / vp_size
+            offset = vp_rank * total_virtual_chunks + (pipeline_rank * num_layers_per_virtual_rank)
+
+            self.layers = torch.nn.ModuleList(
+                [build_layer(i + 1 + offset) for i in range(num_layers_per_virtual_rank)]
+            )
+        else:
+            # Each stage gets a contiguous set of layers.
+            if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+                offset = pipeline_rank * self.num_layers_per_pipeline_rank
+            else:
+                offset = 0
+
+            # @jcasper why is layer_number using 1 index?
+            self.layers = torch.nn.ModuleList(
+                [build_layer(i + 1 + offset) for i in range(self.num_layers_per_pipeline_rank)]
+            )
+
+        # # TODO: add back standalone_embedding_stage
+        # if self.num_layers == 0:
+        #     # When a standalone embedding stage is used (e.g.,
+        #     # args.standalone_embedding_stage == True), virtual pipeline ranks
+        #     # on pipeline rank 0 will have zero transformer layers assigned to
+        #     # them. This results in the model's input and output tensors to be
+        #     # the same, which will cause failure for certain output tensor
+        #     # optimizations (e.g., pipeline output deallocation). To remedy
+        #     # this, we assign a 'no-op' layer on these ranks, which will
+        #     # disconnect the input tensor from the output tensor.
+        #     self.num_layers = 1
+        #     self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)])
+        # else:
+        #     self.layers = torch.nn.ModuleList([build_layer(i + 1 + offset) for i in range(self.num_layers)])
+
+        if self.post_process and self.post_layer_norm:
+            # Final layer norm before output.
+            self.final_layernorm = FusedLayerNorm(
+                hidden_size=self.config.hidden_size,
+                eps=self.config.layernorm_epsilon,
+                persist_layer_norm=self.config.persist_layer_norm,
+                sequence_parallel=self.config.sequence_parallel,
+                zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
+            )
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    def _checkpointed_forward(self, hidden_states, attention_mask):
+        """Forward method with activation checkpointing."""
+
+        def custom(start, end):
+            def custom_forward(*args, **kwargs):
+                x_, *args = args
+                for index in range(start, end):
+                    layer = self._get_layer(index)
+                    x_ = layer(x_, *args, **kwargs)
+                return x_
+
+            return custom_forward
+
+        if self.config.recompute_method == 'uniform':
+            # Uniformly divide the total number of Transformer layers and checkpoint
+            # the input activation of each divided chunk.
+            # A method to further reduce memory usage reducing checkpoints.
+            l = 0
+            while l < self.num_layers:
+                hidden_states = tensor_parallel.checkpoint(
+                    custom(l, l + self.config.recompute_num_layers),
+                    self.config.distribute_saved_activations,
+                    hidden_states,
+                    attention_mask,
+                )
+
+                l += self.recompute_num_layers
+
+        elif self.config.recompute_method == 'block':
+            # Checkpoint the input activation of only a set number of individual
+            # Transformer layers and skip the rest.
+            # A method fully use the device memory removing redundant re-computation.
+            for l in range(self.num_layers_per_pipeline_rank):
+                if l < self.config.recompute_num_layers:
+                    hidden_states = tensor_parallel.checkpoint(
+                        custom(l, l + 1), self.config.distribute_saved_activations, hidden_states, attention_mask,
+                    )
+                else:
+                    hidden_states = custom(l, l + 1)(hidden_states, attention_mask)
+        else:
+            raise ValueError("Invalid activation recompute method.")
+
+        return hidden_states
+
+    def set_input_tensor(self, input_tensor):
+        """Set input tensor to be used instead of forward()'s input.
+
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor
+
+    def forward(self, hidden_states, attention_mask, inference_params=None):
+        # hidden_states (float): [s, b, h]
+        # attention_mask (bool): [1, 1, s, s]
+
+        if not self.pre_process:
+            # See set_input_tensor()
+            hidden_states = self.input_tensor
+
+        # Viewless tensor.
+        # - We only need to create a viewless tensor in the case of micro batch
+        #   size (mbs) == 1, since in this case, 'hidden_states.transpose()'
+        #   above creates a view tensor, and '.contiguous()' is a pass-through.
+        #   For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
+        #   the need to make it viewless.
+        #
+        #   However, we don't explicitly check mbs == 1 here because
+        #   make_viewless_tensor() has negligible overhead when its input
+        #   is already viewless.
+        #
+        # - For the 'else' case above, calling make_viewless_tensor() here is
+        #   likely redundant, since p2p_communication.py (likely originator)
+        #   already creates viewless tensors. That said, make_viewless_tensor()
+        #   is called here to be future-proof and corner-case-proof.
+        hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True,)
+
+        if self.config.sequence_parallel:
+            rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
+        else:
+            rng_context = nullcontext()
+
+        with rng_context:
+            # Forward pass.
+            if self.config.recompute_granularity == 'full':
+                hidden_states = self._checkpointed_forward(hidden_states=hidden_states, attention_mask=attention_mask)
+            else:
+                for layer in self.layers:
+                    hidden_states = layer(hidden_states=hidden_states, attention_mask=attention_mask)
+
+        # Final layer norm.
+        if self.post_process and self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
--- a/megatron/core/transformer/transformer_config.py
+++ b/megatron/core/transformer/transformer_config.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+from dataclasses import dataclass
+from typing import Callable
+
+import torch
+import torch.nn.functional as F
+
+from megatron.core import ModelParallelConfig
+from megatron.core.utils import init_method_normal, scaled_init_method_normal
+
+@dataclass
+class TransformerConfig(ModelParallelConfig):
+    """Configuration object for megatron-core transformers.
+
+        Attributes:
+
+        # model architecture
+        num_layers (int): Number of transformer layers in a transformer block.
+        hidden_size (int): Transformer hidden size.
+        ffn_hidden_size (int): Transformer Feed-Forward Network hidden size.
+                                This is set to 4*hidden_size if not provided. Defaults to None.')
+        num_attention_heads (int): Number of transformer attention heads.
+        num_key_value_heads (int): This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+                                   `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+                                   `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used.
+                                   For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf).
+                                   If it is not specified, will default to `num_attention_heads`.
+        kv_channels (int): Projection weights dimension in multi-head attention.
+                            This is set to hidden_size // num_attention_heads if not provided.
+                            Defaults to None.
+        hidden_dropout (float): Dropout probability for transformer hidden state. Defaults to 0.1.
+        attention_dropout (float): Post attention dropout probability. Defaults to 0.1.
+        fp32_residual_connection (bool): If true, move residual connections to fp32.
+        apply_residual_connection_post_layernorm (bool): If true, uses the original BERT residule connection ordering.
+                                                         Defaults to False.
+        layernorm_epsilon (float): Layernorm epsilon. Defaults to 1e-5.
+
+        layernorm_zero_centered_gamma (bool): if set to 'True', the LayerNorm is adjusted to center the gamma values
+                                              around 0. This improves numerical stability. Defaults to False.
+
+        add_bias_linear (bool): Include a bias term in all linear layers (QKV projections, after core attention, and two
+                                in MLP layer). Default is True.
+
+        gated_linear_unit (bool): Use a gated linear unit for the first linear layer in the MLP. Defaults to False.
+
+        activation_func (Callable): Activation function to use for the non-linearity in the MLP. Defaults to F.gelu.
+
+        # initialization
+        init_method (Callable): Method to initialize weights. Note that bias is always set to
+                                zero. Should be a function that takes a single Tensor and
+                                initializes it. Defaults to
+                                megatron.core.utils.init_method_normal(init_method_std) which is
+                                torch.nn.init.normal_ with mean=0.0 and std=init_method_Std.
+
+        output_layer_init_method (Callable): Method to initialize weights of the output layer of
+                                             both attention and MLP blocks. Defaults to
+                                             megatron.core.utils.scaled_init_method_normal(init_method_std)
+                                             which is torch.nn.init.normal_ with mean=0.0 and
+                                             std=init_method_std / math.sqrt(2.0 * num_layers).
+
+        init_method_std (float): Standard deviation of the zero mean normal for the default
+                                 initialization method, not used if init_method and
+                                 output_layer_init_method are provided. Defaults to 0.02.
+
+        # mixed-precision
+        apply_query_key_layer_scaling (bool): If true, scale Q * K^T by 1 / layer-number. Defaults to True.
+        attention_softmax_in_fp32 (bool): If true, run attention masking and softmax in fp32.
+                                          This should be true if apply_query_key_layer_scaling is true.
+
+        # fusion
+        bias_gelu_fustion (bool): If true, fuses bias and gelu. Defaults to False.
+        masked_softmax_fusion (bool): If true, uses softmax fusion.
+        persist_layer_norm (bool): If true, uses the persistent fused layer norm kernel.
+                                   This kernel only supports a fixed set of hidden sizes.
+                                   Defaults to False.
+        bias_dropout_fusion (bool): If true, uses bias dropout fusion.
+
+        # activation recomputation
+
+        recompute_granularity (str): megatron-core supports 'selective' activation checkpointing where only the memory
+                                     intensive part of attention is checkpointed.  These memory intensive activations
+                                     are also less compute intensive which makes activation checkpointing more efficient
+                                     for LLMs (20B+).  See Reducing Activation Recomputation in Large Transformer
+                                     Models: https://arxiv.org/abs/2205.05198 for more details.  'full' will checkpoint
+                                     the entire transformer layer.  Must be 'selective' or 'full'. Defaults to None.
+
+        recompute_method (str): uniform will uniformly divide the total number of transformer layers in a transformer
+                                block and recompute the input activation of each divided chunk at the specified
+                                granularity.  block will recompute the input activations for only a set number of
+                                transformer layers per pipeline stage.  The rest of the layers in the pipeline stage
+                                will not have any activations recomputed.  Must be 'uniform' or 'block'. Defaults to
+                                None.
+
+        recompute_num_layers (int): When recompute_method is uniform, recompute_num_layers is the number of transformer
+                                    layers in each uniformly divided recompute unit.  When recompute_method is block,
+                                    recompute_num_layers is the number of transformer layers to recompute within each
+                                    pipeline stage.  Defaults to None.
+
+        distribute_saved_activations (bool): If true, distribute recomputed activations across the model parallel
+                                             group. Defaults to None.
+
+    """
+
+    # model architecture
+    num_layers: int = 0
+    hidden_size: int = 0
+    num_attention_heads: int = 0
+    num_key_value_heads: int = None
+
+    ffn_hidden_size: int = None
+    kv_channels: int = None
+    hidden_dropout: float = 0.1
+    attention_dropout: float = 0.1
+    fp32_residual_connection: bool = False
+    # @jcasper should we keep this option?
+    apply_residual_connection_post_layernorm: bool = False
+    layernorm_epsilon: float = 1e-5
+    layernorm_zero_centered_gamma: bool = False
+    add_bias_linear: bool = True
+    gated_linear_unit: bool = False
+    activation_func: Callable = F.gelu
+
+    # initialization
+    init_method: Callable = None
+    output_layer_init_method: Callable = None
+    init_method_std: float = 0.02
+
+    # mixed-precision
+    apply_query_key_layer_scaling: bool = True
+    attention_softmax_in_fp32: bool = True
+
+    # communication
+
+    # fusion
+    bias_gelu_fusion: bool = False  # TODO: this should be bias_activation_fusion ?
+    masked_softmax_fusion: bool = False
+    persist_layer_norm: bool = False
+    bias_dropout_fusion: bool = False  # TODO: this should be bias_dropout_add_fusion?
+
+    # activation recomputation
+    recompute_granularity: str = None
+    recompute_method: str = None
+    recompute_num_layers: int = None
+    distribute_saved_activations: bool = None
+
+    def __post_init__(self):
+        """ Python dataclass method that is used to modify attributes after initialization.
+            See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
+        """
+        super().__post_init__()
+        if self.fp16 and self.bf16:
+            raise ValueError(f'Only one of self.fp16: {self.fp16} and self.bf16 {self.bf16} should be True.')
+
+        if self.ffn_hidden_size is None:
+            self.ffn_hidden_size = 4 * self.hidden_size
+
+        if self.num_key_value_heads is None:
+            self.num_key_value_heads = self.num_attention_heads
+        assert self.num_attention_heads % self.num_key_value_heads == 0
+
+        if self.kv_channels is None:
+            self.kv_channels = self.hidden_size // self.num_attention_heads
+
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+
+        if self.recompute_granularity is not None:
+            if not self.recompute_granularity in ['full', 'selective']:
+                raise ValueError(
+                    f'When using recompute_granuarlity: {self.recompute_granularity} must be "full" or "selective".'
+                )
+
+            if self.recompute_method is not None:
+                if not self.recompute_method in ['block', 'uniform']:
+                    raise ValueError(f'recompute_method: {self.recompute_method} must be "block" or "uniform".')
+            elif self.recompute_granularity != 'selective':
+                raise ValueError(
+                    f'Using recompute_granularity: {self.recompute_granularity} so recompute_method must be "block" or "uniform"'
+                )
+
+            if self.recompute_num_layers is None:
+                raise ValueError(
+                    f'When using recompute_granularity: {self.recompute_granularity} so recompute_num_layers must be between '
+                    f'1 and num_layers_per_pipeline_rank: {self.num_layers // self.pipeline_model_parallel_size}'
+                )
+
+            if self.distribute_saved_activations and self.sequence_parallel_enabled:
+                raise ValueError(
+                    f'distribute_saved_activations: {self.distribute_saved_activations} must be false when sequence parallel is enabled: {self.sequence_parallel_enabled}'
+                )
+
+            if self.virtual_pipeline_model_parallel_size is not None:
+                if not self.num_layers % self.virtual_pipeline_model_parallel_size == 0:
+                    raise ValueError(
+                        f'num_layers: {self.num_layers} must be divisible by virtual_model_parallel_size {self.virtual_pipeline_model_parallel_size}'
+                    )
+
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+
+        if self.bias_gelu_fusion:
+            if not self.add_bias_linear:
+                raise ValueError("When bias_gelu_fusion is True, add_bias_linear must also be True.")
+
+            if self.activation_func != F.gelu:
+                raise ValueError(f'When bias_gelu_fusion is True, activation_func must be F.gelu.')
+
+        if self.init_method is None:
+            self.init_method = init_method_normal(self.init_method_std)
+
+        if self.output_layer_init_method is None:
+            self.output_layer_init_method = scaled_init_method_normal(self.init_method_std, self.num_layers)
+
--- a/megatron/core/transformer/transformer_layer.py
+++ b/megatron/core/transformer/transformer_layer.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.enums import AttnType, AttnMaskType
+from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.transformer.attention import SelfAttention
+from megatron.core.transformer.mlp import MLP
+from megatron.core.utils import make_viewless_tensor
+from megatron.core.transformer.custom_layers.transformer_engine import TELayerNorm
+
+class TransformerLayer(MegatronModule):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(
+        self, config: TransformerConfig, layer_number: int = 1, self_attn_mask_type=AttnMaskType.padding,
+    ):
+        super().__init__(config=config)
+        self.config: TransformerConfig = config
+
+        self.layer_number = layer_number
+        self.self_attn_mask_type = self_attn_mask_type
+
+        # Layernorm on the input data.
+        # TODO: add pytorch only layernorm
+        self.input_layernorm = TELayerNorm(
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+            persist_layer_norm=self.config.persist_layer_norm,
+            sequence_parallel=self.config.sequence_parallel,
+            zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
+        )
+
+        # Self attention.
+        self.self_attention = SelfAttention(
+            config=self.config,
+            layer_number=layer_number,
+            attn_mask_type=self_attn_mask_type,
+        )
+
+        # Layernorm on the attention output
+        self.post_self_attn_layernorm = TELayerNorm(
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+            persist_layer_norm=self.config.persist_layer_norm,
+            sequence_parallel=self.config.sequence_parallel,
+            zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
+        )
+
+        # MLP
+        self.mlp = MLP(config=self.config)
+
+        # @jcasper how should we handle nvfuser?
+        # Set bias+dropout+add fusion grad_enable execution handler.
+        # TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        # TORCH_MINOR = int(torch.__version__.split('.')[1])
+        # use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
+        # self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad
+        self.bias_dropout_add_exec_handler = torch.enable_grad
+
+        self.bias_dropout_add_func = get_bias_dropout_add(
+            self.training,
+            self.config.bias_dropout_fusion
+        )
+
+    # TODO: decide how to do inference_params
+    def forward(
+        self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, inference_params=None
+    ):
+        # hidden_states: [s, b, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output_with_bias = self.self_attention(
+            layernorm_output, attention_mask, inference_params=inference_params
+        )
+
+        # Residual connection.
+        if self.config.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        # bias_dropout_add fusion returning fp32 instead of bf16
+        with self.bias_dropout_add_exec_handler():
+            layernorm_input = self.bias_dropout_add_func(
+                attention_output_with_bias, residual, self.config.hidden_dropout
+            )
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_self_attn_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output_with_bias = self.mlp(layernorm_output)
+
+        # Second residual connection.
+        if self.config.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        with self.bias_dropout_add_exec_handler():
+            output = self.bias_dropout_add_func(
+                mlp_output_with_bias, residual, self.config.hidden_dropout
+            )
+
+        # Jit compiled function creates 'view' tensor. This tensor
+        # potentially gets saved in the MPU checkpoint function context,
+        # which rejects view tensors. While making a viewless tensor here
+        # won't result in memory savings (like the data loader, or
+        # p2p_communication), it serves to document the origin of this
+        # 'view' tensor.
+        output = make_viewless_tensor(inp=output, requires_grad=output.requires_grad, keep_graph=True)
+
+        return output
--- a/megatron/core/transformer/utils.py
+++ b/megatron/core/transformer/utils.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Utilities for transformer layers."""
+
+import torch
+
+from megatron import get_args
+
+from deepspeed.runtime.zero import GatheredParameters
+import pdb
+
+def attention_mask_func(attention_scores, attention_mask):
+    attention_scores.masked_fill_(attention_mask, -10000.0)
+    return attention_scores
+
+
+def get_linear_layer(rows, columns, init_method, gather_params_on_init=False):
+    """Simple linear layer with weight initialization."""
+    layer = torch.nn.Linear(rows, columns)
+    pdb.set_trace()
+    if get_args().perform_initialization:
+        with GatheredParameters(layer.weight, modifier_rank=0, enable=gather_params_on_init):
+            init_method(layer.weight)
+    with torch.no_grad():
+        with GatheredParameters(layer.weight, modifier_rank=0, enable=gather_params_on_init):
+            layer.bias.zero_()
+    return layer
+
+
+@torch.jit.script
+def gelu_impl(x):
+    """OpenAI's gelu implementation."""
+    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
+
+
+def openai_gelu(x):
+    return gelu_impl(x)
+
+
+# This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
+@torch.jit.script
+def erf_gelu(x):
+    return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype) + torch.ones_like(x).to(dtype=x.dtype))
--- a/megatron/core/utils.py
+++ b/megatron/core/utils.py
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+"""Utility functions used throughout Megatron core"""
+from functools import reduce
+import math
+import operator
+
+import torch
+
+from megatron.core import parallel_state
+from megatron import get_args
+
+from deepspeed import get_accelerator
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(
+        numerator, denominator
+    )
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+def get_attr_wrapped_model(model, attr, allow_none=True):
+    """Get an attribute from a wrapped model"""
+    if isinstance(model, list):
+        raise RuntimeError("_get_attr_wrapped_model given a list of models")
+
+    if allow_none:
+        def condition(model, attr):
+            return not hasattr(model, attr)
+    else:
+        def condition(model, attr):
+            return getattr(model, attr, None) is None
+
+    while condition(model, attr):
+        if not hasattr(model, "module"):
+            raise RuntimeError(f"_get_attr_wrapped_model couldn't find attribute {attr}")
+
+        model = model.module
+    return getattr(model, attr)
+
+def get_model_type(model):
+    return get_attr_wrapped_model(model, 'model_type')
+
+def get_model_config(model):
+    args = get_args()
+    if args.deepspeed:
+        return get_attr_wrapped_model(model.module, 'config', allow_none=False)
+    return get_attr_wrapped_model(model, 'config', allow_none=False)
+
+class GlobalMemoryBuffer:
+    """Global buffer to avoid dynamic memory allocations.
+    Caller should ensure that buffers of the same name
+    are not used concurrently."""
+
+    def __init__(self):
+        self.buffer = {}
+
+    def get_tensor(self, tensor_shape, dtype, name):
+        required_len = reduce(operator.mul, tensor_shape, 1)
+        if self.buffer.get((name, dtype), None) is None or \
+                self.buffer[(name, dtype)].numel() < required_len:
+            self.buffer[(name, dtype)] = \
+                torch.empty(required_len,
+                            dtype=dtype,
+                            device=get_accelerator().current_device_name(),
+                            requires_grad=False)
+
+        return self.buffer[(name, dtype)][0:required_len].view(*tensor_shape)
+
+def _kernel_make_viewless_tensor(inp, requires_grad):
+    '''Make a viewless tensor.
+
+    View tensors have the undesirable side-affect of retaining a reference
+    to the originally-viewed tensor, even after manually setting the '.data'
+    field. This method creates a new tensor that links to the old tensor's
+    data, without linking the viewed tensor, referenced via the '._base'
+    field.
+    '''
+    out = torch.empty(
+        (1,),
+        dtype = inp.dtype,
+        device = inp.device,
+        requires_grad = requires_grad,
+    )
+    out.data = inp.data
+    return out
+
+class MakeViewlessTensor(torch.autograd.Function):
+    '''
+    Autograd function to make a viewless tensor.
+
+    This function should be used in cases where the computation graph needs
+    to be propagated, but we only want a viewless tensor (e.g.,
+    ParallelTransformer's hidden_states). Call this function by passing
+    'keep_graph = True' to 'make_viewless_tensor()'.
+    '''
+    @staticmethod
+    def forward(ctx, inp, requires_grad):
+        return _kernel_make_viewless_tensor(inp, requires_grad)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output, None
+
+def make_viewless_tensor(inp, requires_grad, keep_graph):
+    '''
+    Entry-point for creating viewless tensors.
+
+    This method should be used, rather than calling 'MakeViewlessTensor'
+    or '_kernel_make_viewless_tensor' directly. This method acts as a
+    switch for determining if an autograd function or a regular method
+    should be used to create the tensor.
+    '''
+
+    # return tensor as-is, if not a 'view'
+    if inp._base is None:
+        return inp
+
+    # create viewless tensor
+    if keep_graph:
+        return MakeViewlessTensor.apply(inp, requires_grad)
+    else:
+        return _kernel_make_viewless_tensor(inp, requires_grad)
+
+def assert_viewless_tensor(tensor, extra_msg = None):
+    '''Assert that a tensor is not a view (i.e., its '._base' field is
+    not set).'''
+    if isinstance(tensor, list):
+        [ assert_viewless_tensor(t) for t in tensor ]
+        return tensor
+    if not isinstance(tensor, torch.Tensor):
+        return tensor
+    assert tensor._base is None, (
+        "Ensure tensor._base is None before setting tensor.data or storing "
+        "tensor to memory buffer. Otherwise, a memory leak will occur (and "
+        "likely accumulate over iterations). %s"
+    ) % extra_msg
+    return tensor
+
+def safely_set_viewless_tensor_data(tensor, new_data_tensor):
+    '''Safely set tensor's '.data' field.
+
+    Check first that the tensor is viewless (i.e., '._base' not set). If not,
+    raise an exception.
+    '''
+    assert_viewless_tensor(tensor, extra_msg = "FYI, tensor._base has shape %s, and new_data_tensor has shape %s." % ("--" if tensor._base is None else tensor._base.shape, new_data_tensor.shape))
+    tensor.data = new_data_tensor
+
+def init_method_normal(sigma):
+    """Init method based on N(0, sigma)."""
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
+
+    return init_
+
+
+def scaled_init_method_normal(sigma, num_layers):
+    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
--- a/megatron/data/Makefile
+++ b/megatron/data/Makefile
+CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
+CPPFLAGS += $(shell python3 -m pybind11 --includes)
+LIBNAME = helpers
+LIBEXT = $(shell python3-config --extension-suffix)
+
+default: $(LIBNAME)$(LIBEXT)
+
+%$(LIBEXT): %.cpp
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
--- a/megatron/data/__init__.py
+++ b/megatron/data/__init__.py
+from . import indexed_dataset
--- a/megatron/data/__pycache__/__init__.cpython-310.pyc
+++ b/megatron/data/__pycache__/__init__.cpython-310.pyc
--- a/megatron/data/__pycache__/__init__.cpython-37.pyc
+++ b/megatron/data/__pycache__/__init__.cpython-37.pyc
--- a/megatron/data/__pycache__/__init__.cpython-38.pyc
+++ b/megatron/data/__pycache__/__init__.cpython-38.pyc
--- a/megatron/data/__pycache__/autoaugment.cpython-310.pyc
+++ b/megatron/data/__pycache__/autoaugment.cpython-310.pyc
--- a/megatron/data/__pycache__/autoaugment.cpython-37.pyc
+++ b/megatron/data/__pycache__/autoaugment.cpython-37.pyc
--- a/megatron/data/__pycache__/autoaugment.cpython-38.pyc
+++ b/megatron/data/__pycache__/autoaugment.cpython-38.pyc
--- a/megatron/data/__pycache__/blendable_dataset.cpython-310.pyc
+++ b/megatron/data/__pycache__/blendable_dataset.cpython-310.pyc
--- a/megatron/data/__pycache__/blendable_dataset.cpython-37.pyc
+++ b/megatron/data/__pycache__/blendable_dataset.cpython-37.pyc
--- a/megatron/data/__pycache__/blendable_dataset.cpython-38.pyc
+++ b/megatron/data/__pycache__/blendable_dataset.cpython-38.pyc
--- a/megatron/data/__pycache__/data_samplers.cpython-310.pyc
+++ b/megatron/data/__pycache__/data_samplers.cpython-310.pyc