Commit e1354f9d authored by liangjing's avatar liangjing
Browse files

update

parents
Pipeline #1025 failed with stages
in 0 seconds
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import enum
# can we get rid of this?
# it's being used in pipeline schedules
class ModelType(enum.Enum):
encoder_or_decoder = 1
encoder_and_decoder = 2
# class LayerType(enum.Enum):
# encoder = 1
# decoder = 2
class AttnType(enum.Enum):
self_attn = 1
cross_attn = 2
class AttnMaskType(enum.Enum):
padding = 1
causal = 2
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import torch
import torch.nn.functional as F
from megatron.core import tensor_parallel
from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
from megatron.core.transformer.module import MegatronModule
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.transformer.custom_layers.transformer_engine import \
TERowParallelLinear, TEColumnParallelLinear
class MLP(MegatronModule):
"""
MLP will take the input with h hidden state, project it to 4*h
hidden dimension, perform nonlinear transformation, and project the
state back into h hidden dimension.
Returns an output and a bias to be added to the output.
If config.add_bias_linear is False, the bias returned is None.
We use the following notation:
h: hidden size
p: number of tensor model parallel partitions
b: batch size
s: sequence length
"""
def __init__(self, config: TransformerConfig):
super().__init__(config=config)
self.config: TransformerConfig = config
# If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf
ffn_hidden_size = self.config.ffn_hidden_size
if self.config.gated_linear_unit:
ffn_hidden_size *= 2
self.linear_fc1 = TEColumnParallelLinear(
self.config.hidden_size,
ffn_hidden_size,
config=self.config,
init_method=self.config.init_method,
bias=self.config.add_bias_linear,
skip_bias_add=True,
)
if self.config.gated_linear_unit:
def glu(x):
x = torch.chunk(x, 2, dim=-1)
return self.config.activation_func(x[0]) * x[1]
self.activation_func = glu
else:
self.activation_func = self.config.activation_func
self.linear_fc2 = TERowParallelLinear(
self.config.ffn_hidden_size,
self.config.hidden_size,
config=self.config,
init_method=self.config.output_layer_init_method,
bias=self.config.add_bias_linear,
skip_bias_add=True,
)
def forward(self, hidden_states):
# [s, b, 4 * h/p]
intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states)
if self.config.bias_gelu_fusion:
assert self.config.add_bias_linear is True
assert self.activation_func == F.gelu
intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
else:
if bias_parallel is not None:
intermediate_parallel = intermediate_parallel + bias_parallel
intermediate_parallel = self.activation_func(intermediate_parallel)
# [s, b, h]
output, output_bias = self.linear_fc2(intermediate_parallel)
return output, output_bias
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
"""Megatron Module"""
import torch
from torch.autograd import Variable
from torch.nn.parameter import Parameter
from megatron.core import parallel_state, tensor_parallel
from megatron.core.transformer.transformer_config import TransformerConfig
_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
_BF16_TYPES = (torch.BFloat16Tensor, torch.cuda.BFloat16Tensor)
def param_is_not_shared(param):
return not hasattr(param, 'shared') or not param.shared
class MegatronModule(torch.nn.Module):
"""Megatron specific extensions of torch Module with support
for pipelining."""
# def __init__(self, config: TransformerConfig, share_word_embeddings=True):
def __init__(self, config: TransformerConfig):
super().__init__()
self.config = config
def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
"""Use this function to override the state dict for
saving checkpoints."""
return self.state_dict(prefix=prefix, keep_vars=keep_vars)
def conversion_helper(val, conversion):
"""Apply conversion to val. Recursively apply conversion if `val`
#is a nested tuple/list structure."""
if not isinstance(val, (tuple, list)):
return conversion(val)
rtn = [conversion_helper(v, conversion) for v in val]
if isinstance(val, tuple):
rtn = tuple(rtn)
return rtn
def fp32_to_float16(val, float16_convertor):
"""Convert fp32 `val` to fp16/bf16"""
def half_conversion(val):
val_typecheck = val
if isinstance(val_typecheck, (Parameter, Variable)):
val_typecheck = val.data
if isinstance(val_typecheck, _FLOAT_TYPES):
val = float16_convertor(val)
return val
return conversion_helper(val, half_conversion)
def float16_to_fp32(val):
"""Convert fp16/bf16 `val` to fp32"""
def float_conversion(val):
val_typecheck = val
if isinstance(val_typecheck, (Parameter, Variable)):
val_typecheck = val.data
if isinstance(val_typecheck, (_BF16_TYPES, _HALF_TYPES)):
val = val.float()
return val
return conversion_helper(val, float_conversion)
class Float16Module(MegatronModule):
def __init__(self, config: TransformerConfig, module: torch.nn.Module):
super(Float16Module, self).__init__(config)
self.config = config
self.fp16 = config.fp16
self.bf16 = config.bf16
if self.fp16:
self.add_module('module', module.half())
def float16_convertor(val):
return val.half()
elif self.bf16:
self.add_module('module', module.bfloat16())
def float16_convertor(val):
return val.bfloat16()
else:
raise Exception('Either config.fp16 or config.bf16 should be True.')
self.float16_convertor = float16_convertor
def set_input_tensor(self, input_tensor):
return self.module.set_input_tensor(input_tensor)
def forward(self, *inputs, **kwargs):
if parallel_state.is_pipeline_first_stage():
inputs = fp32_to_float16(inputs, self.float16_convertor)
outputs = self.module(*inputs, **kwargs)
if parallel_state.is_pipeline_last_stage():
outputs = float16_to_fp32(outputs)
return outputs
def state_dict(self, destination=None, prefix='', keep_vars=False):
return self.module.state_dict(prefix=prefix, keep_vars=keep_vars)
def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
return self.module.state_dict_for_save_checkpoint(prefix=prefix, keep_vars=keep_vars)
def load_state_dict(self, state_dict, strict=True):
self.module.load_state_dict(state_dict, strict=strict)
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
from contextlib import nullcontext
import torch
from megatron.core import parallel_state, tensor_parallel
from megatron.core.transformer.module import MegatronModule
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
from megatron.core.transformer.transformer_layer import TransformerLayer
from megatron.core.utils import make_viewless_tensor
class TransformerBlock(MegatronModule):
"""Transformer class."""
def __init__(
self,
config: TransformerConfig,
self_attn_mask_type=AttnMaskType.padding,
post_layer_norm=True,
pre_process=True,
post_process=True,
):
super().__init__(config=config)
self.config: TransformerConfig = config
self.self_attn_mask_type = self_attn_mask_type
self.post_layer_norm = post_layer_norm
self.pre_process = pre_process
self.post_process = post_process
# required for pipeline parallel schedules
self.input_tensor = None
self.checkpoint_core_attention = self.config.recompute_granularity == 'selective'
# TODO: Maybe we can create a build_transformer_block method here instead
self.num_layers_per_pipeline_rank = (
self.config.num_layers // parallel_state.get_pipeline_model_parallel_world_size()
)
self._build_layers()
def _build_layers(self):
# Transformer layers.
# @jcasper can we improve how we deal with layer_number?
# currently it's only used in CoreAttention?
# if self.apply_query_key_layer_scaling:
# coeff = self.layer_number
# self.norm_factor *= coeff
def build_layer(layer_number):
return TransformerLayer(
config=self.config, layer_number=layer_number, self_attn_mask_type=self.self_attn_mask_type,
)
pipeline_rank = parallel_state.get_pipeline_model_parallel_rank()
if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
# Number of layers in each model chunk is the number of layers in the stage,
# divided by the number of model chunks in a stage.
# With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
# layers to stages like (each list is a model chunk):
# Stage 0: [0] [2] [4] [6]
# Stage 1: [1] [3] [5] [7]
# With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
# layers to stages like (each list is a model chunk):
# Stage 0: [0, 1] [4, 5]
# Stage 1: [2, 3] [6, 7]
vp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank()
vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
total_num_layers = self.config.num_layers
num_layers_per_virtual_rank = self.num_layers_per_pipeline_rank // vp_size
total_virtual_chunks = total_num_layers / vp_size
offset = vp_rank * total_virtual_chunks + (pipeline_rank * num_layers_per_virtual_rank)
self.layers = torch.nn.ModuleList(
[build_layer(i + 1 + offset) for i in range(num_layers_per_virtual_rank)]
)
else:
# Each stage gets a contiguous set of layers.
if parallel_state.get_pipeline_model_parallel_world_size() > 1:
offset = pipeline_rank * self.num_layers_per_pipeline_rank
else:
offset = 0
# @jcasper why is layer_number using 1 index?
self.layers = torch.nn.ModuleList(
[build_layer(i + 1 + offset) for i in range(self.num_layers_per_pipeline_rank)]
)
# # TODO: add back standalone_embedding_stage
# if self.num_layers == 0:
# # When a standalone embedding stage is used (e.g.,
# # args.standalone_embedding_stage == True), virtual pipeline ranks
# # on pipeline rank 0 will have zero transformer layers assigned to
# # them. This results in the model's input and output tensors to be
# # the same, which will cause failure for certain output tensor
# # optimizations (e.g., pipeline output deallocation). To remedy
# # this, we assign a 'no-op' layer on these ranks, which will
# # disconnect the input tensor from the output tensor.
# self.num_layers = 1
# self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)])
# else:
# self.layers = torch.nn.ModuleList([build_layer(i + 1 + offset) for i in range(self.num_layers)])
if self.post_process and self.post_layer_norm:
# Final layer norm before output.
self.final_layernorm = FusedLayerNorm(
hidden_size=self.config.hidden_size,
eps=self.config.layernorm_epsilon,
persist_layer_norm=self.config.persist_layer_norm,
sequence_parallel=self.config.sequence_parallel,
zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
)
def _get_layer(self, layer_number):
return self.layers[layer_number]
def _checkpointed_forward(self, hidden_states, attention_mask):
"""Forward method with activation checkpointing."""
def custom(start, end):
def custom_forward(*args, **kwargs):
x_, *args = args
for index in range(start, end):
layer = self._get_layer(index)
x_ = layer(x_, *args, **kwargs)
return x_
return custom_forward
if self.config.recompute_method == 'uniform':
# Uniformly divide the total number of Transformer layers and checkpoint
# the input activation of each divided chunk.
# A method to further reduce memory usage reducing checkpoints.
l = 0
while l < self.num_layers:
hidden_states = tensor_parallel.checkpoint(
custom(l, l + self.config.recompute_num_layers),
self.config.distribute_saved_activations,
hidden_states,
attention_mask,
)
l += self.recompute_num_layers
elif self.config.recompute_method == 'block':
# Checkpoint the input activation of only a set number of individual
# Transformer layers and skip the rest.
# A method fully use the device memory removing redundant re-computation.
for l in range(self.num_layers_per_pipeline_rank):
if l < self.config.recompute_num_layers:
hidden_states = tensor_parallel.checkpoint(
custom(l, l + 1), self.config.distribute_saved_activations, hidden_states, attention_mask,
)
else:
hidden_states = custom(l, l + 1)(hidden_states, attention_mask)
else:
raise ValueError("Invalid activation recompute method.")
return hidden_states
def set_input_tensor(self, input_tensor):
"""Set input tensor to be used instead of forward()'s input.
When doing pipeline parallelism the input from the previous
stage comes from communication, not from the input, so the
model's forward_step_func won't have it. This function is thus
used by internal code to bypass the input provided by the
forward_step_func"""
self.input_tensor = input_tensor
def forward(self, hidden_states, attention_mask, inference_params=None):
# hidden_states (float): [s, b, h]
# attention_mask (bool): [1, 1, s, s]
if not self.pre_process:
# See set_input_tensor()
hidden_states = self.input_tensor
# Viewless tensor.
# - We only need to create a viewless tensor in the case of micro batch
# size (mbs) == 1, since in this case, 'hidden_states.transpose()'
# above creates a view tensor, and '.contiguous()' is a pass-through.
# For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
# the need to make it viewless.
#
# However, we don't explicitly check mbs == 1 here because
# make_viewless_tensor() has negligible overhead when its input
# is already viewless.
#
# - For the 'else' case above, calling make_viewless_tensor() here is
# likely redundant, since p2p_communication.py (likely originator)
# already creates viewless tensors. That said, make_viewless_tensor()
# is called here to be future-proof and corner-case-proof.
hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True,)
if self.config.sequence_parallel:
rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
else:
rng_context = nullcontext()
with rng_context:
# Forward pass.
if self.config.recompute_granularity == 'full':
hidden_states = self._checkpointed_forward(hidden_states=hidden_states, attention_mask=attention_mask)
else:
for layer in self.layers:
hidden_states = layer(hidden_states=hidden_states, attention_mask=attention_mask)
# Final layer norm.
if self.post_process and self.post_layer_norm:
hidden_states = self.final_layernorm(hidden_states)
return hidden_states
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
from dataclasses import dataclass
from typing import Callable
import torch
import torch.nn.functional as F
from megatron.core import ModelParallelConfig
from megatron.core.utils import init_method_normal, scaled_init_method_normal
@dataclass
class TransformerConfig(ModelParallelConfig):
"""Configuration object for megatron-core transformers.
Attributes:
# model architecture
num_layers (int): Number of transformer layers in a transformer block.
hidden_size (int): Transformer hidden size.
ffn_hidden_size (int): Transformer Feed-Forward Network hidden size.
This is set to 4*hidden_size if not provided. Defaults to None.')
num_attention_heads (int): Number of transformer attention heads.
num_key_value_heads (int): This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used.
For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf).
If it is not specified, will default to `num_attention_heads`.
kv_channels (int): Projection weights dimension in multi-head attention.
This is set to hidden_size // num_attention_heads if not provided.
Defaults to None.
hidden_dropout (float): Dropout probability for transformer hidden state. Defaults to 0.1.
attention_dropout (float): Post attention dropout probability. Defaults to 0.1.
fp32_residual_connection (bool): If true, move residual connections to fp32.
apply_residual_connection_post_layernorm (bool): If true, uses the original BERT residule connection ordering.
Defaults to False.
layernorm_epsilon (float): Layernorm epsilon. Defaults to 1e-5.
layernorm_zero_centered_gamma (bool): if set to 'True', the LayerNorm is adjusted to center the gamma values
around 0. This improves numerical stability. Defaults to False.
add_bias_linear (bool): Include a bias term in all linear layers (QKV projections, after core attention, and two
in MLP layer). Default is True.
gated_linear_unit (bool): Use a gated linear unit for the first linear layer in the MLP. Defaults to False.
activation_func (Callable): Activation function to use for the non-linearity in the MLP. Defaults to F.gelu.
# initialization
init_method (Callable): Method to initialize weights. Note that bias is always set to
zero. Should be a function that takes a single Tensor and
initializes it. Defaults to
megatron.core.utils.init_method_normal(init_method_std) which is
torch.nn.init.normal_ with mean=0.0 and std=init_method_Std.
output_layer_init_method (Callable): Method to initialize weights of the output layer of
both attention and MLP blocks. Defaults to
megatron.core.utils.scaled_init_method_normal(init_method_std)
which is torch.nn.init.normal_ with mean=0.0 and
std=init_method_std / math.sqrt(2.0 * num_layers).
init_method_std (float): Standard deviation of the zero mean normal for the default
initialization method, not used if init_method and
output_layer_init_method are provided. Defaults to 0.02.
# mixed-precision
apply_query_key_layer_scaling (bool): If true, scale Q * K^T by 1 / layer-number. Defaults to True.
attention_softmax_in_fp32 (bool): If true, run attention masking and softmax in fp32.
This should be true if apply_query_key_layer_scaling is true.
# fusion
bias_gelu_fustion (bool): If true, fuses bias and gelu. Defaults to False.
masked_softmax_fusion (bool): If true, uses softmax fusion.
persist_layer_norm (bool): If true, uses the persistent fused layer norm kernel.
This kernel only supports a fixed set of hidden sizes.
Defaults to False.
bias_dropout_fusion (bool): If true, uses bias dropout fusion.
# activation recomputation
recompute_granularity (str): megatron-core supports 'selective' activation checkpointing where only the memory
intensive part of attention is checkpointed. These memory intensive activations
are also less compute intensive which makes activation checkpointing more efficient
for LLMs (20B+). See Reducing Activation Recomputation in Large Transformer
Models: https://arxiv.org/abs/2205.05198 for more details. 'full' will checkpoint
the entire transformer layer. Must be 'selective' or 'full'. Defaults to None.
recompute_method (str): uniform will uniformly divide the total number of transformer layers in a transformer
block and recompute the input activation of each divided chunk at the specified
granularity. block will recompute the input activations for only a set number of
transformer layers per pipeline stage. The rest of the layers in the pipeline stage
will not have any activations recomputed. Must be 'uniform' or 'block'. Defaults to
None.
recompute_num_layers (int): When recompute_method is uniform, recompute_num_layers is the number of transformer
layers in each uniformly divided recompute unit. When recompute_method is block,
recompute_num_layers is the number of transformer layers to recompute within each
pipeline stage. Defaults to None.
distribute_saved_activations (bool): If true, distribute recomputed activations across the model parallel
group. Defaults to None.
"""
# model architecture
num_layers: int = 0
hidden_size: int = 0
num_attention_heads: int = 0
num_key_value_heads: int = None
ffn_hidden_size: int = None
kv_channels: int = None
hidden_dropout: float = 0.1
attention_dropout: float = 0.1
fp32_residual_connection: bool = False
# @jcasper should we keep this option?
apply_residual_connection_post_layernorm: bool = False
layernorm_epsilon: float = 1e-5
layernorm_zero_centered_gamma: bool = False
add_bias_linear: bool = True
gated_linear_unit: bool = False
activation_func: Callable = F.gelu
# initialization
init_method: Callable = None
output_layer_init_method: Callable = None
init_method_std: float = 0.02
# mixed-precision
apply_query_key_layer_scaling: bool = True
attention_softmax_in_fp32: bool = True
# communication
# fusion
bias_gelu_fusion: bool = False # TODO: this should be bias_activation_fusion ?
masked_softmax_fusion: bool = False
persist_layer_norm: bool = False
bias_dropout_fusion: bool = False # TODO: this should be bias_dropout_add_fusion?
# activation recomputation
recompute_granularity: str = None
recompute_method: str = None
recompute_num_layers: int = None
distribute_saved_activations: bool = None
def __post_init__(self):
""" Python dataclass method that is used to modify attributes after initialization.
See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
"""
super().__post_init__()
if self.fp16 and self.bf16:
raise ValueError(f'Only one of self.fp16: {self.fp16} and self.bf16 {self.bf16} should be True.')
if self.ffn_hidden_size is None:
self.ffn_hidden_size = 4 * self.hidden_size
if self.num_key_value_heads is None:
self.num_key_value_heads = self.num_attention_heads
assert self.num_attention_heads % self.num_key_value_heads == 0
if self.kv_channels is None:
self.kv_channels = self.hidden_size // self.num_attention_heads
if self.apply_query_key_layer_scaling:
self.attention_softmax_in_fp32 = True
if self.recompute_granularity is not None:
if not self.recompute_granularity in ['full', 'selective']:
raise ValueError(
f'When using recompute_granuarlity: {self.recompute_granularity} must be "full" or "selective".'
)
if self.recompute_method is not None:
if not self.recompute_method in ['block', 'uniform']:
raise ValueError(f'recompute_method: {self.recompute_method} must be "block" or "uniform".')
elif self.recompute_granularity != 'selective':
raise ValueError(
f'Using recompute_granularity: {self.recompute_granularity} so recompute_method must be "block" or "uniform"'
)
if self.recompute_num_layers is None:
raise ValueError(
f'When using recompute_granularity: {self.recompute_granularity} so recompute_num_layers must be between '
f'1 and num_layers_per_pipeline_rank: {self.num_layers // self.pipeline_model_parallel_size}'
)
if self.distribute_saved_activations and self.sequence_parallel_enabled:
raise ValueError(
f'distribute_saved_activations: {self.distribute_saved_activations} must be false when sequence parallel is enabled: {self.sequence_parallel_enabled}'
)
if self.virtual_pipeline_model_parallel_size is not None:
if not self.num_layers % self.virtual_pipeline_model_parallel_size == 0:
raise ValueError(
f'num_layers: {self.num_layers} must be divisible by virtual_model_parallel_size {self.virtual_pipeline_model_parallel_size}'
)
if self.apply_query_key_layer_scaling:
self.attention_softmax_in_fp32 = True
if self.bias_gelu_fusion:
if not self.add_bias_linear:
raise ValueError("When bias_gelu_fusion is True, add_bias_linear must also be True.")
if self.activation_func != F.gelu:
raise ValueError(f'When bias_gelu_fusion is True, activation_func must be F.gelu.')
if self.init_method is None:
self.init_method = init_method_normal(self.init_method_std)
if self.output_layer_init_method is None:
self.output_layer_init_method = scaled_init_method_normal(self.init_method_std, self.num_layers)
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import torch
from megatron.core.transformer.module import MegatronModule
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.transformer.enums import AttnType, AttnMaskType
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.transformer.attention import SelfAttention
from megatron.core.transformer.mlp import MLP
from megatron.core.utils import make_viewless_tensor
from megatron.core.transformer.custom_layers.transformer_engine import TELayerNorm
class TransformerLayer(MegatronModule):
"""A single transformer layer.
Transformer layer takes input with size [s, b, h] and returns an
output of the same size.
"""
def __init__(
self, config: TransformerConfig, layer_number: int = 1, self_attn_mask_type=AttnMaskType.padding,
):
super().__init__(config=config)
self.config: TransformerConfig = config
self.layer_number = layer_number
self.self_attn_mask_type = self_attn_mask_type
# Layernorm on the input data.
# TODO: add pytorch only layernorm
self.input_layernorm = TELayerNorm(
hidden_size=self.config.hidden_size,
eps=self.config.layernorm_epsilon,
persist_layer_norm=self.config.persist_layer_norm,
sequence_parallel=self.config.sequence_parallel,
zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
)
# Self attention.
self.self_attention = SelfAttention(
config=self.config,
layer_number=layer_number,
attn_mask_type=self_attn_mask_type,
)
# Layernorm on the attention output
self.post_self_attn_layernorm = TELayerNorm(
hidden_size=self.config.hidden_size,
eps=self.config.layernorm_epsilon,
persist_layer_norm=self.config.persist_layer_norm,
sequence_parallel=self.config.sequence_parallel,
zero_centered_gamma=self.config.layernorm_zero_centered_gamma,
)
# MLP
self.mlp = MLP(config=self.config)
# @jcasper how should we handle nvfuser?
# Set bias+dropout+add fusion grad_enable execution handler.
# TORCH_MAJOR = int(torch.__version__.split('.')[0])
# TORCH_MINOR = int(torch.__version__.split('.')[1])
# use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
# self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad
self.bias_dropout_add_exec_handler = torch.enable_grad
self.bias_dropout_add_func = get_bias_dropout_add(
self.training,
self.config.bias_dropout_fusion
)
# TODO: decide how to do inference_params
def forward(
self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, inference_params=None
):
# hidden_states: [s, b, h]
# Layer norm at the beginning of the transformer layer.
layernorm_output = self.input_layernorm(hidden_states)
# Self attention.
attention_output_with_bias = self.self_attention(
layernorm_output, attention_mask, inference_params=inference_params
)
# Residual connection.
if self.config.apply_residual_connection_post_layernorm:
residual = layernorm_output
else:
residual = hidden_states
# bias_dropout_add fusion returning fp32 instead of bf16
with self.bias_dropout_add_exec_handler():
layernorm_input = self.bias_dropout_add_func(
attention_output_with_bias, residual, self.config.hidden_dropout
)
# Layer norm post the self attention.
layernorm_output = self.post_self_attn_layernorm(layernorm_input)
# MLP.
mlp_output_with_bias = self.mlp(layernorm_output)
# Second residual connection.
if self.config.apply_residual_connection_post_layernorm:
residual = layernorm_output
else:
residual = layernorm_input
with self.bias_dropout_add_exec_handler():
output = self.bias_dropout_add_func(
mlp_output_with_bias, residual, self.config.hidden_dropout
)
# Jit compiled function creates 'view' tensor. This tensor
# potentially gets saved in the MPU checkpoint function context,
# which rejects view tensors. While making a viewless tensor here
# won't result in memory savings (like the data loader, or
# p2p_communication), it serves to document the origin of this
# 'view' tensor.
output = make_viewless_tensor(inp=output, requires_grad=output.requires_grad, keep_graph=True)
return output
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Utilities for transformer layers."""
import torch
from megatron import get_args
from deepspeed.runtime.zero import GatheredParameters
import pdb
def attention_mask_func(attention_scores, attention_mask):
attention_scores.masked_fill_(attention_mask, -10000.0)
return attention_scores
def get_linear_layer(rows, columns, init_method, gather_params_on_init=False):
"""Simple linear layer with weight initialization."""
layer = torch.nn.Linear(rows, columns)
pdb.set_trace()
if get_args().perform_initialization:
with GatheredParameters(layer.weight, modifier_rank=0, enable=gather_params_on_init):
init_method(layer.weight)
with torch.no_grad():
with GatheredParameters(layer.weight, modifier_rank=0, enable=gather_params_on_init):
layer.bias.zero_()
return layer
@torch.jit.script
def gelu_impl(x):
"""OpenAI's gelu implementation."""
return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
def openai_gelu(x):
return gelu_impl(x)
# This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
@torch.jit.script
def erf_gelu(x):
return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype) + torch.ones_like(x).to(dtype=x.dtype))
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
"""Utility functions used throughout Megatron core"""
from functools import reduce
import math
import operator
import torch
from megatron.core import parallel_state
from megatron import get_args
from deepspeed import get_accelerator
def ensure_divisibility(numerator, denominator):
"""Ensure that numerator is divisible by the denominator."""
assert numerator % denominator == 0, "{} is not divisible by {}".format(
numerator, denominator
)
def divide(numerator, denominator):
"""Ensure that numerator is divisible by the denominator and return
the division value."""
ensure_divisibility(numerator, denominator)
return numerator // denominator
def get_attr_wrapped_model(model, attr, allow_none=True):
"""Get an attribute from a wrapped model"""
if isinstance(model, list):
raise RuntimeError("_get_attr_wrapped_model given a list of models")
if allow_none:
def condition(model, attr):
return not hasattr(model, attr)
else:
def condition(model, attr):
return getattr(model, attr, None) is None
while condition(model, attr):
if not hasattr(model, "module"):
raise RuntimeError(f"_get_attr_wrapped_model couldn't find attribute {attr}")
model = model.module
return getattr(model, attr)
def get_model_type(model):
return get_attr_wrapped_model(model, 'model_type')
def get_model_config(model):
args = get_args()
if args.deepspeed:
return get_attr_wrapped_model(model.module, 'config', allow_none=False)
return get_attr_wrapped_model(model, 'config', allow_none=False)
class GlobalMemoryBuffer:
"""Global buffer to avoid dynamic memory allocations.
Caller should ensure that buffers of the same name
are not used concurrently."""
def __init__(self):
self.buffer = {}
def get_tensor(self, tensor_shape, dtype, name):
required_len = reduce(operator.mul, tensor_shape, 1)
if self.buffer.get((name, dtype), None) is None or \
self.buffer[(name, dtype)].numel() < required_len:
self.buffer[(name, dtype)] = \
torch.empty(required_len,
dtype=dtype,
device=get_accelerator().current_device_name(),
requires_grad=False)
return self.buffer[(name, dtype)][0:required_len].view(*tensor_shape)
def _kernel_make_viewless_tensor(inp, requires_grad):
'''Make a viewless tensor.
View tensors have the undesirable side-affect of retaining a reference
to the originally-viewed tensor, even after manually setting the '.data'
field. This method creates a new tensor that links to the old tensor's
data, without linking the viewed tensor, referenced via the '._base'
field.
'''
out = torch.empty(
(1,),
dtype = inp.dtype,
device = inp.device,
requires_grad = requires_grad,
)
out.data = inp.data
return out
class MakeViewlessTensor(torch.autograd.Function):
'''
Autograd function to make a viewless tensor.
This function should be used in cases where the computation graph needs
to be propagated, but we only want a viewless tensor (e.g.,
ParallelTransformer's hidden_states). Call this function by passing
'keep_graph = True' to 'make_viewless_tensor()'.
'''
@staticmethod
def forward(ctx, inp, requires_grad):
return _kernel_make_viewless_tensor(inp, requires_grad)
@staticmethod
def backward(ctx, grad_output):
return grad_output, None
def make_viewless_tensor(inp, requires_grad, keep_graph):
'''
Entry-point for creating viewless tensors.
This method should be used, rather than calling 'MakeViewlessTensor'
or '_kernel_make_viewless_tensor' directly. This method acts as a
switch for determining if an autograd function or a regular method
should be used to create the tensor.
'''
# return tensor as-is, if not a 'view'
if inp._base is None:
return inp
# create viewless tensor
if keep_graph:
return MakeViewlessTensor.apply(inp, requires_grad)
else:
return _kernel_make_viewless_tensor(inp, requires_grad)
def assert_viewless_tensor(tensor, extra_msg = None):
'''Assert that a tensor is not a view (i.e., its '._base' field is
not set).'''
if isinstance(tensor, list):
[ assert_viewless_tensor(t) for t in tensor ]
return tensor
if not isinstance(tensor, torch.Tensor):
return tensor
assert tensor._base is None, (
"Ensure tensor._base is None before setting tensor.data or storing "
"tensor to memory buffer. Otherwise, a memory leak will occur (and "
"likely accumulate over iterations). %s"
) % extra_msg
return tensor
def safely_set_viewless_tensor_data(tensor, new_data_tensor):
'''Safely set tensor's '.data' field.
Check first that the tensor is viewless (i.e., '._base' not set). If not,
raise an exception.
'''
assert_viewless_tensor(tensor, extra_msg = "FYI, tensor._base has shape %s, and new_data_tensor has shape %s." % ("--" if tensor._base is None else tensor._base.shape, new_data_tensor.shape))
tensor.data = new_data_tensor
def init_method_normal(sigma):
"""Init method based on N(0, sigma)."""
def init_(tensor):
return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
return init_
def scaled_init_method_normal(sigma, num_layers):
"""Init method based on N(0, sigma/sqrt(2*num_layers)."""
std = sigma / math.sqrt(2.0 * num_layers)
def init_(tensor):
return torch.nn.init.normal_(tensor, mean=0.0, std=std)
return init_
CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
CPPFLAGS += $(shell python3 -m pybind11 --includes)
LIBNAME = helpers
LIBEXT = $(shell python3-config --extension-suffix)
default: $(LIBNAME)$(LIBEXT)
%$(LIBEXT): %.cpp
$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
from . import indexed_dataset
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment