Commit 051f58f1 authored by liangjing's avatar liangjing
Browse files

v1

parent 0024a5c6
Pipeline #829 passed with stage
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
MAJOR = 0
MINOR = 3
PATCH = 0
PRE_RELEASE = ''
# Use the following formatting: (major, minor, patch, pre-release)
VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
__shortversion__ = '.'.join(map(str, VERSION[:3]))
__version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:])
__package_name__ = 'megatron_core'
__contact_names__ = 'NVIDIA'
__contact_emails__ = 'nemo-toolkit@nvidia.com' # use NeMo Email
__homepage__ = (
'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/' # use NeMo homepage
)
__repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core'
__download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
__description__ = (
'Megatron Core - a library for efficient and scalable training of transformer based models'
)
__license__ = 'BSD-3'
__keywords__ = (
'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch'
)
This diff is collapsed.
torch
\ No newline at end of file
from .cross_entropy import vocab_parallel_cross_entropy from .cross_entropy import vocab_parallel_cross_entropy
from .data import broadcast_data from .data import broadcast_data
from .layers import ( from .layers import (
ColumnParallelLinear, ColumnParallelLinear,
RowParallelLinear, RowParallelLinear,
VocabParallelEmbedding, VocabParallelEmbedding,
set_tensor_model_parallel_attributes,
set_defaults_if_not_set_tensor_model_parallel_attributes,
copy_tensor_model_parallel_attributes, copy_tensor_model_parallel_attributes,
linear_with_grad_accumulation_and_async_allreduce,
param_is_not_tensor_parallel_duplicate, param_is_not_tensor_parallel_duplicate,
linear_with_grad_accumulation_and_async_allreduce set_defaults_if_not_set_tensor_model_parallel_attributes,
set_tensor_model_parallel_attributes,
) )
from .mappings import ( from .mappings import (
copy_to_tensor_model_parallel_region, copy_to_tensor_model_parallel_region,
gather_from_tensor_model_parallel_region,
gather_from_sequence_parallel_region, gather_from_sequence_parallel_region,
scatter_to_tensor_model_parallel_region, gather_from_tensor_model_parallel_region,
scatter_to_sequence_parallel_region, scatter_to_sequence_parallel_region,
scatter_to_tensor_model_parallel_region,
) )
from .random import checkpoint, get_cuda_rng_tracker, model_parallel_cuda_manual_seed
from .random import (
checkpoint,
get_cuda_rng_tracker,
model_parallel_cuda_manual_seed,
)
from .utils import ( from .utils import (
gather_split_1d_tensor,
split_tensor_along_last_dim, split_tensor_along_last_dim,
split_tensor_into_1d_equal_chunks, split_tensor_into_1d_equal_chunks,
gather_split_1d_tensor,
) )
__all__ = [ __all__ = [
...@@ -38,7 +29,7 @@ __all__ = [ ...@@ -38,7 +29,7 @@ __all__ = [
"vocab_parallel_cross_entropy", "vocab_parallel_cross_entropy",
# data.py # data.py
"broadcast_data", "broadcast_data",
#layers.py # layers.py
"ColumnParallelLinear", "ColumnParallelLinear",
"RowParallelLinear", "RowParallelLinear",
"VocabParallelEmbedding", "VocabParallelEmbedding",
...@@ -51,7 +42,7 @@ __all__ = [ ...@@ -51,7 +42,7 @@ __all__ = [
"copy_to_tensor_model_parallel_region", "copy_to_tensor_model_parallel_region",
"gather_from_tensor_model_parallel_region", "gather_from_tensor_model_parallel_region",
"gather_from_sequence_parallel_region", "gather_from_sequence_parallel_region",
# "reduce_from_tensor_model_parallel_region", # "reduce_from_tensor_model_parallel_region",
"scatter_to_tensor_model_parallel_region", "scatter_to_tensor_model_parallel_region",
"scatter_to_sequence_parallel_region", "scatter_to_sequence_parallel_region",
# random.py # random.py
......
...@@ -5,22 +5,21 @@ import torch ...@@ -5,22 +5,21 @@ import torch
from megatron.core.parallel_state import ( from megatron.core.parallel_state import (
get_tensor_model_parallel_group, get_tensor_model_parallel_group,
get_tensor_model_parallel_rank, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size get_tensor_model_parallel_world_size,
) )
from .utils import VocabUtility from .utils import VocabUtility
class _VocabParallelCrossEntropy(torch.autograd.Function): class _VocabParallelCrossEntropy(torch.autograd.Function):
@staticmethod @staticmethod
def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0): def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
# Maximum value along vocab dimension across all GPUs. # Maximum value along vocab dimension across all GPUs.
logits_max = torch.max(vocab_parallel_logits, dim=-1)[0] logits_max = torch.max(vocab_parallel_logits, dim=-1)[0]
torch.distributed.all_reduce(logits_max, torch.distributed.all_reduce(
op=torch.distributed.ReduceOp.MAX, logits_max, op=torch.distributed.ReduceOp.MAX, group=get_tensor_model_parallel_group()
group=get_tensor_model_parallel_group()) )
# Subtract the maximum value. # Subtract the maximum value.
vocab_parallel_logits = vocab_parallel_logits - logits_max.unsqueeze(dim=-1) vocab_parallel_logits = vocab_parallel_logits - logits_max.unsqueeze(dim=-1)
...@@ -29,8 +28,7 @@ class _VocabParallelCrossEntropy(torch.autograd.Function): ...@@ -29,8 +28,7 @@ class _VocabParallelCrossEntropy(torch.autograd.Function):
partition_vocab_size = vocab_parallel_logits.size()[-1] partition_vocab_size = vocab_parallel_logits.size()[-1]
rank = get_tensor_model_parallel_rank() rank = get_tensor_model_parallel_rank()
world_size = get_tensor_model_parallel_world_size() world_size = get_tensor_model_parallel_world_size()
vocab_start_index, vocab_end_index = get_vocab_range( vocab_start_index, vocab_end_index = get_vocab_range(partition_vocab_size, rank, world_size)
partition_vocab_size, rank, world_size)
# Create a mask of valid vocab ids (1 means it needs to be masked). # Create a mask of valid vocab ids (1 means it needs to be masked).
target_mask = (target < vocab_start_index) | (target >= vocab_end_index) target_mask = (target < vocab_start_index) | (target >= vocab_end_index)
...@@ -42,24 +40,27 @@ class _VocabParallelCrossEntropy(torch.autograd.Function): ...@@ -42,24 +40,27 @@ class _VocabParallelCrossEntropy(torch.autograd.Function):
# [*, partition-vocab-size] and target to a 1-D tensor of size [*]. # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size) logits_2d = vocab_parallel_logits.view(-1, partition_vocab_size)
masked_target_1d = masked_target.view(-1) masked_target_1d = masked_target.view(-1)
arange_1d = torch.arange(start=0, end=logits_2d.size()[0], arange_1d = torch.arange(start=0, end=logits_2d.size()[0], device=logits_2d.device)
device=logits_2d.device)
predicted_logits_1d = logits_2d[arange_1d, masked_target_1d] predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
predicted_logits_1d = predicted_logits_1d.clone().contiguous() predicted_logits_1d = predicted_logits_1d.clone().contiguous()
predicted_logits = predicted_logits_1d.view_as(target) predicted_logits = predicted_logits_1d.view_as(target)
predicted_logits[target_mask] = 0.0 predicted_logits[target_mask] = 0.0
# All reduce is needed to get the chunks from other GPUs. # All reduce is needed to get the chunks from other GPUs.
torch.distributed.all_reduce(predicted_logits, torch.distributed.all_reduce(
op=torch.distributed.ReduceOp.SUM, predicted_logits,
group=get_tensor_model_parallel_group()) op=torch.distributed.ReduceOp.SUM,
group=get_tensor_model_parallel_group(),
)
# Sum of exponential of logits along vocab dimension across all GPUs. # Sum of exponential of logits along vocab dimension across all GPUs.
exp_logits = vocab_parallel_logits exp_logits = vocab_parallel_logits
torch.exp(vocab_parallel_logits, out=exp_logits) torch.exp(vocab_parallel_logits, out=exp_logits)
sum_exp_logits = exp_logits.sum(dim=-1) sum_exp_logits = exp_logits.sum(dim=-1)
torch.distributed.all_reduce(sum_exp_logits, torch.distributed.all_reduce(
op=torch.distributed.ReduceOp.SUM, sum_exp_logits,
group=get_tensor_model_parallel_group()) op=torch.distributed.ReduceOp.SUM,
group=get_tensor_model_parallel_group(),
)
# Loss = log(sum(exp(logits))) - predicted-logit. # Loss = log(sum(exp(logits))) - predicted-logit.
loss = torch.log(sum_exp_logits) - predicted_logits loss = torch.log(sum_exp_logits) - predicted_logits
...@@ -87,7 +88,6 @@ class _VocabParallelCrossEntropy(torch.autograd.Function): ...@@ -87,7 +88,6 @@ class _VocabParallelCrossEntropy(torch.autograd.Function):
loss = (1.0 - smoothing) * loss - smoothing * mean_log_probs loss = (1.0 - smoothing) * loss - smoothing * mean_log_probs
ctx.label_smoothing, ctx.vocab_size = label_smoothing, vocab_size ctx.label_smoothing, ctx.vocab_size = label_smoothing, vocab_size
ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
# Store softmax, target-mask and masked-target for backward pass. # Store softmax, target-mask and masked-target for backward pass.
ctx.save_for_backward(exp_logits, target_mask, masked_target_1d) ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
...@@ -108,8 +108,7 @@ class _VocabParallelCrossEntropy(torch.autograd.Function): ...@@ -108,8 +108,7 @@ class _VocabParallelCrossEntropy(torch.autograd.Function):
grad_2d = grad_input.view(-1, partition_vocab_size) grad_2d = grad_input.view(-1, partition_vocab_size)
# Add the gradient from matching classes. # Add the gradient from matching classes.
arange_1d = torch.arange(start=0, end=grad_2d.size()[0], arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
device=grad_2d.device)
softmax_update = 1.0 - target_mask.view(-1).float() softmax_update = 1.0 - target_mask.view(-1).float()
......
...@@ -8,15 +8,16 @@ from megatron.core.parallel_state import ( ...@@ -8,15 +8,16 @@ from megatron.core.parallel_state import (
get_tensor_model_parallel_src_rank, get_tensor_model_parallel_src_rank,
) )
_MAX_DATA_DIM = 5 _MAX_DATA_DIM = 5
def _check_data_types(keys, data, target_dtype): def _check_data_types(keys, data, target_dtype):
"""Check that all the keys have the same target data type.""" """Check that all the keys have the same target data type."""
for key in keys: for key in keys:
assert data[key].dtype == target_dtype, '{} has data type {} which '\ assert data[key].dtype == target_dtype, (
'{} has data type {} which '
'is different than {}'.format(key, data[key].dtype, target_dtype) 'is different than {}'.format(key, data[key].dtype, target_dtype)
)
def _build_key_size_numel_dictionaries(keys, data): def _build_key_size_numel_dictionaries(keys, data):
...@@ -36,8 +37,9 @@ def _build_key_size_numel_dictionaries(keys, data): ...@@ -36,8 +37,9 @@ def _build_key_size_numel_dictionaries(keys, data):
# Move to GPU and broadcast. # Move to GPU and broadcast.
sizes_cuda = torch.cuda.LongTensor(sizes) sizes_cuda = torch.cuda.LongTensor(sizes)
torch.distributed.broadcast(sizes_cuda, get_tensor_model_parallel_src_rank(), torch.distributed.broadcast(
group=get_tensor_model_parallel_group()) sizes_cuda, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group()
)
# Move back to cpu and unpack. # Move back to cpu and unpack.
sizes_cpu = sizes_cuda.cpu() sizes_cpu = sizes_cuda.cpu()
...@@ -74,24 +76,21 @@ def broadcast_data(keys, data, datatype): ...@@ -74,24 +76,21 @@ def broadcast_data(keys, data, datatype):
""" """
# Build (key, size) and (key, number of elements) dictionaries along # Build (key, size) and (key, number of elements) dictionaries along
# with the total number of elements on all ranks. # with the total number of elements on all ranks.
key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, data)
data)
# Pack on rank zero. # Pack on rank zero.
if get_tensor_model_parallel_rank() == 0: if get_tensor_model_parallel_rank() == 0:
# Check that all keys have the same data type. # Check that all keys have the same data type.
_check_data_types(keys, data, datatype) _check_data_types(keys, data, datatype)
# Flatten the data associated with the keys # Flatten the data associated with the keys
flatten_data = torch.cat( flatten_data = torch.cat([data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
[data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
else: else:
flatten_data = torch.empty(total_numel, flatten_data = torch.empty(total_numel, device=torch.cuda.current_device(), dtype=datatype)
device=torch.cuda.current_device(),
dtype=datatype)
# Broadcast # Broadcast
torch.distributed.broadcast(flatten_data, get_tensor_model_parallel_src_rank(), torch.distributed.broadcast(
group=get_tensor_model_parallel_group()) flatten_data, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group()
)
# Unpack # Unpack
output = {} output = {}
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
from .transformer_config import TransformerConfig
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import enum
# can we get rid of this?
# it's being used in pipeline schedules
class ModelType(enum.Enum):
encoder_or_decoder = 1
encoder_and_decoder = 2
# class LayerType(enum.Enum):
# encoder = 1
# decoder = 2
class AttnType(enum.Enum):
self_attn = 1
cross_attn = 2
class AttnMaskType(enum.Enum):
padding = 1
causal = 2
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import torch
class IdentityOp(torch.nn.Module):
"""
This is a placeholder for IdentityOp (NoOp)
"""
def __init__(self, *args, **kwargs):
super(IdentityOp, self).__init__()
def forward(self, x, *args, **kwargs):
return x
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment