Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Yuan2.0-M32_pytorch
Commits
d3dd8642
Commit
d3dd8642
authored
Jun 26, 2024
by
Rayyyyy
Browse files
First add
parents
Pipeline
#1259
failed with stages
in 0 seconds
Changes
315
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
5043 additions
and
0 deletions
+5043
-0
megatron/core/models/gpt/gpt_embedding.py
megatron/core/models/gpt/gpt_embedding.py
+92
-0
megatron/core/models/gpt/gpt_model.py
megatron/core/models/gpt/gpt_model.py
+284
-0
megatron/core/package_info.py
megatron/core/package_info.py
+23
-0
megatron/core/parallel_state.py
megatron/core/parallel_state.py
+605
-0
megatron/core/pipeline_parallel/__init__.py
megatron/core/pipeline_parallel/__init__.py
+1
-0
megatron/core/pipeline_parallel/p2p_communication.py
megatron/core/pipeline_parallel/p2p_communication.py
+543
-0
megatron/core/pipeline_parallel/schedules.py
megatron/core/pipeline_parallel/schedules.py
+1168
-0
megatron/core/requirements.txt
megatron/core/requirements.txt
+2
-0
megatron/core/tensor_parallel/__init__.py
megatron/core/tensor_parallel/__init__.py
+66
-0
megatron/core/tensor_parallel/cross_entropy.py
megatron/core/tensor_parallel/cross_entropy.py
+143
-0
megatron/core/tensor_parallel/data.py
megatron/core/tensor_parallel/data.py
+105
-0
megatron/core/tensor_parallel/layers.py
megatron/core/tensor_parallel/layers.py
+735
-0
megatron/core/tensor_parallel/mappings.py
megatron/core/tensor_parallel/mappings.py
+279
-0
megatron/core/tensor_parallel/random.py
megatron/core/tensor_parallel/random.py
+253
-0
megatron/core/tensor_parallel/utils.py
megatron/core/tensor_parallel/utils.py
+108
-0
megatron/core/transformer/__init__.py
megatron/core/transformer/__init__.py
+4
-0
megatron/core/transformer/attention.py
megatron/core/transformer/attention.py
+320
-0
megatron/core/transformer/core_attention.py
megatron/core/transformer/core_attention.py
+146
-0
megatron/core/transformer/custom_layers/transformer_engine.py
...tron/core/transformer/custom_layers/transformer_engine.py
+141
-0
megatron/core/transformer/enums.py
megatron/core/transformer/enums.py
+25
-0
No files found.
Too many changes to show.
To preserve performance only
315 of 315+
files are displayed.
Plain diff
Email patch
megatron/core/models/gpt/gpt_embedding.py
0 → 100644
View file @
d3dd8642
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import
torch
from
megatron.core
import
tensor_parallel
from
megatron.core.transformer.module
import
MegatronModule
from
megatron.core.transformer.transformer_config
import
TransformerConfig
class
GPTEmbedding
(
MegatronModule
):
"""Language model embeddings.
Arguments:
config (TransformerConfig): config object with all necessary configs for TransformerBlock
vocab_size (int): vocabulary size
max_sequence_length (int): maximum size of sequence. This
is used for positional embedding
add_position_embedding (bool): Add a position embedding.
embedding_dropout_prob float): dropout probability for embeddings
"""
def
__init__
(
self
,
config
:
TransformerConfig
,
vocab_size
:
int
,
max_sequence_length
:
int
,
add_position_embedding
:
bool
):
super
().
__init__
(
config
=
config
)
self
.
config
:
TransformerConfig
=
config
self
.
vocab_size
:
int
=
vocab_size
self
.
max_sequence_length
:
int
=
max_sequence_length
self
.
add_position_embedding
:
bool
=
add_position_embedding
# Word embeddings (parallel).
self
.
word_embeddings
=
tensor_parallel
.
VocabParallelEmbedding
(
num_embeddings
=
self
.
vocab_size
,
embedding_dim
=
self
.
config
.
hidden_size
,
init_method
=
self
.
config
.
init_method
,
config
=
self
.
config
)
# Position embedding (serial).
if
self
.
add_position_embedding
:
self
.
position_embeddings
=
torch
.
nn
.
Embedding
(
self
.
max_sequence_length
,
self
.
config
.
hidden_size
)
# Initialize the position embeddings.
if
self
.
config
.
perform_initialization
:
self
.
config
.
init_method
(
self
.
position_embeddings
.
weight
)
# Embeddings dropout
self
.
embedding_dropout
=
torch
.
nn
.
Dropout
(
self
.
config
.
hidden_dropout
)
def
zero_parameters
(
self
):
"""Zero out all parameters in embedding."""
self
.
word_embeddings
.
weight
.
data
.
fill_
(
0
)
self
.
word_embeddings
.
weight
.
shared
=
True
self
.
position_embeddings
.
weight
.
data
.
fill_
(
0
)
self
.
position_embeddings
.
weight
.
shared
=
True
def
forward
(
self
,
input_ids
,
position_ids
):
# Embeddings.
word_embeddings
=
self
.
word_embeddings
(
input_ids
)
if
self
.
add_position_embedding
:
position_embeddings
=
self
.
position_embeddings
(
position_ids
)
embeddings
=
word_embeddings
+
position_embeddings
else
:
embeddings
=
word_embeddings
# Data format change to avoid explicit tranposes : [b s h] --> [s b h].
embeddings
=
embeddings
.
transpose
(
0
,
1
).
contiguous
()
# If the input flag for fp32 residual connection is set, convert for float.
if
self
.
config
.
fp32_residual_connection
:
embeddings
=
embeddings
.
float
()
# Dropout.
if
self
.
config
.
sequence_parallel
:
embeddings
=
tensor_parallel
.
scatter_to_sequence_parallel_region
(
embeddings
)
with
tensor_parallel
.
get_cuda_rng_tracker
().
fork
():
embeddings
=
self
.
embedding_dropout
(
embeddings
)
else
:
embeddings
=
self
.
embedding_dropout
(
embeddings
)
return
embeddings
# TODO: add distributed checkpointing
def
state_dict_for_save_checkpoint
(
self
,
prefix
=
''
,
keep_vars
=
False
):
pass
def
load_state_dict
(
self
,
state_dict
,
strict
=
True
):
pass
megatron/core/models/gpt/gpt_model.py
0 → 100644
View file @
d3dd8642
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import
logging
from
typing
import
Literal
import
torch
from
torch
import
Tensor
from
megatron.core
import
parallel_state
,
tensor_parallel
from
megatron.core.transformer.module
import
MegatronModule
from
megatron.core.transformer.transformer_config
import
TransformerConfig
from
megatron.core.transformer.transformer_block
import
TransformerBlock
from
megatron.core.transformer.enums
import
AttnMaskType
,
ModelType
from
megatron.core.models.gpt.gpt_embedding
import
GPTEmbedding
from
megatron.core.models.common.rotary_pos_embedding
import
RotaryEmbedding
class
GPTModel
(
MegatronModule
):
"""Transformer language model.
Arguments:
config (TransformerConfig): transformer config
vocab_size (int): vocabulary size
max_sequence_length (int): maximum size of sequence. This is used for positional embedding
pre_process (bool): Include embedding layer (used with pipeline parallelism)
post_process (bool): Include an output layer (used with pipeline parallelism)
parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks
share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are
shared. Defaults to False.
position_embedding_type (string): Position embedding type. Options ['learned_absolute', 'rope'].
Defaults is 'learned_absolute'.
rotary_percent (float): Percent of rotary dimension to use for rotary position embeddings.
Defaults to 1.0 (100%). Ignored unless position_embedding_type is 'rope'.
"""
def
__init__
(
self
,
config
:
TransformerConfig
,
vocab_size
:
int
,
max_sequence_length
:
int
,
pre_process
:
bool
=
True
,
post_process
:
bool
=
True
,
fp16_lm_cross_entropy
:
bool
=
False
,
parallel_output
:
bool
=
True
,
share_embeddings_and_output_weights
:
bool
=
False
,
position_embedding_type
:
Literal
[
'learned_absolute'
,
'rope'
]
=
'learned_absolute'
,
rotary_percent
:
float
=
1.0
,
):
super
(
GPTModel
,
self
).
__init__
(
config
=
config
)
self
.
config
:
TransformerConfig
=
config
self
.
vocab_size
=
vocab_size
self
.
max_sequence_length
=
max_sequence_length
self
.
pre_process
=
pre_process
self
.
post_process
=
post_process
self
.
fp16_lm_cross_entropy
=
fp16_lm_cross_entropy
self
.
parallel_output
=
parallel_output
self
.
share_embeddings_and_output_weights
=
share_embeddings_and_output_weights
self
.
position_embedding_type
=
position_embedding_type
# megatron core pipelining currently depends on model type
self
.
model_type
=
ModelType
.
encoder_or_decoder
# Embeddings.
if
self
.
pre_process
:
self
.
embedding
=
GPTEmbedding
(
config
=
self
.
config
,
vocab_size
=
self
.
vocab_size
,
max_sequence_length
=
self
.
max_sequence_length
,
add_position_embedding
=
(
self
.
position_embedding_type
==
'learned_absolute'
)
)
# Rotary Position Embeddings
if
self
.
position_embedding_type
==
'rope'
:
rotary_dim
=
self
.
config
.
kv_channels
if
rotary_percent
<
1.0
:
rotary_dim
=
int
(
rotary_dim
*
rotary_percent
)
self
.
rotary_pos_emb
=
RotaryEmbedding
(
rotary_dim
)
else
:
self
.
rotary_pos_emb
=
None
# Transformer.
self
.
decoder
=
TransformerBlock
(
config
=
self
.
config
,
self_attn_mask_type
=
AttnMaskType
.
causal
,
pre_process
=
self
.
pre_process
,
post_process
=
self
.
post_process
,
)
# Output
if
post_process
:
self
.
output_layer
=
tensor_parallel
.
ColumnParallelLinear
(
config
.
hidden_size
,
self
.
vocab_size
,
config
=
config
,
init_method
=
config
.
init_method
,
bias
=
False
,
skip_bias_add
=
False
,
gather_output
=
not
self
.
parallel_output
,
skip_weight_param_allocation
=
self
.
pre_process
and
self
.
share_embeddings_and_output_weights
)
if
self
.
share_embeddings_and_output_weights
and
(
self
.
pre_process
or
self
.
post_process
):
self
.
initialize_last_stage_with_word_embeddings
()
def
set_input_tensor
(
self
,
input_tensor
):
""" See megatron.model.transformer.set_input_tensor()"""
# This is usually handled in schedules.py but some inference code still
# gives us non-lists or None
if
not
isinstance
(
input_tensor
,
list
):
input_tensor
=
[
input_tensor
]
assert
len
(
input_tensor
)
==
1
,
'input_tensor should only be length 1 for gpt'
self
.
decoder
.
set_input_tensor
(
input_tensor
[
0
])
def
forward
(
self
,
input_ids
:
Tensor
,
position_ids
:
Tensor
,
attention_mask
:
Tensor
,
labels
:
Tensor
=
None
,
inference_params
=
None
,
):
# Decoder embedding.
if
self
.
pre_process
:
decoder_input
=
self
.
embedding
(
input_ids
=
input_ids
,
position_ids
=
position_ids
)
else
:
# intermediate stage of pipeline
# decoder will get hidden_states from encoder.input_tensor
decoder_input
=
None
# Rotary positional embeddings
rotary_pos_emb
=
None
if
self
.
rotary_pos_emb
is
not
None
:
rotary_seq_len
=
self
.
max_sequence_length
if
inference_params
is
not
None
:
rotary_seq_len
=
inference_params
.
max_sequence_length
rotary_pos_emb
=
self
.
rotary_pos_emb
(
rotary_seq_len
)
# Run decoder.
hidden_states
=
self
.
decoder
(
hidden_states
=
decoder_input
,
attention_mask
=
attention_mask
,
inference_params
=
inference_params
,
rotary_pos_emb
=
rotary_pos_emb
)
if
not
self
.
post_process
:
return
hidden_states
# logits and loss
output_weight
=
None
if
self
.
share_embeddings_and_output_weights
:
output_weight
=
self
.
shared_embedding_or_output_weight
()
logits
,
_
=
self
.
output_layer
(
hidden_states
,
weight
=
output_weight
)
if
labels
is
None
:
# [s b h] => [b s h]
return
logits
.
transpose
(
0
,
1
).
contiguous
()
# [b s] => [s b]
labels
=
labels
.
transpose
(
0
,
1
).
contiguous
()
loss
=
tensor_parallel
.
vocab_parallel_cross_entropy
(
logits
.
float
(),
labels
)
# [s b] => [b, s]
loss
=
loss
.
transpose
(
0
,
1
).
contiguous
()
return
loss
def
shared_embedding_or_output_weight
(
self
):
if
self
.
pre_process
:
return
self
.
embedding
.
word_embeddings
.
weight
elif
self
.
post_process
:
return
self
.
output_layer
.
weight
return
None
def
initialize_last_stage_with_word_embeddings
(
self
):
# This function just initializes the word embeddings in the final stage
# when we are using pipeline parallelism and sharing word
# embeddings. Nothing to do if we aren't sharing weights or aren't using
# pipeline parallelism.
if
not
self
.
share_embeddings_and_output_weights
or
(
self
.
pre_process
and
self
.
post_process
):
return
if
self
.
post_process
and
not
self
.
pre_process
:
assert
not
parallel_state
.
is_pipeline_first_stage
()
# set word_embeddings weights to 0 here, then copy first
# stage's weights using all_reduce below.
self
.
output_layer
.
weight
.
data
.
fill_
(
0
)
self
.
output_layer
.
weight
.
shared
=
True
# Parameters are shared between the word embeddings layers, and the
# heads at the end of the model. In a pipelined setup with more than
# one stage, the initial embedding layer and the head are on different
# workers, so we do the following:
# 1. Create a second copy of word_embeddings on the last stage, with
# initial parameters of 0.0.
# 2. Do an all-reduce between the first and last stage to ensure that
# the two copies of word_embeddings start off with the same
# parameter values.
# 3. In the training loop, before an all-reduce between the grads of
# the two word_embeddings layers to ensure that every applied weight
# update is the same on both stages.
# Ensure that first and last stages have the same initial parameter
# values.
if
torch
.
distributed
.
is_initialized
():
if
parallel_state
.
is_rank_in_embedding_group
():
weight
=
self
.
shared_embedding_or_output_weight
()
torch
.
distributed
.
all_reduce
(
weight
.
data
,
group
=
parallel_state
.
get_embedding_group
())
elif
not
getattr
(
GPTModel
,
"embedding_warning_printed"
,
False
):
logging
.
getLogger
(
__name__
).
warning
(
"Distributed processes aren't initialized, so the output layer "
"is not initialized with weights from the word embeddings. "
"If you are just manipulating a model this is fine, but "
"this needs to be handled manually. If you are training "
"something is definitely wrong."
)
GPTModel
.
embedding_warning_printed
=
True
# TODO: add distributed checkpointing
def
state_dict_for_save_checkpoint
(
self
,
prefix
=
''
,
keep_vars
=
False
):
pass
# """For easy load."""
# state_dict_ = {}
# if self.pre_process:
# state_dict_[self._embedding_key] = self.embedding.state_dict_for_save_checkpoint(
# prefix=prefix, keep_vars=keep_vars
# )
# state_dict_[self._encoder_key] = self.encoder.state_dict_for_save_checkpoint(
# prefix=prefix, keep_vars=keep_vars
# )
# return state_dict_
# TODO: add distributed checkpointing
def
load_state_dict
(
self
,
state_dict
,
strict
=
True
):
pass
# """Customized load."""
# # Embedding.
# if self.pre_process:
# if self._embedding_key in state_dict:
# state_dict_ = state_dict[self._embedding_key]
# else:
# # for backward compatibility.
# state_dict_ = {}
# for key in state_dict.keys():
# if '_embeddings' in key:
# state_dict_[key] = state_dict[key]
# self.embedding.load_state_dict(state_dict_, strict=strict)
# # Encoder.
# if self._encoder_key in state_dict:
# state_dict_ = state_dict[self._encoder_key]
# # For backward compatibility.
# elif 'transformer' in state_dict:
# state_dict_ = state_dict['transformer']
# else:
# # For backward compatibility.
# state_dict_ = {}
# for key in state_dict.keys():
# if 'transformer.' in key:
# state_dict_[key.split('transformer.')[1]] = state_dict[key]
# # For backward compatibility.
# state_dict_self_attention = {}
# for key in state_dict_.keys():
# if '.attention.' in key:
# state_dict_self_attention[key.replace(".attention.", ".self_attention.")] = state_dict_[key]
# else:
# state_dict_self_attention[key] = state_dict_[key]
# state_dict_ = state_dict_self_attention
# self.encoder.load_state_dict(state_dict_, strict=strict)
megatron/core/package_info.py
0 → 100644
View file @
d3dd8642
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
MAJOR
=
0
MINOR
=
2
PATCH
=
0
PRE_RELEASE
=
''
# Use the following formatting: (major, minor, patch, pre-release)
VERSION
=
(
MAJOR
,
MINOR
,
PATCH
,
PRE_RELEASE
)
__shortversion__
=
'.'
.
join
(
map
(
str
,
VERSION
[:
3
]))
__version__
=
'.'
.
join
(
map
(
str
,
VERSION
[:
3
]))
+
''
.
join
(
VERSION
[
3
:])
__package_name__
=
'megatron_core'
__contact_names__
=
'NVIDIA'
__contact_emails__
=
'nemo-toolkit@nvidia.com'
# use NeMo Email
__homepage__
=
'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/'
# use NeMo homepage
__repository_url__
=
'https://github.com/NVIDIA/Megatron-LM/megatron/core'
__download_url__
=
'https://github.com/NVIDIA/Megatron-LM/releases'
__description__
=
'Megatron Core - a library for efficient and scalable training of transformer based models'
__license__
=
'BSD-3'
__keywords__
=
'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch'
megatron/core/parallel_state.py
0 → 100644
View file @
d3dd8642
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Model and data parallel groups."""
import
torch
from
typing
import
Optional
from
.utils
import
GlobalMemoryBuffer
# Intra-layer model parallel group that the current rank belongs to.
_TENSOR_MODEL_PARALLEL_GROUP
=
None
# Inter-layer model parallel group that the current rank belongs to.
_PIPELINE_MODEL_PARALLEL_GROUP
=
None
# Model parallel group (both intra- and pipeline) that the current rank belongs to.
_MODEL_PARALLEL_GROUP
=
None
# Embedding group.
_EMBEDDING_GROUP
=
None
# Position embedding group.
_POSITION_EMBEDDING_GROUP
=
None
# Data parallel group that the current rank belongs to.
_DATA_PARALLEL_GROUP
=
None
_DATA_PARALLEL_GROUP_GLOO
=
None
# FP8 amax reduction group.
_AMAX_REDUCTION_GROUP
=
None
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
=
None
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
=
None
_PIPELINE_MODEL_PARALLEL_SPLIT_RANK
=
None
# These values enable us to change the mpu sizes on the fly.
_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
=
None
_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
=
None
_MPU_TENSOR_MODEL_PARALLEL_RANK
=
None
_MPU_PIPELINE_MODEL_PARALLEL_RANK
=
None
# A list of ranks that have a copy of the embedding.
_EMBEDDING_GLOBAL_RANKS
=
None
# A list of ranks that have a copy of the position embedding.
_POSITION_EMBEDDING_GLOBAL_RANKS
=
None
# A list of global ranks for each pipeline group to ease calculation of the source
# rank when broadcasting from the first or last pipeline stage.
_PIPELINE_GLOBAL_RANKS
=
None
# A list of global ranks for each data parallel group to ease calculation of the source
# rank when broadcasting weights from src to all other data parallel ranks
_DATA_PARALLEL_GLOBAL_RANKS
=
None
# Memory buffers to avoid dynamic memory allocation
_GLOBAL_MEMORY_BUFFER
=
None
def
initialize_model_parallel
(
tensor_model_parallel_size
:
int
=
1
,
pipeline_model_parallel_size
:
int
=
1
,
virtual_pipeline_model_parallel_size
:
Optional
[
int
]
=
None
,
pipeline_model_parallel_split_rank
:
Optional
[
int
]
=
None
,
use_fp8
:
bool
=
False
,
)
->
None
:
"""Initialize model data parallel groups.
Arguments:
tensor_model_parallel_size (int, default = 1):
The number of GPUs to split individual tensors across.
pipeline_model_parallel_size (int, default = 1):
The number of tensor parallel GPU groups to split the
Transformer layers across. For example, if
tensor_model_parallel_size is 4 and
pipeline_model_parallel_size is 2, the model will be split
into 2 groups of 4 GPUs.
virtual_pipeline_model_parallel_size (int, optional):
The number of stages that each pipeline group will have,
interleaving as necessary. If None, no interleaving is
performed. For example, if tensor_model_parallel_size is 1,
pipeline_model_parallel_size is 4,
virtual_pipeline_model_parallel_size is 2, and there are
16 transformer layers in the model, the model will be
split into 8 stages with two layers each and each GPU
would get 2 stages as such (layer number starting with 1):
GPU 0: [1, 2] [9, 10]
GPU 1: [3, 4] [11, 12]
GPU 2: [5, 6] [13, 14]
GPU 3: [7, 8] [15, 16]
pipeline_model_parallel_split_rank (int, optional):
For models with both an encoder and decoder, the rank in
pipeline to switch between encoder and decoder (i.e. the
first rank of the decoder). This allows the user to set
the pipeline parallel size of the encoder and decoder
independently. For example, if
pipeline_model_parallel_size is 8 and
pipeline_model_parallel_split_rank is 3, then ranks 0-2
will be the encoder and ranks 3-7 will be the decoder.
use_fp8 (bool, default = False):
Construct GPU groups needed for FP8 training, namely for
amax reduction across the product of the data-parallel and
tensor-parallel groups.
Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
the model pipeline. The present function will
create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
and 8 data-parallel groups as:
8 data_parallel groups:
[g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
8 tensor model-parallel groups:
[g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
4 pipeline model-parallel groups:
[g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
Note that for efficiency, the caller should make sure adjacent ranks
are on the same DGX box. For example if we are using 2 DGX-1 boxes
with a total of 16 GPUs, rank 0 to 7 belong to the first box and
ranks 8 to 15 belong to the second box.
"""
# Get world size and rank. Ensure some consistencies.
assert
torch
.
distributed
.
is_initialized
()
world_size
:
int
=
torch
.
distributed
.
get_world_size
()
if
world_size
%
(
tensor_model_parallel_size
*
pipeline_model_parallel_size
)
!=
0
:
raise
RuntimeError
(
f
"world_size (
{
world_size
}
) is not divisible by tensor_model_parallel_size "
f
"(
{
tensor_model_parallel_size
}
) x pipeline_model_parallel_size (
{
pipeline_model_parallel_size
}
)"
)
data_parallel_size
:
int
=
world_size
//
(
tensor_model_parallel_size
*
pipeline_model_parallel_size
)
num_tensor_model_parallel_groups
:
int
=
world_size
//
tensor_model_parallel_size
num_pipeline_model_parallel_groups
:
int
=
world_size
//
pipeline_model_parallel_size
num_data_parallel_groups
:
int
=
world_size
//
data_parallel_size
if
virtual_pipeline_model_parallel_size
is
not
None
:
if
not
pipeline_model_parallel_size
>
2
:
raise
RuntimeError
(
"pipeline-model-parallel size should be greater than 2 with "
"interleaved schedule"
)
global
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
global
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
=
0
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
=
virtual_pipeline_model_parallel_size
if
pipeline_model_parallel_split_rank
is
not
None
:
global
_PIPELINE_MODEL_PARALLEL_SPLIT_RANK
_PIPELINE_MODEL_PARALLEL_SPLIT_RANK
=
pipeline_model_parallel_split_rank
rank
=
torch
.
distributed
.
get_rank
()
# Build the data-parallel groups.
global
_DATA_PARALLEL_GROUP
global
_DATA_PARALLEL_GROUP_GLOO
global
_DATA_PARALLEL_GLOBAL_RANKS
assert
_DATA_PARALLEL_GROUP
is
None
,
'data parallel group is already initialized'
all_data_parallel_group_ranks
=
[]
for
i
in
range
(
pipeline_model_parallel_size
):
start_rank
=
i
*
num_pipeline_model_parallel_groups
end_rank
=
(
i
+
1
)
*
num_pipeline_model_parallel_groups
for
j
in
range
(
tensor_model_parallel_size
):
ranks
=
range
(
start_rank
+
j
,
end_rank
,
tensor_model_parallel_size
)
all_data_parallel_group_ranks
.
append
(
list
(
ranks
))
group
=
torch
.
distributed
.
new_group
(
ranks
)
group_gloo
=
torch
.
distributed
.
new_group
(
ranks
,
backend
=
"gloo"
)
if
rank
in
ranks
:
_DATA_PARALLEL_GROUP
=
group
_DATA_PARALLEL_GROUP_GLOO
=
group_gloo
_DATA_PARALLEL_GLOBAL_RANKS
=
ranks
# Build the model-parallel groups.
global
_MODEL_PARALLEL_GROUP
assert
_MODEL_PARALLEL_GROUP
is
None
,
'model parallel group is already initialized'
for
i
in
range
(
data_parallel_size
):
ranks
=
[
data_parallel_group_ranks
[
i
]
for
data_parallel_group_ranks
in
all_data_parallel_group_ranks
]
group
=
torch
.
distributed
.
new_group
(
ranks
)
if
rank
in
ranks
:
_MODEL_PARALLEL_GROUP
=
group
# Build the tensor model-parallel groups.
global
_TENSOR_MODEL_PARALLEL_GROUP
assert
_TENSOR_MODEL_PARALLEL_GROUP
is
None
,
'tensor model parallel group is already initialized'
for
i
in
range
(
num_tensor_model_parallel_groups
):
ranks
=
range
(
i
*
tensor_model_parallel_size
,
(
i
+
1
)
*
tensor_model_parallel_size
)
group
=
torch
.
distributed
.
new_group
(
ranks
)
if
rank
in
ranks
:
_TENSOR_MODEL_PARALLEL_GROUP
=
group
# Build the pipeline model-parallel groups and embedding groups
# (first and last rank in each pipeline model-parallel group).
global
_PIPELINE_MODEL_PARALLEL_GROUP
global
_PIPELINE_GLOBAL_RANKS
assert
_PIPELINE_MODEL_PARALLEL_GROUP
is
None
,
'pipeline model parallel group is already initialized'
global
_EMBEDDING_GROUP
global
_EMBEDDING_GLOBAL_RANKS
assert
_EMBEDDING_GROUP
is
None
,
'embedding group is already initialized'
global
_POSITION_EMBEDDING_GROUP
global
_POSITION_EMBEDDING_GLOBAL_RANKS
assert
_POSITION_EMBEDDING_GROUP
is
None
,
'position embedding group is already initialized'
for
i
in
range
(
num_pipeline_model_parallel_groups
):
ranks
=
range
(
i
,
world_size
,
num_pipeline_model_parallel_groups
)
group
=
torch
.
distributed
.
new_group
(
ranks
)
if
rank
in
ranks
:
_PIPELINE_MODEL_PARALLEL_GROUP
=
group
_PIPELINE_GLOBAL_RANKS
=
ranks
# Setup embedding group (to exchange gradients between
# first and last stages).
if
len
(
ranks
)
>
1
:
embedding_ranks
=
[
ranks
[
0
],
ranks
[
-
1
]]
position_embedding_ranks
=
[
ranks
[
0
]]
if
pipeline_model_parallel_split_rank
is
not
None
:
if
ranks
[
pipeline_model_parallel_split_rank
]
not
in
embedding_ranks
:
embedding_ranks
=
[
ranks
[
0
],
ranks
[
pipeline_model_parallel_split_rank
],
ranks
[
-
1
]]
if
ranks
[
pipeline_model_parallel_split_rank
]
not
in
position_embedding_ranks
:
position_embedding_ranks
=
[
ranks
[
0
],
ranks
[
pipeline_model_parallel_split_rank
]]
else
:
embedding_ranks
=
ranks
position_embedding_ranks
=
ranks
group
=
torch
.
distributed
.
new_group
(
embedding_ranks
)
if
rank
in
embedding_ranks
:
_EMBEDDING_GROUP
=
group
if
rank
in
ranks
:
_EMBEDDING_GLOBAL_RANKS
=
embedding_ranks
group
=
torch
.
distributed
.
new_group
(
position_embedding_ranks
)
if
rank
in
position_embedding_ranks
:
_POSITION_EMBEDDING_GROUP
=
group
if
rank
in
ranks
:
_POSITION_EMBEDDING_GLOBAL_RANKS
=
position_embedding_ranks
# Build the FP8 groups.
global
_AMAX_REDUCTION_GROUP
assert
_AMAX_REDUCTION_GROUP
is
None
,
\
'FP8 amax reduction group is already initialized'
if
use_fp8
:
amax_group_size
:
int
=
tensor_model_parallel_size
*
data_parallel_size
num_amax_groups
:
int
=
world_size
//
amax_group_size
for
i
in
range
(
num_amax_groups
):
start_rank
=
i
*
amax_group_size
end_rank
=
(
i
+
1
)
*
amax_group_size
ranks
=
range
(
start_rank
,
end_rank
)
group
=
torch
.
distributed
.
new_group
(
ranks
)
if
rank
in
ranks
:
_AMAX_REDUCTION_GROUP
=
group
# Initialize global memory buffer
# This isn't really "parallel state" but there isn't another good place to
# put this. If we end up with a more generic initialization of megatron-core
# we could stick it there
_set_global_memory_buffer
()
def
is_unitialized
():
"""Useful for code segments that may be accessed with or without mpu initialization"""
return
_DATA_PARALLEL_GROUP
is
None
def
model_parallel_is_initialized
():
"""Check if model and data parallel groups are initialized."""
if
_TENSOR_MODEL_PARALLEL_GROUP
is
None
or
_PIPELINE_MODEL_PARALLEL_GROUP
is
None
or
_DATA_PARALLEL_GROUP
is
None
:
return
False
return
True
def
get_model_parallel_group
():
"""Get the model parallel group the caller rank belongs to."""
assert
_MODEL_PARALLEL_GROUP
is
not
None
,
'model parallel group is not initialized'
return
_MODEL_PARALLEL_GROUP
def
get_tensor_model_parallel_group
(
check_initialized
=
True
):
"""Get the tensor model parallel group the caller rank belongs to."""
if
check_initialized
:
assert
_TENSOR_MODEL_PARALLEL_GROUP
is
not
None
,
'tensor model parallel group is not initialized'
return
_TENSOR_MODEL_PARALLEL_GROUP
def
get_pipeline_model_parallel_group
():
"""Get the pipeline model parallel group the caller rank belongs to."""
assert
_PIPELINE_MODEL_PARALLEL_GROUP
is
not
None
,
'pipeline_model parallel group is not initialized'
return
_PIPELINE_MODEL_PARALLEL_GROUP
def
get_data_parallel_group
():
"""Get the data parallel group the caller rank belongs to."""
assert
_DATA_PARALLEL_GROUP
is
not
None
,
'data parallel group is not initialized'
return
_DATA_PARALLEL_GROUP
def
get_data_parallel_group_gloo
():
"""Get the data parallel group-gloo the caller rank belongs to."""
assert
_DATA_PARALLEL_GROUP_GLOO
is
not
None
,
\
'data parallel group-gloo is not initialized'
return
_DATA_PARALLEL_GROUP_GLOO
def
get_embedding_group
():
"""Get the embedding group the caller rank belongs to."""
assert
_EMBEDDING_GROUP
is
not
None
,
'embedding group is not initialized'
return
_EMBEDDING_GROUP
def
get_position_embedding_group
():
"""Get the position embedding group the caller rank belongs to."""
assert
_POSITION_EMBEDDING_GROUP
is
not
None
,
'position embedding group is not initialized'
return
_POSITION_EMBEDDING_GROUP
def
get_amax_reduction_group
():
"""Get the FP8 amax reduction group the caller rank belongs to."""
assert
_AMAX_REDUCTION_GROUP
is
not
None
,
\
'FP8 amax reduction group is not initialized'
return
_AMAX_REDUCTION_GROUP
def
set_tensor_model_parallel_world_size
(
world_size
):
"""Set the tensor model parallel size"""
global
_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
=
world_size
def
set_pipeline_model_parallel_world_size
(
world_size
):
"""Set the pipeline model parallel size"""
global
_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
=
world_size
def
set_virtual_pipeline_model_parallel_world_size
(
world_size
):
"""Set the pipeline model parallel size"""
global
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
=
world_size
def
set_virtual_pipeline_model_parallel_world_size
(
world_size
):
"""Set the virtual pipeline model parallel size"""
global
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
=
world_size
def
get_tensor_model_parallel_world_size
():
"""Return world size for the tensor model parallel group."""
global
_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
if
_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
is
not
None
:
return
_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
return
torch
.
distributed
.
get_world_size
(
group
=
get_tensor_model_parallel_group
())
def
get_pipeline_model_parallel_world_size
():
"""Return world size for the pipeline model parallel group."""
global
_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
if
_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
is
not
None
:
return
_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
return
torch
.
distributed
.
get_world_size
(
group
=
get_pipeline_model_parallel_group
())
def
set_tensor_model_parallel_rank
(
rank
):
"""Set tensor model parallel rank."""
global
_MPU_TENSOR_MODEL_PARALLEL_RANK
_MPU_TENSOR_MODEL_PARALLEL_RANK
=
rank
def
set_pipeline_model_parallel_rank
(
rank
):
"""Set pipeline model parallel rank."""
global
_MPU_PIPELINE_MODEL_PARALLEL_RANK
_MPU_PIPELINE_MODEL_PARALLEL_RANK
=
rank
def
set_pipeline_model_parallel_split_rank
(
rank
):
"""Set pipeline model parallel split rank."""
global
_PIPELINE_MODEL_PARALLEL_SPLIT_RANK
_PIPELINE_MODEL_PARALLEL_SPLIT_RANK
=
rank
def
get_tensor_model_parallel_rank
():
"""Return my rank for the tensor model parallel group."""
global
_MPU_TENSOR_MODEL_PARALLEL_RANK
if
_MPU_TENSOR_MODEL_PARALLEL_RANK
is
not
None
:
return
_MPU_TENSOR_MODEL_PARALLEL_RANK
return
torch
.
distributed
.
get_rank
(
group
=
get_tensor_model_parallel_group
())
def
get_pipeline_model_parallel_rank
():
"""Return my rank for the pipeline model parallel group."""
global
_MPU_PIPELINE_MODEL_PARALLEL_RANK
if
_MPU_PIPELINE_MODEL_PARALLEL_RANK
is
not
None
:
return
_MPU_PIPELINE_MODEL_PARALLEL_RANK
return
torch
.
distributed
.
get_rank
(
group
=
get_pipeline_model_parallel_group
())
def
get_pipeline_model_parallel_split_rank
():
"""Return pipeline model parallel split rank."""
global
_PIPELINE_MODEL_PARALLEL_SPLIT_RANK
return
_PIPELINE_MODEL_PARALLEL_SPLIT_RANK
def
is_pipeline_first_stage
(
ignore_virtual
=
False
):
"""Return True if in the first pipeline model-parallel stage, False otherwise."""
if
not
ignore_virtual
:
if
(
get_virtual_pipeline_model_parallel_world_size
()
is
not
None
and
get_virtual_pipeline_model_parallel_rank
()
!=
0
):
return
False
return
get_pipeline_model_parallel_rank
()
==
0
def
is_pipeline_last_stage
(
ignore_virtual
=
False
):
"""Return True if in the last pipeline model-parallel stage, False otherwise."""
if
not
ignore_virtual
:
virtual_pipeline_model_parallel_world_size
=
get_virtual_pipeline_model_parallel_world_size
()
if
virtual_pipeline_model_parallel_world_size
is
not
None
and
get_virtual_pipeline_model_parallel_rank
()
!=
(
virtual_pipeline_model_parallel_world_size
-
1
):
return
False
return
get_pipeline_model_parallel_rank
()
==
(
get_pipeline_model_parallel_world_size
()
-
1
)
def
is_rank_in_embedding_group
(
ignore_virtual
=
False
):
"""Return true if current rank is in embedding group, False otherwise."""
rank
=
torch
.
distributed
.
get_rank
()
global
_EMBEDDING_GLOBAL_RANKS
if
ignore_virtual
:
return
rank
in
_EMBEDDING_GLOBAL_RANKS
if
rank
in
_EMBEDDING_GLOBAL_RANKS
:
if
rank
==
_EMBEDDING_GLOBAL_RANKS
[
0
]:
return
is_pipeline_first_stage
(
ignore_virtual
=
False
)
elif
rank
==
_EMBEDDING_GLOBAL_RANKS
[
-
1
]:
return
is_pipeline_last_stage
(
ignore_virtual
=
False
)
else
:
return
True
return
False
def
is_rank_in_position_embedding_group
():
"""Return true if current rank is in position embedding group, False otherwise."""
rank
=
torch
.
distributed
.
get_rank
()
global
_POSITION_EMBEDDING_GLOBAL_RANKS
return
rank
in
_POSITION_EMBEDDING_GLOBAL_RANKS
def
is_pipeline_stage_before_split
(
rank
=
None
):
"""Return True if pipeline stage executes encoder block for a model
with both encoder and decoder."""
if
get_pipeline_model_parallel_world_size
()
==
1
:
return
True
if
rank
is
None
:
rank
=
get_pipeline_model_parallel_rank
()
global
_PIPELINE_MODEL_PARALLEL_SPLIT_RANK
if
_PIPELINE_MODEL_PARALLEL_SPLIT_RANK
is
None
:
return
True
if
rank
<
_PIPELINE_MODEL_PARALLEL_SPLIT_RANK
:
return
True
return
False
def
is_pipeline_stage_after_split
(
rank
=
None
):
"""Return True if pipeline stage executes decoder block for a model
with both encoder and decoder."""
if
get_pipeline_model_parallel_world_size
()
==
1
:
return
True
if
rank
is
None
:
rank
=
get_pipeline_model_parallel_rank
()
global
_PIPELINE_MODEL_PARALLEL_SPLIT_RANK
if
_PIPELINE_MODEL_PARALLEL_SPLIT_RANK
is
None
:
return
True
if
rank
>=
_PIPELINE_MODEL_PARALLEL_SPLIT_RANK
:
return
True
return
False
def
is_pipeline_stage_at_split
():
"""Return true if pipeline stage executes decoder block and next
stage executes encoder block for a model with both encoder and
decoder."""
rank
=
get_pipeline_model_parallel_rank
()
return
is_pipeline_stage_before_split
(
rank
)
and
is_pipeline_stage_after_split
(
rank
+
1
)
def
get_virtual_pipeline_model_parallel_rank
():
"""Return the virtual pipeline-parallel rank."""
global
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
return
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
def
set_virtual_pipeline_model_parallel_rank
(
rank
):
"""Set the virtual pipeline-parallel rank."""
global
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
=
rank
def
get_virtual_pipeline_model_parallel_world_size
():
"""Return the virtual pipeline-parallel world size."""
global
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
return
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
def
set_virtual_pipeline_model_parallel_world_size
(
world_size
):
"""Set the virtual pipeline-parallel world size"""
global
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
=
world_size
def
get_tensor_model_parallel_src_rank
():
"""Calculate the global rank corresponding to the first local rank
in the tensor model parallel group."""
global_rank
=
torch
.
distributed
.
get_rank
()
local_world_size
=
get_tensor_model_parallel_world_size
()
return
(
global_rank
//
local_world_size
)
*
local_world_size
def
get_data_parallel_src_rank
():
"""Calculate the global rank corresponding to the first local rank
in the data parallel group."""
assert
_DATA_PARALLEL_GLOBAL_RANKS
is
not
None
,
"Data parallel group is not initialized"
return
_DATA_PARALLEL_GLOBAL_RANKS
[
0
]
def
get_pipeline_model_parallel_first_rank
():
"""Return the global rank of the first process in the pipeline for the
current tensor parallel group"""
assert
_PIPELINE_GLOBAL_RANKS
is
not
None
,
"Pipeline parallel group is not initialized"
return
_PIPELINE_GLOBAL_RANKS
[
0
]
def
get_pipeline_model_parallel_last_rank
():
"""Return the global rank of the last process in the pipeline for the
current tensor parallel group"""
assert
_PIPELINE_GLOBAL_RANKS
is
not
None
,
"Pipeline parallel group is not initialized"
last_rank_local
=
get_pipeline_model_parallel_world_size
()
-
1
return
_PIPELINE_GLOBAL_RANKS
[
last_rank_local
]
def
get_pipeline_model_parallel_next_rank
():
"""Return the global rank that follows the caller in the pipeline"""
assert
_PIPELINE_GLOBAL_RANKS
is
not
None
,
"Pipeline parallel group is not initialized"
rank_in_pipeline
=
get_pipeline_model_parallel_rank
()
world_size
=
get_pipeline_model_parallel_world_size
()
return
_PIPELINE_GLOBAL_RANKS
[(
rank_in_pipeline
+
1
)
%
world_size
]
def
get_pipeline_model_parallel_prev_rank
():
"""Return the global rank that preceeds the caller in the pipeline"""
assert
_PIPELINE_GLOBAL_RANKS
is
not
None
,
"Pipeline parallel group is not initialized"
rank_in_pipeline
=
get_pipeline_model_parallel_rank
()
world_size
=
get_pipeline_model_parallel_world_size
()
return
_PIPELINE_GLOBAL_RANKS
[(
rank_in_pipeline
-
1
)
%
world_size
]
def
get_data_parallel_world_size
():
"""Return world size for the data parallel group."""
return
torch
.
distributed
.
get_world_size
(
group
=
get_data_parallel_group
())
def
get_data_parallel_rank
():
"""Return my rank for the data parallel group."""
return
torch
.
distributed
.
get_rank
(
group
=
get_data_parallel_group
())
def
_set_global_memory_buffer
():
"""Initialize global buffer"""
global
_GLOBAL_MEMORY_BUFFER
assert
_GLOBAL_MEMORY_BUFFER
is
None
,
'global memory buffer is already initialized'
_GLOBAL_MEMORY_BUFFER
=
GlobalMemoryBuffer
()
def
get_global_memory_buffer
():
"""Return the global GlobalMemoryBuffer object"""
assert
_GLOBAL_MEMORY_BUFFER
is
not
None
,
'global memory buffer is not initialized'
return
_GLOBAL_MEMORY_BUFFER
def
destroy_global_memory_buffer
():
"""Sets the global memory buffer to None"""
global
_GLOBAL_MEMORY_BUFFER
_GLOBAL_MEMORY_BUFFER
=
None
def
destroy_model_parallel
():
"""Set the groups to none."""
global
_MODEL_PARALLEL_GROUP
_MODEL_PARALLEL_GROUP
=
None
global
_TENSOR_MODEL_PARALLEL_GROUP
_TENSOR_MODEL_PARALLEL_GROUP
=
None
global
_PIPELINE_MODEL_PARALLEL_GROUP
_PIPELINE_MODEL_PARALLEL_GROUP
=
None
global
_DATA_PARALLEL_GROUP
_DATA_PARALLEL_GROUP
=
None
global
_EMBEDDING_GROUP
_EMBEDDING_GROUP
=
None
global
_POSITION_EMBEDDING_GROUP
_POSITION_EMBEDDING_GROUP
=
None
global
_AMAX_REDUCTION_GROUP
_AMAX_REDUCTION_GROUP
=
None
global
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
=
None
global
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
=
None
global
_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
=
None
global
_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
=
None
global
_MPU_TENSOR_MODEL_PARALLEL_RANK
_MPU_TENSOR_MODEL_PARALLEL_RANK
=
None
global
_MPU_PIPELINE_MODEL_PARALLEL_RANK
_MPU_PIPELINE_MODEL_PARALLEL_RANK
=
None
global
_GLOBAL_MEMORY_BUFFER
_GLOBAL_MEMORY_BUFFER
=
None
megatron/core/pipeline_parallel/__init__.py
0 → 100644
View file @
d3dd8642
from
.schedules
import
get_forward_backward_func
megatron/core/pipeline_parallel/p2p_communication.py
0 → 100644
View file @
d3dd8642
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
from
functools
import
reduce
import
operator
from
typing
import
Optional
,
List
,
Union
,
Callable
,
Tuple
import
torch
from
megatron
import
core
from
megatron.core.parallel_state
import
(
get_pipeline_model_parallel_group
,
get_pipeline_model_parallel_rank
,
get_pipeline_model_parallel_prev_rank
,
get_pipeline_model_parallel_next_rank
,
)
from
megatron.core
import
ModelParallelConfig
# Types
Shape
=
Union
[
List
[
int
],
torch
.
Size
]
def
_communicate_shapes
(
tensor_send_next
,
tensor_send_prev
,
recv_prev
,
recv_next
,
config
):
"""Communicate tensor shapes between stages. Used to communicate
tensor shapes before the actual tensor communication happens.
This is required when the sequence lengths across micro batches
are not uniform.
Takes the following arguments:
tensor_send_next: tensor to send to next rank (no tensor sent if
set to None).
tensor_send_prev: tensor to send to prev rank (no tensor sent if
set to None).
recv_prev: boolean for whether tensor should be received from
previous rank.
recv_next: boolean for whether tensor should be received from
next rank.
Returns:
(recv_prev_shape, recv_next_shape)
"""
recv_prev_shape_tensor
=
None
recv_next_shape_tensor
=
None
send_prev_shape_tensor
=
None
send_next_shape_tensor
=
None
if
recv_prev
:
recv_prev_shape_tensor
=
torch
.
empty
((
3
),
device
=
torch
.
cuda
.
current_device
(),
dtype
=
torch
.
int64
)
if
recv_next
:
recv_next_shape_tensor
=
torch
.
empty
((
3
),
device
=
torch
.
cuda
.
current_device
(),
dtype
=
torch
.
int64
)
if
tensor_send_prev
is
not
None
:
send_prev_shape_tensor
=
torch
.
tensor
(
tensor_send_prev
.
size
(),
device
=
torch
.
cuda
.
current_device
(),
dtype
=
torch
.
int64
)
if
tensor_send_next
is
not
None
:
send_next_shape_tensor
=
torch
.
tensor
(
tensor_send_next
.
size
(),
device
=
torch
.
cuda
.
current_device
(),
dtype
=
torch
.
int64
)
if
config
.
use_ring_exchange_p2p
:
torch
.
distributed
.
ring_exchange
(
tensor_send_prev
=
send_prev_shape_tensor
,
tensor_recv_prev
=
recv_prev_shape_tensor
,
tensor_send_next
=
send_next_shape_tensor
,
tensor_recv_next
=
recv_next_shape_tensor
,
group
=
get_pipeline_model_parallel_group
())
else
:
ops
=
[]
if
send_prev_shape_tensor
is
not
None
:
send_prev_op
=
torch
.
distributed
.
P2POp
(
torch
.
distributed
.
isend
,
send_prev_shape_tensor
,
get_pipeline_model_parallel_prev_rank
())
ops
.
append
(
send_prev_op
)
if
recv_prev_shape_tensor
is
not
None
:
recv_prev_op
=
torch
.
distributed
.
P2POp
(
torch
.
distributed
.
irecv
,
recv_prev_shape_tensor
,
get_pipeline_model_parallel_prev_rank
())
ops
.
append
(
recv_prev_op
)
if
send_next_shape_tensor
is
not
None
:
send_next_op
=
torch
.
distributed
.
P2POp
(
torch
.
distributed
.
isend
,
send_next_shape_tensor
,
get_pipeline_model_parallel_next_rank
())
ops
.
append
(
send_next_op
)
if
recv_next_shape_tensor
is
not
None
:
recv_next_op
=
torch
.
distributed
.
P2POp
(
torch
.
distributed
.
irecv
,
recv_next_shape_tensor
,
get_pipeline_model_parallel_next_rank
())
ops
.
append
(
recv_next_op
)
if
len
(
ops
)
>
0
:
reqs
=
torch
.
distributed
.
batch_isend_irecv
(
ops
)
for
req
in
reqs
:
req
.
wait
()
# To protect against race condition when using batch_isend_irecv().
# should take this out once the bug with batch_isend_irecv is resolved.
torch
.
cuda
.
synchronize
()
recv_prev_shape
=
[
0
,
0
,
0
]
if
recv_prev_shape_tensor
is
not
None
:
recv_prev_shape
=
recv_prev_shape_tensor
.
tolist
()
recv_next_shape
=
[
0
,
0
,
0
]
if
recv_next_shape_tensor
is
not
None
:
recv_next_shape
=
recv_next_shape_tensor
.
tolist
()
return
recv_prev_shape
,
recv_next_shape
def
_batched_p2p_ops
(
*
,
tensor_send_prev
:
Optional
[
torch
.
Tensor
],
tensor_recv_prev
:
Optional
[
torch
.
Tensor
],
tensor_send_next
:
Optional
[
torch
.
Tensor
],
tensor_recv_next
:
Optional
[
torch
.
Tensor
],
group
:
torch
.
distributed
.
ProcessGroup
):
ops
=
[]
if
tensor_send_prev
is
not
None
:
send_prev_op
=
torch
.
distributed
.
P2POp
(
torch
.
distributed
.
isend
,
tensor_send_prev
,
get_pipeline_model_parallel_prev_rank
(),
group
)
ops
.
append
(
send_prev_op
)
if
tensor_recv_prev
is
not
None
:
recv_prev_op
=
torch
.
distributed
.
P2POp
(
torch
.
distributed
.
irecv
,
tensor_recv_prev
,
get_pipeline_model_parallel_prev_rank
(),
group
)
ops
.
append
(
recv_prev_op
)
if
tensor_send_next
is
not
None
:
send_next_op
=
torch
.
distributed
.
P2POp
(
torch
.
distributed
.
isend
,
tensor_send_next
,
get_pipeline_model_parallel_next_rank
(),
group
)
ops
.
append
(
send_next_op
)
if
tensor_recv_next
is
not
None
:
recv_next_op
=
torch
.
distributed
.
P2POp
(
torch
.
distributed
.
irecv
,
tensor_recv_next
,
get_pipeline_model_parallel_next_rank
(),
group
)
ops
.
append
(
recv_next_op
)
if
len
(
ops
)
>
0
:
reqs
=
torch
.
distributed
.
batch_isend_irecv
(
ops
)
else
:
reqs
=
[]
return
reqs
def
_p2p_ops
(
*
,
tensor_send_prev
:
Optional
[
torch
.
Tensor
],
tensor_recv_prev
:
Optional
[
torch
.
Tensor
],
tensor_send_next
:
Optional
[
torch
.
Tensor
],
tensor_recv_next
:
Optional
[
torch
.
Tensor
],
group
:
torch
.
distributed
.
ProcessGroup
):
reqs
=
[]
rank
=
get_pipeline_model_parallel_rank
()
if
get_pipeline_model_parallel_rank
()
%
2
==
0
:
if
tensor_send_next
is
not
None
:
send_next_req
=
torch
.
distributed
.
isend
(
tensor
=
tensor_send_next
,
dst
=
get_pipeline_model_parallel_next_rank
(),
group
=
group
,
)
reqs
.
append
(
send_next_req
)
if
tensor_recv_prev
is
not
None
:
recv_prev_req
=
torch
.
distributed
.
irecv
(
tensor
=
tensor_recv_prev
,
src
=
get_pipeline_model_parallel_prev_rank
(),
group
=
group
,
)
reqs
.
append
(
recv_prev_req
)
if
tensor_send_prev
is
not
None
:
send_prev_req
=
torch
.
distributed
.
isend
(
tensor
=
tensor_send_prev
,
dst
=
get_pipeline_model_parallel_prev_rank
(),
group
=
group
,
)
reqs
.
append
(
send_prev_req
)
if
tensor_recv_next
is
not
None
:
recv_next_req
=
torch
.
distributed
.
irecv
(
tensor
=
tensor_recv_next
,
src
=
get_pipeline_model_parallel_next_rank
(),
group
=
group
,
)
reqs
.
append
(
recv_next_req
)
else
:
if
tensor_recv_prev
is
not
None
:
recv_prev_req
=
torch
.
distributed
.
irecv
(
tensor
=
tensor_recv_prev
,
src
=
get_pipeline_model_parallel_prev_rank
(),
group
=
group
,
)
reqs
.
append
(
recv_prev_req
)
if
tensor_send_next
is
not
None
:
send_next_req
=
torch
.
distributed
.
isend
(
tensor
=
tensor_send_next
,
dst
=
get_pipeline_model_parallel_next_rank
(),
group
=
group
,
)
reqs
.
append
(
send_next_req
)
if
tensor_recv_next
is
not
None
:
recv_next_req
=
torch
.
distributed
.
irecv
(
tensor
=
tensor_recv_next
,
src
=
get_pipeline_model_parallel_next_rank
(),
group
=
group
,
)
reqs
.
append
(
recv_next_req
)
if
tensor_send_prev
is
not
None
:
send_prev_req
=
torch
.
distributed
.
isend
(
tensor
=
tensor_send_prev
,
dst
=
get_pipeline_model_parallel_prev_rank
(),
group
=
group
,
)
reqs
.
append
(
send_prev_req
)
return
reqs
def
_communicate
(
*
,
tensor_send_next
:
Optional
[
torch
.
Tensor
],
tensor_send_prev
:
Optional
[
torch
.
Tensor
],
recv_prev
:
bool
,
recv_next
:
bool
,
tensor_shape
:
Shape
,
config
:
ModelParallelConfig
,
wait_on_reqs
:
bool
=
True
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""Communicate tensors between stages. Used as helper method in other
communication methods that are used in megatron/schedules.py.
Arguments:
tensor_send_next (torch.Tensor, optional):
Tensor to send to next rank (no tensor sent if None)
tensor_send_prev (torch.Tensor, optional):
Tensor to send to prev rank (no tensor sent if None)
recv_prev (boolean, required):
whether tensor should be received from previous rank.
recv_next (boolean, required):
whether tensor should be received from next rank.
tensor_shape (List[int] or torch.Size, required):
shape of tensor to receive (this method assumes that all
tensors sent and received in a single function call are
the same shape).
wait_on_reqs (boolean, optional, default=False):
For non-batched p2p communication, wait on each request
before returning.
Returns:
tuple containing
- tensor_recv_prev: torch.Tensor if recv_prev is True, None otherwise.
- tensor_recv_next: torch.Tensor if recv_next is True, None otherwise.
"""
# Create placeholder tensors for receive in forward and backward directions
# if needed.
tensor_recv_prev
=
None
tensor_recv_next
=
None
if
not
config
.
variable_seq_lengths
:
recv_prev_shape
=
tensor_shape
recv_next_shape
=
tensor_shape
else
:
recv_prev_shape
,
recv_next_shape
=
\
_communicate_shapes
(
tensor_send_next
,
tensor_send_prev
,
recv_prev
,
recv_next
,
config
)
if
recv_prev
:
if
config
.
pipeline_dtype
is
None
:
raise
RuntimeError
(
"pipeline_dtype must be provided if recv_prev is True"
)
if
tensor_shape
is
None
:
raise
RuntimeError
(
"tensor_shape must be specified if recv_prev is True. "
"Common tensor_shape is (seq_length, micro_batch_size, hidden_size)"
)
tensor_recv_prev
=
torch
.
empty
(
recv_prev_shape
,
requires_grad
=
True
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
config
.
pipeline_dtype
)
if
recv_next
:
if
config
.
pipeline_dtype
is
None
:
raise
RuntimeError
(
"dtype must be provided if recv_next is True"
)
if
tensor_shape
is
None
:
raise
RuntimeError
(
"tensor_shape must be specified if recv_next is True. "
"Common tensor_shape is (seq_length, micro_batch_size, hidden_size)"
)
tensor_recv_next
=
torch
.
empty
(
recv_next_shape
,
requires_grad
=
True
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
config
.
pipeline_dtype
)
# Send tensors in both the forward and backward directions as appropriate.
if
config
.
use_ring_exchange_p2p
:
def
_ring_exchange_wrapper
(
**
kwargs
):
torch
.
distributed
.
ring_exchange
(
**
kwargs
)
return
[]
p2p_func
=
_ring_exchange_wrapper
elif
config
.
batch_p2p_comm
:
assert
wait_on_reqs
p2p_func
=
_batched_p2p_ops
else
:
p2p_func
=
_p2p_ops
reqs
=
p2p_func
(
tensor_send_prev
=
tensor_send_prev
,
tensor_recv_prev
=
tensor_recv_prev
,
tensor_send_next
=
tensor_send_next
,
tensor_recv_next
=
tensor_recv_next
,
group
=
get_pipeline_model_parallel_group
())
if
wait_on_reqs
and
len
(
reqs
)
>
0
:
for
req
in
reqs
:
req
.
wait
()
reqs
=
None
if
config
.
batch_p2p_comm
and
config
.
batch_p2p_sync
:
# To protect against race condition when using batch_isend_irecv().
# User should assert that we have a modern enough PyTorch to not need this
torch
.
cuda
.
synchronize
()
return
tensor_recv_prev
,
tensor_recv_next
,
reqs
def
recv_forward
(
tensor_shape
:
Shape
,
config
:
ModelParallelConfig
)
->
torch
.
Tensor
:
""" Receive tensor from previous rank in pipeline (forward receive).
See _communicate for argument details.
"""
if
core
.
parallel_state
.
is_pipeline_first_stage
():
input_tensor
=
None
else
:
if
config
.
timers
is
not
None
:
config
.
timers
(
'forward-recv'
,
log_level
=
2
).
start
()
input_tensor
,
_
,
_
=
_communicate
(
tensor_send_next
=
None
,
tensor_send_prev
=
None
,
recv_prev
=
True
,
recv_next
=
False
,
tensor_shape
=
tensor_shape
,
config
=
config
)
if
config
.
timers
is
not
None
:
config
.
timers
(
'forward-recv'
).
stop
()
return
input_tensor
def
recv_backward
(
tensor_shape
:
Shape
,
config
:
ModelParallelConfig
)
->
torch
.
Tensor
:
"""Receive tensor from next rank in pipeline (backward receive).
See _communicate for argument details.
"""
if
core
.
parallel_state
.
is_pipeline_last_stage
():
output_tensor_grad
=
None
else
:
if
config
.
timers
is
not
None
:
config
.
timers
(
'backward-recv'
,
log_level
=
2
).
start
()
_
,
output_tensor_grad
,
_
=
_communicate
(
tensor_send_next
=
None
,
tensor_send_prev
=
None
,
recv_prev
=
False
,
recv_next
=
True
,
tensor_shape
=
tensor_shape
,
config
=
config
)
if
config
.
timers
is
not
None
:
config
.
timers
(
'backward-recv'
).
stop
()
return
output_tensor_grad
def
send_forward
(
output_tensor
:
torch
.
Tensor
,
config
:
ModelParallelConfig
)
->
None
:
"""Send tensor to next rank in pipeline (forward send).
See _communicate for argument details.
"""
if
not
core
.
parallel_state
.
is_pipeline_last_stage
():
if
config
.
timers
is
not
None
:
config
.
timers
(
'forward-send'
,
log_level
=
2
).
start
()
_communicate
(
tensor_send_next
=
output_tensor
,
tensor_send_prev
=
None
,
recv_prev
=
False
,
recv_next
=
False
,
tensor_shape
=
None
,
config
=
config
)
if
config
.
timers
is
not
None
:
config
.
timers
(
'forward-send'
).
stop
()
def
send_backward
(
input_tensor_grad
:
torch
.
Tensor
,
config
:
ModelParallelConfig
)
->
None
:
"""Send tensor to previous rank in pipeline (backward send).
See _communicate for argument details.
"""
if
not
core
.
parallel_state
.
is_pipeline_first_stage
():
if
config
.
timers
is
not
None
:
config
.
timers
(
'backward-send'
,
log_level
=
2
).
start
()
_communicate
(
tensor_send_next
=
None
,
tensor_send_prev
=
input_tensor_grad
,
recv_prev
=
False
,
recv_next
=
False
,
tensor_shape
=
None
,
config
=
config
)
if
config
.
timers
is
not
None
:
config
.
timers
(
'backward-send'
).
stop
()
def
send_forward_recv_backward
(
output_tensor
:
torch
.
Tensor
,
tensor_shape
:
Shape
,
config
:
ModelParallelConfig
)
->
torch
.
Tensor
:
"""Batched send and recv with next rank in pipeline.
See _communicate for argument details.
"""
if
core
.
parallel_state
.
is_pipeline_last_stage
():
output_tensor_grad
=
None
else
:
if
config
.
timers
is
not
None
:
config
.
timers
(
'forward-send-backward-recv'
,
log_level
=
2
).
start
()
_
,
output_tensor_grad
,
_
=
_communicate
(
tensor_send_next
=
output_tensor
,
tensor_send_prev
=
None
,
recv_prev
=
False
,
recv_next
=
True
,
tensor_shape
=
tensor_shape
,
config
=
config
)
if
config
.
timers
is
not
None
:
config
.
timers
(
'forward-send-backward-recv'
).
stop
()
return
output_tensor_grad
def
send_backward_recv_forward
(
input_tensor_grad
:
torch
.
Tensor
,
tensor_shape
:
Shape
,
config
:
ModelParallelConfig
)
->
torch
.
Tensor
:
"""Batched send and recv with previous rank in pipeline.
See _communicate for argument details.
"""
if
core
.
parallel_state
.
is_pipeline_first_stage
():
input_tensor
=
None
else
:
if
config
.
timers
is
not
None
:
config
.
timers
(
'backward-send-forward-recv'
,
log_level
=
2
).
start
()
input_tensor
,
_
,
_
=
_communicate
(
tensor_send_next
=
None
,
tensor_send_prev
=
input_tensor_grad
,
recv_prev
=
True
,
recv_next
=
False
,
tensor_shape
=
tensor_shape
,
config
=
config
)
if
config
.
timers
is
not
None
:
config
.
timers
(
'backward-send-forward-recv'
).
stop
()
return
input_tensor
def
send_forward_recv_forward
(
output_tensor
:
torch
.
Tensor
,
recv_prev
:
bool
,
tensor_shape
:
Shape
,
config
:
ModelParallelConfig
,
overlap_p2p_comm
:
bool
=
False
)
->
torch
.
Tensor
:
"""Batched recv from previous rank and send to next rank in pipeline.
See _communicate for argument details.
"""
if
config
.
timers
is
not
None
:
config
.
timers
(
'forward-send-forward-recv'
,
log_level
=
2
).
start
()
input_tensor
,
_
,
wait_handles
=
_communicate
(
tensor_send_next
=
output_tensor
,
tensor_send_prev
=
None
,
recv_prev
=
recv_prev
,
recv_next
=
False
,
tensor_shape
=
tensor_shape
,
wait_on_reqs
=
(
not
overlap_p2p_comm
),
config
=
config
)
if
config
.
timers
is
not
None
:
config
.
timers
(
'forward-send-forward-recv'
).
stop
()
if
overlap_p2p_comm
:
return
input_tensor
,
wait_handles
return
input_tensor
def
send_backward_recv_backward
(
input_tensor_grad
:
torch
.
Tensor
,
recv_next
:
bool
,
tensor_shape
:
Shape
,
config
:
ModelParallelConfig
,
overlap_p2p_comm
:
bool
=
False
)
->
torch
.
Tensor
:
"""Batched recv from next rank and send to previous rank in pipeline.
See _communicate for argument details.
"""
if
config
.
timers
is
not
None
:
config
.
timers
(
'backward-send-backward-recv'
,
log_level
=
2
).
start
()
_
,
output_tensor_grad
,
wait_handles
=
_communicate
(
tensor_send_next
=
None
,
tensor_send_prev
=
input_tensor_grad
,
recv_prev
=
False
,
recv_next
=
recv_next
,
tensor_shape
=
tensor_shape
,
wait_on_reqs
=
(
not
overlap_p2p_comm
),
config
=
config
)
if
config
.
timers
is
not
None
:
config
.
timers
(
'backward-send-backward-recv'
).
stop
()
if
overlap_p2p_comm
:
return
output_tensor_grad
,
wait_handles
return
output_tensor_grad
def
send_forward_backward_recv_forward_backward
(
output_tensor
:
torch
.
Tensor
,
input_tensor_grad
:
torch
.
Tensor
,
recv_prev
:
bool
,
recv_next
:
bool
,
tensor_shape
:
Shape
,
config
:
ModelParallelConfig
)
->
torch
.
Tensor
:
"""Batched send and recv with previous and next ranks in pipeline.
See _communicate for argument details.
"""
if
config
.
timers
is
not
None
:
config
.
timers
(
'forward-backward-send-forward-backward-recv'
,
log_level
=
2
).
start
()
input_tensor
,
output_tensor_grad
,
_
=
_communicate
(
tensor_send_next
=
output_tensor
,
tensor_send_prev
=
input_tensor_grad
,
recv_prev
=
recv_prev
,
recv_next
=
recv_next
,
tensor_shape
=
tensor_shape
,
config
=
config
)
if
config
.
timers
is
not
None
:
config
.
timers
(
'forward-backward-send-forward-backward-recv'
).
stop
()
return
input_tensor
,
output_tensor_grad
megatron/core/pipeline_parallel/schedules.py
0 → 100644
View file @
d3dd8642
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import
contextlib
from
typing
import
Callable
,
Iterator
,
List
,
Optional
,
Union
import
torch
from
torch.autograd.variable
import
Variable
from
torch.nn.parallel.distributed
import
DistributedDataParallel
as
torchDDP
from
megatron
import
core
from
megatron.core
import
parallel_state
from
megatron.core.pipeline_parallel
import
p2p_communication
from
megatron.core.enums
import
ModelType
from
megatron.core.utils
import
get_attr_wrapped_model
,
get_model_type
,
get_model_config
# Types
Shape
=
Union
[
List
[
int
],
torch
.
Size
]
def
get_forward_backward_func
():
"""Retrieves the appropriate forward_backward function given the
configuration of parallel_state.
Returns a function that will perform all of the forward and
backward passes of the model given the pipeline model parallel
world size and virtual pipeline model parallel world size in the
global parallel_state.
Note that if using sequence parallelism, the sequence length component of
the tensor shape is updated to original_sequence_length /
tensor_model_parallel_world_size.
The function returned takes the following arguments:
forward_step_func (required): A function that takes a data
iterator and a model as its arguments and return the model's
forward output and the loss function. The loss function should
take one torch.Tensor and return a torch.Tensor of loss and a
dictionary of string -> torch.Tensor.
A third argument, checkpoint_activations_microbatch, indicates
that the activations for this microbatch should be
checkpointed. A None value for this argument indicates that
the default from the configuration should be used. This is
used when the
num_microbatches_with_partial_activation_checkpoints is used.
For example:
def loss_func(loss_mask, output_tensor):
losses = output_tensor.float()
loss_mask = loss_mask.view(-1).float()
loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
# Reduce loss for logging.
averaged_loss = average_losses_across_data_parallel_group([loss])
return loss, {'lm loss': averaged_loss[0]}
def forward_step(data_iterator, model):
data, loss_mask = next(data_iterator)
output = model(data)
return output, partial(loss_func, loss_mask)
forward_backward_func(forward_step_func=forward_step, ...)
data_iterator (required): an iterator over the data, will be
passed as is to forward_step_func. Expected to be a list of
iterators in the case of interleaved pipeline parallelism.
model (required): the actual model. Expected to be a list of modules in the case of interleaved
pipeline parallelism. Must be a (potentially wrapped) megatron.core.models.MegatronModule.
num_microbatches (int, required):
The number of microbatches to go through
seq_length (int, required): Sequence length of the current global batch. If this is a dual-stack
transformer, this is the encoder's sequence length. This is ignored if variable_seq_lengths
in the config is True. Otherwise, each microbatch in the current global batch size must use
this sequence length.
micro_batch_size (int, required): The number of sequences in a microbatch.
decoder_seq_length (int, optional): The sequence length for the decoder in a dual-stack
transformer. This is ignored for a single-stack transformer.
forward_only (optional, default = False): Perform only the forward step
collect_non_loss_data (optional, bool, default=False): TODO
"""
pipeline_model_parallel_size
=
parallel_state
.
get_pipeline_model_parallel_world_size
()
if
pipeline_model_parallel_size
>
1
:
if
parallel_state
.
get_virtual_pipeline_model_parallel_world_size
()
is
not
None
:
forward_backward_func
=
forward_backward_pipelining_with_interleaving
else
:
forward_backward_func
=
forward_backward_pipelining_without_interleaving
else
:
forward_backward_func
=
forward_backward_no_pipelining
return
forward_backward_func
def
deallocate_output_tensor
(
out
,
deallocate_pipeline_outputs
=
False
):
'''Pseudo-deallocate (i.e., set to scalar) the output tensor's '.data' field.
This method should be called right after the output tensor has been
sent to the next pipeline stage. At this point, the output tensor is
only useful for its '.grad_fn' field, and not its '.data'.
'''
if
(
out
is
None
)
or
(
not
deallocate_pipeline_outputs
):
return
assert
isinstance
(
out
,
torch
.
Tensor
),
\
"expected Tensor, found %s."
%
type
(
out
).
__name__
assert
out
.
_base
is
None
,
\
"counter-productive to free a view of another tensor."
out
.
data
=
torch
.
empty
(
(
1
,),
device
=
out
.
device
,
dtype
=
out
.
dtype
,
)
def
custom_backward
(
output
,
grad_output
):
'''Directly call C++ autograd engine.
To make the 'deallocate_output_tensor' (above) optimization work, the C++
autograd engine must be called directly, bypassing Pytorch's
torch.autograd.backward. Pytorch's 'backward' checks that the output and
grad have the same shape, while C++'s 'backward' does not.
'''
assert
output
.
numel
()
==
1
,
\
"output should be pseudo-'freed' in schedule, to optimize memory"
assert
isinstance
(
output
,
torch
.
Tensor
),
\
"output == '%s'."
%
type
(
output
).
__name__
assert
isinstance
(
grad_output
,
(
torch
.
Tensor
,
type
(
None
))),
\
"grad_output == '%s'."
%
type
(
grad_output
).
__name__
# Handle scalar output
if
grad_output
is
None
:
assert
output
.
numel
()
==
1
,
"implicit grad requires scalar output."
grad_output
=
torch
.
ones_like
(
output
,
memory_format
=
torch
.
preserve_format
,
)
# Call c++ engine [ see torch/csrc/autograd/python_engine.cpp ]
Variable
.
_execution_engine
.
run_backward
(
tensors
=
(
output
,),
grad_tensors
=
(
grad_output
,),
keep_graph
=
False
,
create_graph
=
False
,
inputs
=
tuple
(),
allow_unreachable
=
True
,
accumulate_grad
=
True
,
)
def
forward_step
(
forward_step_func
,
data_iterator
,
model
,
num_microbatches
,
input_tensor
,
forward_data_store
,
config
,
collect_non_loss_data
=
False
,
checkpoint_activations_microbatch
=
None
):
"""Forward step for passed-in model.
If first stage, input tensor is obtained from data_iterator, otherwise
passed-in input_tensor is used.
Returns output tensor."""
if
config
.
timers
is
not
None
:
config
.
timers
(
'forward-compute'
,
log_level
=
2
).
start
()
unwrap_output_tensor
=
False
if
not
isinstance
(
input_tensor
,
list
):
input_tensor
=
[
input_tensor
]
unwrap_output_tensor
=
True
set_input_tensor
=
get_attr_wrapped_model
(
model
,
"set_input_tensor"
)
set_input_tensor
(
input_tensor
)
if
config
.
enable_autocast
:
context_manager
=
torch
.
autocast
(
"cuda"
,
dtype
=
config
.
autocast_dtype
)
else
:
context_manager
=
contextlib
.
nullcontext
()
with
context_manager
:
if
checkpoint_activations_microbatch
is
None
:
output_tensor
,
loss_func
=
forward_step_func
(
data_iterator
,
model
)
else
:
output_tensor
,
loss_func
=
forward_step_func
(
data_iterator
,
model
,
checkpoint_activations_microbatch
)
if
parallel_state
.
is_pipeline_last_stage
():
if
not
collect_non_loss_data
:
output_tensor
=
loss_func
(
output_tensor
)
loss
,
loss_reduced
=
output_tensor
output_tensor
=
loss
/
num_microbatches
forward_data_store
.
append
(
loss_reduced
)
else
:
data
=
loss_func
(
output_tensor
,
non_loss_data
=
True
)
forward_data_store
.
append
(
data
)
if
config
.
timers
is
not
None
:
config
.
timers
(
'forward-compute'
).
stop
()
# If T5 model (or other model with encoder and decoder)
# and in decoder stack, then send encoder_hidden_state
# downstream as well.
model_type
=
get_model_type
(
model
)
if
parallel_state
.
is_pipeline_stage_after_split
()
and
\
model_type
==
ModelType
.
encoder_and_decoder
:
return
[
output_tensor
,
input_tensor
[
-
1
]]
if
unwrap_output_tensor
:
return
output_tensor
return
[
output_tensor
]
def
backward_step
(
input_tensor
,
output_tensor
,
output_tensor_grad
,
model_type
,
config
):
"""Backward step through passed-in output tensor.
If last stage, output_tensor_grad is None, otherwise gradient of loss
with respect to stage's output tensor.
Returns gradient of loss with respect to input tensor (None if first
stage)."""
# NOTE: This code currently can handle at most one skip connection. It
# needs to be modified slightly to support arbitrary numbers of skip
# connections.
if
config
.
timers
is
not
None
:
config
.
timers
(
'backward-compute'
,
log_level
=
2
).
start
()
# Retain the grad on the input_tensor.
unwrap_input_tensor_grad
=
False
if
not
isinstance
(
input_tensor
,
list
):
input_tensor
=
[
input_tensor
]
unwrap_input_tensor_grad
=
True
for
x
in
input_tensor
:
if
x
is
not
None
:
x
.
retain_grad
()
if
not
isinstance
(
output_tensor
,
list
):
output_tensor
=
[
output_tensor
]
if
not
isinstance
(
output_tensor_grad
,
list
):
output_tensor_grad
=
[
output_tensor_grad
]
# Backward pass.
if
output_tensor_grad
[
0
]
is
None
and
config
.
grad_scale_func
is
not
None
:
output_tensor
[
0
]
=
config
.
grad_scale_func
(
output_tensor
[
0
])
if
config
.
deallocate_pipeline_outputs
:
custom_backward
(
output_tensor
[
0
],
output_tensor_grad
[
0
])
else
:
torch
.
autograd
.
backward
(
output_tensor
[
0
],
grad_tensors
=
output_tensor_grad
[
0
])
# Collect the grad of the input_tensor.
input_tensor_grad
=
[
None
]
if
input_tensor
is
not
None
:
input_tensor_grad
=
[]
for
x
in
input_tensor
:
if
x
is
None
:
input_tensor_grad
.
append
(
None
)
else
:
input_tensor_grad
.
append
(
x
.
grad
)
# Handle single skip connection if it exists (encoder_hidden_state in
# model with encoder and decoder).
if
parallel_state
.
get_pipeline_model_parallel_world_size
()
>
1
and
\
parallel_state
.
is_pipeline_stage_after_split
()
and
\
model_type
==
ModelType
.
encoder_and_decoder
:
if
output_tensor_grad
[
1
]
is
not
None
:
input_tensor_grad
[
-
1
].
add_
(
output_tensor_grad
[
1
])
if
unwrap_input_tensor_grad
:
input_tensor_grad
=
input_tensor_grad
[
0
]
if
config
.
timers
is
not
None
:
config
.
timers
(
'backward-compute'
).
stop
()
return
input_tensor_grad
def
forward_backward_no_pipelining
(
*
,
forward_step_func
,
data_iterator
:
Union
[
Iterator
,
List
[
Iterator
]],
model
:
Union
[
torch
.
nn
.
Module
,
List
[
torch
.
nn
.
Module
]],
num_microbatches
:
int
,
seq_length
:
int
,
# unused
micro_batch_size
:
int
,
# unused
decoder_seq_length
:
int
=
None
,
# unused
forward_only
:
bool
=
False
,
collect_non_loss_data
:
bool
=
False
,
):
"""Run forward and backward passes with no pipeline parallelism
(no inter-stage communication).
Returns dictionary with losses.
See get_forward_backward_func() for argument details
"""
if
isinstance
(
model
,
list
):
assert
len
(
model
)
==
1
,
\
"non-pipeline-parallel schedule does not support model chunking"
model
=
model
[
0
]
if
isinstance
(
data_iterator
,
list
):
assert
len
(
data_iterator
)
==
1
,
\
"non-pipeline-parallel schedule does not support model chunking"
data_iterator
=
data_iterator
[
0
]
config
=
get_model_config
(
model
)
no_sync_func
=
config
.
no_sync_func
if
no_sync_func
is
None
and
isinstance
(
model
,
torchDDP
):
no_sync_func
=
model
.
no_sync
if
no_sync_func
is
None
:
no_sync_func
=
contextlib
.
nullcontext
model_type
=
get_model_type
(
model
)
forward_data_store
=
[]
input_tensor
,
output_tensor_grad
=
None
,
None
with
no_sync_func
():
for
i
in
range
(
num_microbatches
-
1
):
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
,
model
,
num_microbatches
,
input_tensor
,
forward_data_store
,
config
,
collect_non_loss_data
)
if
not
forward_only
:
backward_step
(
input_tensor
,
output_tensor
,
output_tensor_grad
,
model_type
,
config
)
# Run computation for last microbatch out of context handler (want to
# synchronize gradients).
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
,
model
,
num_microbatches
,
input_tensor
,
forward_data_store
,
config
,
collect_non_loss_data
)
if
not
forward_only
:
backward_step
(
input_tensor
,
output_tensor
,
output_tensor_grad
,
model_type
,
config
)
return
forward_data_store
def
forward_backward_pipelining_with_interleaving
(
*
,
forward_step_func
,
data_iterator
:
Union
[
Iterator
,
List
[
Iterator
]],
model
:
Union
[
torch
.
nn
.
Module
,
List
[
torch
.
nn
.
Module
]],
num_microbatches
:
int
,
seq_length
:
int
,
micro_batch_size
:
int
,
decoder_seq_length
:
int
=
None
,
forward_only
:
bool
=
False
,
collect_non_loss_data
:
bool
=
False
,
):
"""Run interleaved 1F1B schedule (model split into model chunks), with
communication between pipeline stages as needed.
Returns dictionary with losses if the last stage, empty dict otherwise."""
assert
isinstance
(
model
,
list
),
\
"interleaved pipeline parallelism expected model chunking"
assert
all
(
isinstance
(
chunk
,
torch
.
nn
.
Module
)
for
chunk
in
model
),
\
"invalid model chunking"
assert
isinstance
(
data_iterator
,
list
),
\
"interleaved pipeline parallelism expected each model chunk to have a data iterator"
config
=
get_model_config
(
model
[
0
])
if
config
.
overlap_p2p_comm
and
config
.
batch_p2p_comm
:
raise
ValueError
(
"Can not use both overlap_p2p_comm and batch_p2p_comm"
)
# Disable async grad reductions
no_sync_func
=
config
.
no_sync_func
if
no_sync_func
is
None
and
all
(
isinstance
(
chunk
,
torchDDP
)
for
chunk
in
model
):
def
multi_no_sync
():
stack
=
contextlib
.
ExitStack
()
for
chunk
in
model
:
stack
.
enter_context
(
chunk
.
no_sync
())
return
stack
no_sync_func
=
multi_no_sync
if
no_sync_func
is
None
:
no_sync_func
=
contextlib
.
nullcontext
no_sync_context
=
None
def
disable_grad_sync
():
"""Disable asynchronous grad reductions"""
nonlocal
no_sync_context
if
no_sync_context
is
None
:
no_sync_context
=
no_sync_func
()
no_sync_context
.
__enter__
()
def
enable_grad_sync
():
"""Enable asynchronous grad reductions"""
nonlocal
no_sync_context
if
no_sync_context
is
not
None
:
no_sync_context
.
__exit__
(
None
,
None
,
None
)
no_sync_context
=
None
disable_grad_sync
()
# Model chunk IDs with synchronized grads
synchronized_model_chunks
=
set
()
input_tensors
=
[[]
for
_
in
range
(
len
(
model
))]
output_tensors
=
[[]
for
_
in
range
(
len
(
model
))]
forward_data_store
=
[]
if
not
forward_only
:
output_tensor_grads
=
[[]
for
_
in
range
(
len
(
model
))]
pipeline_parallel_size
=
parallel_state
.
get_pipeline_model_parallel_world_size
()
pipeline_parallel_rank
=
parallel_state
.
get_pipeline_model_parallel_rank
()
if
num_microbatches
%
pipeline_parallel_size
!=
0
:
msg
=
f
'number of microbatches (
{
num_microbatches
}
) is not divisible by '
msg
+=
f
'pipeline-model-parallel-size (
{
pipeline_parallel_size
}
) '
msg
+=
'when using interleaved schedule'
raise
RuntimeError
(
msg
)
model_type
=
get_model_type
(
model
[
0
])
if
model_type
==
ModelType
.
encoder_and_decoder
:
raise
RuntimeError
(
"Interleaving is not supported with an encoder and decoder model."
)
tensor_shape
=
(
seq_length
,
micro_batch_size
,
config
.
hidden_size
)
if
decoder_seq_length
is
not
None
and
decoder_seq_length
!=
tensor_shape
[
0
]:
raise
RuntimeError
(
"Interleaving is not supported with a different decoder sequence length."
)
if
config
.
sequence_parallel
:
tensor_shape
[
0
]
=
tensor_shape
[
0
]
//
parallel_state
.
get_tensor_model_parallel_world_size
()
# Compute number of warmup and remaining microbatches.
num_model_chunks
=
len
(
model
)
total_num_microbatches
=
num_microbatches
*
num_model_chunks
all_warmup_microbatches
=
False
if
forward_only
:
num_warmup_microbatches
=
total_num_microbatches
else
:
# Run all forward passes and then all backward passes if number of
# microbatches is just the number of pipeline stages.
# Otherwise, perform (num_model_chunks-1)*pipeline_parallel_size on
# all workers, followed by more microbatches after depending on
# stage ID (more forward passes for earlier stages, later stages can
# immediately start with 1F1B).
if
num_microbatches
==
pipeline_parallel_size
:
num_warmup_microbatches
=
total_num_microbatches
all_warmup_microbatches
=
True
else
:
num_warmup_microbatches
=
(
pipeline_parallel_size
-
pipeline_parallel_rank
-
1
)
*
2
num_warmup_microbatches
+=
(
num_model_chunks
-
1
)
*
pipeline_parallel_size
num_warmup_microbatches
=
min
(
num_warmup_microbatches
,
total_num_microbatches
)
num_microbatches_remaining
=
total_num_microbatches
-
num_warmup_microbatches
# Checkpoint the activations of partial Transformer layers in a number of micro-batches
# within the maximum outstanding micro-batch backpropagations.
# Micro-batches with the ids less than 'num_microbatches_with_partial_activation_checkpoints'
# checkpoint partial Transformer layers (or skip checkpointing) and
# the rest of micro-batches within a window of micro-batches checkpoint
# all Transformer layers. The window of micro-batches is set by the maximum
# outstanding backpropagations and becomes smaller at later pipeline stages.
# Please refer the appendix C in https://arxiv.org/pdf/2205.05198.pdf
max_outstanding_backprops
=
None
if
config
.
num_microbatches_with_partial_activation_checkpoints
is
not
None
:
max_outstanding_backprops
=
num_warmup_microbatches
+
1
# Synchronize params for first two model chunks
if
config
.
param_sync_func
is
not
None
:
config
.
param_sync_func
(
model
[
0
].
parameters
())
config
.
param_sync_func
(
model
[
1
].
parameters
())
def
get_model_chunk_id
(
microbatch_id
,
forward
):
"""Helper method to get the model chunk ID given the iteration number."""
microbatch_id_in_group
=
microbatch_id
%
(
pipeline_parallel_size
*
num_model_chunks
)
model_chunk_id
=
microbatch_id_in_group
//
pipeline_parallel_size
if
not
forward
:
model_chunk_id
=
(
num_model_chunks
-
model_chunk_id
-
1
)
return
model_chunk_id
def
is_first_microbatch_for_model_chunk
(
microbatch_id
:
int
)
->
bool
:
"""Check if an iteration is the first for a model chunk."""
microbatch_group_size
=
pipeline_parallel_size
*
num_model_chunks
num_microbatch_groups
=
total_num_microbatches
//
microbatch_group_size
microbatch_group_id
=
microbatch_id
//
microbatch_group_size
microbatch_id_in_group
=
microbatch_id
%
microbatch_group_size
if
microbatch_group_id
==
0
:
return
microbatch_id_in_group
%
pipeline_parallel_size
==
0
else
:
return
False
def
is_last_microbatch_for_model_chunk
(
microbatch_id
:
int
)
->
bool
:
"""Check if an iteration is the last for a model chunk."""
microbatch_group_size
=
pipeline_parallel_size
*
num_model_chunks
num_microbatch_groups
=
total_num_microbatches
//
microbatch_group_size
microbatch_group_id
=
microbatch_id
//
microbatch_group_size
microbatch_id_in_group
=
microbatch_id
%
microbatch_group_size
if
microbatch_group_id
==
num_microbatch_groups
-
1
:
return
microbatch_id_in_group
%
pipeline_parallel_size
==
pipeline_parallel_size
-
1
else
:
return
False
def
forward_step_helper
(
microbatch_id
,
checkpoint_activations_microbatch
):
"""Helper method to run forward step with model split into chunks
(run set_virtual_pipeline_model_parallel_rank() before calling
forward_step())."""
model_chunk_id
=
get_model_chunk_id
(
microbatch_id
,
forward
=
True
)
parallel_state
.
set_virtual_pipeline_model_parallel_rank
(
model_chunk_id
)
# launch param synchronization for next model chunk
# Note: Asynchronous communication tends to slow down compute.
# To reduce idling from mismatched microbatch times, we launch
# asynchronous communication at the same time across the
# pipeline-parallel group.
if
config
.
param_sync_func
is
not
None
:
param_sync_microbatch_id
=
microbatch_id
+
pipeline_parallel_rank
if
param_sync_microbatch_id
<
num_microbatches
and
is_first_microbatch_for_model_chunk
(
param_sync_microbatch_id
):
param_sync_chunk_id
=
get_model_chunk_id
(
param_sync_microbatch_id
,
forward
=
True
)
+
1
if
1
<
param_sync_chunk_id
<
num_model_chunks
:
config
.
param_sync_func
(
model
[
param_sync_chunk_id
].
parameters
())
# forward step
if
parallel_state
.
is_pipeline_first_stage
():
if
len
(
input_tensors
[
model_chunk_id
])
==
\
len
(
output_tensors
[
model_chunk_id
]):
input_tensors
[
model_chunk_id
].
append
(
None
)
input_tensor
=
input_tensors
[
model_chunk_id
][
-
1
]
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
[
model_chunk_id
],
model
[
model_chunk_id
],
num_microbatches
,
input_tensor
,
forward_data_store
,
config
,
collect_non_loss_data
,
checkpoint_activations_microbatch
)
output_tensors
[
model_chunk_id
].
append
(
output_tensor
)
# if forward-only, no need to save tensors for a backward pass
if
forward_only
:
input_tensors
[
model_chunk_id
].
pop
()
output_tensors
[
model_chunk_id
].
pop
()
return
output_tensor
def
backward_step_helper
(
microbatch_id
):
"""Helper method to run backward step with model split into chunks
(run set_virtual_pipeline_model_parallel_rank() before calling
backward_step())."""
model_chunk_id
=
get_model_chunk_id
(
microbatch_id
,
forward
=
False
)
parallel_state
.
set_virtual_pipeline_model_parallel_rank
(
model_chunk_id
)
# launch grad synchronization (default)
if
config
.
grad_sync_func
is
None
and
is_last_microbatch_for_model_chunk
(
microbatch_id
):
enable_grad_sync
()
synchronized_model_chunks
.
add
(
model_chunk_id
)
if
parallel_state
.
is_pipeline_last_stage
():
if
len
(
output_tensor_grads
[
model_chunk_id
])
==
0
:
output_tensor_grads
[
model_chunk_id
].
append
(
None
)
input_tensor
=
input_tensors
[
model_chunk_id
].
pop
(
0
)
output_tensor
=
output_tensors
[
model_chunk_id
].
pop
(
0
)
output_tensor_grad
=
output_tensor_grads
[
model_chunk_id
].
pop
(
0
)
input_tensor_grad
=
\
backward_step
(
input_tensor
,
output_tensor
,
output_tensor_grad
,
model_type
,
config
)
# launch grad synchronization (custom grad sync)
# Note: Asynchronous communication tends to slow down compute.
# To reduce idling from mismatched microbatch times, we launch
# asynchronous communication at the same time across the
# pipeline-parallel group.
if
config
.
grad_sync_func
is
not
None
:
grad_sync_microbatch_id
=
microbatch_id
-
pipeline_parallel_rank
if
grad_sync_microbatch_id
>=
0
and
is_last_microbatch_for_model_chunk
(
grad_sync_microbatch_id
):
grad_sync_chunk_id
=
get_model_chunk_id
(
grad_sync_microbatch_id
,
forward
=
False
)
enable_grad_sync
()
config
.
grad_sync_func
(
model
[
grad_sync_chunk_id
].
parameters
())
synchronized_model_chunks
.
add
(
grad_sync_chunk_id
)
disable_grad_sync
()
return
input_tensor_grad
# Run warmup forward passes.
parallel_state
.
set_virtual_pipeline_model_parallel_rank
(
0
)
input_tensors
[
0
].
append
(
p2p_communication
.
recv_forward
(
tensor_shape
,
config
))
fwd_wait_handles
=
None
bwd_wait_handles
=
None
for
k
in
range
(
num_warmup_microbatches
):
if
fwd_wait_handles
is
not
None
:
for
req
in
fwd_wait_handles
:
req
.
wait
()
# Decide to checkpoint all layers' activations of the current micro-batch
if
max_outstanding_backprops
is
not
None
:
checkpoint_activations_microbatch
=
k
%
max_outstanding_backprops
>=
\
config
.
num_microbatches_with_partial_activation_checkpoints
else
:
checkpoint_activations_microbatch
=
None
output_tensor
=
forward_step_helper
(
k
,
checkpoint_activations_microbatch
)
# Determine if tensor should be received from previous stage.
next_forward_model_chunk_id
=
get_model_chunk_id
(
k
+
1
,
forward
=
True
)
recv_prev
=
True
if
parallel_state
.
is_pipeline_first_stage
(
ignore_virtual
=
True
):
if
next_forward_model_chunk_id
==
0
:
recv_prev
=
False
if
k
==
(
total_num_microbatches
-
1
):
recv_prev
=
False
# Don't send tensor downstream if on last stage.
if
parallel_state
.
is_pipeline_last_stage
():
output_tensor
=
None
# Send and receive tensors as appropriate (send tensors computed
# in this iteration; receive tensors for next iteration).
if
not
config
.
overlap_p2p_comm
:
if
k
==
(
num_warmup_microbatches
-
1
)
and
not
forward_only
and
\
not
all_warmup_microbatches
:
input_tensor_grad
=
None
recv_next
=
True
if
parallel_state
.
is_pipeline_last_stage
(
ignore_virtual
=
True
):
recv_next
=
False
input_tensor
,
output_tensor_grad
=
\
p2p_communication
.
send_forward_backward_recv_forward_backward
(
output_tensor
,
input_tensor_grad
,
recv_prev
=
recv_prev
,
recv_next
=
recv_next
,
tensor_shape
=
tensor_shape
,
config
=
config
)
output_tensor_grads
[
num_model_chunks
-
1
].
append
(
output_tensor_grad
)
else
:
input_tensor
=
\
p2p_communication
.
send_forward_recv_forward
(
output_tensor
,
recv_prev
=
recv_prev
,
tensor_shape
=
tensor_shape
,
config
=
config
)
input_tensors
[
next_forward_model_chunk_id
].
append
(
input_tensor
)
else
:
input_tensor
,
fwd_wait_handles
=
\
p2p_communication
.
send_forward_recv_forward
(
output_tensor
,
recv_prev
=
recv_prev
,
tensor_shape
=
tensor_shape
,
config
=
config
,
overlap_p2p_comm
=
True
)
if
k
==
(
num_warmup_microbatches
-
1
)
and
not
forward_only
and
\
not
all_warmup_microbatches
:
input_tensor_grad
=
None
recv_next
=
True
if
parallel_state
.
is_pipeline_last_stage
(
ignore_virtual
=
True
):
recv_next
=
False
output_tensor_grad
,
bwd_wait_handles
=
p2p_communication
.
send_backward_recv_backward
(
input_tensor_grad
,
recv_next
=
recv_next
,
tensor_shape
=
tensor_shape
,
config
=
config
,
overlap_p2p_comm
=
True
)
output_tensor_grads
[
num_model_chunks
-
1
].
append
(
output_tensor_grad
)
input_tensors
[
next_forward_model_chunk_id
].
append
(
input_tensor
)
deallocate_output_tensor
(
output_tensor
,
config
.
deallocate_pipeline_outputs
)
# Run 1F1B in steady state.
for
k
in
range
(
num_microbatches_remaining
):
# Forward pass.
forward_k
=
k
+
num_warmup_microbatches
# Decide to checkpoint all layers' activations of the current micro-batch
if
max_outstanding_backprops
is
not
None
:
checkpoint_activations_microbatch
=
(
forward_k
%
max_outstanding_backprops
>=
\
config
.
num_microbatches_with_partial_activation_checkpoints
)
else
:
checkpoint_activations_microbatch
=
None
if
config
.
overlap_p2p_comm
:
if
fwd_wait_handles
is
not
None
:
for
req
in
fwd_wait_handles
:
req
.
wait
()
deallocate_output_tensor
(
output_tensor
,
config
.
deallocate_pipeline_outputs
)
output_tensor
=
forward_step_helper
(
forward_k
,
checkpoint_activations_microbatch
)
# Determine if current stage has anything to send in either direction,
# otherwise set tensor to None.
forward_model_chunk_id
=
get_model_chunk_id
(
forward_k
,
forward
=
True
)
parallel_state
.
set_virtual_pipeline_model_parallel_rank
(
forward_model_chunk_id
)
# Last virtual stage no activation tensor to send
if
parallel_state
.
is_pipeline_last_stage
():
output_tensor
=
None
# Determine if peers are sending, and where in data structure to put
# received tensors.
recv_prev
=
True
if
parallel_state
.
is_pipeline_first_stage
(
ignore_virtual
=
True
):
# First stage is ahead of last stage by (pipeline_parallel_size - 1).
next_forward_model_chunk_id
=
get_model_chunk_id
(
forward_k
-
(
pipeline_parallel_size
-
1
),
forward
=
True
)
if
next_forward_model_chunk_id
==
(
num_model_chunks
-
1
):
recv_prev
=
False
next_forward_model_chunk_id
+=
1
else
:
next_forward_model_chunk_id
=
get_model_chunk_id
(
forward_k
+
1
,
forward
=
True
)
# If last iteration, don't receive; we already received one extra
# before the start of the for loop.
if
k
==
(
num_microbatches_remaining
-
1
):
recv_prev
=
False
# Send activation tensor to the next stage and receive activation tensor from the
# previous stage
input_tensor
,
fwd_wait_handles
=
\
p2p_communication
.
send_forward_recv_forward
(
output_tensor
,
recv_prev
=
recv_prev
,
tensor_shape
=
tensor_shape
,
dtype
=
dtype
,
batch_p2p_comm
=
batch_p2p_comm
,
timers
=
timers
,
overlap_p2p_comm
=
True
)
# assert fwd_wait_handles is not None
if
bwd_wait_handles
is
not
None
:
for
req
in
bwd_wait_handles
:
req
.
wait
()
# Backward pass.
backward_k
=
k
input_tensor_grad
=
backward_step_helper
(
backward_k
)
backward_model_chunk_id
=
get_model_chunk_id
(
backward_k
,
forward
=
False
)
parallel_state
.
set_virtual_pipeline_model_parallel_rank
(
backward_model_chunk_id
)
# First virtual stage no activation gradient tensor to send
if
parallel_state
.
is_pipeline_first_stage
():
input_tensor_grad
=
None
# Determine if the current virtual stage has an activation gradient tensor to receive
recv_next
=
True
if
parallel_state
.
is_pipeline_last_stage
(
ignore_virtual
=
True
):
# Last stage is ahead of first stage by (pipeline_parallel_size - 1).
next_backward_model_chunk_id
=
get_model_chunk_id
(
backward_k
-
(
pipeline_parallel_size
-
1
),
forward
=
False
)
if
next_backward_model_chunk_id
==
0
:
recv_next
=
False
next_backward_model_chunk_id
-=
1
else
:
next_backward_model_chunk_id
=
get_model_chunk_id
(
backward_k
+
1
,
forward
=
False
)
output_tensor_grad
,
bwd_wait_handles
=
p2p_communication
.
send_backward_recv_backward
(
input_tensor_grad
,
recv_next
=
recv_next
,
tensor_shape
=
tensor_shape
,
config
=
config
,
overlap_p2p_comm
=
True
)
else
:
# no p2p overlap
output_tensor
=
forward_step_helper
(
forward_k
,
checkpoint_activations_microbatch
)
# Backward pass.
backward_k
=
k
input_tensor_grad
=
backward_step_helper
(
backward_k
)
# Send output_tensor and input_tensor_grad, receive input_tensor
# and output_tensor_grad.
# Determine if current stage has anything to send in either direction,
# otherwise set tensor to None.
forward_model_chunk_id
=
get_model_chunk_id
(
forward_k
,
forward
=
True
)
parallel_state
.
set_virtual_pipeline_model_parallel_rank
(
forward_model_chunk_id
)
if
parallel_state
.
is_pipeline_last_stage
():
output_tensor
=
None
backward_model_chunk_id
=
get_model_chunk_id
(
backward_k
,
forward
=
False
)
parallel_state
.
set_virtual_pipeline_model_parallel_rank
(
backward_model_chunk_id
)
if
parallel_state
.
is_pipeline_first_stage
():
input_tensor_grad
=
None
# Determine if peers are sending, and where in data structure to put
# received tensors.
recv_prev
=
True
if
parallel_state
.
is_pipeline_first_stage
(
ignore_virtual
=
True
):
# First stage is ahead of last stage by (pipeline_parallel_size - 1).
next_forward_model_chunk_id
=
get_model_chunk_id
(
forward_k
-
(
pipeline_parallel_size
-
1
),
forward
=
True
)
if
next_forward_model_chunk_id
==
(
num_model_chunks
-
1
):
recv_prev
=
False
next_forward_model_chunk_id
+=
1
else
:
next_forward_model_chunk_id
=
get_model_chunk_id
(
forward_k
+
1
,
forward
=
True
)
recv_next
=
True
if
parallel_state
.
is_pipeline_last_stage
(
ignore_virtual
=
True
):
# Last stage is ahead of first stage by (pipeline_parallel_size - 1).
next_backward_model_chunk_id
=
get_model_chunk_id
(
backward_k
-
(
pipeline_parallel_size
-
1
),
forward
=
False
)
if
next_backward_model_chunk_id
==
0
:
recv_next
=
False
next_backward_model_chunk_id
-=
1
else
:
next_backward_model_chunk_id
=
get_model_chunk_id
(
backward_k
+
1
,
forward
=
False
)
# If last iteration, don't receive; we already received one extra
# before the start of the for loop.
if
k
==
(
num_microbatches_remaining
-
1
):
recv_prev
=
False
# Communicate tensors.
input_tensor
,
output_tensor_grad
=
\
p2p_communication
.
send_forward_backward_recv_forward_backward
(
output_tensor
,
input_tensor_grad
,
recv_prev
=
recv_prev
,
recv_next
=
recv_next
,
tensor_shape
=
tensor_shape
,
config
=
config
)
deallocate_output_tensor
(
output_tensor
,
config
.
deallocate_pipeline_outputs
)
# Put input_tensor and output_tensor_grad in data structures in the
# right location.
if
recv_prev
:
input_tensors
[
next_forward_model_chunk_id
].
append
(
input_tensor
)
if
recv_next
:
output_tensor_grads
[
next_backward_model_chunk_id
].
append
(
output_tensor_grad
)
deallocate_output_tensor
(
output_tensor
,
config
.
deallocate_pipeline_outputs
)
# Run cooldown backward passes (flush out pipeline).
if
not
forward_only
:
if
config
.
overlap_p2p_comm
and
bwd_wait_handles
is
not
None
:
for
wait_handle
in
bwd_wait_handles
:
wait_handle
.
wait
()
if
all_warmup_microbatches
:
output_tensor_grads
[
num_model_chunks
-
1
].
append
(
p2p_communication
.
recv_backward
(
tensor_shape
,
config
=
config
))
for
k
in
range
(
num_microbatches_remaining
,
total_num_microbatches
):
input_tensor_grad
=
backward_step_helper
(
k
)
next_backward_model_chunk_id
=
get_model_chunk_id
(
k
+
1
,
forward
=
False
)
recv_next
=
True
if
parallel_state
.
is_pipeline_last_stage
(
ignore_virtual
=
True
):
if
next_backward_model_chunk_id
==
(
num_model_chunks
-
1
):
recv_next
=
False
if
k
==
(
total_num_microbatches
-
1
):
recv_next
=
False
output_tensor_grads
[
next_backward_model_chunk_id
].
append
(
p2p_communication
.
send_backward_recv_backward
(
input_tensor_grad
,
recv_next
=
recv_next
,
tensor_shape
=
tensor_shape
,
config
=
config
))
# Launch any remaining grad reductions
enable_grad_sync
()
if
config
.
grad_sync_func
is
not
None
:
params
=
[]
for
model_chunk_id
in
range
(
num_model_chunks
):
if
model_chunk_id
not
in
synchronized_model_chunks
:
params
.
extend
(
model
[
model_chunk_id
].
parameters
())
synchronized_model_chunks
.
add
(
model_chunk_id
)
if
params
:
config
.
grad_sync_func
(
params
)
return
forward_data_store
def
get_tensor_shapes
(
*
,
rank
:
int
,
model_type
:
ModelType
,
seq_length
:
int
,
micro_batch_size
:
int
,
decoder_seq_length
:
int
,
config
):
# Determine right tensor sizes (based on position of rank with respect to split
# rank) and model size.
# Send two tensors if model is T5 and rank is in decoder stage:
# first tensor is decoder (pre-transpose),
# second tensor is encoder (post-transpose).
# If model is T5 and rank is at the boundary:
# send one tensor (post-transpose from encoder).
# Otherwise, send one tensor (pre-transpose).
tensor_shapes
=
[]
if
config
.
sequence_parallel
:
seq_length
=
seq_length
//
parallel_state
.
get_tensor_model_parallel_world_size
()
if
model_type
==
ModelType
.
encoder_and_decoder
:
decoder_seq_length
=
(
decoder_seq_length
//
parallel_state
.
get_tensor_model_parallel_world_size
()
)
if
model_type
==
ModelType
.
encoder_and_decoder
:
if
parallel_state
.
is_pipeline_stage_before_split
(
rank
):
tensor_shapes
.
append
((
seq_length
,
micro_batch_size
,
config
.
hidden_size
))
else
:
tensor_shapes
.
append
((
decoder_seq_length
,
micro_batch_size
,
config
.
hidden_size
))
tensor_shapes
.
append
((
seq_length
,
micro_batch_size
,
config
.
hidden_size
))
else
:
tensor_shapes
.
append
((
seq_length
,
micro_batch_size
,
config
.
hidden_size
))
return
tensor_shapes
def
recv_forward
(
tensor_shapes
,
config
):
input_tensors
=
[]
for
tensor_shape
in
tensor_shapes
:
if
tensor_shape
is
None
:
input_tensors
.
append
(
None
)
else
:
input_tensors
.
append
(
p2p_communication
.
recv_forward
(
tensor_shape
,
config
))
return
input_tensors
def
recv_backward
(
tensor_shapes
,
config
):
output_tensor_grads
=
[]
for
tensor_shape
in
tensor_shapes
:
if
tensor_shape
is
None
:
output_tensor_grads
.
append
(
None
)
else
:
output_tensor_grads
.
append
(
p2p_communication
.
recv_backward
(
tensor_shape
,
config
))
return
output_tensor_grads
def
send_forward
(
output_tensors
,
tensor_shapes
,
config
):
if
not
isinstance
(
output_tensors
,
list
):
output_tensors
=
[
output_tensors
]
for
(
output_tensor
,
tensor_shape
)
in
zip
(
output_tensors
,
tensor_shapes
):
if
tensor_shape
is
None
:
continue
p2p_communication
.
send_forward
(
output_tensor
,
config
)
def
send_backward
(
input_tensor_grads
,
tensor_shapes
,
config
):
if
not
isinstance
(
input_tensor_grads
,
list
):
input_tensor_grads
=
[
input_tensor_grads
]
for
(
input_tensor_grad
,
tensor_shape
)
in
zip
(
input_tensor_grads
,
tensor_shapes
):
if
tensor_shape
is
None
:
continue
p2p_communication
.
send_backward
(
input_tensor_grad
,
config
)
def
send_forward_recv_backward
(
output_tensors
,
tensor_shapes
,
config
):
if
not
isinstance
(
output_tensors
,
list
):
output_tensors
=
[
output_tensors
]
output_tensor_grads
=
[]
for
(
output_tensor
,
tensor_shape
)
in
zip
(
output_tensors
,
tensor_shapes
):
if
tensor_shape
is
None
:
output_tensor_grads
.
append
(
None
)
continue
output_tensor_grad
=
p2p_communication
.
send_forward_recv_backward
(
output_tensor
,
tensor_shape
,
config
)
output_tensor_grads
.
append
(
output_tensor_grad
)
return
output_tensor_grads
def
send_backward_recv_forward
(
input_tensor_grads
,
tensor_shapes
,
config
):
if
not
isinstance
(
input_tensor_grads
,
list
):
input_tensor_grads
=
[
input_tensor_grads
]
input_tensors
=
[]
for
(
input_tensor_grad
,
tensor_shape
)
in
zip
(
input_tensor_grads
,
tensor_shapes
):
if
tensor_shape
is
None
:
input_tensors
.
append
(
None
)
continue
input_tensor
=
p2p_communication
.
send_backward_recv_forward
(
input_tensor_grad
,
tensor_shape
,
config
)
input_tensors
.
append
(
input_tensor
)
return
input_tensors
def
forward_backward_pipelining_without_interleaving
(
*
,
forward_step_func
,
data_iterator
:
Union
[
Iterator
,
List
[
Iterator
]],
model
:
Union
[
torch
.
nn
.
Module
,
List
[
torch
.
nn
.
Module
]],
num_microbatches
:
int
,
seq_length
:
int
,
micro_batch_size
:
int
,
decoder_seq_length
:
int
=
None
,
forward_only
:
bool
=
False
,
collect_non_loss_data
:
bool
=
False
,
):
"""Run non-interleaved 1F1B schedule, with communication between pipeline
stages.
Returns dictionary with losses if the last stage, empty dict otherwise."""
if
isinstance
(
model
,
list
):
assert
len
(
model
)
==
1
,
\
"non-interleaved pipeline parallelism does not support model chunking"
model
=
model
[
0
]
if
isinstance
(
data_iterator
,
list
):
assert
len
(
data_iterator
)
==
1
,
\
"non-pipeline-parallel schedule does not support model chunking"
data_iterator
=
data_iterator
[
0
]
config
=
get_model_config
(
model
)
if
config
.
overlap_p2p_comm
:
raise
ValueError
(
"Non-interleaved pipeline parallelism does not support overlapping p2p communication"
)
# Disable async grad reductions
no_sync_func
=
config
.
no_sync_func
if
no_sync_func
is
None
and
isinstance
(
model
,
torchDDP
):
no_sync_func
=
model
.
no_sync
if
no_sync_func
is
None
:
no_sync_func
=
contextlib
.
nullcontext
no_sync_context
=
None
def
disable_grad_sync
():
"""Disable asynchronous grad reductions"""
nonlocal
no_sync_context
if
no_sync_context
is
None
:
no_sync_context
=
no_sync_func
()
no_sync_context
.
__enter__
()
def
enable_grad_sync
():
"""Enable asynchronous grad reductions"""
nonlocal
no_sync_context
if
no_sync_context
is
not
None
:
no_sync_context
.
__exit__
(
None
,
None
,
None
)
no_sync_context
=
None
disable_grad_sync
()
# Compute number of warmup microbatches.
num_warmup_microbatches
=
\
(
parallel_state
.
get_pipeline_model_parallel_world_size
()
-
parallel_state
.
get_pipeline_model_parallel_rank
()
-
1
)
num_warmup_microbatches
=
min
(
num_warmup_microbatches
,
num_microbatches
)
num_microbatches_remaining
=
\
num_microbatches
-
num_warmup_microbatches
# Checkpoint the activations of partial Transformer layers in a number of micro-batches
# within the maximum outstanding micro-batch backpropagations.
# Micro-batches with the ids less than 'num_microbatches_with_partial_activation_checkpoints'
# checkpoint partial Transformer layers (or skip checkpointing) and
# the rest of micro-batches within a window of micro-batches checkpoint
# all Transformer layers. The window of micro-batches is set by the maximum
# outstanding backpropagations and becomes smaller at later pipeline stages.
# Please refer the appendix C in https://arxiv.org/pdf/2205.05198.pdf
max_outstanding_backprops
=
None
if
config
.
num_microbatches_with_partial_activation_checkpoints
is
not
None
:
max_outstanding_backprops
=
num_warmup_microbatches
+
1
model_type
=
get_model_type
(
model
)
rank
=
parallel_state
.
get_pipeline_model_parallel_rank
()
recv_tensor_shapes
=
get_tensor_shapes
(
rank
=
rank
-
1
,
model_type
=
model_type
,
seq_length
=
seq_length
,
micro_batch_size
=
micro_batch_size
,
decoder_seq_length
=
decoder_seq_length
,
config
=
config
)
send_tensor_shapes
=
get_tensor_shapes
(
rank
=
rank
,
model_type
=
model_type
,
seq_length
=
seq_length
,
micro_batch_size
=
micro_batch_size
,
decoder_seq_length
=
decoder_seq_length
,
config
=
config
)
# Input, output tensors only need to be saved when doing backward passes
input_tensors
=
None
output_tensors
=
None
if
not
forward_only
:
input_tensors
=
[]
output_tensors
=
[]
forward_data_store
=
[]
# Run warmup forward passes.
for
i
in
range
(
num_warmup_microbatches
):
# Decide to checkpoint all layers' activations of the current micro-batch
if
max_outstanding_backprops
is
not
None
:
checkpoint_activations_microbatch
=
(
i
%
max_outstanding_backprops
>=
config
.
num_microbatches_with_partial_activation_checkpoints
)
else
:
checkpoint_activations_microbatch
=
None
input_tensor
=
recv_forward
(
recv_tensor_shapes
,
config
)
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
,
model
,
num_microbatches
,
input_tensor
,
forward_data_store
,
config
,
collect_non_loss_data
,
checkpoint_activations_microbatch
)
send_forward
(
output_tensor
,
send_tensor_shapes
,
config
)
if
not
forward_only
:
input_tensors
.
append
(
input_tensor
)
output_tensors
.
append
(
output_tensor
)
deallocate_output_tensor
(
output_tensor
[
0
],
config
.
deallocate_pipeline_outputs
)
# Before running 1F1B, need to receive first forward tensor.
# If all microbatches are run in warmup / cooldown phase, then no need to
# receive this tensor here.
if
num_microbatches_remaining
>
0
:
input_tensor
=
recv_forward
(
recv_tensor_shapes
,
config
)
# Run 1F1B in steady state.
for
i
in
range
(
num_microbatches_remaining
):
last_iteration
=
(
i
==
(
num_microbatches_remaining
-
1
))
# Decide to checkpoint all layers' activations of the current micro-batch
if
max_outstanding_backprops
is
not
None
:
checkpoint_activations_microbatch
=
(
((
i
+
num_warmup_microbatches
)
%
max_outstanding_backprops
)
>=
\
config
.
num_microbatches_with_partial_activation_checkpoints
)
else
:
checkpoint_activations_microbatch
=
None
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
,
model
,
num_microbatches
,
input_tensor
,
forward_data_store
,
config
,
collect_non_loss_data
,
checkpoint_activations_microbatch
)
if
forward_only
:
send_forward
(
output_tensor
,
send_tensor_shapes
,
config
)
if
not
last_iteration
:
input_tensor
=
recv_forward
(
recv_tensor_shapes
,
config
)
else
:
output_tensor_grad
=
\
send_forward_recv_backward
(
output_tensor
,
send_tensor_shapes
,
config
)
# Add input_tensor and output_tensor to end of list.
input_tensors
.
append
(
input_tensor
)
output_tensors
.
append
(
output_tensor
)
deallocate_output_tensor
(
output_tensor
[
0
],
config
.
deallocate_pipeline_outputs
)
# Pop input_tensor and output_tensor from the start of the list for
# the backward pass.
input_tensor
=
input_tensors
.
pop
(
0
)
output_tensor
=
output_tensors
.
pop
(
0
)
input_tensor_grad
=
\
backward_step
(
input_tensor
,
output_tensor
,
output_tensor_grad
,
model_type
,
config
)
if
last_iteration
:
input_tensor
=
None
send_backward
(
input_tensor_grad
,
recv_tensor_shapes
,
config
)
else
:
input_tensor
=
\
send_backward_recv_forward
(
input_tensor_grad
,
recv_tensor_shapes
,
config
)
# Run cooldown backward passes.
if
not
forward_only
:
for
i
in
range
(
num_warmup_microbatches
):
# Enable async grad reduction in the last backward pass
# Note: If grad sync function is provided, only enable
# async grad reduction in first pipeline stage. Other
# pipeline stages do grad reduction during pipeline
# bubble.
if
i
==
num_warmup_microbatches
-
1
:
if
config
.
grad_sync_func
is
None
or
rank
==
0
:
enable_grad_sync
()
input_tensor
=
input_tensors
.
pop
(
0
)
output_tensor
=
output_tensors
.
pop
(
0
)
output_tensor_grad
=
recv_backward
(
send_tensor_shapes
,
config
)
input_tensor_grad
=
\
backward_step
(
input_tensor
,
output_tensor
,
output_tensor_grad
,
model_type
,
config
)
send_backward
(
input_tensor_grad
,
recv_tensor_shapes
,
config
)
# Launch any remaining grad reductions
if
no_sync_context
is
not
None
:
enable_grad_sync
()
if
config
.
grad_sync_func
is
not
None
:
config
.
grad_sync_func
(
model
.
parameters
())
return
forward_data_store
megatron/core/requirements.txt
0 → 100644
View file @
d3dd8642
torch
\ No newline at end of file
megatron/core/tensor_parallel/__init__.py
0 → 100644
View file @
d3dd8642
from
.cross_entropy
import
vocab_parallel_cross_entropy
from
.data
import
broadcast_data
from
.layers
import
(
ColumnParallelLinear
,
RowParallelLinear
,
VocabParallelEmbedding
,
set_tensor_model_parallel_attributes
,
set_defaults_if_not_set_tensor_model_parallel_attributes
,
copy_tensor_model_parallel_attributes
,
param_is_not_tensor_parallel_duplicate
,
linear_with_grad_accumulation_and_async_allreduce
)
from
.mappings
import
(
copy_to_tensor_model_parallel_region
,
gather_from_tensor_model_parallel_region
,
gather_from_sequence_parallel_region
,
scatter_to_tensor_model_parallel_region
,
scatter_to_sequence_parallel_region
,
)
from
.random
import
(
checkpoint
,
get_cuda_rng_tracker
,
model_parallel_cuda_manual_seed
,
)
from
.utils
import
(
split_tensor_along_last_dim
,
split_tensor_into_1d_equal_chunks
,
gather_split_1d_tensor
,
)
__all__
=
[
# cross_entropy.py
"vocab_parallel_cross_entropy"
,
# data.py
"broadcast_data"
,
#layers.py
"ColumnParallelLinear"
,
"RowParallelLinear"
,
"VocabParallelEmbedding"
,
"set_tensor_model_parallel_attributes"
,
"set_defaults_if_not_set_tensor_model_parallel_attributes"
,
"copy_tensor_model_parallel_attributes"
,
"param_is_not_tensor_parallel_duplicate"
,
"linear_with_grad_accumulation_and_async_allreduce"
,
"_initialize_affine_weight_gpu"
,
# mappings.py
"copy_to_tensor_model_parallel_region"
,
"gather_from_tensor_model_parallel_region"
,
"gather_from_sequence_parallel_region"
,
# "reduce_from_tensor_model_parallel_region",
"scatter_to_tensor_model_parallel_region"
,
"scatter_to_sequence_parallel_region"
,
# random.py
"checkpoint"
,
"get_cuda_rng_tracker"
,
"model_parallel_cuda_manual_seed"
,
# utils.py
"split_tensor_along_last_dim"
,
"split_tensor_into_1d_equal_chunks"
,
"gather_split_1d_tensor"
,
]
megatron/core/tensor_parallel/cross_entropy.py
0 → 100644
View file @
d3dd8642
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import
torch
from
megatron.core.parallel_state
import
(
get_tensor_model_parallel_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
from
.utils
import
VocabUtility
class
_VocabParallelCrossEntropy
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
vocab_parallel_logits
,
target
,
label_smoothing
=
0.0
):
# Maximum value along vocab dimension across all GPUs.
logits_max
=
torch
.
max
(
vocab_parallel_logits
,
dim
=-
1
)[
0
]
torch
.
distributed
.
all_reduce
(
logits_max
,
op
=
torch
.
distributed
.
ReduceOp
.
MAX
,
group
=
get_tensor_model_parallel_group
())
# Subtract the maximum value.
vocab_parallel_logits
=
vocab_parallel_logits
-
logits_max
.
unsqueeze
(
dim
=-
1
)
# Get the partition's vocab indecies
get_vocab_range
=
VocabUtility
.
vocab_range_from_per_partition_vocab_size
partition_vocab_size
=
vocab_parallel_logits
.
size
()[
-
1
]
rank
=
get_tensor_model_parallel_rank
()
world_size
=
get_tensor_model_parallel_world_size
()
vocab_start_index
,
vocab_end_index
=
get_vocab_range
(
partition_vocab_size
,
rank
,
world_size
)
# Create a mask of valid vocab ids (1 means it needs to be masked).
target_mask
=
(
target
<
vocab_start_index
)
|
(
target
>=
vocab_end_index
)
masked_target
=
target
.
clone
()
-
vocab_start_index
masked_target
[
target_mask
]
=
0
# Get predicted-logits = logits[target].
# For Simplicity, we convert logits to a 2-D tensor with size
# [*, partition-vocab-size] and target to a 1-D tensor of size [*].
logits_2d
=
vocab_parallel_logits
.
view
(
-
1
,
partition_vocab_size
)
masked_target_1d
=
masked_target
.
view
(
-
1
)
arange_1d
=
torch
.
arange
(
start
=
0
,
end
=
logits_2d
.
size
()[
0
],
device
=
logits_2d
.
device
)
predicted_logits_1d
=
logits_2d
[
arange_1d
,
masked_target_1d
]
predicted_logits_1d
=
predicted_logits_1d
.
clone
().
contiguous
()
predicted_logits
=
predicted_logits_1d
.
view_as
(
target
)
predicted_logits
[
target_mask
]
=
0.0
# All reduce is needed to get the chunks from other GPUs.
torch
.
distributed
.
all_reduce
(
predicted_logits
,
op
=
torch
.
distributed
.
ReduceOp
.
SUM
,
group
=
get_tensor_model_parallel_group
())
# Sum of exponential of logits along vocab dimension across all GPUs.
exp_logits
=
vocab_parallel_logits
torch
.
exp
(
vocab_parallel_logits
,
out
=
exp_logits
)
sum_exp_logits
=
exp_logits
.
sum
(
dim
=-
1
)
torch
.
distributed
.
all_reduce
(
sum_exp_logits
,
op
=
torch
.
distributed
.
ReduceOp
.
SUM
,
group
=
get_tensor_model_parallel_group
())
# Loss = log(sum(exp(logits))) - predicted-logit.
loss
=
torch
.
log
(
sum_exp_logits
)
-
predicted_logits
# Normalize and optionally smooth logits
exp_logits
.
div_
(
sum_exp_logits
.
unsqueeze
(
dim
=-
1
))
vocab_size
=
exp_logits
.
size
(
-
1
)
if
label_smoothing
>
0
:
"""
We'd like to assign 1 / (K - 1) probability mass to every index that is not the ground truth.
= (1 - alpha) * y_gt + alpha * mean(y_{i for i != gt})
= (1 - alpha) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i
= ((K - 1) * (1 - alpha) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i != gt} y_i
= (K * (1 - alpha) - 1) / (K - 1)) * y_gt + (alpha / (K - 1)) * \sum_{i} y_i
= (1 - (alpha * K) / (K - 1)) * y_gt + ( (alpha * K) / (K - 1) ) * \sum_{i} y_i / K
From: https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/losses/smoothed_cross_entropy.py
"""
assert
1.0
>
label_smoothing
>
0.0
smoothing
=
label_smoothing
*
vocab_size
/
(
vocab_size
-
1
)
# Exp logits at this point are normalized probabilities. So we can just take the log to get log-probs.
log_probs
=
torch
.
log
(
exp_logits
)
mean_log_probs
=
log_probs
.
mean
(
dim
=-
1
)
loss
=
(
1.0
-
smoothing
)
*
loss
-
smoothing
*
mean_log_probs
ctx
.
label_smoothing
,
ctx
.
vocab_size
=
label_smoothing
,
vocab_size
ctx
.
save_for_backward
(
exp_logits
,
target_mask
,
masked_target_1d
)
# Store softmax, target-mask and masked-target for backward pass.
ctx
.
save_for_backward
(
exp_logits
,
target_mask
,
masked_target_1d
)
return
loss
@
staticmethod
def
backward
(
ctx
,
grad_output
):
# Retreive tensors from the forward path.
softmax
,
target_mask
,
masked_target_1d
=
ctx
.
saved_tensors
label_smoothing
,
vocab_size
=
ctx
.
label_smoothing
,
ctx
.
vocab_size
# All the inputs have softmax as thier gradient.
grad_input
=
softmax
# For simplicity, work with the 2D gradient.
partition_vocab_size
=
softmax
.
size
()[
-
1
]
grad_2d
=
grad_input
.
view
(
-
1
,
partition_vocab_size
)
# Add the gradient from matching classes.
arange_1d
=
torch
.
arange
(
start
=
0
,
end
=
grad_2d
.
size
()[
0
],
device
=
grad_2d
.
device
)
softmax_update
=
1.0
-
target_mask
.
view
(
-
1
).
float
()
if
label_smoothing
>
0
:
smoothing
=
label_smoothing
*
vocab_size
/
(
vocab_size
-
1
)
grad_2d
[
arange_1d
,
masked_target_1d
]
-=
(
1.0
-
smoothing
)
*
softmax_update
average_grad
=
1
/
vocab_size
grad_2d
[
arange_1d
,
:]
-=
smoothing
*
average_grad
else
:
grad_2d
[
arange_1d
,
masked_target_1d
]
-=
softmax_update
# Finally elementwise multiplication with the output gradients.
grad_input
.
mul_
(
grad_output
.
unsqueeze
(
dim
=-
1
))
return
grad_input
,
None
,
None
def
vocab_parallel_cross_entropy
(
vocab_parallel_logits
,
target
,
label_smoothing
=
0.0
):
"""
Performs cross entropy loss when logits are split across tensor parallel ranks
Arguments:
vocab_parallel_logits: logits split across tensor parallel ranks
dimension is [sequence_length, batch_size, hidden_size]
target: correct vocab ids of dimseion [sequence_length, micro_batch_size]
lobal_smoothing: smoothing factor, must be in range [0.0, 1.0)
default is no smoothing (=0.0)
"""
return
_VocabParallelCrossEntropy
.
apply
(
vocab_parallel_logits
,
target
,
label_smoothing
)
megatron/core/tensor_parallel/data.py
0 → 100644
View file @
d3dd8642
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import
torch
from
megatron.core.parallel_state
import
(
get_tensor_model_parallel_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_src_rank
,
)
_MAX_DATA_DIM
=
5
def
_check_data_types
(
keys
,
data
,
target_dtype
):
"""Check that all the keys have the same target data type."""
for
key
in
keys
:
assert
data
[
key
].
dtype
==
target_dtype
,
'{} has data type {} which '
\
'is different than {}'
.
format
(
key
,
data
[
key
].
dtype
,
target_dtype
)
def
_build_key_size_numel_dictionaries
(
keys
,
data
):
"""Build the size on rank 0 and broadcast."""
max_dim
=
_MAX_DATA_DIM
sizes
=
[
0
for
_
in
range
(
max_dim
)
for
_
in
keys
]
# Pack the sizes on rank zero.
if
get_tensor_model_parallel_rank
()
==
0
:
offset
=
0
for
key
in
keys
:
assert
data
[
key
].
dim
()
<
max_dim
,
'you should increase MAX_DATA_DIM'
size
=
data
[
key
].
size
()
for
i
,
s
in
enumerate
(
size
):
sizes
[
i
+
offset
]
=
s
offset
+=
max_dim
# Move to GPU and broadcast.
sizes_cuda
=
torch
.
cuda
.
LongTensor
(
sizes
)
torch
.
distributed
.
broadcast
(
sizes_cuda
,
get_tensor_model_parallel_src_rank
(),
group
=
get_tensor_model_parallel_group
())
# Move back to cpu and unpack.
sizes_cpu
=
sizes_cuda
.
cpu
()
key_size
=
{}
key_numel
=
{}
total_numel
=
0
offset
=
0
for
key
in
keys
:
i
=
0
size
=
[]
numel
=
1
while
sizes_cpu
[
offset
+
i
]
>
0
:
this_size
=
sizes_cpu
[
offset
+
i
]
size
.
append
(
this_size
)
numel
*=
this_size
i
+=
1
key_size
[
key
]
=
size
key_numel
[
key
]
=
numel
total_numel
+=
numel
offset
+=
max_dim
return
key_size
,
key_numel
,
total_numel
def
broadcast_data
(
keys
,
data
,
datatype
):
"""Broadcast data from rank zero of each model parallel group to the
members of the same model parallel group.
Arguments:
keys: list of keys in the data disctionary to be broadcasted
data: data dictionary of string keys and cpu tensor values.
datatype: torch data type of all tensors in data associated
with keys.
"""
# Build (key, size) and (key, number of elements) dictionaries along
# with the total number of elements on all ranks.
key_size
,
key_numel
,
total_numel
=
_build_key_size_numel_dictionaries
(
keys
,
data
)
# Pack on rank zero.
if
get_tensor_model_parallel_rank
()
==
0
:
# Check that all keys have the same data type.
_check_data_types
(
keys
,
data
,
datatype
)
# Flatten the data associated with the keys
flatten_data
=
torch
.
cat
(
[
data
[
key
].
contiguous
().
view
(
-
1
)
for
key
in
keys
],
dim
=
0
).
cuda
()
else
:
flatten_data
=
torch
.
empty
(
total_numel
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
datatype
)
# Broadcast
torch
.
distributed
.
broadcast
(
flatten_data
,
get_tensor_model_parallel_src_rank
(),
group
=
get_tensor_model_parallel_group
())
# Unpack
output
=
{}
offset
=
0
for
key
in
keys
:
size
=
key_size
[
key
]
numel
=
key_numel
[
key
]
output
[
key
]
=
flatten_data
.
narrow
(
0
,
offset
,
numel
).
view
(
size
)
offset
+=
numel
return
output
megatron/core/tensor_parallel/layers.py
0 → 100644
View file @
d3dd8642
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Parts of the code here are adapted from PyTorch
# repo: https://github.com/pytorch/pytorch
import
math
import
os
from
typing
import
Optional
,
Callable
import
warnings
import
torch
import
torch.nn.functional
as
F
import
torch.nn.init
as
init
from
torch.nn.parameter
import
Parameter
from
torch.cuda.amp
import
custom_fwd
,
custom_bwd
from
megatron.core.model_parallel_config
import
ModelParallelConfig
from
megatron.core.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_group
,
get_global_memory_buffer
,
)
from
.mappings
import
(
copy_to_tensor_model_parallel_region
,
gather_from_tensor_model_parallel_region
,
gather_from_sequence_parallel_region
,
reduce_from_tensor_model_parallel_region
,
scatter_to_tensor_model_parallel_region
,
reduce_scatter_to_sequence_parallel_region
,
)
from
.random
import
get_cuda_rng_tracker
from
.utils
import
(
divide
,
split_tensor_along_last_dim
,
VocabUtility
,
)
_grad_accum_fusion_available
=
True
try
:
import
fused_weight_gradient_mlp_cuda
except
ImportError
:
_grad_accum_fusion_available
=
False
_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS
=
{
'tensor_model_parallel'
:
False
,
'partition_dim'
:
-
1
,
'partition_stride'
:
1
}
def
param_is_not_tensor_parallel_duplicate
(
param
):
return
(
hasattr
(
param
,
'tensor_model_parallel'
)
and
param
.
tensor_model_parallel
)
or
(
get_tensor_model_parallel_rank
()
==
0
)
def
set_tensor_model_parallel_attributes
(
tensor
,
is_parallel
,
dim
,
stride
):
# Make sure the attributes are not set.
for
attribute
in
_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS
:
assert
not
hasattr
(
tensor
,
attribute
)
# Set the attributes.
setattr
(
tensor
,
'tensor_model_parallel'
,
is_parallel
)
setattr
(
tensor
,
'partition_dim'
,
dim
)
setattr
(
tensor
,
'partition_stride'
,
stride
)
def
set_defaults_if_not_set_tensor_model_parallel_attributes
(
tensor
):
def
maybe_set
(
attribute
,
value
):
if
not
hasattr
(
tensor
,
attribute
):
setattr
(
tensor
,
attribute
,
value
)
for
attribute
in
_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS
:
maybe_set
(
attribute
,
_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS
[
attribute
])
def
copy_tensor_model_parallel_attributes
(
destination_tensor
,
source_tensor
):
def
maybe_copy
(
attribute
):
if
hasattr
(
source_tensor
,
attribute
):
setattr
(
destination_tensor
,
attribute
,
getattr
(
source_tensor
,
attribute
))
for
attribute
in
_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS
:
maybe_copy
(
attribute
)
def
_initialize_affine_weight_gpu
(
weight
,
init_method
,
partition_dim
,
stride
=
1
,
expert_parallel
=
False
):
"""Initialize affine weight for model parallel on GPU."""
set_tensor_model_parallel_attributes
(
tensor
=
weight
,
is_parallel
=
True
,
dim
=
partition_dim
,
stride
=
stride
)
with
get_cuda_rng_tracker
().
fork
():
init_method
(
weight
)
def
_initialize_affine_weight_cpu
(
weight
,
output_size
,
input_size
,
per_partition_size
,
partition_dim
,
init_method
,
stride
=
1
,
return_master_weight
=
False
,
*
,
params_dtype
=
torch
.
float32
):
"""Initialize affine weight for model parallel.
Build the master weight on all processes and scatter
the relevant chunk."""
set_tensor_model_parallel_attributes
(
tensor
=
weight
,
is_parallel
=
True
,
dim
=
partition_dim
,
stride
=
stride
)
# Initialize master weight
master_weight
=
torch
.
empty
(
output_size
,
input_size
,
dtype
=
torch
.
float
,
requires_grad
=
False
)
init_method
(
master_weight
)
master_weight
=
master_weight
.
to
(
dtype
=
params_dtype
)
# Split and copy
per_partition_per_stride_size
=
divide
(
per_partition_size
,
stride
)
weight_list
=
torch
.
split
(
master_weight
,
per_partition_per_stride_size
,
dim
=
partition_dim
)
rank
=
get_tensor_model_parallel_rank
()
world_size
=
get_tensor_model_parallel_world_size
()
my_weight_list
=
weight_list
[
rank
::
world_size
]
with
torch
.
no_grad
():
torch
.
cat
(
my_weight_list
,
dim
=
partition_dim
,
out
=
weight
)
if
return_master_weight
:
return
master_weight
return
None
class
VocabParallelEmbedding
(
torch
.
nn
.
Module
):
"""Embedding parallelized in the vocabulary dimension.
This is mainly adapted from torch.nn.Embedding and all the default
values are kept.
Arguments:
num_embeddings: vocabulary size.
embedding_dim: size of hidden state.
Keyword Arguments:
config: A megatron.core.ModelParallelConfig object
"""
def
__init__
(
self
,
num_embeddings
:
int
,
embedding_dim
:
int
,
*
,
init_method
:
Callable
,
config
:
ModelParallelConfig
):
super
(
VocabParallelEmbedding
,
self
).
__init__
()
# Keep the input dimensions.
self
.
num_embeddings
=
num_embeddings
self
.
embedding_dim
=
embedding_dim
# Set the detauls for compatibility.
self
.
padding_idx
=
None
self
.
max_norm
=
None
self
.
norm_type
=
2.
self
.
scale_grad_by_freq
=
False
self
.
sparse
=
False
self
.
_weight
=
None
self
.
tensor_model_parallel_size
=
get_tensor_model_parallel_world_size
()
# Divide the weight matrix along the vocaburaly dimension.
self
.
vocab_start_index
,
self
.
vocab_end_index
=
\
VocabUtility
.
vocab_range_from_global_vocab_size
(
self
.
num_embeddings
,
get_tensor_model_parallel_rank
(),
self
.
tensor_model_parallel_size
)
self
.
num_embeddings_per_partition
=
self
.
vocab_end_index
-
\
self
.
vocab_start_index
# Allocate weights and initialize.
if
config
.
use_cpu_initialization
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
num_embeddings_per_partition
,
self
.
embedding_dim
,
dtype
=
config
.
params_dtype
))
if
config
.
perform_initialization
:
_initialize_affine_weight_cpu
(
self
.
weight
,
self
.
num_embeddings
,
self
.
embedding_dim
,
self
.
num_embeddings_per_partition
,
0
,
init_method
,
params_dtype
=
config
.
params_dtype
)
else
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
num_embeddings_per_partition
,
self
.
embedding_dim
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
config
.
params_dtype
))
if
config
.
perform_initialization
:
_initialize_affine_weight_gpu
(
self
.
weight
,
init_method
,
partition_dim
=
0
,
stride
=
1
)
def
forward
(
self
,
input_
):
if
self
.
tensor_model_parallel_size
>
1
:
# Build the mask.
input_mask
=
(
input_
<
self
.
vocab_start_index
)
|
\
(
input_
>=
self
.
vocab_end_index
)
# Mask the input.
masked_input
=
input_
.
clone
()
-
self
.
vocab_start_index
masked_input
[
input_mask
]
=
0
else
:
masked_input
=
input_
# Get the embeddings.
output_parallel
=
F
.
embedding
(
masked_input
,
self
.
weight
,
self
.
padding_idx
,
self
.
max_norm
,
self
.
norm_type
,
self
.
scale_grad_by_freq
,
self
.
sparse
)
# Mask the output embedding.
if
self
.
tensor_model_parallel_size
>
1
:
output_parallel
[
input_mask
,
:]
=
0.0
# Reduce across all the model parallel GPUs.
output
=
reduce_from_tensor_model_parallel_region
(
output_parallel
)
return
output
class
LinearWithGradAccumulationAndAsyncCommunication
(
torch
.
autograd
.
Function
):
"""See linear_with_grad_accumulation_and_async_allreduce"""
@
staticmethod
@
custom_fwd
def
forward
(
ctx
,
input
,
weight
,
bias
,
gradient_accumulation_fusion
,
async_grad_allreduce
,
sequence_parallel
):
ctx
.
save_for_backward
(
input
,
weight
)
ctx
.
use_bias
=
bias
is
not
None
ctx
.
gradient_accumulation_fusion
=
gradient_accumulation_fusion
ctx
.
async_grad_allreduce
=
async_grad_allreduce
ctx
.
sequence_parallel
=
sequence_parallel
if
sequence_parallel
:
world_size
=
get_tensor_model_parallel_world_size
()
dim_size
=
list
(
input
.
size
())
dim_size
[
0
]
=
dim_size
[
0
]
*
world_size
all_gather_buffer
=
\
get_global_memory_buffer
().
get_tensor
(
dim_size
,
input
.
dtype
,
"mpu"
)
torch
.
distributed
.
_all_gather_base
(
all_gather_buffer
,
input
,
group
=
get_tensor_model_parallel_group
())
total_input
=
all_gather_buffer
else
:
total_input
=
input
output
=
torch
.
matmul
(
total_input
,
weight
.
t
())
if
bias
is
not
None
:
output
=
output
+
bias
return
output
@
staticmethod
@
custom_bwd
def
backward
(
ctx
,
grad_output
):
input
,
weight
=
ctx
.
saved_tensors
use_bias
=
ctx
.
use_bias
if
ctx
.
sequence_parallel
:
world_size
=
get_tensor_model_parallel_world_size
()
dim_size
=
list
(
input
.
size
())
dim_size
[
0
]
=
dim_size
[
0
]
*
world_size
all_gather_buffer
=
\
get_global_memory_buffer
().
get_tensor
(
dim_size
,
input
.
dtype
,
"mpu"
)
handle
=
torch
.
distributed
.
_all_gather_base
(
all_gather_buffer
,
input
,
group
=
get_tensor_model_parallel_group
(),
async_op
=
True
)
# Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
# gather is scheduled before the input gradient computation
total_input
=
all_gather_buffer
else
:
total_input
=
input
grad_input
=
grad_output
.
matmul
(
weight
)
if
ctx
.
sequence_parallel
:
handle
.
wait
()
# Doing gather + slicing during the NeMo forward pass can make this tensor
# not be contiguous. PyTorch only checks if the tensor is contiguous, and only
# clones it if it's not contiguous:
# https://github.com/pytorch/pytorch/blob/c47cf9bc7f9e02f649ab4ed53fe4d35732c92ab6/torch/_refs/__init__.py#L2761
grad_output
=
grad_output
.
contiguous
()
# Convert the tensor shapes to 2D for execution compatibility
grad_output
=
grad_output
.
view
(
grad_output
.
shape
[
0
]
*
grad_output
.
shape
[
1
],
grad_output
.
shape
[
2
])
total_input
=
total_input
.
view
(
total_input
.
shape
[
0
]
*
total_input
.
shape
[
1
],
total_input
.
shape
[
2
])
if
ctx
.
async_grad_allreduce
:
# Asynchronous all-reduce
handle
=
torch
.
distributed
.
all_reduce
(
grad_input
,
group
=
get_tensor_model_parallel_group
(),
async_op
=
True
)
# Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
# all-reduce is scheduled before the weight gradient computation
if
ctx
.
sequence_parallel
:
assert
not
ctx
.
async_grad_allreduce
dim_size
=
list
(
input
.
size
())
sub_grad_input
=
torch
.
empty
(
dim_size
,
dtype
=
input
.
dtype
,
device
=
torch
.
cuda
.
current_device
(),
requires_grad
=
False
)
# reduce_scatter
handle
=
torch
.
distributed
.
_reduce_scatter_base
(
sub_grad_input
,
grad_input
,
group
=
get_tensor_model_parallel_group
(),
async_op
=
True
)
# Here we rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that the
# reduce scatter is scheduled before the weight gradient computation
if
ctx
.
gradient_accumulation_fusion
:
if
weight
.
main_grad
.
dtype
==
torch
.
float32
:
fused_weight_gradient_mlp_cuda
.
wgrad_gemm_accum_fp32
(
total_input
,
grad_output
,
weight
.
main_grad
)
elif
weight
.
main_grad
.
dtype
in
(
torch
.
float16
,
torch
.
bfloat16
):
fused_weight_gradient_mlp_cuda
.
wgrad_gemm_accum_fp16
(
total_input
,
grad_output
,
weight
.
main_grad
)
else
:
raise
RuntimeError
(
"Unsupported gradient type for gradient accumulation fusion"
)
grad_weight
=
None
else
:
grad_weight
=
grad_output
.
t
().
matmul
(
total_input
)
grad_bias
=
grad_output
.
sum
(
dim
=
0
)
if
use_bias
else
None
if
ctx
.
sequence_parallel
:
handle
.
wait
()
return
sub_grad_input
,
grad_weight
,
grad_bias
,
None
,
None
,
None
if
ctx
.
async_grad_allreduce
:
handle
.
wait
()
return
grad_input
,
grad_weight
,
grad_bias
,
None
,
None
,
None
def
linear_with_grad_accumulation_and_async_allreduce
(
input
:
torch
.
Tensor
,
weight
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
],
gradient_accumulation_fusion
:
bool
,
async_grad_allreduce
:
bool
,
sequence_parallel
:
bool
,
)
->
torch
.
Tensor
:
"""Linear layer execution with asynchronous communication and
gradient accumulation fusion in backprop.
This has the option to accumulate the result of backprop
calculation into an existing gradient buffer, preventing the need
to do an additional addition kernel after the gradient
calculation.
Additionally, the tensor parallel all reduce of the input
gradients can be done asynchronously with the calculation of
the weight gradients.
In the case of sequence parallelism, the reduce scatter of the
input gradients is done asynchronously with the calcluation of the
weight gradients.
Use of this module requires that the environment variable
CUDA_DEVICE_MAX_CONNECTIONS=1. There are a few collective
operations, noted in the code, that should be scheduled before
compute kernels to overlap the communication with the computation,
which is necessary for a speedup but not for correctness so that
ordering isn't imposed by the scheduler. Setting
CUDA_DEVICE_MAX_CONNECTIONS=1 forces the kernels to be scheduled
in the order they are called.
Arguments:
input (torch.Tensor required): input like torch.nn.functional.linear
weight (torch.Tensor required): weight like torch.nn.functional.linear
bias (torch.Tensor optional): bias like torch.nn.functional.linear
gradient_accumulation_fusion (bool required): Perform the gradient
accumulation fusion, requires the custom CUDA extension
fused_weight_gradient_mlp_cuda module. To use
gradient_accumulation_fusion you must install APEX with
--cpp_ext and --cuda_ext. For example: "pip install
--global-option=
\"
--cpp_ext
\"
--global-option=
\"
--cuda_ext .
\"
" Note that the extension requires CUDA>=11. Otherwise, you
must turn off gradient accumulation fusion."
async_grad_allreduce (bool required): Do the allreduce of input
gradients asyncronously with the computation of weight
gradients. If sequence_parallel is True, this must be
False, as no all reduce is performed.
sequence_parallel (bool required): Indicates that sequence
parallelism is used and thus in the forward pass the input is
all gathered, and the backward pass the input gradients are
reduce scattered.
"""
args
=
[
input
,
weight
,
bias
,
gradient_accumulation_fusion
,
async_grad_allreduce
,
sequence_parallel
,
]
if
not
linear_with_grad_accumulation_and_async_allreduce
.
warned
:
if
os
.
environ
.
get
(
'CUDA_DEVICE_MAX_CONNECTIONS'
)
!=
"1"
:
if
sequence_parallel
:
warnings
.
warn
(
"When using sequence parallelism it is recommended to set the "
"environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for "
"maximum speedup"
)
linear_with_grad_accumulation_and_async_allreduce
.
warned
=
True
if
async_grad_allreduce
:
warnings
.
warn
(
"When using async grad allreduce it is recommended to set the "
"environment variable CUDA_DEVICE_MAX_CONNECTIONS to 1 for "
"maximum speedup"
)
linear_with_grad_accumulation_and_async_allreduce
.
warned
=
True
return
LinearWithGradAccumulationAndAsyncCommunication
.
apply
(
*
args
)
linear_with_grad_accumulation_and_async_allreduce
.
warned
=
False
class
ColumnParallelLinear
(
torch
.
nn
.
Module
):
"""Linear layer with column parallelism.
The linear layer is defined as Y = XA + b. A is parallelized along
its second dimension as A = [A_1, ..., A_p].
Arguments:
input_size: first dimension of matrix A.
output_size: second dimension of matrix A.
Keyword Arguments
bias: If true, add bias
gather_output: If true, call all-gather on output and make Y available
to all GPUs, otherwise, every GPU will have its output
which is Y_i = XA_i
init_method: method to initialize weights. Note that bias is always set
to zero.
stride: For the strided linear layers.
keep_master_weight_for_test: This was added for testing and should be
set to False. It returns the master weights
used for initialization.
skip_bias_add: If True, do not add the bias term, instead
return it to be added by the caller. This
enables performance optimations where bias can
be fused with other elementwise operations.
skip_weight_param_allocation: If True, weight parameter is not allocated and must be passed
as a keyword argument `weight` during the forward pass. Note
that this does not affect bias, which will be allocated if
bias is True. Defaults to False.
config: ModelParallelConfig object
"""
def
__init__
(
self
,
input_size
,
output_size
,
*
,
config
:
ModelParallelConfig
,
init_method
:
Callable
,
bias
=
True
,
gather_output
=
False
,
stride
=
1
,
keep_master_weight_for_test
=
False
,
skip_bias_add
=
False
,
skip_weight_param_allocation
:
bool
=
False
):
super
(
ColumnParallelLinear
,
self
).
__init__
()
# Keep input parameters
self
.
input_size
=
input_size
self
.
output_size
=
output_size
self
.
gather_output
=
gather_output
# Divide the weight matrix along the last dimension.
world_size
=
get_tensor_model_parallel_world_size
()
self
.
output_size_per_partition
=
divide
(
output_size
,
world_size
)
self
.
skip_bias_add
=
skip_bias_add
self
.
config
=
config
# Parameters.
# Note: torch.nn.functional.linear performs XA^T + b and as a result
# we allocate the transpose.
# Initialize weight.
if
not
skip_weight_param_allocation
:
if
config
.
use_cpu_initialization
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
output_size_per_partition
,
self
.
input_size
,
dtype
=
config
.
params_dtype
))
if
config
.
perform_initialization
:
self
.
master_weight
=
_initialize_affine_weight_cpu
(
self
.
weight
,
self
.
output_size
,
self
.
input_size
,
self
.
output_size_per_partition
,
0
,
init_method
,
stride
=
stride
,
return_master_weight
=
keep_master_weight_for_test
)
else
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
output_size_per_partition
,
self
.
input_size
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
config
.
params_dtype
))
if
config
.
perform_initialization
:
_initialize_affine_weight_gpu
(
self
.
weight
,
init_method
,
partition_dim
=
0
,
stride
=
stride
)
else
:
self
.
weight
=
None
if
bias
:
if
config
.
use_cpu_initialization
:
self
.
bias
=
Parameter
(
torch
.
empty
(
self
.
output_size_per_partition
,
dtype
=
config
.
params_dtype
))
else
:
self
.
bias
=
Parameter
(
torch
.
empty
(
self
.
output_size_per_partition
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
config
.
params_dtype
))
set_tensor_model_parallel_attributes
(
self
.
bias
,
True
,
0
,
stride
)
if
config
.
perform_initialization
:
# Always initialize bias to zero.
with
torch
.
no_grad
():
self
.
bias
.
zero_
()
else
:
self
.
register_parameter
(
'bias'
,
None
)
self
.
async_tensor_model_parallel_allreduce
=
(
config
.
async_tensor_model_parallel_allreduce
and
world_size
>
1
)
self
.
sequence_parallel
=
config
.
sequence_parallel
if
self
.
sequence_parallel
and
world_size
<=
1
:
warnings
.
warn
(
f
"`sequence_parallel` is set to `True`, but tensor model parallel size is
{
world_size
}
. "
f
"Disabling sequence parallel."
)
self
.
sequence_parallel
=
False
if
config
.
gradient_accumulation_fusion
and
not
_grad_accum_fusion_available
:
raise
RuntimeError
(
"ColumnParallelLinear was called with gradient_accumulation_fusion set "
"to True but the custom CUDA extension fused_weight_gradient_mlp_cuda "
"module is not found. To use gradient_accumulation_fusion you must "
"install APEX with --cpp_ext and --cuda_ext. For example: "
"pip install --global-option=
\"
--cpp_ext
\"
--global-option=
\"
--cuda_ext .
\"
"
"Note that the extension requires CUDA>=11. Otherwise, you must turn off "
"gradient accumulation fusion."
)
self
.
gradient_accumulation_fusion
=
config
.
gradient_accumulation_fusion
if
self
.
async_tensor_model_parallel_allreduce
and
self
.
sequence_parallel
:
raise
RuntimeError
(
"`async_tensor_model_parallel_allreduce` and `sequence_parallel` "
"cannot be enabled at the same time."
)
self
.
_forward_impl
=
linear_with_grad_accumulation_and_async_allreduce
def
forward
(
self
,
input_
:
torch
.
Tensor
,
weight
:
Optional
[
torch
.
Tensor
]
=
None
):
"""Forward of ColumnParallelLinear
Args:
input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
weight (optional): weight tensor to use, compulsory when
skip_weight_param_allocation is True.
Returns:
- output
- bias
"""
if
weight
is
None
:
if
self
.
weight
is
None
:
raise
RuntimeError
(
"weight was not supplied to ColumnParallelLinear forward pass "
"and skip_weight_param_allocation is True."
)
weight
=
self
.
weight
else
:
# Check the weight passed in is the correct shape
expected_shape
=
(
self
.
output_size_per_partition
,
self
.
input_size
)
if
weight
.
shape
!=
expected_shape
:
raise
RuntimeError
(
f
"supplied weight's shape is
{
tuple
(
weight
.
shape
)
}
, "
f
"not
{
expected_shape
}
as expected"
)
bias
=
self
.
bias
if
not
self
.
skip_bias_add
else
None
if
self
.
async_tensor_model_parallel_allreduce
or
\
self
.
sequence_parallel
:
input_parallel
=
input_
else
:
input_parallel
=
copy_to_tensor_model_parallel_region
(
input_
)
# Matrix multiply.
output_parallel
=
self
.
_forward_impl
(
input
=
input_parallel
,
weight
=
weight
,
bias
=
bias
,
gradient_accumulation_fusion
=
self
.
gradient_accumulation_fusion
,
async_grad_allreduce
=
self
.
async_tensor_model_parallel_allreduce
,
sequence_parallel
=
self
.
sequence_parallel
)
if
self
.
gather_output
:
# All-gather across the partitions.
assert
not
self
.
sequence_parallel
output
=
gather_from_tensor_model_parallel_region
(
output_parallel
)
else
:
output
=
output_parallel
output_bias
=
self
.
bias
if
self
.
skip_bias_add
else
None
return
output
,
output_bias
class
RowParallelLinear
(
torch
.
nn
.
Module
):
"""Linear layer with row parallelism.
The linear layer is defined as Y = XA + b. A is parallelized along
its first dimension and X along its second dimension as:
- -
| A_1 |
| . |
A = | . | X = [X_1, ..., X_p]
| . |
| A_p |
- -
Arguments:
input_size: first dimension of matrix A.
output_size: second dimension of matrix A.
Keyword Arguments:
bias: If true, add bias. Note that bias is not parallelized.
input_is_parallel: If true, we assume that the input is already
split across the GPUs and we do not split
again.
init_method: method to initialize weights. Note that bias is always set
to zero.
stride: For the strided linear layers.
keep_master_weight_for_test: This was added for testing and should be
set to False. It returns the master weights
used for initialization.
skip_bias_add: If True, do not add the bias term, instead
return it to be added by the caller. This
enables performance optimations where bias can
be fused with other elementwise operations.
config: ModelParallelConfig object
"""
def
__init__
(
self
,
input_size
:
int
,
output_size
:
int
,
*
,
config
:
ModelParallelConfig
,
init_method
:
Callable
,
bias
:
bool
=
True
,
input_is_parallel
:
bool
=
False
,
stride
:
int
=
1
,
keep_master_weight_for_test
:
bool
=
False
,
skip_bias_add
:
bool
=
False
):
super
(
RowParallelLinear
,
self
).
__init__
()
# Keep input parameters
self
.
input_size
=
input_size
self
.
output_size
=
output_size
self
.
input_is_parallel
=
input_is_parallel
# Divide the weight matrix along the last dimension.
world_size
=
get_tensor_model_parallel_world_size
()
self
.
input_size_per_partition
=
divide
(
input_size
,
world_size
)
self
.
skip_bias_add
=
skip_bias_add
self
.
config
=
config
self
.
gradient_accumulation_fusion
=
config
.
gradient_accumulation_fusion
self
.
sequence_parallel
=
config
.
sequence_parallel
if
self
.
sequence_parallel
and
not
self
.
input_is_parallel
:
raise
RuntimeError
(
"To enable `sequence_parallel`, `input_is_parallel` must be `True`"
)
# Parameters.
# Note: torch.nn.functional.linear performs XA^T + b and as a result
# we allocate the transpose.
# Initialize weight.
if
config
.
use_cpu_initialization
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
output_size
,
self
.
input_size_per_partition
,
dtype
=
config
.
params_dtype
))
if
config
.
perform_initialization
:
self
.
master_weight
=
_initialize_affine_weight_cpu
(
self
.
weight
,
self
.
output_size
,
self
.
input_size
,
self
.
input_size_per_partition
,
1
,
init_method
,
stride
=
stride
,
return_master_weight
=
keep_master_weight_for_test
,
params_dtype
=
config
.
params_dtype
)
else
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
output_size
,
self
.
input_size_per_partition
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
config
.
params_dtype
))
if
config
.
perform_initialization
:
_initialize_affine_weight_gpu
(
self
.
weight
,
init_method
,
partition_dim
=
1
,
stride
=
stride
)
if
bias
:
if
config
.
use_cpu_initialization
:
self
.
bias
=
Parameter
(
torch
.
empty
(
self
.
output_size
,
dtype
=
config
.
params_dtype
))
else
:
self
.
bias
=
Parameter
(
torch
.
empty
(
self
.
output_size
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
config
.
params_dtype
))
setattr
(
self
.
bias
,
'sequence_parallel'
,
self
.
sequence_parallel
)
if
config
.
perform_initialization
:
# Always initialize bias to zero.
with
torch
.
no_grad
():
self
.
bias
.
zero_
()
else
:
self
.
register_parameter
(
'bias'
,
None
)
self
.
_forward_impl
=
linear_with_grad_accumulation_and_async_allreduce
def
forward
(
self
,
input_
):
"""Forward of RowParallelLinear
Args:
input_: 3D tensor whose order of dimension is [sequence, batch, hidden]
Returns:
- output
- bias
"""
# Set up backprop all-reduce.
if
self
.
input_is_parallel
:
input_parallel
=
input_
else
:
assert
not
self
.
sequence_parallel
input_parallel
=
scatter_to_tensor_model_parallel_region
(
input_
)
# Matrix multiply.
output_parallel
=
self
.
_forward_impl
(
input
=
input_parallel
,
weight
=
self
.
weight
,
bias
=
None
,
gradient_accumulation_fusion
=
self
.
gradient_accumulation_fusion
,
async_grad_allreduce
=
False
,
sequence_parallel
=
False
,
)
# All-reduce across all the partitions.
if
self
.
sequence_parallel
:
output_
=
reduce_scatter_to_sequence_parallel_region
(
output_parallel
)
else
:
output_
=
reduce_from_tensor_model_parallel_region
(
output_parallel
)
if
not
self
.
skip_bias_add
:
output
=
output_
+
self
.
bias
if
self
.
bias
is
not
None
else
output_
output_bias
=
None
else
:
output
=
output_
output_bias
=
self
.
bias
return
output
,
output_bias
megatron/core/tensor_parallel/mappings.py
0 → 100644
View file @
d3dd8642
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import
torch
from
megatron.core.parallel_state
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_group
,
)
from
.utils
import
split_tensor_along_last_dim
def
_reduce
(
input_
):
"""All-reduce the input tensor across model parallel group."""
# Bypass the function if we are using only 1 GPU.
if
get_tensor_model_parallel_world_size
()
==
1
:
return
input_
# All-reduce.
torch
.
distributed
.
all_reduce
(
input_
,
group
=
get_tensor_model_parallel_group
())
return
input_
def
_split_along_last_dim
(
input_
):
"""Split the tensor along its last dimension and keep the
corresponding slice."""
world_size
=
get_tensor_model_parallel_world_size
()
# Bypass the function if we are using only 1 GPU.
if
world_size
==
1
:
return
input_
# Split along last dimension.
input_list
=
split_tensor_along_last_dim
(
input_
,
world_size
)
# Note: torch.split does not create contiguous tensors by default.
rank
=
get_tensor_model_parallel_rank
()
output
=
input_list
[
rank
].
contiguous
()
return
output
def
_split_along_first_dim
(
input_
):
"""Split the tensor along its first dimension and keep the
corresponding slice."""
world_size
=
get_tensor_model_parallel_world_size
()
# Bypass the function if we are using only 1 GPU.
if
world_size
==
1
:
return
input_
# Split along first dimension.
dim_size
=
input_
.
size
()[
0
]
assert
dim_size
%
world_size
==
0
,
\
"First dimension of the tensor should be divisible by tensor parallel size"
local_dim_size
=
dim_size
//
world_size
rank
=
get_tensor_model_parallel_rank
()
dim_offset
=
rank
*
local_dim_size
output
=
input_
[
dim_offset
:
dim_offset
+
local_dim_size
].
contiguous
()
return
output
def
_gather_along_last_dim
(
input_
):
"""Gather tensors and concatinate along the last dimension."""
world_size
=
get_tensor_model_parallel_world_size
()
# Bypass the function if we are using only 1 GPU.
if
world_size
==
1
:
return
input_
# Size and dimension.
last_dim
=
input_
.
dim
()
-
1
rank
=
get_tensor_model_parallel_rank
()
tensor_list
=
[
torch
.
empty_like
(
input_
)
for
_
in
range
(
world_size
)]
tensor_list
[
rank
]
=
input_
torch
.
distributed
.
all_gather
(
tensor_list
,
input_
,
group
=
get_tensor_model_parallel_group
())
# Note: torch.cat already creates a contiguous tensor.
output
=
torch
.
cat
(
tensor_list
,
dim
=
last_dim
).
contiguous
()
return
output
def
_gather_along_first_dim
(
input_
):
"""Gather tensors and concatinate along the first dimension."""
world_size
=
get_tensor_model_parallel_world_size
()
# Bypass the function if we are using only 1 GPU.
if
world_size
==
1
:
return
input_
dim_size
=
list
(
input_
.
size
())
dim_size
[
0
]
=
dim_size
[
0
]
*
world_size
output
=
torch
.
empty
(
dim_size
,
dtype
=
input_
.
dtype
,
device
=
torch
.
cuda
.
current_device
())
torch
.
distributed
.
_all_gather_base
(
output
,
input_
.
contiguous
(),
group
=
get_tensor_model_parallel_group
())
return
output
def
_reduce_scatter_along_first_dim
(
input_
):
"""Reduce-scatter the input tensor across model parallel group."""
world_size
=
get_tensor_model_parallel_world_size
()
# Bypass the function if we are using only 1 GPU.
if
world_size
==
1
:
return
input_
dim_size
=
list
(
input_
.
size
())
assert
dim_size
[
0
]
%
world_size
==
0
,
\
"First dimension of the tensor should be divisible by tensor parallel size"
dim_size
[
0
]
=
dim_size
[
0
]
//
world_size
output
=
torch
.
empty
(
dim_size
,
dtype
=
input_
.
dtype
,
device
=
torch
.
cuda
.
current_device
())
torch
.
distributed
.
_reduce_scatter_base
(
output
,
input_
.
contiguous
(),
group
=
get_tensor_model_parallel_group
())
return
output
class
_CopyToModelParallelRegion
(
torch
.
autograd
.
Function
):
"""Pass the input to the model parallel region."""
@
staticmethod
def
symbolic
(
graph
,
input_
):
return
input_
@
staticmethod
def
forward
(
ctx
,
input_
):
return
input_
@
staticmethod
def
backward
(
ctx
,
grad_output
):
return
_reduce
(
grad_output
)
class
_ReduceFromModelParallelRegion
(
torch
.
autograd
.
Function
):
"""All-reduce the input from the model parallel region."""
@
staticmethod
def
symbolic
(
graph
,
input_
):
return
_reduce
(
input_
)
@
staticmethod
def
forward
(
ctx
,
input_
):
return
_reduce
(
input_
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
return
grad_output
class
_ScatterToModelParallelRegion
(
torch
.
autograd
.
Function
):
"""Split the input and keep only the corresponding chuck to the rank."""
@
staticmethod
def
symbolic
(
graph
,
input_
):
return
_split_along_last_dim
(
input_
)
@
staticmethod
def
forward
(
ctx
,
input_
):
return
_split_along_last_dim
(
input_
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
return
_gather_along_last_dim
(
grad_output
)
class
_GatherFromModelParallelRegion
(
torch
.
autograd
.
Function
):
"""Gather the input from model parallel region and concatinate."""
@
staticmethod
def
symbolic
(
graph
,
input_
):
return
_gather_along_last_dim
(
input_
)
@
staticmethod
def
forward
(
ctx
,
input_
):
return
_gather_along_last_dim
(
input_
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
return
_split_along_last_dim
(
grad_output
)
class
_ScatterToSequenceParallelRegion
(
torch
.
autograd
.
Function
):
"""Split the input and keep only the corresponding chuck to the rank."""
@
staticmethod
def
symbolic
(
graph
,
input_
):
return
_split_along_first_dim
(
input_
)
@
staticmethod
def
forward
(
ctx
,
input_
):
return
_split_along_first_dim
(
input_
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
return
_gather_along_first_dim
(
grad_output
)
class
_GatherFromSequenceParallelRegion
(
torch
.
autograd
.
Function
):
"""Gather the input from sequence parallel region and concatinate."""
@
staticmethod
def
symbolic
(
graph
,
input_
,
tensor_parallel_output_grad
=
True
):
return
_gather_along_first_dim
(
input_
)
@
staticmethod
def
forward
(
ctx
,
input_
,
tensor_parallel_output_grad
=
True
):
ctx
.
tensor_parallel_output_grad
=
tensor_parallel_output_grad
return
_gather_along_first_dim
(
input_
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
tensor_parallel_output_grad
=
ctx
.
tensor_parallel_output_grad
# If the computation graph after the gather operation is
# in the tensor parallel mode, output gradients need to reduce
# scattered and whereas if the computation is duplicated,
# output gradients need to be scattered.
if
tensor_parallel_output_grad
:
return
_reduce_scatter_along_first_dim
(
grad_output
),
None
else
:
return
_split_along_first_dim
(
grad_output
),
None
class
_ReduceScatterToSequenceParallelRegion
(
torch
.
autograd
.
Function
):
"""Reduce scatter the input from the model parallel region."""
@
staticmethod
def
symbolic
(
graph
,
input_
):
return
_reduce_scatter_along_first_dim
(
input_
)
@
staticmethod
def
forward
(
ctx
,
input_
):
return
_reduce_scatter_along_first_dim
(
input_
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
return
_gather_along_first_dim
(
grad_output
)
# -----------------
# Helper functions.
# -----------------
def
copy_to_tensor_model_parallel_region
(
input_
):
return
_CopyToModelParallelRegion
.
apply
(
input_
)
def
reduce_from_tensor_model_parallel_region
(
input_
):
return
_ReduceFromModelParallelRegion
.
apply
(
input_
)
def
scatter_to_tensor_model_parallel_region
(
input_
):
return
_ScatterToModelParallelRegion
.
apply
(
input_
)
def
gather_from_tensor_model_parallel_region
(
input_
):
return
_GatherFromModelParallelRegion
.
apply
(
input_
)
def
scatter_to_sequence_parallel_region
(
input_
):
return
_ScatterToSequenceParallelRegion
.
apply
(
input_
)
def
gather_from_sequence_parallel_region
(
input_
,
tensor_parallel_output_grad
=
True
):
return
_GatherFromSequenceParallelRegion
.
apply
(
input_
,
tensor_parallel_output_grad
)
def
reduce_scatter_to_sequence_parallel_region
(
input_
):
return
_ReduceScatterToSequenceParallelRegion
.
apply
(
input_
)
megatron/core/tensor_parallel/random.py
0 → 100644
View file @
d3dd8642
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Parts of the code here are adapted from PyTorch
# repo: https://github.com/pytorch/pytorch
import
contextlib
import
torch
from
torch
import
_C
from
torch.cuda
import
_lazy_call
,
device
as
device_ctx_manager
from
torch.utils.checkpoint
import
detach_variable
from
megatron.core.parallel_state
import
(
get_data_parallel_rank
,
get_tensor_model_parallel_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
)
from
.utils
import
(
split_tensor_into_1d_equal_chunks
,
gather_split_1d_tensor
,
)
from
megatron.core.utils
import
safely_set_viewless_tensor_data
# Default name for the model parallel rng tracker.
_MODEL_PARALLEL_RNG_TRACKER_NAME
=
'model-parallel-rng'
def
_set_cuda_rng_state
(
new_state
,
device
=-
1
):
"""Sets the random number generator state of the current GPU.
Argumentss:
new_state (torch.ByteTensor): The desired state
This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
with a single change: the input state is not cloned. Cloning caused
major performance issues for +4 GPU cases.
"""
if
hasattr
(
_C
,
'_cuda_setRNGState'
)
and
callable
(
_C
.
_cuda_setRNGState
):
# older PyTorch
def
cb
():
with
device_ctx_manager
(
device
):
_C
.
_cuda_setRNGState
(
new_state
)
else
:
# newer PyTorch
if
device
==
-
1
:
device
=
torch
.
device
(
'cuda'
)
elif
isinstance
(
device
,
str
):
device
=
torch
.
device
(
device
)
elif
isinstance
(
device
,
int
):
device
=
torch
.
device
(
'cuda'
,
device
)
def
cb
():
idx
=
device
.
index
if
idx
is
None
:
idx
=
torch
.
cuda
.
current_device
()
default_generator
=
torch
.
cuda
.
default_generators
[
idx
]
default_generator
.
set_state
(
new_state
)
_lazy_call
(
cb
)
class
CudaRNGStatesTracker
:
"""Tracker for the cuda RNG states.
Using the `add` method, a cuda rng state is initialized based on
the input `seed` and is assigned to `name`. Later, by forking the
rng state, we can perform operations and return to our starting
cuda state.
"""
def
__init__
(
self
):
# Map from a string name to the cuda rng state.
self
.
states_
=
{}
# Seeds are just for book keeping and ensure no seed is set twice.
self
.
seeds_
=
set
()
def
reset
(
self
):
"""Set to the initial state (no tracker)."""
self
.
states_
=
{}
self
.
seeds_
=
set
()
def
get_states
(
self
):
"""Get rng states. Copy the dictionary so we have direct
pointers to the states, not just a pointer to the dictionary."""
states
=
{}
for
name
in
self
.
states_
:
states
[
name
]
=
self
.
states_
[
name
]
return
states
def
set_states
(
self
,
states
):
"""Set the rng states. For efficiency purposes, we do not check
the size of seed for compatibility."""
self
.
states_
=
states
def
add
(
self
,
name
,
seed
):
"""Track the rng state."""
# Check seed is not already used.
if
seed
in
self
.
seeds_
:
raise
Exception
(
'seed {} already exists'
.
format
(
seed
))
self
.
seeds_
.
add
(
seed
)
# Check that state is not already defined.
if
name
in
self
.
states_
:
raise
Exception
(
'cuda rng state {} already exists'
.
format
(
name
))
# Get the current rng state.
orig_rng_state
=
torch
.
cuda
.
get_rng_state
()
# Set the new state and store it.
torch
.
cuda
.
manual_seed
(
seed
)
self
.
states_
[
name
]
=
torch
.
cuda
.
get_rng_state
()
# Reset rng state to what it was.
_set_cuda_rng_state
(
orig_rng_state
)
@
contextlib
.
contextmanager
def
fork
(
self
,
name
=
_MODEL_PARALLEL_RNG_TRACKER_NAME
):
"""Fork the cuda rng state, perform operations, and exit with
the original state."""
# Check if we have added the state
if
name
not
in
self
.
states_
:
raise
Exception
(
'cuda rng state {} is not added'
.
format
(
name
))
# Store current rng state.
orig_cuda_rng_state
=
torch
.
cuda
.
get_rng_state
()
# Set rng state to the desired one
_set_cuda_rng_state
(
self
.
states_
[
name
])
# Do the stuff we wanted to do.
try
:
yield
finally
:
# Update the current rng state for later use.
self
.
states_
[
name
]
=
torch
.
cuda
.
get_rng_state
()
# And set the state to the original state we started with.
_set_cuda_rng_state
(
orig_cuda_rng_state
)
# RNG tracker object.
_CUDA_RNG_STATE_TRACKER
=
CudaRNGStatesTracker
()
def
get_cuda_rng_tracker
():
"""Get cuda rng tracker."""
return
_CUDA_RNG_STATE_TRACKER
def
model_parallel_cuda_manual_seed
(
seed
):
"""Initialize model parallel cuda seed.
This function should be called after the model parallel is
initialized. Also, no torch.cuda.manual_seed should be called
after this function. Basically, this is replacement for that
function.
Two set of RNG states are tracked:
default state: This is for data parallelism and is the same among a
set of model parallel GPUs but different across
different model paralle groups. This is used for
example for dropout in the non-tensor-model-parallel regions.
tensor-model-parallel state: This state is different among a set of model
parallel GPUs, but the same across data parallel
groups. This is used for example for dropout in
model parallel regions.
"""
# 2718 is just for fun and any POSITIVE value will work.
offset
=
seed
+
2718
tensor_model_parallel_seed
=
offset
+
get_tensor_model_parallel_rank
()
# Data parallel gets the original seed.
data_parallel_seed
=
seed
_CUDA_RNG_STATE_TRACKER
.
reset
()
# Set the default state.
torch
.
cuda
.
manual_seed
(
data_parallel_seed
)
# and model parallel state.
_CUDA_RNG_STATE_TRACKER
.
add
(
_MODEL_PARALLEL_RNG_TRACKER_NAME
,
tensor_model_parallel_seed
)
class
CheckpointFunction
(
torch
.
autograd
.
Function
):
"""This function is adapted from torch.utils.checkpoint with
two main changes:
1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
2) the states in the model parallel tracker are also properly
tracked/set/reset.
"""
@
staticmethod
def
forward
(
ctx
,
run_function
,
distribute_saved_activations
,
*
args
):
ctx
.
run_function
=
run_function
ctx
.
distribute_saved_activations
\
=
distribute_saved_activations
# Copy the rng states.
ctx
.
fwd_cpu_rng_state
=
torch
.
get_rng_state
()
ctx
.
fwd_cuda_rng_state
=
torch
.
cuda
.
get_rng_state
()
ctx
.
fwd_cuda_rng_state_tracker
=
get_cuda_rng_tracker
().
get_states
()
with
torch
.
no_grad
():
outputs
=
run_function
(
*
args
)
# Divide hidden states across model parallel group and only keep
# the chunk corresponding to the current rank.
if
distribute_saved_activations
:
ctx
.
input_0_shape
=
args
[
0
].
data
.
shape
safely_set_viewless_tensor_data
(
args
[
0
],
split_tensor_into_1d_equal_chunks
(
args
[
0
].
data
,
new_buffer
=
True
))
# Store everything.
ctx
.
save_for_backward
(
*
args
)
return
outputs
@
staticmethod
def
backward
(
ctx
,
*
args
):
if
not
torch
.
autograd
.
_is_checkpoint_valid
():
raise
RuntimeError
(
"Checkpointing is not compatible with .grad(), "
"please use .backward() if possible"
)
inputs
=
ctx
.
saved_tensors
if
ctx
.
distribute_saved_activations
:
safely_set_viewless_tensor_data
(
inputs
[
0
],
gather_split_1d_tensor
(
inputs
[
0
].
data
).
view
(
ctx
.
input_0_shape
))
# Store the current states.
bwd_cpu_rng_state
=
torch
.
get_rng_state
()
bwd_cuda_rng_state
=
torch
.
cuda
.
get_rng_state
()
bwd_cuda_rng_state_tracker
=
get_cuda_rng_tracker
().
get_states
()
# Set the states to what it used to be before the forward pass.
torch
.
set_rng_state
(
ctx
.
fwd_cpu_rng_state
)
_set_cuda_rng_state
(
ctx
.
fwd_cuda_rng_state
)
get_cuda_rng_tracker
().
set_states
(
ctx
.
fwd_cuda_rng_state_tracker
)
# Compute the forward pass.
detached_inputs
=
detach_variable
(
inputs
)
with
torch
.
enable_grad
():
outputs
=
ctx
.
run_function
(
*
detached_inputs
)
# Set the states back to what it was at the start of this function.
torch
.
set_rng_state
(
bwd_cpu_rng_state
)
_set_cuda_rng_state
(
bwd_cuda_rng_state
)
get_cuda_rng_tracker
().
set_states
(
bwd_cuda_rng_state_tracker
)
if
isinstance
(
outputs
,
torch
.
Tensor
):
outputs
=
(
outputs
,)
torch
.
autograd
.
backward
(
outputs
,
args
)
grads
=
tuple
(
inp
.
grad
if
isinstance
(
inp
,
torch
.
Tensor
)
else
inp
for
inp
in
detached_inputs
)
return
(
None
,
None
)
+
grads
def
checkpoint
(
function
,
distribute_saved_activations
,
*
args
):
"""Checkpoint a model or part of the model.
This has been directly copied from torch.utils.checkpoint."""
return
CheckpointFunction
.
apply
(
function
,
distribute_saved_activations
,
*
args
)
megatron/core/tensor_parallel/utils.py
0 → 100644
View file @
d3dd8642
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import
torch
from
typing
import
List
,
Sequence
from
megatron.core.utils
import
divide
from
megatron.core
import
parallel_state
def
split_tensor_along_last_dim
(
tensor
:
torch
.
Tensor
,
num_partitions
:
int
,
contiguous_split_chunks
:
bool
=
False
,
)
->
List
[
torch
.
Tensor
]:
""" Split a tensor along its last dimension.
Arguments:
tensor: input tensor.
num_partitions: number of partitions to split the tensor
contiguous_split_chunks: If True, make each chunk contiguous
in memory.
Returns:
A list of Tensors
"""
# Get the size and dimension.
last_dim
=
tensor
.
dim
()
-
1
last_dim_size
=
divide
(
tensor
.
size
()[
last_dim
],
num_partitions
)
# Split.
tensor_list
=
torch
.
split
(
tensor
,
last_dim_size
,
dim
=
last_dim
)
# Note: torch.split does not create contiguous tensors by default.
if
contiguous_split_chunks
:
return
tuple
(
chunk
.
contiguous
()
for
chunk
in
tensor_list
)
return
tensor_list
def
split_tensor_into_1d_equal_chunks
(
tensor
,
new_buffer
=
False
):
""" Break a tensor into equal 1D chunks across tensor parallel ranks.
Returns a Tensor or View with this rank's portion of the data.
Arguments:
tensor: The tensor to split
Keyword Arguments:
new_buffer (bool): If True, returns a new Tensor.
If False, returns a view into the existing Tensor.
Default is False
"""
partition_size
=
torch
.
numel
(
tensor
)
//
\
parallel_state
.
get_tensor_model_parallel_world_size
()
start_index
=
partition_size
*
parallel_state
.
get_tensor_model_parallel_rank
()
end_index
=
start_index
+
partition_size
if
new_buffer
:
data
=
torch
.
empty
(
partition_size
,
dtype
=
tensor
.
dtype
,
device
=
torch
.
cuda
.
current_device
(),
requires_grad
=
False
)
data
.
copy_
(
tensor
.
view
(
-
1
)[
start_index
:
end_index
])
else
:
data
=
tensor
.
view
(
-
1
)[
start_index
:
end_index
]
return
data
def
gather_split_1d_tensor
(
tensor
):
""" Opposite of split_tensor_into_1d_equal_chunks. Gather values from tensor
model parallel ranks.
Returns a new Tensor with the gathered data.
Arguments:
tensor: A Tensor or view of this rank's portion of the data.
"""
numel_gathered
=
torch
.
numel
(
tensor
)
*
\
parallel_state
.
get_tensor_model_parallel_world_size
()
gathered
=
torch
.
empty
(
numel_gathered
,
dtype
=
tensor
.
dtype
,
device
=
torch
.
cuda
.
current_device
(),
requires_grad
=
False
)
# TODO: This API is experimental in pytorch (as of Feb 2022) and
# this might break in future pytorch releases. We chose this API
# as opposed to torch.distributed.all_gather for efficiency reasons.
# This API calls directly NCCL all-gather versus the former does
# internal copies and can potentially cause slow down.
torch
.
distributed
.
_all_gather_base
(
gathered
,
tensor
,
group
=
parallel_state
.
get_tensor_model_parallel_group
())
return
gathered
class
VocabUtility
:
""" Split the vocabulary into `world_size` chunks and return the first
and last index of the vocabulary belonging to the `rank`
partition: Note that indices in [fist, last)
"""
@
staticmethod
def
vocab_range_from_per_partition_vocab_size
(
per_partition_vocab_size
:
int
,
rank
,
world_size
:
int
)
->
Sequence
[
int
]:
index_f
=
rank
*
per_partition_vocab_size
index_l
=
index_f
+
per_partition_vocab_size
return
index_f
,
index_l
@
staticmethod
def
vocab_range_from_global_vocab_size
(
global_vocab_size
:
int
,
rank
:
int
,
world_size
:
int
)
->
Sequence
[
int
]:
per_partition_vocab_size
=
divide
(
global_vocab_size
,
world_size
)
return
VocabUtility
.
vocab_range_from_per_partition_vocab_size
(
per_partition_vocab_size
,
rank
,
world_size
)
megatron/core/transformer/__init__.py
0 → 100644
View file @
d3dd8642
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
from
.transformer_config
import
TransformerConfig
from
.core_attention
import
CoreAttention
megatron/core/transformer/attention.py
0 → 100644
View file @
d3dd8642
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
from
abc
import
ABC
,
abstractmethod
from
.enums
import
AttnMaskType
from
.transformer_config
import
TransformerConfig
import
torch
from
megatron.core
import
parallel_state
,
tensor_parallel
from
megatron.core.transformer.core_attention
import
CoreAttention
from
megatron.core.utils
import
divide
from
megatron.core.transformer.module
import
MegatronModule
from
megatron.core.transformer.enums
import
AttnType
,
AttnMaskType
from
megatron.core.transformer.transformer_config
import
TransformerConfig
from
megatron.core.transformer.custom_layers.transformer_engine
import
\
TECoreAttention
,
TEColumnParallelLinear
,
TERowParallelLinear
from
megatron.core.models.common.rotary_pos_embedding
import
apply_rotary_pos_emb
class
Attention
(
MegatronModule
,
ABC
):
"""Attention layer abstract class.
This layer only contains common modules required for the "self attn" and
"cross attn" specializations.
"""
def
__init__
(
self
,
config
:
TransformerConfig
,
layer_number
:
int
=
1
,
attn_mask_type
=
AttnMaskType
.
padding
,
):
super
().
__init__
(
config
=
config
)
self
.
config
=
config
self
.
layer_number
=
layer_number
self
.
attn_mask_type
=
attn_mask_type
self
.
projection_size
=
self
.
config
.
kv_channels
*
self
.
config
.
num_attention_heads
# Per attention head and per partition values.
world_size
=
parallel_state
.
get_tensor_model_parallel_world_size
()
self
.
hidden_size_per_attention_head
=
divide
(
self
.
projection_size
,
self
.
config
.
num_attention_heads
)
self
.
num_attention_heads_per_partition
=
divide
(
self
.
config
.
num_attention_heads
,
world_size
)
self
.
core_attention
=
TECoreAttention
(
config
=
self
.
config
,
layer_number
=
self
.
layer_number
,
attn_mask_type
=
self
.
attn_mask_type
)
self
.
checkpoint_core_attention
=
self
.
config
.
recompute_granularity
==
'selective'
# Output.
self
.
linear_proj
=
TERowParallelLinear
(
self
.
projection_size
,
self
.
config
.
hidden_size
,
config
=
self
.
config
,
init_method
=
self
.
config
.
output_layer_init_method
,
bias
=
self
.
config
.
add_bias_linear
,
skip_bias_add
=
True
,
)
def
_checkpointed_attention_forward
(
self
,
query
,
key
,
value
,
attention_mask
,
rotary_pos_emb
=
None
):
"""Forward method with selective activation checkpointing."""
def
custom_forward
(
*
inputs
):
query
=
inputs
[
0
]
key
=
inputs
[
1
]
value
=
inputs
[
2
]
attention_mask
=
inputs
[
3
]
output_
=
self
.
core_attention
(
query
,
key
,
value
,
attention_mask
)
return
output_
hidden_states
=
tensor_parallel
.
checkpoint
(
custom_forward
,
False
,
query
,
key
,
value
,
attention_mask
,
rotary_pos_emb
)
return
hidden_states
def
_allocate_memory
(
self
,
inference_max_sequence_len
,
batch_size
):
return
torch
.
empty
(
inference_max_sequence_len
,
batch_size
,
self
.
num_attention_heads_per_partition
,
self
.
hidden_size_per_attention_head
,
dtype
=
self
.
params_dtype
,
device
=
torch
.
cuda
.
current_device
(),
)
def
_adjust_key_value_for_inference
(
self
,
inference_params
,
key
,
value
,
rotary_pos_emb
):
"""
Saves the generated key and value tensors to the end of the buffers in inference_params.
Returns the full size keys and values from the provided inference_params, as well as
adjusted rotary_pos_emb.
Returns a tuple: (key, value, rotary_pos_emb)
"""
if
inference_params
is
None
:
return
key
,
value
,
rotary_pos_emb
# =================================================
# Pre-allocate memory for key-values for inference.
# =================================================
is_first_step
=
False
if
self
.
layer_number
not
in
inference_params
.
key_value_memory_dict
:
inf_max_seq_len
=
inference_params
.
max_sequence_len
inf_max_batch_size
=
inference_params
.
max_batch_size
inference_key_memory
=
self
.
_allocate_memory
(
inf_max_seq_len
,
inf_max_batch_size
)
inference_value_memory
=
self
.
_allocate_memory
(
inf_max_seq_len
,
inf_max_batch_size
)
inference_params
.
key_value_memory_dict
[
self
.
layer_number
]
=
(
inference_key_memory
,
inference_value_memory
,
)
is_first_step
=
True
else
:
# Get the pre-allocated buffers for this layer
inference_key_memory
,
inference_value_memory
=
inference_params
.
key_value_memory_dict
[
self
.
layer_number
]
batch_start
=
inference_params
.
batch_size_offset
batch_end
=
batch_start
+
key
.
size
(
1
)
assert
batch_end
<=
inference_key_memory
.
size
(
1
)
sequence_start
=
inference_params
.
sequence_len_offset
sequence_end
=
sequence_start
+
key
.
size
(
0
)
assert
sequence_end
<=
inference_key_memory
.
size
(
0
)
# Copy key and values.
inference_key_memory
[
sequence_start
:
sequence_end
,
batch_start
:
batch_end
,
...]
=
key
inference_value_memory
[
sequence_start
:
sequence_end
,
batch_start
:
batch_end
,
...]
=
value
key
=
inference_key_memory
[:
sequence_end
,
batch_start
:
batch_end
,
...]
value
=
inference_value_memory
[:
sequence_end
,
batch_start
:
batch_end
,
...]
# adjust the key rotary positional embedding
if
rotary_pos_emb
is
not
None
:
q_pos_emb
,
k_pos_emb
=
rotary_pos_emb
# need to cross check this condition during inference
# if not set_inference_key_value_memory:
if
not
is_first_step
:
# In inference, we compute one token at a time.
# Select the correct positional embedding
# (only the last token in the sequence)
q_pos_emb
=
q_pos_emb
[
sequence_end
-
1
:
sequence_end
]
else
:
# In the first forward pass of inference,
# we use the entire provided prefix.
# q_pos_emb here has the rope embeddings of the entire
# prefix + to-be-generated output so
# we slice to just the prefix.
q_pos_emb
=
q_pos_emb
[:
sequence_end
,
:,
:,
:]
k_pos_emb
=
k_pos_emb
[:
sequence_end
,
:,
:,
:]
rotary_pos_emb
=
(
q_pos_emb
,
k_pos_emb
)
return
key
,
value
,
rotary_pos_emb
@
abstractmethod
def
get_query_key_value_tensors
(
self
,
hidden_states
,
key_value_states
):
"""
This method needs to be implemented based on whether the derived class
is "self-attn" or "cross-attn".
"""
def
forward
(
self
,
hidden_states
,
attention_mask
,
key_value_states
=
None
,
inference_params
=
None
,
rotary_pos_emb
=
None
):
# hidden_states: [sq, b, h]
# For self attention we just duplicate the rotary_pos_emb if it isn't already
if
rotary_pos_emb
is
not
None
and
not
isinstance
(
rotary_pos_emb
,
tuple
):
rotary_pos_emb
=
((
rotary_pos_emb
,)
*
2
)
# =====================
# Query, Key, and Value
# =====================
# Get the query, key and value tensors based on the type of attention -
# self or cross attn.
query
,
key
,
value
=
self
.
get_query_key_value_tensors
(
hidden_states
,
key_value_states
)
# ===================================================
# Adjust key, value, and rotary_pos_emb for inference
# ===================================================
key
,
value
,
rotary_pos_emb
=
self
.
_adjust_key_value_for_inference
(
inference_params
,
key
,
value
,
rotary_pos_emb
)
# ================================================
# relative positional embedding (rotary embedding)
# ================================================
if
rotary_pos_emb
is
not
None
:
q_pos_emb
,
k_pos_emb
=
rotary_pos_emb
query
=
apply_rotary_pos_emb
(
query
,
q_pos_emb
)
key
=
apply_rotary_pos_emb
(
key
,
k_pos_emb
)
# TODO, can apply positional embedding to value_layer so it has
# absolute positional embedding.
# otherwise, only relative positional embedding takes effect
# value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb)
# ==================================
# core attention computation
# ==================================
if
self
.
checkpoint_core_attention
:
core_attn_out
=
self
.
_checkpointed_attention_forward
(
query
,
key
,
value
,
attention_mask
)
else
:
core_attn_out
=
self
.
core_attention
(
query
,
key
,
value
,
attention_mask
)
# =================
# Output. [sq, b, h]
# =================
output
,
bias
=
self
.
linear_proj
(
core_attn_out
)
return
output
,
bias
class
SelfAttention
(
Attention
):
"""Self-attention layer class
Self-attention layer takes input with size [s, b, h]
and returns output of the same size.
"""
def
__init__
(
self
,
config
:
TransformerConfig
,
layer_number
:
int
=
1
,
attn_mask_type
=
AttnMaskType
.
padding
):
super
().
__init__
(
config
=
config
,
layer_number
=
layer_number
,
attn_mask_type
=
attn_mask_type
)
self
.
linear_qkv
=
TEColumnParallelLinear
(
self
.
config
.
hidden_size
,
3
*
self
.
projection_size
,
config
=
self
.
config
,
init_method
=
self
.
config
.
init_method
,
bias
=
self
.
config
.
add_bias_linear
,
skip_bias_add
=
False
)
def
get_query_key_value_tensors
(
self
,
hidden_states
,
key_value_states
=
None
):
"""
Derives `query`, `key` and `value` tensors from `hidden_states`.
"""
# Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
mixed_qkv
,
_
=
self
.
linear_qkv
(
hidden_states
)
# [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
new_tensor_shape
=
mixed_qkv
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads_per_partition
,
3
*
self
.
hidden_size_per_attention_head
,
)
mixed_qkv
=
mixed_qkv
.
view
(
*
new_tensor_shape
)
# [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
(
query
,
key
,
value
)
=
tensor_parallel
.
split_tensor_along_last_dim
(
mixed_qkv
,
3
)
return
query
,
key
,
value
class
CrossAttention
(
Attention
):
"""Cross-attention layer class
Cross-attention layer takes input with size [s, b, h] and context with size
[s, b, h] and returns output of the same size.
"""
def
__init__
(
self
,
config
:
TransformerConfig
,
layer_number
:
int
=
1
,
attn_mask_type
=
AttnMaskType
.
padding
):
super
().
__init__
(
config
=
config
,
layer_number
=
layer_number
,
attn_mask_type
=
attn_mask_type
)
self
.
linear_q
=
TEColumnParallelLinear
(
self
.
config
.
hidden_size
,
self
.
projection_size
,
config
=
self
.
config
,
init_method
=
self
.
config
.
init_method
,
bias
=
self
.
config
.
add_bias_linear
,
skip_bias_add
=
False
)
self
.
linear_kv
=
TEColumnParallelLinear
(
self
.
config
.
hidden_size
,
2
*
self
.
projection_size
,
config
=
self
.
config
,
init_method
=
self
.
config
.
init_method
,
bias
=
self
.
config
.
add_bias_linear
,
skip_bias_add
=
False
)
def
get_query_key_value_tensors
(
self
,
hidden_states
,
key_value_states
):
"""
Derives `query` tensor from `hidden_states`, and `key`/`value` tensors
from `key_value_states`.
"""
# Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
mixed_kv
,
_
=
self
.
linear_kv
(
key_value_states
)
# [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
new_tensor_shape
=
mixed_kv
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads_per_partition
,
2
*
self
.
hidden_size_per_attention_head
,
)
mixed_kv
=
mixed_kv
.
view
(
*
new_tensor_shape
)
# [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
(
key
,
value
)
=
tensor_parallel
.
split_tensor_along_last_dim
(
mixed_kv
,
2
)
# Attention head [sq, b, h] --> [sq, b, hp]
query
,
_
=
self
.
linear_q
(
hidden_states
)
# [sq, b, hp] --> [sq, b, np, hn]
new_tensor_shape
=
query
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads_per_partition
,
self
.
hidden_size_per_attention_head
,
)
query
=
query
.
view
(
*
new_tensor_shape
)
return
query
,
key
,
value
megatron/core/transformer/core_attention.py
0 → 100644
View file @
d3dd8642
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import
math
import
torch
from
torch
import
Tensor
from
megatron.core
import
parallel_state
,
tensor_parallel
from
megatron.core.utils
import
divide
from
megatron.core.transformer.module
import
MegatronModule
from
megatron.core.transformer.transformer_config
import
TransformerConfig
from
megatron.core.transformer.enums
import
AttnMaskType
from
megatron.core.transformer.utils
import
attention_mask_func
from
megatron.core.fusions.fused_softmax
import
FusedScaleMaskSoftmax
class
CoreAttention
(
MegatronModule
):
"""
Region where selective activation recomputation is applied.
This region is memory intensive but less compute intensive which
makes activation checkpointing more efficient for LLMs (20B+).
See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
We use the following notation:
h: hidden size
n: number of attention heads
p: number of tensor model parallel partitions
b: batch size
s: sequence length
"""
def
__init__
(
self
,
config
:
TransformerConfig
,
layer_number
:
int
=
1
,
attn_mask_type
=
AttnMaskType
.
padding
):
super
().
__init__
(
config
=
config
)
self
.
config
:
TransformerConfig
=
config
self
.
layer_number
=
max
(
1
,
layer_number
)
self
.
attn_mask_type
=
attn_mask_type
projection_size
=
self
.
config
.
kv_channels
*
config
.
num_attention_heads
# Per attention head and per partition values.
world_size
=
parallel_state
.
get_tensor_model_parallel_world_size
()
self
.
hidden_size_per_partition
=
divide
(
projection_size
,
world_size
)
self
.
hidden_size_per_attention_head
=
divide
(
projection_size
,
config
.
num_attention_heads
)
self
.
num_attention_heads_per_partition
=
divide
(
config
.
num_attention_heads
,
world_size
)
coeff
=
None
self
.
norm_factor
=
math
.
sqrt
(
self
.
hidden_size_per_attention_head
)
if
self
.
config
.
apply_query_key_layer_scaling
:
coeff
=
self
.
layer_number
self
.
norm_factor
*=
coeff
self
.
scale_mask_softmax
=
FusedScaleMaskSoftmax
(
input_in_fp16
=
self
.
config
.
fp16
,
input_in_bf16
=
self
.
config
.
bf16
,
attn_mask_type
=
self
.
attn_mask_type
,
scaled_masked_softmax_fusion
=
self
.
config
.
masked_softmax_fusion
,
mask_func
=
attention_mask_func
,
softmax_in_fp32
=
self
.
config
.
attention_softmax_in_fp32
,
scale
=
coeff
,
)
# Dropout. Note that for a single iteration, this layer will generate
# different outputs on different number of parallel partitions but
# on average it should not be partition dependent.
self
.
attention_dropout
=
torch
.
nn
.
Dropout
(
self
.
config
.
attention_dropout
)
def
forward
(
self
,
query_layer
:
Tensor
,
key_layer
:
Tensor
,
value_layer
:
Tensor
,
attention_mask
:
Tensor
):
# ===================================
# Raw attention scores. [b, n/p, s, s]
# ===================================
# [b, np, sq, sk]
output_size
=
(
query_layer
.
size
(
1
),
query_layer
.
size
(
2
),
query_layer
.
size
(
0
),
key_layer
.
size
(
0
))
# [sq, b, np, hn] -> [sq, b * np, hn]
query_layer
=
query_layer
.
view
(
output_size
[
2
],
output_size
[
0
]
*
output_size
[
1
],
-
1
)
# [sk, b, np, hn] -> [sk, b * np, hn]
key_layer
=
key_layer
.
view
(
output_size
[
3
],
output_size
[
0
]
*
output_size
[
1
],
-
1
)
# preallocting input tensor: [b * np, sq, sk]
matmul_input_buffer
=
parallel_state
.
get_global_memory_buffer
().
get_tensor
(
(
output_size
[
0
]
*
output_size
[
1
],
output_size
[
2
],
output_size
[
3
]),
query_layer
.
dtype
,
"mpu"
)
# Raw attention scores. [b * np, sq, sk]
matmul_result
=
torch
.
baddbmm
(
matmul_input_buffer
,
query_layer
.
transpose
(
0
,
1
),
# [b * np, sq, hn]
key_layer
.
transpose
(
0
,
1
).
transpose
(
1
,
2
),
# [b * np, hn, sk]
beta
=
0.0
,
alpha
=
(
1.0
/
self
.
norm_factor
),
)
# change view to [b, np, sq, sk]
attention_scores
=
matmul_result
.
view
(
*
output_size
)
# ===========================
# Attention probs and dropout
# ===========================
# attention scores and attention mask [b, np, sq, sk]
attention_probs
:
Tensor
=
self
.
scale_mask_softmax
(
attention_scores
,
attention_mask
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
if
not
self
.
config
.
sequence_parallel
:
with
tensor_parallel
.
get_cuda_rng_tracker
().
fork
():
attention_probs
=
self
.
attention_dropout
(
attention_probs
)
else
:
attention_probs
=
self
.
attention_dropout
(
attention_probs
)
# =========================
# Context layer. [sq, b, hp]
# =========================
# value_layer -> context layer.
# [sk, b, np, hn] --> [b, np, sq, hn]
# context layer shape: [b, np, sq, hn]
output_size
=
(
value_layer
.
size
(
1
),
value_layer
.
size
(
2
),
query_layer
.
size
(
0
),
value_layer
.
size
(
3
))
# change view [sk, b * np, hn]
value_layer
=
value_layer
.
view
(
value_layer
.
size
(
0
),
output_size
[
0
]
*
output_size
[
1
],
-
1
)
# change view [b * np, sq, sk]
attention_probs
=
attention_probs
.
view
(
output_size
[
0
]
*
output_size
[
1
],
output_size
[
2
],
-
1
)
# matmul: [b * np, sq, hn]
context_layer
=
torch
.
bmm
(
attention_probs
,
value_layer
.
transpose
(
0
,
1
))
# change view [b, np, sq, hn]
context_layer
=
context_layer
.
view
(
*
output_size
)
# [b, np, sq, hn] --> [sq, b, np, hn]
context_layer
=
context_layer
.
permute
(
2
,
0
,
1
,
3
).
contiguous
()
# [sq, b, np, hn] --> [sq, b, hp]
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
(
self
.
hidden_size_per_partition
,)
context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
return
context_layer
megatron/core/transformer/custom_layers/transformer_engine.py
0 → 100644
View file @
d3dd8642
import
torch
import
transformer_engine
as
te
from
typing
import
Callable
from
megatron.core.transformer.transformer_config
import
TransformerConfig
from
megatron.core.transformer.enums
import
AttnMaskType
from
megatron.core.parallel_state
import
get_tensor_model_parallel_group
from
megatron.core.tensor_parallel
import
get_cuda_rng_tracker
class
TELayerNorm
(
te
.
pytorch
.
module
.
LayerNorm
):
"""
Wrapper for the Transformer-Engine's `LayerNorm`.
"""
def
__init__
(
self
,
hidden_size
:
int
,
eps
:
float
=
1e-5
,
sequence_parallel
:
bool
=
False
,
**
kwargs
):
super
().
__init__
(
hidden_size
=
hidden_size
,
eps
=
eps
,
sequence_parallel
=
sequence_parallel
)
class
TELinear
(
te
.
pytorch
.
module
.
Linear
):
"""
Wrapper for the Transformer-Engine's `Linear` layer.
Note that if Megatron's parallel_state has not been initialized
yet, the tp_group passed to TE will be None and must be set later
via set_tensor_parallel_group().
"""
def
__init__
(
self
,
input_size
:
int
,
output_size
:
int
,
config
:
TransformerConfig
,
parallel_mode
:
str
,
init_method
:
Callable
,
*
,
bias
:
bool
=
True
,
skip_bias_add
:
bool
=
False
,
**
kwargs
):
self
.
config
=
config
# TE returns a zero length Tensor when bias=False and
# return_bias=True, but we prefer None. So in that case we
# tell TE to not return the bias, and return None
# ourselves. This way our forward always returns two values
# and we don't have to deal with the zero length Tensor.
self
.
te_return_bias
=
skip_bias_add
and
bias
super
().
__init__
(
in_features
=
input_size
,
out_features
=
output_size
,
sequence_parallel
=
self
.
config
.
sequence_parallel
,
fuse_wgrad_accumulation
=
self
.
config
.
gradient_accumulation_fusion
,
tp_group
=
get_tensor_model_parallel_group
(
check_initialized
=
False
),
tp_size
=
self
.
config
.
tensor_model_parallel_size
,
get_rng_state_tracker
=
get_cuda_rng_tracker
,
init_method
=
init_method
,
params_dtype
=
self
.
config
.
params_dtype
,
parallel_mode
=
parallel_mode
,
bias
=
bias
,
return_bias
=
self
.
te_return_bias
,
**
kwargs
)
def
forward
(
self
,
x
):
out
=
super
().
forward
(
x
)
# TE only returns a tuple when return_bias is True, otherwise
# it returns a single Tensor, we always want to return two
# values regardless of the arguments.
if
self
.
te_return_bias
:
return
out
return
out
,
None
class
TEColumnParallelLinear
(
TELinear
):
"""
Wrapper for the Transformer-Engine's `Linear` layer but specialized similar
to megatron's `ColumnParallelLinear` layer.
"""
def
__init__
(
self
,
input_size
:
int
,
output_size
:
int
,
config
:
TransformerConfig
,
**
kwargs
):
self
.
config
=
config
super
().
__init__
(
input_size
=
input_size
,
output_size
=
output_size
,
config
=
self
.
config
,
parallel_mode
=
"column"
,
**
kwargs
)
class
TERowParallelLinear
(
TELinear
):
"""
Wrapper for the Transformer-Engine's `Linear` layer but specialized similar
to megatron's `RowParallelLinear` layer.
"""
def
__init__
(
self
,
input_size
:
int
,
output_size
:
int
,
config
:
TransformerConfig
,
**
kwargs
):
self
.
config
=
config
super
().
__init__
(
input_size
=
input_size
,
output_size
=
output_size
,
config
=
self
.
config
,
parallel_mode
=
"row"
,
**
kwargs
)
class
TECoreAttention
(
te
.
pytorch
.
transformer
.
DotProductAttention
):
"""
Wrapper for the Transformer-Engine's `DotProductAttention` layer that also
has "flash attention" enabled.
Note that if Megatron's parallel_state has not been initialized
yet, the tp_group passed to TE will be None and must be set later
via set_tensor_parallel_group().
"""
def
__init__
(
self
,
config
:
TransformerConfig
,
layer_number
:
int
=
1
,
attn_mask_type
:
AttnMaskType
=
AttnMaskType
.
padding
,
**
kwargs
):
self
.
config
=
config
super
().
__init__
(
num_attention_heads
=
self
.
config
.
num_attention_heads
,
kv_channels
=
self
.
config
.
kv_channels
,
attention_dropout
=
self
.
config
.
attention_dropout
,
layer_number
=
layer_number
,
attn_mask_type
=
attn_mask_type
.
name
,
sequence_parallel
=
self
.
config
.
sequence_parallel
,
tp_size
=
self
.
config
.
tensor_model_parallel_size
,
get_rng_state_tracker
=
get_cuda_rng_tracker
,
tp_group
=
get_tensor_model_parallel_group
(
check_initialized
=
False
),
**
kwargs
)
megatron/core/transformer/enums.py
0 → 100644
View file @
d3dd8642
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import
enum
# can we get rid of this?
# it's being used in pipeline schedules
class
ModelType
(
enum
.
Enum
):
encoder_or_decoder
=
1
encoder_and_decoder
=
2
# class LayerType(enum.Enum):
# encoder = 1
# decoder = 2
class
AttnType
(
enum
.
Enum
):
self_attn
=
1
cross_attn
=
2
class
AttnMaskType
(
enum
.
Enum
):
padding
=
1
causal
=
2
Prev
1
…
7
8
9
10
11
12
13
14
15
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment