Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
GPT2_pytorch
Commits
8ec5d678
Commit
8ec5d678
authored
Apr 03, 2023
by
hepj987
Browse files
GPT2 base on megatron-deepspeed
parents
Changes
248
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4080 additions
and
0 deletions
+4080
-0
megatron-deepspeed_dtk22.10/megatron/model/biencoder_model.py
...tron-deepspeed_dtk22.10/megatron/model/biencoder_model.py
+295
-0
megatron-deepspeed_dtk22.10/megatron/model/classification.py
megatron-deepspeed_dtk22.10/megatron/model/classification.py
+119
-0
megatron-deepspeed_dtk22.10/megatron/model/distributed.py
megatron-deepspeed_dtk22.10/megatron/model/distributed.py
+218
-0
megatron-deepspeed_dtk22.10/megatron/model/fused_bias_gelu.py
...tron-deepspeed_dtk22.10/megatron/model/fused_bias_gelu.py
+60
-0
megatron-deepspeed_dtk22.10/megatron/model/fused_layer_norm.py
...ron-deepspeed_dtk22.10/megatron/model/fused_layer_norm.py
+111
-0
megatron-deepspeed_dtk22.10/megatron/model/fused_softmax.py
megatron-deepspeed_dtk22.10/megatron/model/fused_softmax.py
+238
-0
megatron-deepspeed_dtk22.10/megatron/model/glu_activations.py
...tron-deepspeed_dtk22.10/megatron/model/glu_activations.py
+52
-0
megatron-deepspeed_dtk22.10/megatron/model/gpt_model.py
megatron-deepspeed_dtk22.10/megatron/model/gpt_model.py
+319
-0
megatron-deepspeed_dtk22.10/megatron/model/language_model.py
megatron-deepspeed_dtk22.10/megatron/model/language_model.py
+505
-0
megatron-deepspeed_dtk22.10/megatron/model/module.py
megatron-deepspeed_dtk22.10/megatron/model/module.py
+189
-0
megatron-deepspeed_dtk22.10/megatron/model/multiple_choice.py
...tron-deepspeed_dtk22.10/megatron/model/multiple_choice.py
+130
-0
megatron-deepspeed_dtk22.10/megatron/model/positional_embeddings.py
...eepspeed_dtk22.10/megatron/model/positional_embeddings.py
+52
-0
megatron-deepspeed_dtk22.10/megatron/model/realm_model.py
megatron-deepspeed_dtk22.10/megatron/model/realm_model.py
+204
-0
megatron-deepspeed_dtk22.10/megatron/model/t5_model.py
megatron-deepspeed_dtk22.10/megatron/model/t5_model.py
+174
-0
megatron-deepspeed_dtk22.10/megatron/model/transformer.py
megatron-deepspeed_dtk22.10/megatron/model/transformer.py
+820
-0
megatron-deepspeed_dtk22.10/megatron/model/utils.py
megatron-deepspeed_dtk22.10/megatron/model/utils.py
+91
-0
megatron-deepspeed_dtk22.10/megatron/model/vit_model.py
megatron-deepspeed_dtk22.10/megatron/model/vit_model.py
+210
-0
megatron-deepspeed_dtk22.10/megatron/mpu/__init__.py
megatron-deepspeed_dtk22.10/megatron/mpu/__init__.py
+67
-0
megatron-deepspeed_dtk22.10/megatron/mpu/cross_entropy.py
megatron-deepspeed_dtk22.10/megatron/mpu/cross_entropy.py
+110
-0
megatron-deepspeed_dtk22.10/megatron/mpu/data.py
megatron-deepspeed_dtk22.10/megatron/mpu/data.py
+116
-0
No files found.
Too many changes to show.
To preserve performance only
248 of 248+
files are displayed.
Plain diff
Email patch
megatron-deepspeed_dtk22.10/megatron/model/biencoder_model.py
0 → 100644
View file @
8ec5d678
import
os
import
torch
import
sys
from
megatron
import
get_args
,
print_rank_0
from
megatron.checkpointing
import
fix_query_key_value_ordering
from
megatron.checkpointing
import
get_checkpoint_tracker_filename
from
megatron.checkpointing
import
get_checkpoint_name
from
megatron
import
mpu
,
get_tokenizer
from
megatron.model.bert_model
import
bert_position_ids
from
megatron.enums
import
AttnMaskType
from
megatron.model.language_model
import
get_language_model
from
megatron.model.utils
import
get_linear_layer
from
megatron.model.utils
import
init_method_normal
from
megatron.model.utils
import
scaled_init_method_normal
from
.module
import
MegatronModule
def
biencoder_model_provider
(
only_query_model
=
False
,
only_context_model
=
False
,
biencoder_shared_query_context_model
=
False
):
"""Build the model."""
args
=
get_args
()
assert
mpu
.
get_tensor_model_parallel_world_size
()
==
1
and
\
mpu
.
get_pipeline_model_parallel_world_size
()
==
1
,
\
"Model parallel size > 1 not supported for ICT"
print_rank_0
(
'building BiEncoderModel...'
)
# simpler to just keep using 2 tokentypes since
# the LM we initialize with has 2 tokentypes
model
=
BiEncoderModel
(
num_tokentypes
=
2
,
parallel_output
=
False
,
only_query_model
=
only_query_model
,
only_context_model
=
only_context_model
,
biencoder_shared_query_context_model
=
\
biencoder_shared_query_context_model
)
return
model
class
BiEncoderModel
(
MegatronModule
):
"""Bert-based module for Biencoder model."""
def
__init__
(
self
,
num_tokentypes
=
1
,
parallel_output
=
True
,
only_query_model
=
False
,
only_context_model
=
False
,
biencoder_shared_query_context_model
=
False
):
super
(
BiEncoderModel
,
self
).
__init__
()
args
=
get_args
()
bert_kwargs
=
dict
(
num_tokentypes
=
num_tokentypes
,
parallel_output
=
parallel_output
)
self
.
biencoder_shared_query_context_model
=
\
biencoder_shared_query_context_model
assert
not
(
only_context_model
and
only_query_model
)
self
.
use_context_model
=
not
only_query_model
self
.
use_query_model
=
not
only_context_model
self
.
biencoder_projection_dim
=
args
.
biencoder_projection_dim
if
self
.
biencoder_shared_query_context_model
:
self
.
model
=
PretrainedBertModel
(
**
bert_kwargs
)
self
.
_model_key
=
'shared_model'
self
.
query_model
,
self
.
context_model
=
self
.
model
,
self
.
model
else
:
if
self
.
use_query_model
:
# this model embeds (pseudo-)queries - Embed_input in the paper
self
.
query_model
=
PretrainedBertModel
(
**
bert_kwargs
)
self
.
_query_key
=
'query_model'
if
self
.
use_context_model
:
# this model embeds evidence blocks - Embed_doc in the paper
self
.
context_model
=
PretrainedBertModel
(
**
bert_kwargs
)
self
.
_context_key
=
'context_model'
def
forward
(
self
,
query_tokens
,
query_attention_mask
,
query_types
,
context_tokens
,
context_attention_mask
,
context_types
):
"""Run a forward pass for each of the models and
return the respective embeddings."""
if
self
.
use_query_model
:
query_logits
=
self
.
embed_text
(
self
.
query_model
,
query_tokens
,
query_attention_mask
,
query_types
)
else
:
raise
ValueError
(
"Cannot embed query without the query model."
)
if
self
.
use_context_model
:
context_logits
=
self
.
embed_text
(
self
.
context_model
,
context_tokens
,
context_attention_mask
,
context_types
)
else
:
raise
ValueError
(
"Cannot embed block without the block model."
)
return
query_logits
,
context_logits
@
staticmethod
def
embed_text
(
model
,
tokens
,
attention_mask
,
token_types
):
"""Embed a batch of tokens using the model"""
logits
=
model
(
tokens
,
attention_mask
,
token_types
)
return
logits
def
state_dict_for_save_checkpoint
(
self
,
destination
=
None
,
\
prefix
=
''
,
keep_vars
=
False
):
"""Save dict with state dicts of each of the models."""
state_dict_
=
{}
if
self
.
biencoder_shared_query_context_model
:
state_dict_
[
self
.
_model_key
]
=
\
self
.
model
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
else
:
if
self
.
use_query_model
:
state_dict_
[
self
.
_query_key
]
=
\
self
.
query_model
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
if
self
.
use_context_model
:
state_dict_
[
self
.
_context_key
]
=
\
self
.
context_model
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
return
state_dict_
def
load_state_dict
(
self
,
state_dict
,
strict
=
True
):
"""Load the state dicts of each of the models"""
if
self
.
biencoder_shared_query_context_model
:
print_rank_0
(
"Loading shared query-context model"
)
self
.
model
.
load_state_dict
(
state_dict
[
self
.
_model_key
],
\
strict
=
strict
)
else
:
if
self
.
use_query_model
:
print_rank_0
(
"Loading query model"
)
self
.
query_model
.
load_state_dict
(
\
state_dict
[
self
.
_query_key
],
strict
=
strict
)
if
self
.
use_context_model
:
print_rank_0
(
"Loading context model"
)
self
.
context_model
.
load_state_dict
(
\
state_dict
[
self
.
_context_key
],
strict
=
strict
)
def
init_state_dict_from_bert
(
self
):
"""Initialize the state from a pretrained BERT model
on iteration zero of ICT pretraining"""
args
=
get_args
()
if
args
.
bert_load
is
None
:
print_rank_0
(
"bert-load argument is None"
)
return
tracker_filename
=
get_checkpoint_tracker_filename
(
args
.
bert_load
)
if
not
os
.
path
.
isfile
(
tracker_filename
):
raise
FileNotFoundError
(
"Could not find BERT checkpoint"
)
with
open
(
tracker_filename
,
'r'
)
as
f
:
iteration
=
int
(
f
.
read
().
strip
())
assert
iteration
>
0
checkpoint_name
=
get_checkpoint_name
(
args
.
bert_load
,
iteration
,
False
)
if
mpu
.
get_data_parallel_rank
()
==
0
:
print
(
'global rank {} is loading BERT checkpoint {}'
.
format
(
torch
.
distributed
.
get_rank
(),
checkpoint_name
))
# Load the checkpoint.
try
:
state_dict
=
torch
.
load
(
checkpoint_name
,
map_location
=
'cpu'
)
except
ModuleNotFoundError
:
from
megatron.fp16_deprecated
import
loss_scaler
# For backward compatibility.
print_rank_0
(
' > deserializing using the old code structure ...'
)
sys
.
modules
[
'fp16.loss_scaler'
]
=
sys
.
modules
[
'megatron.fp16_deprecated.loss_scaler'
]
sys
.
modules
[
'megatron.fp16.loss_scaler'
]
=
sys
.
modules
[
'megatron.fp16_deprecated.loss_scaler'
]
state_dict
=
torch
.
load
(
checkpoint_name
,
map_location
=
'cpu'
)
sys
.
modules
.
pop
(
'fp16.loss_scaler'
,
None
)
sys
.
modules
.
pop
(
'megatron.fp16.loss_scaler'
,
None
)
except
BaseException
:
print_rank_0
(
'could not load the BERT checkpoint'
)
sys
.
exit
()
checkpoint_version
=
state_dict
.
get
(
'checkpoint_version'
,
0
)
# load the LM state dict into each model
model_dict
=
state_dict
[
'model'
][
'language_model'
]
if
self
.
biencoder_shared_query_context_model
:
self
.
model
.
language_model
.
load_state_dict
(
model_dict
)
fix_query_key_value_ordering
(
self
.
model
,
checkpoint_version
)
else
:
if
self
.
use_query_model
:
self
.
query_model
.
language_model
.
load_state_dict
(
model_dict
)
# give each model the same ict_head to begin with as well
if
self
.
biencoder_projection_dim
>
0
:
query_proj_state_dict
=
\
self
.
state_dict_for_save_checkpoint
()
\
[
self
.
_query_key
][
'projection_enc'
]
fix_query_key_value_ordering
(
self
.
query_model
,
checkpoint_version
)
if
self
.
use_context_model
:
self
.
context_model
.
language_model
.
load_state_dict
(
model_dict
)
if
self
.
query_model
is
not
None
and
\
self
.
biencoder_projection_dim
>
0
:
self
.
context_model
.
projection_enc
.
load_state_dict
\
(
query_proj_state_dict
)
fix_query_key_value_ordering
(
self
.
context_model
,
checkpoint_version
)
class
PretrainedBertModel
(
MegatronModule
):
"""BERT-based encoder for queries or contexts used for
learned information retrieval."""
def
__init__
(
self
,
num_tokentypes
=
2
,
parallel_output
=
True
):
super
(
PretrainedBertModel
,
self
).
__init__
()
args
=
get_args
()
tokenizer
=
get_tokenizer
()
self
.
pad_id
=
tokenizer
.
pad
self
.
biencoder_projection_dim
=
args
.
biencoder_projection_dim
self
.
parallel_output
=
parallel_output
init_method
=
init_method_normal
(
args
.
init_method_std
)
scaled_init_method
=
scaled_init_method_normal
(
args
.
init_method_std
,
args
.
num_layers
)
self
.
language_model
,
self
.
_language_model_key
=
get_language_model
(
num_tokentypes
=
num_tokentypes
,
add_pooler
=
False
,
encoder_attn_mask_type
=
AttnMaskType
.
padding
,
init_method
=
init_method
,
scaled_init_method
=
scaled_init_method
)
if
args
.
biencoder_projection_dim
>
0
:
self
.
projection_enc
=
get_linear_layer
(
args
.
hidden_size
,
args
.
biencoder_projection_dim
,
init_method
)
self
.
_projection_enc_key
=
'projection_enc'
def
forward
(
self
,
input_ids
,
attention_mask
,
tokentype_ids
=
None
):
extended_attention_mask
=
attention_mask
.
unsqueeze
(
1
)
#extended_attention_mask = bert_extended_attention_mask(attention_mask)
position_ids
=
bert_position_ids
(
input_ids
)
lm_output
=
self
.
language_model
(
input_ids
,
position_ids
,
extended_attention_mask
,
tokentype_ids
=
tokentype_ids
)
# This mask will be used in average-pooling and max-pooling
pool_mask
=
(
input_ids
==
self
.
pad_id
).
unsqueeze
(
2
)
# Taking the representation of the [CLS] token of BERT
pooled_output
=
lm_output
[:,
0
,
:]
# Converting to float16 dtype
pooled_output
=
pooled_output
.
to
(
lm_output
.
dtype
)
# Output.
if
self
.
biencoder_projection_dim
:
pooled_output
=
self
.
projection_enc
(
pooled_output
)
return
pooled_output
def
state_dict_for_save_checkpoint
(
self
,
destination
=
None
,
prefix
=
''
,
keep_vars
=
False
):
"""For easy load when model is combined with other heads,
add an extra key."""
state_dict_
=
{}
state_dict_
[
self
.
_language_model_key
]
\
=
self
.
language_model
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
if
self
.
biencoder_projection_dim
>
0
:
state_dict_
[
self
.
_projection_enc_key
]
=
\
self
.
projection_enc
.
state_dict
(
destination
,
prefix
,
keep_vars
)
return
state_dict_
def
load_state_dict
(
self
,
state_dict
,
strict
=
True
):
"""Customized load."""
print_rank_0
(
"loading BERT weights"
)
self
.
language_model
.
load_state_dict
(
state_dict
[
self
.
_language_model_key
],
strict
=
strict
)
if
self
.
biencoder_projection_dim
>
0
:
print_rank_0
(
"loading projection head weights"
)
self
.
projection_enc
.
load_state_dict
(
state_dict
[
self
.
_projection_enc_key
],
strict
=
strict
)
megatron-deepspeed_dtk22.10/megatron/model/classification.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Classification model."""
import
torch
from
megatron
import
get_args
,
print_rank_last
from
megatron
import
mpu
from
megatron.enums
import
AttnMaskType
from
megatron.model.bert_model
import
bert_extended_attention_mask
,
bert_position_ids
from
megatron.model.language_model
import
get_language_model
from
megatron.model.utils
import
get_linear_layer
from
megatron.model.utils
import
init_method_normal
from
megatron.model.utils
import
scaled_init_method_normal
from
.module
import
MegatronModule
class
Classification
(
MegatronModule
):
def
__init__
(
self
,
num_classes
,
num_tokentypes
=
2
,
pre_process
=
True
,
post_process
=
True
):
super
(
Classification
,
self
).
__init__
(
share_word_embeddings
=
False
)
args
=
get_args
()
self
.
num_classes
=
num_classes
self
.
pre_process
=
pre_process
self
.
post_process
=
post_process
init_method
=
init_method_normal
(
args
.
init_method_std
)
self
.
language_model
,
self
.
_language_model_key
=
get_language_model
(
num_tokentypes
=
num_tokentypes
,
add_pooler
=
True
,
encoder_attn_mask_type
=
AttnMaskType
.
padding
,
init_method
=
init_method
,
scaled_init_method
=
scaled_init_method_normal
(
args
.
init_method_std
,
args
.
num_layers
),
pre_process
=
self
.
pre_process
,
post_process
=
self
.
post_process
)
# Multi-choice head.
if
self
.
post_process
:
self
.
classification_dropout
=
torch
.
nn
.
Dropout
(
args
.
hidden_dropout
)
self
.
classification_head
=
get_linear_layer
(
args
.
hidden_size
,
self
.
num_classes
,
init_method
)
self
.
_classification_head_key
=
'classification_head'
def
set_input_tensor
(
self
,
input_tensor
):
"""See megatron.model.transformer.set_input_tensor()"""
self
.
language_model
.
set_input_tensor
(
input_tensor
)
def
forward
(
self
,
model_input
,
attention_mask
,
tokentype_ids
=
None
):
extended_attention_mask
=
bert_extended_attention_mask
(
attention_mask
)
input_ids
=
model_input
position_ids
=
bert_position_ids
(
input_ids
)
lm_output
=
self
.
language_model
(
input_ids
,
position_ids
,
extended_attention_mask
,
tokentype_ids
=
tokentype_ids
)
if
self
.
post_process
:
_
,
pooled_output
=
lm_output
classification_output
=
self
.
classification_dropout
(
pooled_output
)
classification_logits
=
self
.
classification_head
(
classification_output
)
# Reshape back to separate choices.
classification_logits
=
classification_logits
.
view
(
-
1
,
self
.
num_classes
)
return
classification_logits
return
lm_output
def
state_dict_for_save_checkpoint
(
self
,
destination
=
None
,
prefix
=
''
,
keep_vars
=
False
):
"""For easy load when model is combined with other heads,
add an extra key."""
state_dict_
=
{}
state_dict_
[
self
.
_language_model_key
]
\
=
self
.
language_model
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
if
self
.
post_process
:
state_dict_
[
self
.
_classification_head_key
]
\
=
self
.
classification_head
.
state_dict
(
destination
,
prefix
,
keep_vars
)
return
state_dict_
def
load_state_dict
(
self
,
state_dict
,
strict
=
True
):
"""Customized load."""
self
.
language_model
.
load_state_dict
(
state_dict
[
self
.
_language_model_key
],
strict
=
strict
)
if
self
.
post_process
:
if
self
.
_classification_head_key
in
state_dict
:
self
.
classification_head
.
load_state_dict
(
state_dict
[
self
.
_classification_head_key
],
strict
=
strict
)
else
:
print_rank_last
(
'***WARNING*** could not find {} in the checkpoint, '
'initializing to random'
.
format
(
self
.
_classification_head_key
))
megatron-deepspeed_dtk22.10/megatron/model/distributed.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
abc
import
ABC
from
abc
import
abstractmethod
import
torch
from
torch._utils
import
_flatten_dense_tensors
,
_unflatten_dense_tensors
from
megatron
import
get_args
from
megatron
import
mpu
from
.module
import
MegatronModule
class
MemoryBuffer
:
def
__init__
(
self
,
numel
,
dtype
):
self
.
numel
=
numel
self
.
dtype
=
dtype
self
.
data
=
torch
.
zeros
(
self
.
numel
,
dtype
=
self
.
dtype
,
device
=
torch
.
cuda
.
current_device
(),
requires_grad
=
False
)
def
zero
(
self
):
"""Reset the buffer to zero."""
self
.
data
.
zero_
()
def
get
(
self
,
shape
,
start_index
):
"""Return a tensor with the input `shape` as a view into the
1-D data starting at `start_index`."""
end_index
=
start_index
+
shape
.
numel
()
assert
end_index
<=
self
.
numel
,
\
'requested tensor is out of the buffer range.'
buffer_tensor
=
self
.
data
[
start_index
:
end_index
]
buffer_tensor
=
buffer_tensor
.
view
(
shape
)
return
buffer_tensor
class
DistributedDataParallelBase
(
MegatronModule
,
ABC
):
"""Abstract class for DDP."""
def
__init__
(
self
,
module
):
super
(
DistributedDataParallelBase
,
self
).
__init__
()
# Keep a pointer to the model.
self
.
module
=
module
@
abstractmethod
def
allreduce_gradients
(
self
):
pass
def
forward
(
self
,
*
inputs
,
**
kwargs
):
return
self
.
module
(
*
inputs
,
**
kwargs
)
def
state_dict
(
self
,
destination
=
None
,
prefix
=
''
,
keep_vars
=
False
):
return
self
.
module
.
state_dict
(
destination
,
prefix
,
keep_vars
)
def
state_dict_for_save_checkpoint
(
self
,
destination
=
None
,
prefix
=
''
,
keep_vars
=
False
):
return
self
.
module
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
def
load_state_dict
(
self
,
state_dict
,
strict
=
True
):
self
.
module
.
load_state_dict
(
state_dict
,
strict
=
strict
)
class
DistributedDataParallel
(
DistributedDataParallelBase
):
"""DDP with contiguous buffers options to storre and accumulate gradients.
This class:
- has the potential to reduce memory fragmentation.
- provides the option to do the gradient accumulation
in a type other than the params type (for example fp32)
Arguments:
module: input model.
accumulate_allreduce_grads_in_fp32: if true do the gradient accumulation
and the gradient all-reduce all in in float32. If this option is
true, we require `use_contiguous_buffers` to be true too.
use_contiguous_buffers: if true, use a contiguous buffer to store the
gradients.
"""
def
__init__
(
self
,
module
,
accumulate_allreduce_grads_in_fp32
,
use_contiguous_buffers
):
super
(
DistributedDataParallel
,
self
).
__init__
(
module
)
self
.
accumulate_allreduce_grads_in_fp32
\
=
accumulate_allreduce_grads_in_fp32
self
.
use_contiguous_buffers
=
use_contiguous_buffers
# If we are using fp32-accumulate-allreduce explicitly
# this means we need main grads in a continous buffer.
if
self
.
accumulate_allreduce_grads_in_fp32
:
assert
self
.
use_contiguous_buffers
# ===================================
# Rest of this part applies only to
# the case we use continuous buffers.
# ===================================
self
.
_grad_buffers
=
None
if
self
.
use_contiguous_buffers
:
self
.
_grad_buffers
=
{}
# Simple function to define buffer type.
def
_get_buffer_type
(
param
):
return
torch
.
float
if
\
self
.
accumulate_allreduce_grads_in_fp32
else
param
.
dtype
# First calculate total number of elements per type.
type_num_elements
=
{}
for
param
in
self
.
module
.
parameters
():
if
param
.
requires_grad
:
dtype
=
_get_buffer_type
(
param
)
type_num_elements
[
dtype
]
=
type_num_elements
.
get
(
dtype
,
0
)
\
+
param
.
data
.
nelement
()
# Allocate the buffer.
for
dtype
,
num_elements
in
type_num_elements
.
items
():
self
.
_grad_buffers
[
dtype
]
=
MemoryBuffer
(
num_elements
,
dtype
)
# Assume the back prop order is reverse the params order,
# store the start index for the gradients.
for
param
in
self
.
module
.
parameters
():
if
param
.
requires_grad
:
dtype
=
_get_buffer_type
(
param
)
type_num_elements
[
dtype
]
-=
param
.
data
.
nelement
()
param
.
main_grad
=
self
.
_grad_buffers
[
dtype
].
get
(
param
.
data
.
shape
,
type_num_elements
[
dtype
])
# Backward hook.
# Accumalation function for the gradients. We need
# to store them so they don't go out of scope.
self
.
grad_accs
=
[]
# Loop over all the parameters in the model.
for
param
in
self
.
module
.
parameters
():
if
param
.
requires_grad
:
# Expand so we get access to grad_fn.
param_tmp
=
param
.
expand_as
(
param
)
# Get the gradient accumulator functtion.
grad_acc
=
param_tmp
.
grad_fn
.
next_functions
[
0
][
0
]
grad_acc
.
register_hook
(
self
.
_make_param_hook
(
param
))
self
.
grad_accs
.
append
(
grad_acc
)
def
_make_param_hook
(
self
,
param
):
"""Create the all-reduce hook for backprop."""
# Hook used for back-prop.
def
param_hook
(
*
unused
):
# Add the gradient to the buffer.
if
param
.
grad
.
data
is
not
None
:
param
.
main_grad
.
add_
(
param
.
grad
.
data
)
# Now we can deallocate grad memory.
param
.
grad
=
None
return
param_hook
def
zero_grad_buffer
(
self
):
"""Set the grad buffer data to zero. Needs to be called at the
begining of each iteration."""
assert
self
.
_grad_buffers
is
not
None
,
'buffers are not initialized.'
for
_
,
buffer_
in
self
.
_grad_buffers
.
items
():
buffer_
.
zero
()
def
allreduce_gradients
(
self
):
"""Reduce gradients across data parallel ranks."""
# If we have buffers, simply reduce the data in the buffer.
if
self
.
_grad_buffers
is
not
None
:
for
_
,
buffer_
in
self
.
_grad_buffers
.
items
():
buffer_
.
data
/=
mpu
.
get_data_parallel_world_size
()
torch
.
distributed
.
all_reduce
(
buffer_
.
data
,
group
=
mpu
.
get_data_parallel_group
())
else
:
# Otherwise, bucketize and all-reduce
buckets
=
{}
# Pack the buckets.
for
param
in
self
.
module
.
parameters
():
if
param
.
requires_grad
and
param
.
grad
is
not
None
:
tp
=
param
.
data
.
type
()
if
tp
not
in
buckets
:
buckets
[
tp
]
=
[]
buckets
[
tp
].
append
(
param
)
param
.
main_grad
=
param
.
grad
# For each bucket, all-reduce and copy all-reduced grads.
for
tp
in
buckets
:
bucket
=
buckets
[
tp
]
grads
=
[
param
.
grad
.
data
for
param
in
bucket
]
coalesced
=
_flatten_dense_tensors
(
grads
)
coalesced
/=
mpu
.
get_data_parallel_world_size
()
torch
.
distributed
.
all_reduce
(
coalesced
,
group
=
mpu
.
get_data_parallel_group
())
for
buf
,
synced
in
zip
(
grads
,
_unflatten_dense_tensors
(
coalesced
,
grads
)):
buf
.
copy_
(
synced
)
megatron-deepspeed_dtk22.10/megatron/model/fused_bias_gelu.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch
torch
.
_C
.
_jit_set_profiling_mode
(
False
)
torch
.
_C
.
_jit_set_profiling_executor
(
False
)
torch
.
_C
.
_jit_override_can_fuse_on_cpu
(
True
)
torch
.
_C
.
_jit_override_can_fuse_on_gpu
(
True
)
###### BIAS GELU FUSION/ NO AUTOGRAD ################
# 1/sqrt(2*pi)-> 0.3989423
# 1/sqrt(2) -> 0.70710678
# sqrt(2/pi) -> 0.79788456
# this function is tanh approximation of gelu
# actual gelu is:
# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
@
torch
.
jit
.
script
def
bias_gelu
(
bias
,
y
):
x
=
bias
+
y
return
x
*
0.5
*
(
1.0
+
torch
.
tanh
(
0.79788456
*
x
*
(
1
+
0.044715
*
x
*
x
)))
# gradient of tanh approximation of gelu
# gradient of actual gelu is:
# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
@
torch
.
jit
.
script
def
bias_gelu_back
(
g
,
bias
,
y
):
x
=
bias
+
y
tanh_out
=
torch
.
tanh
(
0.79788456
*
x
*
(
1
+
0.044715
*
x
*
x
))
# sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
ff
=
0.5
*
x
*
((
1
-
tanh_out
*
tanh_out
)
*
(
0.79788456
+
0.1070322243
*
x
*
x
))
+
0.5
*
(
1
+
tanh_out
)
return
ff
*
g
class
GeLUFunction
(
torch
.
autograd
.
Function
):
@
staticmethod
# bias is an optional argument
def
forward
(
ctx
,
input
,
bias
):
ctx
.
save_for_backward
(
input
,
bias
)
return
bias_gelu
(
bias
,
input
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
input
,
bias
=
ctx
.
saved_tensors
tmp
=
bias_gelu_back
(
grad_output
,
bias
,
input
)
return
tmp
,
tmp
bias_gelu_impl
=
GeLUFunction
.
apply
megatron-deepspeed_dtk22.10/megatron/model/fused_layer_norm.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This code is copied fron NVIDIA apex:
https://github.com/NVIDIA/apex
with some changes. """
import
numbers
from
megatron
import
get_args
from
megatron
import
mpu
from
packaging
import
version
from
torch
import
nn
from
torch.nn
import
init
from
torch.nn.parameter
import
Parameter
import
importlib
import
torch
import
torch.nn.functional
as
F
global
fused_mix_prec_layer_norm_cuda
fused_mix_prec_layer_norm_cuda
=
None
class
FusedLayerNormAffineFunction
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
input
,
weight
,
bias
,
normalized_shape
,
eps
):
ctx
.
normalized_shape
=
normalized_shape
ctx
.
eps
=
eps
input_
=
input
.
contiguous
()
weight_
=
weight
.
contiguous
()
bias_
=
bias
.
contiguous
()
output
,
mean
,
invvar
=
fused_mix_prec_layer_norm_cuda
.
forward_affine
(
input_
,
ctx
.
normalized_shape
,
weight_
,
bias_
,
ctx
.
eps
)
ctx
.
save_for_backward
(
input_
,
weight_
,
bias_
,
mean
,
invvar
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_output
):
input_
,
weight_
,
bias_
,
mean
,
invvar
=
ctx
.
saved_tensors
grad_input
=
grad_weight
=
grad_bias
=
None
grad_input
,
grad_weight
,
grad_bias
\
=
fused_mix_prec_layer_norm_cuda
.
backward_affine
(
grad_output
.
contiguous
(),
mean
,
invvar
,
input_
,
ctx
.
normalized_shape
,
weight_
,
bias_
,
ctx
.
eps
)
return
grad_input
,
grad_weight
,
grad_bias
,
None
,
None
class
MixedFusedLayerNorm
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
normalized_shape
,
eps
=
1e-5
):
super
(
MixedFusedLayerNorm
,
self
).
__init__
()
global
fused_mix_prec_layer_norm_cuda
fused_mix_prec_layer_norm_cuda
=
importlib
.
import_module
(
"fused_mix_prec_layer_norm_cuda"
)
if
isinstance
(
normalized_shape
,
numbers
.
Integral
):
normalized_shape
=
(
normalized_shape
,)
self
.
normalized_shape
=
torch
.
Size
(
normalized_shape
)
self
.
eps
=
eps
self
.
weight
=
Parameter
(
torch
.
Tensor
(
*
normalized_shape
))
self
.
bias
=
Parameter
(
torch
.
Tensor
(
*
normalized_shape
))
self
.
reset_parameters
()
args
=
get_args
()
self
.
layernorm_tp_auto_sync
=
args
.
sync_tp_duplicated_parameters
self
.
use_meg_ds_fused_layer_norm
=
(
args
.
bf16
# Current Meg-DS cuda kernel has better throughput than torch.nn.LayerNorm
or
version
.
parse
(
torch
.
__version__
)
>=
version
.
parse
(
"1.11.0"
)
# https://github.com/pytorch/pytorch/pull/66920
)
def
reset_parameters
(
self
):
init
.
ones_
(
self
.
weight
)
init
.
zeros_
(
self
.
bias
)
def
forward
(
self
,
input
):
if
self
.
layernorm_tp_auto_sync
:
torch
.
distributed
.
all_reduce
(
self
.
weight
,
op
=
torch
.
distributed
.
ReduceOp
.
AVG
,
group
=
mpu
.
get_tensor_model_parallel_group
())
torch
.
distributed
.
all_reduce
(
self
.
bias
,
op
=
torch
.
distributed
.
ReduceOp
.
AVG
,
group
=
mpu
.
get_tensor_model_parallel_group
())
if
self
.
use_meg_ds_fused_layer_norm
:
return
FusedLayerNormAffineFunction
.
apply
(
input
,
self
.
weight
,
self
.
bias
,
self
.
normalized_shape
,
self
.
eps
)
else
:
return
F
.
layer_norm
(
input
,
self
.
normalized_shape
,
self
.
weight
,
self
.
bias
)
megatron-deepspeed_dtk22.10/megatron/model/fused_softmax.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
functools
import
lru_cache
import
torch
import
torch.nn
as
nn
from
megatron.enums
import
AttnMaskType
class
ScaledUpperTriangMaskedSoftmax
(
torch
.
autograd
.
Function
):
"""
Fused operation which performs following three operations in sequence
1. Scale the tensor.
2. Apply upper triangular mask (typically used in gpt models).
3. Perform softmax.
"""
@
staticmethod
def
forward
(
ctx
,
inputs
,
scale
):
import
scaled_upper_triang_masked_softmax_cuda
scale_t
=
torch
.
tensor
([
scale
])
softmax_results
=
scaled_upper_triang_masked_softmax_cuda
.
forward
(
inputs
,
scale_t
[
0
]
)
ctx
.
save_for_backward
(
softmax_results
,
scale_t
)
return
softmax_results
@
staticmethod
def
backward
(
ctx
,
output_grads
):
import
scaled_upper_triang_masked_softmax_cuda
softmax_results
,
scale_t
=
ctx
.
saved_tensors
input_grads
=
scaled_upper_triang_masked_softmax_cuda
.
backward
(
output_grads
,
softmax_results
,
scale_t
[
0
]
)
return
input_grads
,
None
class
ScaledMaskedSoftmax
(
torch
.
autograd
.
Function
):
"""
Fused operation which performs following three operations in sequence
1. Scale the tensor.
2. Apply the mask.
3. Perform softmax.
"""
@
staticmethod
def
forward
(
ctx
,
inputs
,
mask
,
scale
):
import
scaled_masked_softmax_cuda
scale_t
=
torch
.
tensor
([
scale
])
softmax_results
=
scaled_masked_softmax_cuda
.
forward
(
inputs
,
mask
,
scale_t
[
0
])
ctx
.
save_for_backward
(
softmax_results
,
scale_t
)
return
softmax_results
@
staticmethod
def
backward
(
ctx
,
output_grads
):
import
scaled_masked_softmax_cuda
softmax_results
,
scale_t
=
ctx
.
saved_tensors
input_grads
=
scaled_masked_softmax_cuda
.
backward
(
output_grads
,
softmax_results
,
scale_t
[
0
]
)
return
input_grads
,
None
,
None
class
ScaledSoftmax
(
torch
.
autograd
.
Function
):
"""
Fused operation which performs following two operations in sequence
1. Scale the tensor.
2. Perform softmax.
"""
@
staticmethod
def
forward
(
ctx
,
inputs
,
scale
):
import
scaled_softmax_cuda
scale_t
=
torch
.
tensor
([
scale
])
softmax_results
=
scaled_softmax_cuda
.
forward
(
inputs
,
scale_t
[
0
]
)
ctx
.
save_for_backward
(
softmax_results
,
scale_t
)
return
softmax_results
@
staticmethod
def
backward
(
ctx
,
output_grads
):
import
scaled_softmax_cuda
softmax_results
,
scale_t
=
ctx
.
saved_tensors
input_grads
=
scaled_softmax_cuda
.
backward
(
output_grads
,
softmax_results
,
scale_t
[
0
]
)
return
input_grads
,
None
,
None
class
FusedScaleMaskSoftmax
(
nn
.
Module
):
"""
fused operation: scaling + mask + softmax
Arguments:
input_in_fp16: flag to indicate if input in fp16 data format.
input_in_bf16: flag to indicate if input in bf16 data format.
attn_mask_type: attention mask type (pad or causal)
scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
mask_func: mask function to be applied.
softmax_in_fp32: if true, softmax in performed at fp32 precision.
scale: scaling factor used in input tensor scaling.
"""
def
__init__
(
self
,
input_in_fp16
,
input_in_bf16
,
attn_mask_type
,
scaled_masked_softmax_fusion
,
mask_func
,
softmax_in_fp32
,
scale
,
):
super
(
FusedScaleMaskSoftmax
,
self
).
__init__
()
self
.
input_in_fp16
=
input_in_fp16
self
.
input_in_bf16
=
input_in_bf16
assert
not
(
self
.
input_in_fp16
and
self
.
input_in_bf16
),
"both fp16 and bf16 flags cannot be active at the same time."
self
.
input_in_float16
=
self
.
input_in_fp16
or
self
.
input_in_bf16
self
.
attn_mask_type
=
attn_mask_type
self
.
scaled_masked_softmax_fusion
=
scaled_masked_softmax_fusion
self
.
mask_func
=
mask_func
self
.
softmax_in_fp32
=
softmax_in_fp32
self
.
scale
=
scale
assert
(
self
.
scale
is
None
or
softmax_in_fp32
),
"softmax should be in fp32 when scaled"
def
forward
(
self
,
input
,
mask
):
# [b, np, sq, sk]
assert
input
.
dim
()
==
4
if
self
.
is_kernel_available
(
mask
,
*
input
.
size
()):
return
self
.
forward_fused_softmax
(
input
,
mask
)
else
:
return
self
.
forward_torch_softmax
(
input
,
mask
)
def
is_kernel_available
(
self
,
mask
,
b
,
np
,
sq
,
sk
):
attn_batches
=
b
*
np
if
(
self
.
scaled_masked_softmax_fusion
# user want to fuse
and
self
.
input_in_float16
# input must be fp16
and
16
<
sk
<=
4096
# sk must be 16 ~ 4096
and
sq
%
4
==
0
# sq must be divisor of 4
and
attn_batches
%
4
==
0
# np * b must be divisor of 4
):
if
0
<=
sk
<=
4096
:
batch_per_block
=
self
.
get_batch_per_block
(
sq
,
sk
,
b
,
np
)
if
self
.
attn_mask_type
==
AttnMaskType
.
causal
:
if
attn_batches
%
batch_per_block
==
0
:
return
True
else
:
if
sq
%
batch_per_block
==
0
:
return
True
return
False
def
forward_fused_softmax
(
self
,
input
,
mask
):
b
,
np
,
sq
,
sk
=
input
.
size
()
scale
=
self
.
scale
if
self
.
scale
is
not
None
else
1.0
if
self
.
attn_mask_type
==
AttnMaskType
.
causal
:
assert
sq
==
sk
,
"causal mask is only for self attention"
# assert mask is None, "Mask is silently ignored due to the use of a custom kernel"
# input is 3D tensor (attn_batches, sq, sk)
input
=
input
.
view
(
-
1
,
sq
,
sk
)
probs
=
ScaledUpperTriangMaskedSoftmax
.
apply
(
input
,
scale
)
return
probs
.
view
(
b
,
np
,
sq
,
sk
)
else
:
# input is 4D tensor (b, np, sq, sk)
if
mask
is
not
None
:
return
ScaledMaskedSoftmax
.
apply
(
input
,
mask
,
scale
)
else
:
return
ScaledSoftmax
.
apply
(
input
,
scale
)
@
staticmethod
@
lru_cache
(
maxsize
=
1
)
def
get_causal_mask
(
sequence_length
:
int
):
mask
=
torch
.
ones
(
1
,
1
,
sequence_length
,
sequence_length
,
dtype
=
torch
.
bool
,
device
=
torch
.
cuda
.
current_device
())
return
torch
.
triu
(
mask
,
diagonal
=
1
)
def
forward_torch_softmax
(
self
,
input
,
mask
):
if
self
.
input_in_float16
and
self
.
softmax_in_fp32
:
input
=
input
.
float
()
if
self
.
scale
is
not
None
:
input
=
input
*
self
.
scale
if
self
.
attn_mask_type
==
AttnMaskType
.
causal
:
# assert mask is None
assert
input
.
shape
[
2
]
==
input
.
shape
[
3
]
mask
=
self
.
get_causal_mask
(
input
.
shape
[
2
])
mask_output
=
self
.
mask_func
(
input
,
mask
)
if
mask
is
not
None
else
input
probs
=
torch
.
nn
.
Softmax
(
dim
=-
1
)(
mask_output
)
if
self
.
input_in_float16
and
self
.
softmax_in_fp32
:
if
self
.
input_in_fp16
:
probs
=
probs
.
half
()
else
:
probs
=
probs
.
bfloat16
()
return
probs
@
staticmethod
def
get_batch_per_block
(
sq
,
sk
,
b
,
np
):
import
scaled_masked_softmax_cuda
return
scaled_masked_softmax_cuda
.
get_batch_per_block
(
sq
,
sk
,
b
,
np
)
megatron-deepspeed_dtk22.10/megatron/model/glu_activations.py
0 → 100644
View file @
8ec5d678
import
torch
from
torch
import
nn
from
torch.nn
import
functional
as
F
from
megatron
import
logging
from
megatron.model.utils
import
log_debug_usage
logger
=
logging
.
get_logger
(
__name__
)
class
_GLUBaseModule
(
nn
.
Module
):
def
__init__
(
self
,
activation_fn
):
super
().
__init__
()
self
.
activation_fn
=
activation_fn
def
forward
(
self
,
x
):
# dim=-1 breaks in jit for pt<1.10
x1
,
x2
=
x
.
chunk
(
2
,
dim
=
(
x
.
ndim
-
1
))
return
x1
*
self
.
activation_fn
(
x2
)
class
LiGLU
(
_GLUBaseModule
):
def
__init__
(
self
):
super
().
__init__
(
nn
.
Identity
())
class
GEGLU
(
_GLUBaseModule
):
def
__init__
(
self
):
super
().
__init__
(
F
.
gelu
)
class
ReGLU
(
_GLUBaseModule
):
def
__init__
(
self
):
super
().
__init__
(
F
.
relu
)
class
SwiGLU
(
_GLUBaseModule
):
def
__init__
(
self
):
super
().
__init__
(
F
.
silu
)
liglu
=
log_debug_usage
(
logger
,
"Using GLU activation: LiGLU."
)(
torch
.
jit
.
script
(
LiGLU
()))
geglu
=
log_debug_usage
(
logger
,
"Using GLU activation: GELU."
)(
torch
.
jit
.
script
(
GEGLU
()))
reglu
=
log_debug_usage
(
logger
,
"Using GLU activation: ReGLU."
)(
torch
.
jit
.
script
(
ReGLU
()))
swiglu
=
log_debug_usage
(
logger
,
"Using GLU activation: SwiGLU."
)(
torch
.
jit
.
script
(
SwiGLU
()))
GLU_ACTIVATIONS
=
{
"geglu"
:
geglu
,
"liglu"
:
liglu
,
"reglu"
:
reglu
,
"swiglu"
:
swiglu
,
}
megatron-deepspeed_dtk22.10/megatron/model/gpt_model.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""GPT-2 model."""
from
functools
import
partial
import
torch
from
megatron
import
get_args
from
megatron
import
mpu
from
megatron.enums
import
AttnMaskType
from
.module
import
MegatronModule
,
fp32_to_float16
from
.language_model
import
parallel_lm_logits
from
.language_model
import
get_language_model
from
.utils
import
init_method_normal
from
.utils
import
scaled_init_method_normal
from
deepspeed.pipe
import
PipelineModule
,
LayerSpec
,
TiedLayerSpec
from
megatron.model.fused_layer_norm
import
MixedFusedLayerNorm
as
LayerNorm
from
megatron.model.module
import
float16_to_fp32
from
.language_model
import
EmbeddingPipe
from
.transformer
import
ParallelTransformerLayerPipe
def
post_language_model_processing
(
lm_output
,
labels
,
logit_weights
,
get_key_value
,
parallel_output
,
forward_method_parallel_output
,
fp16_lm_cross_entropy
):
if
get_key_value
:
lm_output
,
presents
=
lm_output
# Output.
if
forward_method_parallel_output
is
not
None
:
parallel_output
=
forward_method_parallel_output
output
=
parallel_lm_logits
(
lm_output
,
logit_weights
,
parallel_output
)
if
get_key_value
:
output
=
[
output
,
presents
]
if
labels
is
None
:
return
output
else
:
if
fp16_lm_cross_entropy
:
assert
output
.
dtype
==
torch
.
half
loss
=
mpu
.
vocab_parallel_cross_entropy
(
output
,
labels
)
else
:
loss
=
mpu
.
vocab_parallel_cross_entropy
(
output
.
float
(),
labels
)
return
loss
class
GPTModel
(
MegatronModule
):
"""GPT-2 Language model."""
def
__init__
(
self
,
num_tokentypes
=
0
,
parallel_output
=
True
,
pre_process
=
True
,
post_process
=
True
,
prefix_lm
=
False
,
):
super
(
GPTModel
,
self
).
__init__
()
args
=
get_args
()
self
.
parallel_output
=
parallel_output
self
.
pre_process
=
pre_process
self
.
post_process
=
post_process
self
.
fp16_lm_cross_entropy
=
args
.
fp16_lm_cross_entropy
self
.
language_model
,
self
.
_language_model_key
=
get_language_model
(
num_tokentypes
=
num_tokentypes
,
add_pooler
=
False
,
# TODO: Change naming of class from GPT to something that encapsulate prefix lm.
encoder_attn_mask_type
=
AttnMaskType
.
prefix
if
prefix_lm
else
AttnMaskType
.
causal
,
init_method
=
init_method_normal
(
args
.
init_method_std
),
scaled_init_method
=
scaled_init_method_normal
(
args
.
init_method_std
,
args
.
num_layers
),
pre_process
=
self
.
pre_process
,
post_process
=
self
.
post_process
)
self
.
initialize_word_embeddings
(
init_method_normal
)
def
set_input_tensor
(
self
,
input_tensor
):
"""See megatron.model.transformer.set_input_tensor()"""
self
.
language_model
.
set_input_tensor
(
input_tensor
)
def
forward
(
self
,
input_ids
,
position_ids
,
attention_mask
,
labels
=
None
,
tokentype_ids
=
None
,
layer_past
=
None
,
get_key_value
=
False
,
forward_method_parallel_output
=
None
,
curriculum_seqlen
=
None
):
if
curriculum_seqlen
is
not
None
:
args
=
get_args
()
args
.
curriculum_seqlen
=
curriculum_seqlen
if
curriculum_seqlen
<
input_ids
.
size
()[
1
]:
# seqlen-based curriculum learning
# input_ids, position_ids, labels have size [batch size, seqlen]
input_ids
=
input_ids
[:,
:
curriculum_seqlen
].
contiguous
()
position_ids
=
position_ids
[:,
:
curriculum_seqlen
].
contiguous
()
labels
=
labels
[:,
:
curriculum_seqlen
].
contiguous
()
# attention_mask has size [1, 1, seqlen, seqlen]
attention_mask
=
attention_mask
[:,
:,
:
curriculum_seqlen
,
:
curriculum_seqlen
].
contiguous
()
lm_output
=
self
.
language_model
(
input_ids
,
position_ids
,
attention_mask
,
layer_past
=
layer_past
,
get_key_value
=
get_key_value
)
if
self
.
post_process
:
return
post_language_model_processing
(
lm_output
,
labels
,
self
.
word_embeddings_weight
(),
get_key_value
,
self
.
parallel_output
,
forward_method_parallel_output
,
self
.
fp16_lm_cross_entropy
)
else
:
return
lm_output
def
state_dict_for_save_checkpoint
(
self
,
destination
=
None
,
prefix
=
''
,
keep_vars
=
False
):
state_dict_
=
{}
state_dict_
[
self
.
_language_model_key
]
\
=
self
.
language_model
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
# Save word_embeddings.
if
self
.
post_process
and
not
self
.
pre_process
:
state_dict_
[
self
.
_word_embeddings_for_head_key
]
\
=
self
.
word_embeddings
.
state_dict
(
destination
,
prefix
,
keep_vars
)
return
state_dict_
def
load_state_dict
(
self
,
state_dict
,
strict
=
True
):
"""Customized load."""
# Load word_embeddings.
if
self
.
post_process
and
not
self
.
pre_process
:
self
.
word_embeddings
.
load_state_dict
(
state_dict
[
self
.
_word_embeddings_for_head_key
],
strict
=
strict
)
if
self
.
_language_model_key
in
state_dict
:
state_dict
=
state_dict
[
self
.
_language_model_key
]
self
.
language_model
.
load_state_dict
(
state_dict
,
strict
=
strict
)
def
get_cross_entropy
(
is_prefix
:
bool
):
def
CrossEntropy
(
output
,
labels
):
labels
,
loss_mask
=
labels
[
0
],
labels
[
1
]
args
=
get_args
()
losses
=
mpu
.
vocab_parallel_cross_entropy
(
output
.
contiguous
().
float
(),
labels
)
if
is_prefix
:
micro_batch_size
,
sequence_length
=
loss_mask
.
shape
average_tokens_per_sample
:
torch
.
Tensor
if
args
.
loss_on_targets_only
:
# HACK: This is useful when we obtain loss masks that are microbatch dependent. Consequently, if we want to
# preserve the notion that all tokens have the same impact on the loss, we can only normalise using a
# microbatch independent value. It should be expected weight over a microbatch.
# Here we still use `sequence_length`, that's batch size dependent, in order to be backwards compatible with
# current experiment on vanilla gpt.
if
args
.
reweight_loss_based_on_position_frequency
:
reweight
=
torch
.
arange
(
sequence_length
,
0
,
-
1
,
dtype
=
torch
.
float
,
device
=
loss_mask
.
device
)
/
(
sequence_length
+
1
)
*
2
average_tokens_per_sample
=
reweight
.
flip
(
-
1
).
cumsum
(
-
1
).
mean
()
else
:
average_tokens_per_sample
=
(
sequence_length
+
1
)
/
2
else
:
average_tokens_per_sample
=
sequence_length
expected_number_of_tokens
=
average_tokens_per_sample
*
micro_batch_size
else
:
expected_number_of_tokens
=
loss_mask
.
sum
()
loss_mask
=
loss_mask
.
view
(
-
1
)
loss
=
torch
.
sum
(
losses
.
view
(
-
1
)
*
loss_mask
)
/
expected_number_of_tokens
return
loss
return
CrossEntropy
class
GPTModelPipe
(
PipelineModule
,
MegatronModule
):
"""GPT-2 Language model."""
def
__init__
(
self
,
num_tokentypes
=
0
,
parallel_output
=
True
,
attn_mask_type
:
AttnMaskType
=
AttnMaskType
.
causal
):
args
=
get_args
()
self
.
parallel_output
=
parallel_output
init_method
=
init_method_normal
(
args
.
init_method_std
)
self
.
specs
=
[]
def
_to_float16
(
inputs
):
if
args
.
fp16
:
return
fp32_to_float16
(
inputs
,
lambda
v
:
v
.
half
())
elif
args
.
bf16
:
return
fp32_to_float16
(
inputs
,
lambda
v
:
v
.
bfloat16
())
else
:
return
inputs
self
.
specs
.
append
(
_to_float16
)
# Embedding layer
self
.
specs
.
append
(
TiedLayerSpec
(
'embed'
,
EmbeddingPipe
,
args
.
hidden_size
,
args
.
padded_vocab_size
,
args
.
hidden_dropout
,
init_method
=
init_method
,
num_tokentypes
=
num_tokentypes
,
tied_weight_attr
=
'word_embeddings_weight'
))
if
args
.
fp32_residual_connection
:
if
getattr
(
args
,
'pretrain_causal_attention'
,
False
):
self
.
specs
.
append
(
lambda
x
:
x
.
transpose
(
0
,
1
).
contiguous
().
float
())
else
:
# EmbeddingPipe returns attention mask as well
self
.
specs
.
append
(
lambda
x
:
(
x
[
0
].
transpose
(
0
,
1
).
contiguous
().
float
(),
*
x
[
1
:]))
else
:
if
getattr
(
args
,
'pretrain_causal_attention'
,
False
):
self
.
specs
.
append
(
lambda
x
:
x
.
transpose
(
0
,
1
).
contiguous
())
else
:
# EmbeddingPipe returns attention mask as well
self
.
specs
.
append
(
lambda
x
:
(
x
[
0
].
transpose
(
0
,
1
).
contiguous
(),
*
x
[
1
:]))
for
layer_idx
in
range
(
args
.
num_layers
):
self
.
specs
.
append
(
LayerSpec
(
ParallelTransformerLayerPipe
,
init_method
=
init_method
,
output_layer_init_method
=
scaled_init_method_normal
(
args
.
init_method_std
,
args
.
num_layers
),
layer_number
=
layer_idx
,
# TODO: Change naming of class from GPT to something that encapsulate prefix lm.
self_attn_mask_type
=
attn_mask_type
))
# Undo data format change
def
undo
(
x
):
if
not
getattr
(
args
,
'pretrain_causal_attention'
,
False
):
x
=
x
[
0
]
return
x
.
transpose
(
0
,
1
).
contiguous
()
self
.
specs
.
append
(
undo
)
# Final layernorm after transformer layers
self
.
specs
.
append
(
LayerSpec
(
LayerNorm
,
args
.
hidden_size
,
eps
=
args
.
layernorm_epsilon
))
def
_logits_helper
(
embedding
,
lm_output
):
"""A wrapper to massage inputs/outputs from pipeline. """
return
parallel_lm_logits
(
lm_output
,
embedding
.
word_embeddings_weight
,
self
.
parallel_output
)
self
.
specs
.
append
(
TiedLayerSpec
(
'embed'
,
EmbeddingPipe
,
args
.
hidden_size
,
args
.
padded_vocab_size
,
args
.
hidden_dropout
,
init_method
=
init_method
,
num_tokentypes
=
num_tokentypes
,
forward_fn
=
_logits_helper
,
tied_weight_attr
=
'word_embeddings_weight'
)
)
# Convert to fp32 if needed
if
args
.
fp16
or
args
.
bf16
:
self
.
specs
.
append
(
float16_to_fp32
)
if
args
.
checkpoint_activations
:
interval
=
args
.
checkpoint_num_layers
else
:
interval
=
0
from
deepspeed.runtime.pipe.topology
import
PipeModelDataParallelTopology
topo
=
PipeModelDataParallelTopology
(
num_pp
=
mpu
.
get_pipeline_model_parallel_world_size
(),
num_mp
=
mpu
.
get_tensor_model_parallel_world_size
(),
num_dp
=
mpu
.
get_data_parallel_world_size
())
# here one can extend the regex to include more layers to be counted towards partitioning,
# e.g. 'type:transformer|embedding' will add up all the transformer blocks and also the first
# and last embedding layers and then partition that transformers+2 layers - so to get a good
# balance you may want to use less transformer layers
#
# caveat emptor: the current implementation of PP fails unless each stage has at least one
# transformer layer
if
args
.
pp_partition_method
is
not
None
:
partition_method
=
args
.
pp_partition_method
else
:
partition_method
=
'type:transformer'
super
().
__init__
(
layers
=
self
.
specs
,
loss_fn
=
get_cross_entropy
(
is_prefix
=
attn_mask_type
is
AttnMaskType
.
prefix
),
topology
=
topo
,
activation_checkpoint_interval
=
interval
,
partition_method
=
partition_method
)
megatron-deepspeed_dtk22.10/megatron/model/language_model.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Transformer based language model."""
import
torch
import
torch.nn.functional
as
F
from
megatron
import
get_args
from
megatron
import
mpu
from
.module
import
MegatronModule
from
megatron.enums
import
LayerType
,
AttnMaskType
,
PositionEmbeddingType
from
megatron.model.transformer
import
ParallelTransformer
from
megatron.model.utils
import
get_linear_layer
from
megatron.model.utils
import
init_method_normal
,
scaled_init_method_normal
def
parallel_lm_logits
(
input_
,
word_embeddings_weight
,
parallel_output
,
bias
=
None
):
"""LM logits using word embedding weights."""
# Parallel logits.
input_parallel
=
mpu
.
copy_to_tensor_model_parallel_region
(
input_
)
# Matrix multiply.
if
bias
is
None
:
logits_parallel
=
F
.
linear
(
input_parallel
,
word_embeddings_weight
)
else
:
logits_parallel
=
F
.
linear
(
input_parallel
,
word_embeddings_weight
,
bias
)
# Gather if needed.
if
parallel_output
:
return
logits_parallel
return
mpu
.
gather_from_tensor_model_parallel_region
(
logits_parallel
)
def
get_language_model
(
num_tokentypes
,
add_pooler
,
encoder_attn_mask_type
,
init_method
=
None
,
scaled_init_method
=
None
,
add_decoder
=
False
,
decoder_attn_mask_type
=
AttnMaskType
.
causal
,
pre_process
=
True
,
post_process
=
True
):
"""Build language model and return along with the key to save."""
args
=
get_args
()
if
init_method
is
None
:
init_method
=
init_method_normal
(
args
.
init_method_std
)
if
scaled_init_method
is
None
:
scaled_init_method
=
scaled_init_method_normal
(
args
.
init_method_std
,
args
.
num_layers
)
# Language model.
language_model
=
TransformerLanguageModel
(
init_method
,
scaled_init_method
,
encoder_attn_mask_type
,
num_tokentypes
=
num_tokentypes
,
add_decoder
=
add_decoder
,
decoder_attn_mask_type
=
decoder_attn_mask_type
,
add_pooler
=
add_pooler
,
pre_process
=
pre_process
,
post_process
=
post_process
)
# key used for checkpoints.
language_model_key
=
'language_model'
return
language_model
,
language_model_key
class
Pooler
(
MegatronModule
):
"""Pooler layer.
Pool hidden states of a specific token (for example start of the
sequence) and add a linear transformation followed by a tanh.
Arguments:
hidden_size: hidden size
init_method: weight initialization method for the linear layer.
bias is set to zero.
"""
def
__init__
(
self
,
hidden_size
,
init_method
):
super
(
Pooler
,
self
).
__init__
()
self
.
dense
=
get_linear_layer
(
hidden_size
,
hidden_size
,
init_method
)
def
forward
(
self
,
hidden_states
,
sequence_index
=
0
):
# hidden_states: [b, s, h]
# sequence_index: index of the token to pool.
pooled
=
hidden_states
[:,
sequence_index
,
:]
pooled
=
self
.
dense
(
pooled
)
pooled
=
torch
.
tanh
(
pooled
)
return
pooled
class
Embedding
(
MegatronModule
):
"""Language model embeddings.
Arguments:
hidden_size: hidden size
vocab_size: vocabulary size
embedding_dropout_prob: dropout probability for embeddings
init_method: weight initialization method
num_tokentypes: size of the token-type embeddings. 0 value
will ignore this embedding
"""
def
__init__
(
self
,
hidden_size
,
vocab_size
,
embedding_dropout_prob
,
init_method
,
num_tokentypes
=
0
):
super
(
Embedding
,
self
).
__init__
()
self
.
hidden_size
=
hidden_size
self
.
init_method
=
init_method
self
.
num_tokentypes
=
num_tokentypes
args
=
get_args
()
# Word embeddings (parallel).
self
.
word_embeddings
=
mpu
.
VocabParallelEmbedding
(
vocab_size
,
self
.
hidden_size
,
init_method
=
self
.
init_method
)
self
.
_word_embeddings_key
=
'word_embeddings'
# Position embedding (serial).
self
.
position_embedding_type
=
args
.
position_embedding_type
if
self
.
position_embedding_type
==
PositionEmbeddingType
.
absolute
:
max_position_embeddings
=
args
.
max_position_embeddings
assert
max_position_embeddings
is
not
None
self
.
position_embeddings
=
torch
.
nn
.
Embedding
(
max_position_embeddings
,
self
.
hidden_size
)
self
.
_position_embeddings_key
=
'position_embeddings'
# Initialize the position embeddings.
self
.
init_method
(
self
.
position_embeddings
.
weight
)
else
:
self
.
position_embeddings
=
None
# Token type embedding.
# Add this as an optional field that can be added through
# method call so we can load a pretrain model without
# token types and add them as needed.
self
.
_tokentype_embeddings_key
=
'tokentype_embeddings'
if
self
.
num_tokentypes
>
0
:
self
.
tokentype_embeddings
=
torch
.
nn
.
Embedding
(
self
.
num_tokentypes
,
self
.
hidden_size
)
# Initialize the token-type embeddings.
self
.
init_method
(
self
.
tokentype_embeddings
.
weight
)
else
:
self
.
tokentype_embeddings
=
None
# Embeddings dropout
self
.
embedding_dropout
=
torch
.
nn
.
Dropout
(
embedding_dropout_prob
)
def
add_tokentype_embeddings
(
self
,
num_tokentypes
):
"""Add token-type embedding. This function is provided so we can add
token-type embeddings in case the pretrained model does not have it.
This allows us to load the model normally and then add this embedding.
"""
if
self
.
tokentype_embeddings
is
not
None
:
raise
Exception
(
'tokentype embeddings is already initialized'
)
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'adding embedding for {} tokentypes'
.
format
(
num_tokentypes
),
flush
=
True
)
self
.
num_tokentypes
=
num_tokentypes
self
.
tokentype_embeddings
=
self
.
torch
.
nn
.
Embedding
(
num_tokentypes
,
self
.
hidden_size
)
# Initialize the token-type embeddings.
args
=
get_args
()
self
.
init_method
(
self
.
tokentype_embeddings
.
weight
)
def
forward
(
self
,
input_ids
,
position_ids
,
tokentype_ids
=
None
):
# Embeddings.
words_embeddings
=
self
.
word_embeddings
(
input_ids
)
embeddings
=
words_embeddings
if
self
.
position_embedding_type
==
PositionEmbeddingType
.
absolute
:
assert
self
.
position_embeddings
is
not
None
embeddings
=
embeddings
+
self
.
position_embeddings
(
position_ids
)
else
:
assert
self
.
position_embeddings
is
None
if
tokentype_ids
is
not
None
:
assert
self
.
tokentype_embeddings
is
not
None
embeddings
=
embeddings
+
self
.
tokentype_embeddings
(
tokentype_ids
)
else
:
assert
self
.
tokentype_embeddings
is
None
# Dropout.
embeddings
=
self
.
embedding_dropout
(
embeddings
)
return
embeddings
def
state_dict_for_save_checkpoint
(
self
,
destination
=
None
,
prefix
=
''
,
keep_vars
=
False
):
"""For easy load."""
state_dict_
=
{}
state_dict_
[
self
.
_word_embeddings_key
]
\
=
self
.
word_embeddings
.
state_dict
(
destination
,
prefix
,
keep_vars
)
if
self
.
position_embedding_type
==
PositionEmbeddingType
.
absolute
:
state_dict_
[
self
.
_position_embeddings_key
]
\
=
self
.
position_embeddings
.
state_dict
(
destination
,
prefix
,
keep_vars
)
if
self
.
num_tokentypes
>
0
:
state_dict_
[
self
.
_tokentype_embeddings_key
]
\
=
self
.
tokentype_embeddings
.
state_dict
(
destination
,
prefix
,
keep_vars
)
return
state_dict_
def
load_state_dict
(
self
,
state_dict
,
strict
=
True
):
"""Customized load."""
# Word embedding.
if
self
.
_word_embeddings_key
in
state_dict
:
state_dict_
=
state_dict
[
self
.
_word_embeddings_key
]
else
:
# for backward compatibility.
state_dict_
=
{}
for
key
in
state_dict
.
keys
():
if
'word_embeddings'
in
key
:
state_dict_
[
key
.
split
(
'word_embeddings.'
)[
1
]]
\
=
state_dict
[
key
]
self
.
word_embeddings
.
load_state_dict
(
state_dict_
,
strict
=
strict
)
# Position embedding.
if
self
.
position_embedding_type
==
PositionEmbeddingType
.
absolute
:
if
self
.
_position_embeddings_key
in
state_dict
:
state_dict_
=
state_dict
[
self
.
_position_embeddings_key
]
else
:
# for backward compatibility.
state_dict_
=
{}
for
key
in
state_dict
.
keys
():
if
'position_embeddings'
in
key
:
state_dict_
[
key
.
split
(
'position_embeddings.'
)[
1
]]
\
=
state_dict
[
key
]
self
.
position_embeddings
.
load_state_dict
(
state_dict_
,
strict
=
strict
)
# Tokentype embedding.
if
self
.
num_tokentypes
>
0
:
state_dict_
=
{}
if
self
.
_tokentype_embeddings_key
in
state_dict
:
state_dict_
=
state_dict
[
self
.
_tokentype_embeddings_key
]
else
:
# for backward compatibility.
for
key
in
state_dict
.
keys
():
if
'tokentype_embeddings'
in
key
:
state_dict_
[
key
.
split
(
'tokentype_embeddings.'
)[
1
]]
\
=
state_dict
[
key
]
if
len
(
state_dict_
.
keys
())
>
0
:
self
.
tokentype_embeddings
.
load_state_dict
(
state_dict_
,
strict
=
strict
)
else
:
print
(
'***WARNING*** expected tokentype embeddings in the '
'checkpoint but could not find it'
,
flush
=
True
)
class
EmbeddingPipe
(
Embedding
):
def
forward
(
self
,
inputs
,
**
kwargs
):
if
not
hasattr
(
self
,
'_args'
):
self
.
_args
=
get_args
()
input_ids
=
inputs
[
0
]
position_ids
=
inputs
[
1
]
if
getattr
(
self
.
_args
,
'pretrain_causal_attention'
,
False
):
attention_mask
=
None
else
:
attention_mask
=
inputs
[
2
]
if
len
(
inputs
)
==
4
:
tokentype_ids
=
inputs
[
3
]
else
:
tokentype_ids
=
None
embeddings
=
super
().
forward
(
input_ids
,
position_ids
,
tokentype_ids
=
tokentype_ids
)
# If cmd args has attn_mask, we don't forward it as an activation.
if
getattr
(
self
.
_args
,
'pretrain_causal_attention'
,
False
):
return
embeddings
else
:
return
embeddings
,
attention_mask
@
property
def
word_embeddings_weight
(
self
):
"""Easy accessory for the DeepSpeed pipeline engine to tie embeddings across stages."""
return
self
.
word_embeddings
.
weight
class
TransformerLanguageModel
(
MegatronModule
):
"""Transformer language model.
Arguments:
transformer_hparams: transformer hyperparameters
vocab_size: vocabulary size
embedding_dropout_prob: dropout probability for embeddings
num_tokentypes: size of the token-type embeddings. 0 value
will ignore this embedding
"""
def
__init__
(
self
,
init_method
,
output_layer_init_method
,
encoder_attn_mask_type
,
num_tokentypes
=
0
,
add_decoder
=
False
,
decoder_attn_mask_type
=
AttnMaskType
.
causal
,
add_pooler
=
False
,
pre_process
=
True
,
post_process
=
True
):
super
(
TransformerLanguageModel
,
self
).
__init__
()
args
=
get_args
()
self
.
pre_process
=
pre_process
self
.
post_process
=
post_process
self
.
hidden_size
=
args
.
hidden_size
self
.
num_tokentypes
=
num_tokentypes
self
.
init_method
=
init_method
self
.
encoder_attn_mask_type
=
encoder_attn_mask_type
self
.
add_decoder
=
add_decoder
self
.
decoder_attn_mask_type
=
decoder_attn_mask_type
self
.
add_pooler
=
add_pooler
# Embeddings.
if
self
.
pre_process
:
self
.
embedding
=
Embedding
(
self
.
hidden_size
,
args
.
padded_vocab_size
,
args
.
hidden_dropout
,
self
.
init_method
,
self
.
num_tokentypes
)
self
.
_embedding_key
=
'embedding'
# Transformer.
self
.
encoder
=
ParallelTransformer
(
self
.
init_method
,
output_layer_init_method
,
self_attn_mask_type
=
self
.
encoder_attn_mask_type
,
pre_process
=
self
.
pre_process
,
post_process
=
self
.
post_process
)
self
.
_encoder_key
=
'encoder'
# Decoder
if
self
.
add_decoder
:
assert
args
.
pipeline_model_parallel_size
==
1
,
\
'pipeline parallelism is not supported in the presence of decoder'
self
.
decoder
=
ParallelTransformer
(
self
.
init_method
,
output_layer_init_method
,
layer_type
=
LayerType
.
decoder
,
self_attn_mask_type
=
self
.
decoder_attn_mask_type
)
self
.
_decoder_key
=
'decoder'
if
self
.
post_process
:
# Pooler.
if
self
.
add_pooler
:
self
.
pooler
=
Pooler
(
self
.
hidden_size
,
self
.
init_method
)
self
.
_pooler_key
=
'pooler'
def
set_input_tensor
(
self
,
input_tensor
):
""" See megatron.model.transformer.set_input_tensor()"""
self
.
encoder
.
set_input_tensor
(
input_tensor
)
def
forward
(
self
,
enc_input_ids
,
enc_position_ids
,
enc_attn_mask
,
dec_input_ids
=
None
,
dec_position_ids
=
None
,
dec_attn_mask
=
None
,
enc_dec_attn_mask
=
None
,
tokentype_ids
=
None
,
layer_past
=
None
,
get_key_value
=
False
,
pooling_sequence_index
=
0
,
enc_hidden_states
=
None
,
output_enc_hidden
=
False
):
# Embeddings.
if
self
.
pre_process
:
embedding_output
=
self
.
embedding
(
enc_input_ids
,
enc_position_ids
,
tokentype_ids
=
tokentype_ids
)
encoder_input
=
embedding_output
else
:
encoder_input
=
None
# encoder.
if
enc_hidden_states
is
None
:
encoder_output
=
self
.
encoder
(
encoder_input
,
enc_attn_mask
,
layer_past
=
layer_past
,
get_key_value
=
get_key_value
)
else
:
encoder_output
=
enc_hidden_states
.
to
(
encoder_input
.
dtype
)
if
self
.
post_process
:
if
self
.
add_pooler
:
pooled_output
=
self
.
pooler
(
encoder_output
,
pooling_sequence_index
)
# output_enc_hidden refers to when we just need the encoder's
# output. For example, it is helpful to compute
# similarity between two sequences by average pooling
if
not
self
.
add_decoder
or
output_enc_hidden
:
if
self
.
add_pooler
and
self
.
post_process
:
return
encoder_output
,
pooled_output
else
:
return
encoder_output
# Decoder Embedding
dec_embedding_output
=
self
.
embedding
(
dec_input_ids
,
dec_position_ids
)
# decoder
decoder_output
=
self
.
decoder
(
dec_embedding_output
,
dec_attn_mask
,
layer_past
=
layer_past
,
get_key_value
=
get_key_value
,
encoder_output
=
encoder_output
,
enc_dec_attn_mask
=
enc_dec_attn_mask
)
if
self
.
add_pooler
and
self
.
post_process
:
return
decoder_output
,
encoder_output
,
pooled_output
else
:
return
decoder_output
,
encoder_output
def
state_dict_for_save_checkpoint
(
self
,
destination
=
None
,
prefix
=
''
,
keep_vars
=
False
):
"""For easy load."""
state_dict_
=
{}
if
self
.
pre_process
:
state_dict_
[
self
.
_embedding_key
]
\
=
self
.
embedding
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
state_dict_
[
self
.
_encoder_key
]
\
=
self
.
encoder
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
if
self
.
post_process
:
if
self
.
add_pooler
:
state_dict_
[
self
.
_pooler_key
]
\
=
self
.
pooler
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
if
self
.
add_decoder
:
state_dict_
[
self
.
_decoder_key
]
\
=
self
.
decoder
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
return
state_dict_
def
load_state_dict
(
self
,
state_dict
,
strict
=
True
):
"""Customized load."""
# Embedding.
if
self
.
pre_process
:
if
self
.
_embedding_key
in
state_dict
:
state_dict_
=
state_dict
[
self
.
_embedding_key
]
else
:
# for backward compatibility.
state_dict_
=
{}
for
key
in
state_dict
.
keys
():
if
'_embeddings'
in
key
:
state_dict_
[
key
]
=
state_dict
[
key
]
self
.
embedding
.
load_state_dict
(
state_dict_
,
strict
=
strict
)
# Encoder.
if
self
.
_encoder_key
in
state_dict
:
state_dict_
=
state_dict
[
self
.
_encoder_key
]
# for backward compatibility.
elif
'transformer'
in
state_dict
:
state_dict_
=
state_dict
[
'transformer'
]
else
:
# for backward compatibility.
state_dict_
=
{}
for
key
in
state_dict
.
keys
():
if
'transformer.'
in
key
:
state_dict_
[
key
.
split
(
'transformer.'
)[
1
]]
=
state_dict
[
key
]
# for backward compatibility.
state_dict_self_attention
=
{}
for
key
in
state_dict_
.
keys
():
if
'.attention.'
in
key
:
state_dict_self_attention
[
key
.
replace
(
".attention."
,
".self_attention."
)]
=
state_dict_
[
key
]
else
:
state_dict_self_attention
[
key
]
=
state_dict_
[
key
]
state_dict_
=
state_dict_self_attention
self
.
encoder
.
load_state_dict
(
state_dict_
,
strict
=
strict
)
if
self
.
post_process
:
# pooler
if
self
.
add_pooler
:
assert
'pooler'
in
state_dict
,
\
'could not find data for pooler in the checkpoint'
self
.
pooler
.
load_state_dict
(
state_dict
[
self
.
_pooler_key
],
strict
=
strict
)
# decoder
if
self
.
add_decoder
:
assert
'decoder'
in
state_dict
,
\
'could not find data for pooler in the checkpoint'
self
.
decoder
.
load_state_dict
(
state_dict
[
self
.
_decoder_key
],
strict
=
strict
)
megatron-deepspeed_dtk22.10/megatron/model/module.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron Module"""
import
torch
from
torch.autograd
import
Variable
from
torch.nn.parameter
import
Parameter
from
megatron
import
get_args
from
megatron
import
mpu
_FLOAT_TYPES
=
(
torch
.
FloatTensor
,
torch
.
cuda
.
FloatTensor
)
_HALF_TYPES
=
(
torch
.
HalfTensor
,
torch
.
cuda
.
HalfTensor
)
_BF16_TYPES
=
(
torch
.
BFloat16Tensor
,
torch
.
cuda
.
BFloat16Tensor
)
def
param_is_not_shared
(
param
):
return
not
hasattr
(
param
,
'shared'
)
or
not
param
.
shared
class
MegatronModule
(
torch
.
nn
.
Module
):
"""Megatron specific extensions of torch Module with support
for pipelining."""
def
__init__
(
self
,
share_word_embeddings
=
True
):
super
(
MegatronModule
,
self
).
__init__
()
self
.
share_word_embeddings
=
share_word_embeddings
def
state_dict_for_save_checkpoint
(
self
,
destination
=
None
,
prefix
=
''
,
keep_vars
=
False
):
"""Use this function to override the state dict for
saving checkpoints."""
return
self
.
state_dict
(
destination
,
prefix
,
keep_vars
)
def
word_embeddings_weight
(
self
):
if
mpu
.
is_pipeline_first_stage
(
ignore_virtual
=
True
):
return
self
.
language_model
.
embedding
.
word_embeddings
.
weight
if
mpu
.
is_pipeline_last_stage
(
ignore_virtual
=
True
):
if
not
self
.
share_word_embeddings
:
raise
Exception
(
'word_embeddings_weight() called for last '
'stage, but share_word_embeddings is false'
)
return
self
.
word_embeddings
.
weight
raise
Exception
(
'word_embeddings_weight() should be '
'called for first and last stage only'
)
def
initialize_word_embeddings
(
self
,
init_method_normal
):
args
=
get_args
()
if
not
self
.
share_word_embeddings
:
raise
Exception
(
'initialize_word_embeddings() was called but '
'share_word_embeddings is false'
)
# This function just initializes the word embeddings in the final stage
# when we are using pipeline parallelism. If we aren't using pipeline
# parallelism there is nothing to do.
if
args
.
pipeline_model_parallel_size
==
1
:
return
# Parameters are shared between the word embeddings layer, and the
# heads at the end of the model. In a pipelined setup with more than
# one stage, the initial embedding layer and the head are on different
# workers, so we do the following:
# 1. Create a second copy of word_embeddings on the last stage, with
# initial parameters of 0.0.
# 2. Do an all-reduce between the first and last stage to ensure that
# the two copies of word_embeddings start off with the same
# parameter values.
# 3. In the training loop, before an all-reduce between the grads of
# the two word_embeddings layers to ensure that every applied weight
# update is the same on both stages.
if
mpu
.
is_pipeline_last_stage
():
assert
not
mpu
.
is_pipeline_first_stage
()
self
.
_word_embeddings_for_head_key
=
'word_embeddings_for_head'
# set word_embeddings weights to 0 here, then copy first
# stage's weights using all_reduce below.
self
.
word_embeddings
=
mpu
.
VocabParallelEmbedding
(
args
.
padded_vocab_size
,
args
.
hidden_size
,
init_method
=
init_method_normal
(
args
.
init_method_std
))
self
.
word_embeddings
.
weight
.
data
.
fill_
(
0
)
self
.
word_embeddings
.
weight
.
shared
=
True
# Ensure that first and last stages have the same initial parameter
# values.
if
torch
.
distributed
.
is_initialized
():
if
mpu
.
is_pipeline_first_stage
()
or
mpu
.
is_pipeline_last_stage
():
torch
.
distributed
.
all_reduce
(
self
.
word_embeddings_weight
().
data
,
group
=
mpu
.
get_embedding_group
())
else
:
print
(
"WARNING! Distributed processes aren't initialized, so "
"word embeddings in the last layer are not initialized. "
"If you are just manipulating a model this is fine, but "
"this needs to be handled manually. If you are training "
"something is definitely wrong."
)
def
conversion_helper
(
val
,
conversion
):
"""Apply conversion to val. Recursively apply conversion if `val`
#is a nested tuple/list structure."""
if
not
isinstance
(
val
,
(
tuple
,
list
)):
return
conversion
(
val
)
rtn
=
[
conversion_helper
(
v
,
conversion
)
for
v
in
val
]
if
isinstance
(
val
,
tuple
):
rtn
=
tuple
(
rtn
)
return
rtn
def
fp32_to_float16
(
val
,
float16_convertor
):
"""Convert fp32 `val` to fp16/bf16"""
def
half_conversion
(
val
):
val_typecheck
=
val
if
isinstance
(
val_typecheck
,
(
Parameter
,
Variable
)):
val_typecheck
=
val
.
data
if
isinstance
(
val_typecheck
,
_FLOAT_TYPES
):
val
=
float16_convertor
(
val
)
return
val
return
conversion_helper
(
val
,
half_conversion
)
def
float16_to_fp32
(
val
):
"""Convert fp16/bf16 `val` to fp32"""
def
float_conversion
(
val
):
val_typecheck
=
val
if
isinstance
(
val_typecheck
,
(
Parameter
,
Variable
)):
val_typecheck
=
val
.
data
if
isinstance
(
val_typecheck
,
(
_BF16_TYPES
,
_HALF_TYPES
)):
val
=
val
.
float
()
return
val
return
conversion_helper
(
val
,
float_conversion
)
class
Float16Module
(
MegatronModule
):
def
__init__
(
self
,
module
,
args
):
super
(
Float16Module
,
self
).
__init__
()
if
args
.
fp16
:
self
.
add_module
(
'module'
,
module
.
half
())
def
float16_convertor
(
val
):
return
val
.
half
()
elif
args
.
bf16
:
self
.
add_module
(
'module'
,
module
.
bfloat16
())
def
float16_convertor
(
val
):
return
val
.
bfloat16
()
else
:
raise
Exception
(
'should not be here'
)
self
.
float16_convertor
=
float16_convertor
def
forward
(
self
,
*
inputs
,
**
kwargs
):
if
mpu
.
is_pipeline_first_stage
():
inputs
=
fp32_to_float16
(
inputs
,
self
.
float16_convertor
)
outputs
=
self
.
module
(
*
inputs
,
**
kwargs
)
if
mpu
.
is_pipeline_last_stage
():
outputs
=
float16_to_fp32
(
outputs
)
return
outputs
def
state_dict
(
self
,
destination
=
None
,
prefix
=
''
,
keep_vars
=
False
):
return
self
.
module
.
state_dict
(
destination
,
prefix
,
keep_vars
)
def
state_dict_for_save_checkpoint
(
self
,
destination
=
None
,
prefix
=
''
,
keep_vars
=
False
):
return
self
.
module
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
def
load_state_dict
(
self
,
state_dict
,
strict
=
True
):
self
.
module
.
load_state_dict
(
state_dict
,
strict
=
strict
)
megatron-deepspeed_dtk22.10/megatron/model/multiple_choice.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Multiple choice model."""
import
torch
from
megatron
import
get_args
,
print_rank_last
from
megatron
import
mpu
from
megatron.enums
import
AttnMaskType
from
megatron.model.bert_model
import
bert_extended_attention_mask
,
bert_position_ids
from
megatron.model.language_model
import
get_language_model
from
megatron.model.utils
import
get_linear_layer
from
megatron.model.utils
import
init_method_normal
from
megatron.model.utils
import
scaled_init_method_normal
from
.module
import
MegatronModule
class
MultipleChoice
(
MegatronModule
):
def
__init__
(
self
,
num_tokentypes
=
2
,
pre_process
=
True
,
post_process
=
True
):
super
(
MultipleChoice
,
self
).
__init__
(
share_word_embeddings
=
False
)
args
=
get_args
()
init_method
=
init_method_normal
(
args
.
init_method_std
)
self
.
pre_process
=
pre_process
self
.
post_process
=
post_process
self
.
language_model
,
self
.
_language_model_key
=
get_language_model
(
num_tokentypes
=
num_tokentypes
,
add_pooler
=
True
,
encoder_attn_mask_type
=
AttnMaskType
.
padding
,
init_method
=
init_method
,
scaled_init_method
=
scaled_init_method_normal
(
args
.
init_method_std
,
args
.
num_layers
),
pre_process
=
self
.
pre_process
,
post_process
=
self
.
post_process
)
# Multi-choice head.
if
self
.
post_process
:
self
.
multichoice_dropout
=
torch
.
nn
.
Dropout
(
args
.
hidden_dropout
)
self
.
multichoice_head
=
get_linear_layer
(
args
.
hidden_size
,
1
,
init_method
)
self
.
_multichoice_head_key
=
'multichoice_head'
def
set_input_tensor
(
self
,
input_tensor
):
"""See megatron.model.transformer.set_input_tensor()"""
self
.
language_model
.
set_input_tensor
(
input_tensor
)
def
forward
(
self
,
model_input
,
attention_mask
,
tokentype_ids
=
None
):
# [batch, choices, sequence] --> [batch * choices, sequence] -->
# transformer --> [batch, choices] --> softmax
# Ensure the shape is [batch-size, choices, sequence]
assert
len
(
attention_mask
.
shape
)
==
3
num_choices
=
attention_mask
.
shape
[
1
]
# Reshape and treat choice dimension the same as batch.
attention_mask
=
attention_mask
.
view
(
-
1
,
attention_mask
.
size
(
-
1
))
extended_attention_mask
=
bert_extended_attention_mask
(
attention_mask
)
input_ids
=
model_input
# Do the same as attention_mask for input_ids, tokentype_ids
assert
len
(
input_ids
.
shape
)
==
3
assert
len
(
tokentype_ids
.
shape
)
==
3
input_ids
=
input_ids
.
view
(
-
1
,
input_ids
.
size
(
-
1
))
tokentype_ids
=
tokentype_ids
.
view
(
-
1
,
tokentype_ids
.
size
(
-
1
))
position_ids
=
bert_position_ids
(
input_ids
)
lm_output
=
self
.
language_model
(
input_ids
,
position_ids
,
extended_attention_mask
,
tokentype_ids
=
tokentype_ids
)
if
self
.
post_process
:
_
,
pooled_output
=
lm_output
multichoice_output
=
self
.
multichoice_dropout
(
pooled_output
)
multichoice_logits
=
self
.
multichoice_head
(
multichoice_output
)
# Reshape back to separate choices.
multichoice_logits
=
multichoice_logits
.
view
(
-
1
,
num_choices
)
return
multichoice_logits
return
lm_output
def
state_dict_for_save_checkpoint
(
self
,
destination
=
None
,
prefix
=
''
,
keep_vars
=
False
):
"""For easy load when model is combined with other heads,
add an extra key."""
state_dict_
=
{}
state_dict_
[
self
.
_language_model_key
]
\
=
self
.
language_model
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
if
self
.
post_process
:
state_dict_
[
self
.
_multichoice_head_key
]
\
=
self
.
multichoice_head
.
state_dict
(
destination
,
prefix
,
keep_vars
)
return
state_dict_
def
load_state_dict
(
self
,
state_dict
,
strict
=
True
):
"""Customized load."""
self
.
language_model
.
load_state_dict
(
state_dict
[
self
.
_language_model_key
],
strict
=
strict
)
if
self
.
post_process
:
if
self
.
_multichoice_head_key
in
state_dict
:
self
.
multichoice_head
.
load_state_dict
(
state_dict
[
self
.
_multichoice_head_key
],
strict
=
strict
)
else
:
print_rank_last
(
'***WARNING*** could not find {} in the checkpoint, '
'initializing to random'
.
format
(
self
.
_multichoice_head_key
))
megatron-deepspeed_dtk22.10/megatron/model/positional_embeddings.py
0 → 100644
View file @
8ec5d678
# Extracted from: https://github.com/EleutherAI/gpt-neox
import
torch
class
RotaryEmbedding
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
dim
,
base
=
10000
,
precision
=
torch
.
half
):
super
().
__init__
()
inv_freq
=
1.
/
(
base
**
(
torch
.
arange
(
0
,
dim
,
2
).
float
()
/
dim
))
self
.
register_buffer
(
'inv_freq'
,
inv_freq
)
self
.
max_seq_len_cached
=
None
self
.
cos_cached
=
None
self
.
sin_cached
=
None
self
.
precision
=
precision
def
forward
(
self
,
x
,
seq_dim
=
1
,
seq_len
=
None
):
if
seq_len
is
None
:
seq_len
=
x
.
shape
[
seq_dim
]
if
self
.
max_seq_len_cached
is
None
or
(
seq_len
>
self
.
max_seq_len_cached
):
self
.
max_seq_len_cached
=
seq_len
t
=
torch
.
arange
(
self
.
max_seq_len_cached
,
device
=
x
.
device
,
dtype
=
self
.
inv_freq
.
dtype
)
freqs
=
torch
.
einsum
(
'i,j->ij'
,
t
,
self
.
inv_freq
)
# Different from paper, but it uses a different permutation in order to obtain the same calculation
emb
=
torch
.
cat
((
freqs
,
freqs
),
dim
=-
1
).
to
(
x
.
device
)
if
self
.
precision
==
torch
.
bfloat16
:
emb
=
emb
.
float
()
# [sx, 1 (b * np), hn]
self
.
cos_cached
=
emb
.
cos
()[:,
None
,
:]
self
.
sin_cached
=
emb
.
sin
()[:,
None
,
:]
if
self
.
precision
==
torch
.
bfloat16
:
self
.
cos_cached
=
self
.
cos_cached
.
bfloat16
()
self
.
sin_cached
=
self
.
sin_cached
.
bfloat16
()
return
self
.
cos_cached
[:
seq_len
,
...],
self
.
sin_cached
[:
seq_len
,
...]
# rotary pos emb helpers:
def
rotate_half
(
x
):
x1
,
x2
=
x
[...,
:
x
.
shape
[
-
1
]
//
2
],
x
[...,
x
.
shape
[
-
1
]
//
2
:]
return
torch
.
cat
((
-
x2
,
x1
),
dim
=
x1
.
ndim
-
1
)
# dim=-1 triggers a bug in earlier torch versions
@
torch
.
jit
.
script
def
apply_rotary_pos_emb
(
q
,
k
,
cos
,
sin
,
offset
:
int
=
0
):
cos
,
sin
=
cos
[
offset
:
q
.
shape
[
0
]
+
offset
,
...],
sin
[
offset
:
q
.
shape
[
0
]
+
offset
,
...]
return
(
q
*
cos
)
+
(
rotate_half
(
q
)
*
sin
),
(
k
*
cos
)
+
(
rotate_half
(
k
)
*
sin
)
def
apply_rotary_pos_emb_torch
(
q
,
k
,
cos
,
sin
,
offset
:
int
=
0
):
# jitting fails with bf16
cos
,
sin
=
cos
[
offset
:
q
.
shape
[
0
]
+
offset
,
...],
sin
[
offset
:
q
.
shape
[
0
]
+
offset
,
...]
return
(
q
*
cos
)
+
(
rotate_half
(
q
)
*
sin
),
(
k
*
cos
)
+
(
rotate_half
(
k
)
*
sin
)
\ No newline at end of file
megatron-deepspeed_dtk22.10/megatron/model/realm_model.py
0 → 100644
View file @
8ec5d678
import
os
import
torch
from
megatron
import
get_args
,
print_rank_0
from
megatron.checkpointing
import
get_checkpoint_tracker_filename
,
get_checkpoint_name
from
megatron.model.bert_model
import
BertModel
from
.module
import
MegatronModule
from
megatron
import
mpu
from
megatron.enums
import
AttnMaskType
from
megatron.model.utils
import
get_linear_layer
from
megatron.model.utils
import
init_method_normal
from
megatron.model.language_model
import
get_language_model
from
megatron.model.utils
import
scaled_init_method_normal
from
megatron.model.bert_model
import
bert_extended_attention_mask
,
bert_position_ids
def
general_ict_model_provider
(
only_query_model
=
False
,
only_block_model
=
False
):
"""Build the model."""
args
=
get_args
()
assert
args
.
ict_head_size
is
not
None
,
\
"Need to specify --ict-head-size to provide an ICTBertModel"
assert
mpu
.
get_tensor_model_parallel_world_size
()
==
1
and
mpu
.
get_pipeline_model_parallel_world_size
()
==
1
,
\
"Model parallel size > 1 not supported for ICT"
print_rank_0
(
'building ICTBertModel...'
)
# simpler to just keep using 2 tokentypes since the LM we initialize with has 2 tokentypes
model
=
ICTBertModel
(
ict_head_size
=
args
.
ict_head_size
,
num_tokentypes
=
2
,
parallel_output
=
True
,
only_query_model
=
only_query_model
,
only_block_model
=
only_block_model
)
return
model
class
ICTBertModel
(
MegatronModule
):
"""Bert-based module for Inverse Cloze task."""
def
__init__
(
self
,
ict_head_size
,
num_tokentypes
=
1
,
parallel_output
=
True
,
only_query_model
=
False
,
only_block_model
=
False
):
super
(
ICTBertModel
,
self
).
__init__
()
bert_kwargs
=
dict
(
ict_head_size
=
ict_head_size
,
num_tokentypes
=
num_tokentypes
,
parallel_output
=
parallel_output
)
assert
not
(
only_block_model
and
only_query_model
)
self
.
use_block_model
=
not
only_query_model
self
.
use_query_model
=
not
only_block_model
if
self
.
use_query_model
:
# this model embeds (pseudo-)queries - Embed_input in the paper
self
.
query_model
=
IREncoderBertModel
(
**
bert_kwargs
)
self
.
_query_key
=
'question_model'
if
self
.
use_block_model
:
# this model embeds evidence blocks - Embed_doc in the paper
self
.
block_model
=
IREncoderBertModel
(
**
bert_kwargs
)
self
.
_block_key
=
'context_model'
def
forward
(
self
,
query_tokens
,
query_attention_mask
,
block_tokens
,
block_attention_mask
):
"""Run a forward pass for each of the models and return the respective embeddings."""
query_logits
=
self
.
embed_query
(
query_tokens
,
query_attention_mask
)
block_logits
=
self
.
embed_block
(
block_tokens
,
block_attention_mask
)
return
query_logits
,
block_logits
def
embed_query
(
self
,
query_tokens
,
query_attention_mask
):
"""Embed a batch of tokens using the query model"""
if
self
.
use_query_model
:
query_types
=
torch
.
cuda
.
LongTensor
(
*
query_tokens
.
shape
).
fill_
(
0
)
query_ict_logits
,
_
=
self
.
query_model
.
forward
(
query_tokens
,
query_attention_mask
,
query_types
)
return
query_ict_logits
else
:
raise
ValueError
(
"Cannot embed query without query model."
)
def
embed_block
(
self
,
block_tokens
,
block_attention_mask
):
"""Embed a batch of tokens using the block model"""
if
self
.
use_block_model
:
block_types
=
torch
.
cuda
.
LongTensor
(
*
block_tokens
.
shape
).
fill_
(
0
)
block_ict_logits
,
_
=
self
.
block_model
.
forward
(
block_tokens
,
block_attention_mask
,
block_types
)
return
block_ict_logits
else
:
raise
ValueError
(
"Cannot embed block without block model."
)
def
state_dict_for_save_checkpoint
(
self
,
destination
=
None
,
prefix
=
''
,
keep_vars
=
False
):
"""Save dict with state dicts of each of the models."""
state_dict_
=
{}
if
self
.
use_query_model
:
state_dict_
[
self
.
_query_key
]
\
=
self
.
query_model
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
if
self
.
use_block_model
:
state_dict_
[
self
.
_block_key
]
\
=
self
.
block_model
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
return
state_dict_
def
load_state_dict
(
self
,
state_dict
,
strict
=
True
):
"""Load the state dicts of each of the models"""
if
self
.
use_query_model
:
print
(
"Loading ICT query model"
,
flush
=
True
)
self
.
query_model
.
load_state_dict
(
state_dict
[
self
.
_query_key
],
strict
=
strict
)
if
self
.
use_block_model
:
print
(
"Loading ICT block model"
,
flush
=
True
)
self
.
block_model
.
load_state_dict
(
state_dict
[
self
.
_block_key
],
strict
=
strict
)
def
init_state_dict_from_bert
(
self
):
"""Initialize the state from a pretrained BERT model on iteration zero of ICT pretraining"""
args
=
get_args
()
tracker_filename
=
get_checkpoint_tracker_filename
(
args
.
bert_load
)
if
not
os
.
path
.
isfile
(
tracker_filename
):
raise
FileNotFoundError
(
"Could not find BERT load for ICT"
)
with
open
(
tracker_filename
,
'r'
)
as
f
:
iteration
=
int
(
f
.
read
().
strip
())
assert
iteration
>
0
checkpoint_name
=
get_checkpoint_name
(
args
.
bert_load
,
iteration
,
False
)
if
mpu
.
get_data_parallel_rank
()
==
0
:
print
(
'global rank {} is loading checkpoint {}'
.
format
(
torch
.
distributed
.
get_rank
(),
checkpoint_name
))
try
:
state_dict
=
torch
.
load
(
checkpoint_name
,
map_location
=
'cpu'
)
except
BaseException
:
raise
ValueError
(
"Could not load checkpoint"
)
# load the LM state dict into each model
model_dict
=
state_dict
[
'model'
][
'language_model'
]
self
.
query_model
.
language_model
.
load_state_dict
(
model_dict
)
self
.
block_model
.
language_model
.
load_state_dict
(
model_dict
)
# give each model the same ict_head to begin with as well
query_ict_head_state_dict
=
self
.
state_dict_for_save_checkpoint
()[
self
.
_query_key
][
'ict_head'
]
self
.
block_model
.
ict_head
.
load_state_dict
(
query_ict_head_state_dict
)
class
IREncoderBertModel
(
MegatronModule
):
"""BERT-based encoder for queries or blocks used for learned information retrieval."""
def
__init__
(
self
,
ict_head_size
,
num_tokentypes
=
2
,
parallel_output
=
True
):
super
(
IREncoderBertModel
,
self
).
__init__
()
args
=
get_args
()
self
.
ict_head_size
=
ict_head_size
self
.
parallel_output
=
parallel_output
init_method
=
init_method_normal
(
args
.
init_method_std
)
scaled_init_method
=
scaled_init_method_normal
(
args
.
init_method_std
,
args
.
num_layers
)
self
.
language_model
,
self
.
_language_model_key
=
get_language_model
(
num_tokentypes
=
num_tokentypes
,
add_pooler
=
True
,
encoder_attn_mask_type
=
AttnMaskType
.
padding
,
init_method
=
init_method
,
scaled_init_method
=
scaled_init_method
)
self
.
ict_head
=
get_linear_layer
(
args
.
hidden_size
,
ict_head_size
,
init_method
)
self
.
_ict_head_key
=
'ict_head'
def
forward
(
self
,
input_ids
,
attention_mask
,
tokentype_ids
=
None
):
extended_attention_mask
=
bert_extended_attention_mask
(
attention_mask
,
next
(
self
.
language_model
.
parameters
()).
dtype
)
position_ids
=
bert_position_ids
(
input_ids
)
lm_output
,
pooled_output
=
self
.
language_model
(
input_ids
,
position_ids
,
extended_attention_mask
,
tokentype_ids
=
tokentype_ids
)
# Output.
ict_logits
=
self
.
ict_head
(
pooled_output
)
return
ict_logits
,
None
def
state_dict_for_save_checkpoint
(
self
,
destination
=
None
,
prefix
=
''
,
keep_vars
=
False
):
"""For easy load when model is combined with other heads,
add an extra key."""
state_dict_
=
{}
state_dict_
[
self
.
_language_model_key
]
\
=
self
.
language_model
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
state_dict_
[
self
.
_ict_head_key
]
\
=
self
.
ict_head
.
state_dict
(
destination
,
prefix
,
keep_vars
)
return
state_dict_
def
load_state_dict
(
self
,
state_dict
,
strict
=
True
):
"""Customized load."""
self
.
language_model
.
load_state_dict
(
state_dict
[
self
.
_language_model_key
],
strict
=
strict
)
self
.
ict_head
.
load_state_dict
(
state_dict
[
self
.
_ict_head_key
],
strict
=
strict
)
megatron-deepspeed_dtk22.10/megatron/model/t5_model.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""T5 model."""
import
torch
from
megatron
import
(
get_args
,
mpu
)
from
megatron.enums
import
AttnMaskType
from
megatron.model.language_model
import
parallel_lm_logits
,
get_language_model
from
megatron.model.transformer
import
LayerNorm
from
megatron.model.utils
import
(
openai_gelu
,
get_linear_layer
,
init_method_normal
,
scaled_init_method_normal
)
from
.module
import
MegatronModule
def
t5_extended_attention_mask
(
attention_mask_list
):
def
attn_mask_postprocess
(
attn_mask
):
# [b, 1, s, s]
extended_attention_mask
=
attn_mask
.
unsqueeze
(
1
)
return
extended_attention_mask
return
[
attn_mask_postprocess
(
attn_mask
)
for
attn_mask
in
attention_mask_list
]
def
t5_position_ids
(
token_ids
):
# Create position ids
seq_length
=
token_ids
.
size
(
1
)
position_ids
=
torch
.
arange
(
seq_length
,
dtype
=
torch
.
long
,
device
=
token_ids
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
token_ids
)
return
position_ids
class
T5LMHead
(
MegatronModule
):
"""Masked LM head for T5
Arguments:
mpu_vocab_size: model parallel size of vocabulary.
hidden_size: hidden size
init_method: init method for weight initialization
layernorm_epsilon: tolerance for layer norm divisions
parallel_output: wether output logits being distributed or not.
"""
def
__init__
(
self
,
mpu_vocab_size
,
parallel_output
):
super
(
T5LMHead
,
self
).
__init__
()
args
=
get_args
()
self
.
bias
=
torch
.
nn
.
Parameter
(
torch
.
zeros
(
mpu_vocab_size
))
self
.
bias
.
model_parallel
=
True
self
.
bias
.
partition_dim
=
0
self
.
bias
.
stride
=
1
self
.
parallel_output
=
parallel_output
def
forward
(
self
,
hidden_states
,
word_embeddings_weight
):
output
=
parallel_lm_logits
(
hidden_states
,
word_embeddings_weight
,
self
.
parallel_output
,
bias
=
self
.
bias
)
return
output
class
T5Model
(
MegatronModule
):
"""T5 Language model."""
def
__init__
(
self
,
num_tokentypes
=
0
,
parallel_output
=
True
):
super
(
T5Model
,
self
).
__init__
()
args
=
get_args
()
self
.
fp16_lm_cross_entropy
=
args
.
fp16_lm_cross_entropy
self
.
parallel_output
=
parallel_output
init_method
=
init_method_normal
(
args
.
init_method_std
)
scaled_init_method
=
scaled_init_method_normal
(
args
.
init_method_std
,
args
.
num_layers
)
self
.
language_model
,
self
.
_language_model_key
=
get_language_model
(
num_tokentypes
=
num_tokentypes
,
add_pooler
=
False
,
add_decoder
=
True
,
encoder_attn_mask_type
=
AttnMaskType
.
padding
,
init_method
=
init_method
,
scaled_init_method
=
scaled_init_method
)
self
.
lm_head
=
T5LMHead
(
self
.
language_model
.
embedding
.
word_embeddings
.
weight
.
size
(
0
),
parallel_output
)
self
.
_lm_head_key
=
'lm_head'
def
set_input_tensor
(
self
,
input_tensor
):
"""See megatron.model.transformer.set_input_tensor()"""
self
.
language_model
.
set_input_tensor
(
input_tensor
)
def
forward
(
self
,
encoder_input_ids
,
decoder_input_ids
,
encoder_attn_mask
,
decoder_attn_mask
,
encoder_decoder_attn_mask
,
tokentype_ids
=
None
,
lm_labels
=
None
,
enc_hidden_states
=
None
):
# Converting the attention masks to proper parameter settings
encoder_attn_mask
,
decoder_attn_mask
,
encoder_decoder_attn_mask
=
t5_extended_attention_mask
(
[
encoder_attn_mask
,
decoder_attn_mask
,
encoder_decoder_attn_mask
])
encoder_position_ids
=
t5_position_ids
(
encoder_input_ids
)
decoder_position_ids
=
t5_position_ids
(
decoder_input_ids
)
lm_output
=
self
.
language_model
(
encoder_input_ids
,
encoder_position_ids
,
encoder_attn_mask
,
decoder_input_ids
,
decoder_position_ids
,
decoder_attn_mask
,
encoder_decoder_attn_mask
,
tokentype_ids
=
tokentype_ids
,
enc_hidden_states
=
enc_hidden_states
)
decoder_output
,
encoder_output
=
lm_output
# Output.
lm_logits
=
self
.
lm_head
(
decoder_output
,
self
.
language_model
.
embedding
.
word_embeddings
.
weight
)
if
lm_labels
is
None
:
return
lm_logits
,
encoder_output
else
:
if
self
.
fp16_lm_cross_entropy
:
assert
lm_logits
.
dtype
==
torch
.
half
lm_loss
=
mpu
.
vocab_parallel_cross_entropy
(
lm_logits
,
lm_labels
)
else
:
lm_loss
=
mpu
.
vocab_parallel_cross_entropy
(
lm_logits
.
float
(),
lm_labels
)
return
lm_loss
,
encoder_output
def
state_dict_for_save_checkpoint
(
self
,
destination
=
None
,
prefix
=
''
,
keep_vars
=
False
):
"""For easy load when model is combined with other heads,
add an extra key."""
state_dict_
=
{}
state_dict_
[
self
.
_language_model_key
]
\
=
self
.
language_model
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
state_dict_
[
self
.
_lm_head_key
]
\
=
self
.
lm_head
.
state_dict_for_save_checkpoint
(
destination
,
prefix
,
keep_vars
)
return
state_dict_
def
load_state_dict
(
self
,
state_dict
,
strict
=
True
):
"""Customized load."""
self
.
language_model
.
load_state_dict
(
state_dict
[
self
.
_language_model_key
],
strict
=
strict
)
self
.
lm_head
.
load_state_dict
(
state_dict
[
self
.
_lm_head_key
],
strict
=
strict
)
megatron-deepspeed_dtk22.10/megatron/model/transformer.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Transformer."""
import
math
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
megatron
import
get_args
,
logging
from
megatron
import
mpu
from
.module
import
MegatronModule
from
megatron.enums
import
AttnMaskType
,
LayerType
,
AttnType
,
PositionEmbeddingType
from
megatron.model.fused_layer_norm
import
MixedFusedLayerNorm
as
LayerNorm
from
megatron.model.fused_softmax
import
FusedScaleMaskSoftmax
from
megatron.model.fused_bias_gelu
import
bias_gelu_impl
from
megatron.model.utils
import
attention_mask_func
,
openai_gelu
,
erf_gelu
import
deepspeed
from
.glu_activations
import
GLU_ACTIVATIONS
from
.positional_embeddings
import
RotaryEmbedding
,
apply_rotary_pos_emb_torch
,
apply_rotary_pos_emb
# flags required to enable jit fusion kernels
torch
.
_C
.
_jit_set_profiling_mode
(
False
)
torch
.
_C
.
_jit_set_profiling_executor
(
False
)
torch
.
_C
.
_jit_override_can_fuse_on_cpu
(
True
)
torch
.
_C
.
_jit_override_can_fuse_on_gpu
(
True
)
logger
=
logging
.
get_logger
(
__name__
)
""" We use the following notation throughout this file:
h: hidden size
n: number of attention heads
p: number of model parallel partitions
np: n/p
hp: h/p
hn: h/n
b: batch size
s: sequence length
l: number of layers
Transformer takes input of size [s, b, h] and returns a
tensor of the same size. We use the following arguments:
hyperparameters: transformer hyperparameters
"""
class
ParallelMLP
(
MegatronModule
):
"""MLP.
MLP will take the input with h hidden state, project it to 4*h
hidden dimension, perform nonlinear transformation, and project the
state back into h hidden dimension. At the end, dropout is also
applied.
"""
def
__init__
(
self
,
init_method
,
output_layer_init_method
):
super
(
ParallelMLP
,
self
).
__init__
()
args
=
get_args
()
# Project to ffn_hidden_size
self
.
dense_h_to_4h
=
mpu
.
ColumnParallelLinear
(
args
.
hidden_size
,
# GLU is a special activation that divides the dimension by a factor 2.
2
*
args
.
ffn_hidden_size
if
args
.
glu_activation
else
args
.
ffn_hidden_size
,
gather_output
=
False
,
init_method
=
init_method
,
skip_bias_add
=
True
)
self
.
bias_gelu_fusion
=
args
.
bias_gelu_fusion
self
.
activation_func
=
F
.
gelu
if
args
.
glu_activation
:
self
.
activation_func
=
GLU_ACTIVATIONS
[
args
.
glu_activation
]
elif
args
.
openai_gelu
:
self
.
activation_func
=
openai_gelu
elif
args
.
onnx_safe
:
self
.
activation_func
=
erf_gelu
# Project back to h.
self
.
dense_4h_to_h
=
mpu
.
RowParallelLinear
(
args
.
ffn_hidden_size
,
args
.
hidden_size
,
input_is_parallel
=
True
,
init_method
=
output_layer_init_method
,
skip_bias_add
=
True
)
def
forward
(
self
,
hidden_states
):
# [s, b, 4hp]
intermediate_parallel
,
bias_parallel
=
self
.
dense_h_to_4h
(
hidden_states
)
if
self
.
bias_gelu_fusion
:
intermediate_parallel
=
\
bias_gelu_impl
(
intermediate_parallel
,
bias_parallel
)
else
:
intermediate_parallel
=
\
self
.
activation_func
(
intermediate_parallel
+
bias_parallel
)
# [s, b, h]
output
,
output_bias
=
self
.
dense_4h_to_h
(
intermediate_parallel
)
return
output
,
output_bias
class
ParallelAttention
(
MegatronModule
):
"""Parallel self-attention layer abstract class.
Self-attention layer takes input with size [b, s, h]
and returns output of the same size.
"""
def
__init__
(
self
,
init_method
,
output_layer_init_method
,
layer_number
,
attention_type
=
AttnType
.
self_attn
,
attn_mask_type
=
AttnMaskType
.
padding
):
super
(
ParallelAttention
,
self
).
__init__
()
args
=
get_args
()
self
.
fp16
=
args
.
fp16
self
.
bf16
=
args
.
bf16
self
.
position_embedding_type
=
args
.
position_embedding_type
self
.
apply_query_key_layer_scaling
=
args
.
apply_query_key_layer_scaling
self
.
attention_softmax_in_fp32
=
args
.
attention_softmax_in_fp32
if
self
.
apply_query_key_layer_scaling
:
self
.
attention_softmax_in_fp32
=
True
self
.
layer_number
=
max
(
1
,
layer_number
)
self
.
attention_type
=
attention_type
self
.
attn_mask_type
=
attn_mask_type
projection_size
=
args
.
kv_channels
*
args
.
num_attention_heads
# Per attention head and per partition values.
world_size
=
mpu
.
get_tensor_model_parallel_world_size
()
self
.
hidden_size_per_partition
=
mpu
.
divide
(
projection_size
,
world_size
)
self
.
hidden_size_per_attention_head
=
mpu
.
divide
(
projection_size
,
args
.
num_attention_heads
)
self
.
num_attention_heads_per_partition
=
mpu
.
divide
(
args
.
num_attention_heads
,
world_size
)
# Strided linear layer.
if
attention_type
==
AttnType
.
self_attn
:
self
.
query_key_value
=
mpu
.
ColumnParallelLinear
(
args
.
hidden_size
,
3
*
projection_size
,
gather_output
=
False
,
init_method
=
init_method
)
else
:
assert
attention_type
==
AttnType
.
cross_attn
self
.
query
=
mpu
.
ColumnParallelLinear
(
args
.
hidden_size
,
projection_size
,
gather_output
=
False
,
init_method
=
init_method
)
self
.
key_value
=
mpu
.
ColumnParallelLinear
(
args
.
hidden_size
,
2
*
projection_size
,
gather_output
=
False
,
init_method
=
init_method
)
coeff
=
None
self
.
norm_factor
=
math
.
sqrt
(
self
.
hidden_size_per_attention_head
)
if
self
.
apply_query_key_layer_scaling
:
coeff
=
self
.
layer_number
self
.
norm_factor
*=
coeff
self
.
scale_mask_softmax
=
FusedScaleMaskSoftmax
(
self
.
fp16
,
self
.
bf16
,
self
.
attn_mask_type
,
args
.
masked_softmax_fusion
,
attention_mask_func
,
self
.
attention_softmax_in_fp32
,
coeff
)
# Dropout. Note that for a single iteration, this layer will generate
# different outputs on different number of parallel partitions but
# on average it should not be partition dependent.
self
.
attention_dropout
=
torch
.
nn
.
Dropout
(
args
.
attention_dropout
)
# Output.
self
.
dense
=
mpu
.
RowParallelLinear
(
projection_size
,
args
.
hidden_size
,
input_is_parallel
=
True
,
init_method
=
output_layer_init_method
,
skip_bias_add
=
True
)
if
deepspeed
.
checkpointing
.
is_configured
():
global
get_cuda_rng_tracker
,
checkpoint
get_cuda_rng_tracker
=
deepspeed
.
checkpointing
.
get_cuda_rng_tracker
checkpoint
=
deepspeed
.
checkpointing
.
checkpoint
if
self
.
position_embedding_type
==
PositionEmbeddingType
.
rotary
:
self
.
rotary_emb
=
RotaryEmbedding
(
self
.
hidden_size_per_attention_head
,
precision
=
args
.
params_dtype
)
def
forward
(
self
,
hidden_states
,
attention_mask
,
layer_past
=
None
,
get_key_value
=
False
,
encoder_output
=
None
,
alibi
=
None
):
# hidden_states: [sq, b, h]
# =====================
# Query, Key, and Value
# =====================
if
self
.
attention_type
==
AttnType
.
self_attn
:
# Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
mixed_x_layer
,
_
=
self
.
query_key_value
(
hidden_states
)
# [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
new_tensor_shape
=
mixed_x_layer
.
size
()[:
-
1
]
+
\
(
self
.
num_attention_heads_per_partition
,
3
*
self
.
hidden_size_per_attention_head
)
mixed_x_layer
=
mixed_x_layer
.
view
(
*
new_tensor_shape
)
# [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
(
query_layer
,
key_layer
,
value_layer
)
=
mpu
.
split_tensor_along_last_dim
(
mixed_x_layer
,
3
)
else
:
# Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
mixed_kv_layer
,
_
=
self
.
key_value
(
encoder_output
)
# [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
new_tensor_shape
=
mixed_kv_layer
.
size
()[:
-
1
]
+
\
(
self
.
num_attention_heads_per_partition
,
2
*
self
.
hidden_size_per_attention_head
)
mixed_kv_layer
=
mixed_kv_layer
.
view
(
*
new_tensor_shape
)
# [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
(
key_layer
,
value_layer
)
=
mpu
.
split_tensor_along_last_dim
(
mixed_kv_layer
,
2
)
# Attention head [sq, b, h] --> [sq, b, hp]
query_layer
,
_
=
self
.
query
(
hidden_states
)
# [sq, b, hp] --> [sq, b, np, hn]
new_tensor_shape
=
query_layer
.
size
()[:
-
1
]
+
\
(
self
.
num_attention_heads_per_partition
,
self
.
hidden_size_per_attention_head
)
query_layer
=
query_layer
.
view
(
*
new_tensor_shape
)
# ==================================
# Adjust key and value for inference
# ==================================
if
layer_past
is
not
None
:
past_key
,
past_value
=
layer_past
key_layer
=
torch
.
cat
((
past_key
.
type_as
(
key_layer
),
key_layer
),
dim
=
0
)
value_layer
=
torch
.
cat
((
past_value
.
type_as
(
value_layer
),
value_layer
),
dim
=
0
)
if
get_key_value
:
present
=
(
key_layer
,
value_layer
)
# ===================================
# Raw attention scores. [b, np, s, s]
# ===================================
# [b, np, sq, sk]
output_size
=
(
query_layer
.
size
(
1
),
query_layer
.
size
(
2
),
query_layer
.
size
(
0
),
key_layer
.
size
(
0
))
# [sq, b, np, hn] -> [sq, b * np, hn]
query_layer
=
query_layer
.
view
(
output_size
[
2
],
output_size
[
0
]
*
output_size
[
1
],
-
1
)
# [sk, b, np, hn] -> [sk, b * np, hn]
key_layer
=
key_layer
.
view
(
output_size
[
3
],
output_size
[
0
]
*
output_size
[
1
],
-
1
)
# preallocting result tensor: [b * np, sq, sk]
if
alibi
is
None
:
matmul_result
=
torch
.
empty
(
output_size
[
0
]
*
output_size
[
1
],
output_size
[
2
],
output_size
[
3
],
dtype
=
query_layer
.
dtype
,
device
=
torch
.
cuda
.
current_device
())
else
:
matmul_result
=
alibi
[:
output_size
[
0
]
*
output_size
[
1
],
:,
:
output_size
[
3
]]
# Rotary embeddings
if
self
.
position_embedding_type
==
PositionEmbeddingType
.
rotary
:
apply_rotary_fn
=
apply_rotary_pos_emb_torch
if
self
.
bf16
else
apply_rotary_pos_emb
seq_len
=
key_layer
.
shape
[
0
]
offset
=
0
if
layer_past
is
not
None
and
layer_past
.
numel
()
>
0
:
offset
=
layer_past
[
0
].
shape
[
0
]
seq_len
+=
offset
cos
,
sin
=
self
.
rotary_emb
(
value_layer
,
seq_len
=
seq_len
)
query_layer
,
key_layer
=
apply_rotary_fn
(
query_layer
,
key_layer
,
cos
,
sin
,
offset
=
offset
)
# Raw attention scores. [b * np, sq, sk]
if
alibi
is
None
:
matmul_result
=
torch
.
baddbmm
(
matmul_result
,
query_layer
.
transpose
(
0
,
1
),
# [b * np, sq, hn]
key_layer
.
transpose
(
0
,
1
).
transpose
(
1
,
2
),
# [b * np, hn, sk]
beta
=
0.0
,
alpha
=
(
1.0
/
self
.
norm_factor
))
else
:
if
not
hasattr
(
self
,
"logged_alibi"
):
logger
.
debug
(
"Using Alibi."
)
self
.
logged_alibi
=
True
if
self
.
apply_query_key_layer_scaling
:
beta
=
1.0
/
self
.
layer_number
else
:
beta
=
1.0
matmul_result
=
torch
.
baddbmm
(
matmul_result
,
query_layer
.
transpose
(
0
,
1
),
# [b * np, sq, hn]
key_layer
.
transpose
(
0
,
1
).
transpose
(
1
,
2
),
# [b * np, hn, sk]
beta
=
beta
,
alpha
=
(
1.0
/
self
.
norm_factor
))
# change view to [b, np, sq, sk]
attention_scores
=
matmul_result
.
view
(
*
output_size
)
# ==================================================
# Update attention mask for inference. [b, np, sq, sk]
# ==================================================
if
get_key_value
:
with
torch
.
no_grad
():
# TODO @thomasw21 Handle case where `attention_mask` is None
if
layer_past
is
not
None
:
attention_mask
=
attention_mask
[
...,
attention_scores
.
size
(
3
)
-
1
,
:
attention_scores
.
size
(
3
)].
unsqueeze
(
2
)
else
:
attention_mask
=
attention_mask
[
...,
:
attention_scores
.
size
(
3
),
:
attention_scores
.
size
(
3
)]
# ===========================
# Attention probs and dropout
# ===========================
# attention scores and attention mask [b, np, sq, sk]
attention_probs
=
self
.
scale_mask_softmax
(
attention_scores
,
attention_mask
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
with
mpu
.
get_cuda_rng_tracker
().
fork
():
attention_probs
=
self
.
attention_dropout
(
attention_probs
)
# =========================
# Context layer. [sq, b, hp]
# =========================
# value_layer -> context layer.
# [sk, b, np, hn] --> [b, np, sq, hn]
# context layer shape: [b, np, sq, hn]
output_size
=
(
value_layer
.
size
(
1
),
value_layer
.
size
(
2
),
query_layer
.
size
(
0
),
value_layer
.
size
(
3
))
# change view [sk, b * np, hn]
value_layer
=
value_layer
.
view
(
value_layer
.
size
(
0
),
output_size
[
0
]
*
output_size
[
1
],
-
1
)
# change view [b * np, sq, sk]
attention_probs
=
attention_probs
.
view
(
output_size
[
0
]
*
output_size
[
1
],
output_size
[
2
],
-
1
)
# matmul: [b * np, sq, hn]
context_layer
=
torch
.
bmm
(
attention_probs
,
value_layer
.
transpose
(
0
,
1
))
# change view [b, np, sq, hn]
context_layer
=
context_layer
.
view
(
*
output_size
)
# [b, np, sq, hn] --> [sq, b, np, hn]
context_layer
=
context_layer
.
permute
(
2
,
0
,
1
,
3
).
contiguous
()
# [sq, b, np, hn] --> [sq, b, hp]
new_context_layer_shape
=
context_layer
.
size
()[:
-
2
]
+
\
(
self
.
hidden_size_per_partition
,)
context_layer
=
context_layer
.
view
(
*
new_context_layer_shape
)
# =================
# Output. [sq, b, h]
# =================
output
,
bias
=
self
.
dense
(
context_layer
)
if
get_key_value
:
output
=
[
output
,
present
]
return
output
,
bias
def
bias_dropout_add
(
x
,
bias
,
residual
,
prob
,
training
):
# type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
out
=
torch
.
nn
.
functional
.
dropout
(
x
+
bias
,
p
=
prob
,
training
=
training
)
out
=
residual
+
out
return
out
def
get_bias_dropout_add
(
training
):
def
_bias_dropout_add
(
x
,
bias
,
residual
,
prob
):
return
bias_dropout_add
(
x
,
bias
,
residual
,
prob
,
training
)
return
_bias_dropout_add
@
torch
.
jit
.
script
def
bias_dropout_add_fused_train
(
x
,
bias
,
residual
,
prob
):
# type: (Tensor, Tensor, Tensor, float) -> Tensor
return
bias_dropout_add
(
x
,
bias
,
residual
,
prob
,
True
)
@
torch
.
jit
.
script
def
bias_dropout_add_fused_inference
(
x
,
bias
,
residual
,
prob
):
# type: (Tensor, Tensor, Tensor, float) -> Tensor
return
bias_dropout_add
(
x
,
bias
,
residual
,
prob
,
False
)
class
ParallelTransformerLayer
(
MegatronModule
):
"""A single transformer layer.
Transformer layer takes input with size [b, s, h] and returns an
output of the same size.
"""
def
__init__
(
self
,
init_method
,
output_layer_init_method
,
layer_number
,
layer_type
=
LayerType
.
encoder
,
self_attn_mask_type
=
AttnMaskType
.
padding
):
args
=
get_args
()
super
(
ParallelTransformerLayer
,
self
).
__init__
()
self
.
layer_number
=
layer_number
self
.
layer_type
=
layer_type
self
.
apply_residual_connection_post_layernorm
\
=
args
.
apply_residual_connection_post_layernorm
self
.
bf16
=
args
.
bf16
self
.
fp32_residual_connection
=
args
.
fp32_residual_connection
# Layernorm on the input data.
self
.
input_layernorm
=
LayerNorm
(
args
.
hidden_size
,
eps
=
args
.
layernorm_epsilon
)
# Self attention.
self
.
self_attention
=
ParallelAttention
(
init_method
,
output_layer_init_method
,
layer_number
,
attention_type
=
AttnType
.
self_attn
,
attn_mask_type
=
self_attn_mask_type
)
self
.
hidden_dropout
=
args
.
hidden_dropout
self
.
bias_dropout_fusion
=
args
.
bias_dropout_fusion
# Layernorm on the attention output
self
.
post_attention_layernorm
=
LayerNorm
(
args
.
hidden_size
,
eps
=
args
.
layernorm_epsilon
)
if
self
.
layer_type
==
LayerType
.
decoder
:
self
.
inter_attention
=
ParallelAttention
(
init_method
,
output_layer_init_method
,
layer_number
,
attention_type
=
AttnType
.
cross_attn
)
# Layernorm on the attention output.
self
.
post_inter_attention_layernorm
=
LayerNorm
(
args
.
hidden_size
,
eps
=
args
.
layernorm_epsilon
)
# MLP
self
.
mlp
=
ParallelMLP
(
init_method
,
output_layer_init_method
)
# Alibi
if
args
.
position_embedding_type
==
PositionEmbeddingType
.
alibi
:
self
.
alibi
=
self
.
_build_alibi_tensor
(
args
.
seq_length
,
args
.
num_attention_heads
,
args
.
micro_batch_size
).
to
(
torch
.
cuda
.
current_device
())
if
args
.
params_dtype
==
torch
.
float16
:
self
.
alibi
=
self
.
alibi
.
to
(
torch
.
float16
)
elif
args
.
params_dtype
==
torch
.
bfloat16
:
self
.
alibi
=
self
.
alibi
.
to
(
torch
.
bfloat16
)
else
:
self
.
alibi
=
None
def
forward
(
self
,
hidden_states
,
attention_mask
,
encoder_output
=
None
,
enc_dec_attn_mask
=
None
,
layer_past
=
None
,
get_key_value
=
False
):
# hidden_states: [b, s, h]
# Layer norm at the beginning of the transformer layer.
layernorm_output
=
self
.
input_layernorm
(
hidden_states
)
# Self attention.
attention_output
,
attention_bias
=
\
self
.
self_attention
(
layernorm_output
,
attention_mask
,
layer_past
=
layer_past
,
get_key_value
=
get_key_value
,
alibi
=
self
.
alibi
)
if
get_key_value
:
attention_output
,
presents
=
attention_output
# Residual connection.
if
self
.
apply_residual_connection_post_layernorm
:
residual
=
layernorm_output
else
:
residual
=
hidden_states
# jit scripting for a nn.module (with dropout) is not
# trigerring the fusion kernel. For now, we use two
# different nn.functional routines to account for varying
# dropout semantics during training and inference phases.
if
self
.
bias_dropout_fusion
:
if
self
.
training
:
bias_dropout_add_func
=
bias_dropout_add_fused_train
else
:
bias_dropout_add_func
=
bias_dropout_add_fused_inference
else
:
bias_dropout_add_func
=
get_bias_dropout_add
(
self
.
training
)
# re-enable torch grad to enable fused optimization.
with
torch
.
enable_grad
():
layernorm_input
=
bias_dropout_add_func
(
attention_output
,
attention_bias
.
expand_as
(
residual
),
residual
,
self
.
hidden_dropout
)
# Layer norm post the self attention.
layernorm_output
=
self
.
post_attention_layernorm
(
layernorm_input
)
if
self
.
layer_type
==
LayerType
.
decoder
:
attention_output
,
attention_bias
=
\
self
.
inter_attention
(
layernorm_output
,
enc_dec_attn_mask
,
encoder_output
=
encoder_output
)
# residual connection
if
self
.
apply_residual_connection_post_layernorm
:
residual
=
layernorm_output
else
:
residual
=
layernorm_input
# re-enable torch grad to enable fused optimization.
with
torch
.
enable_grad
():
layernorm_input
=
bias_dropout_add_func
(
attention_output
,
attention_bias
.
expand_as
(
residual
),
residual
,
self
.
hidden_dropout
)
# Layer norm post the decoder attention
layernorm_output
=
self
.
post_inter_attention_layernorm
(
layernorm_input
)
# MLP.
mlp_output
,
mlp_bias
=
self
.
mlp
(
layernorm_output
)
# Second residual connection.
if
self
.
apply_residual_connection_post_layernorm
:
residual
=
layernorm_output
else
:
residual
=
layernorm_input
# re-enable torch grad to enable fused optimization.
with
torch
.
enable_grad
():
output
=
bias_dropout_add_func
(
mlp_output
,
mlp_bias
.
expand_as
(
residual
),
residual
,
self
.
hidden_dropout
)
if
get_key_value
:
output
=
[
output
,
presents
]
return
output
@
staticmethod
def
_build_alibi_tensor
(
max_seq_len
,
num_attention_heads
,
batch_size
):
# Based on https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
"""Returns tensor shaped (batch_size * num_attention_heads, 1, max_seq_len)"""
def
get_slopes
(
n
):
def
get_slopes_power_of_2
(
n
):
start
=
(
2
**
(
-
2
**
-
(
math
.
log2
(
n
)
-
3
)))
ratio
=
start
return
[
start
*
ratio
**
i
for
i
in
range
(
n
)]
if
math
.
log2
(
n
).
is_integer
():
return
get_slopes_power_of_2
(
n
)
else
:
closest_power_of_2
=
2
**
math
.
floor
(
math
.
log2
(
n
))
return
get_slopes_power_of_2
(
closest_power_of_2
)
+
get_slopes
(
2
*
closest_power_of_2
)[
0
::
2
][
:
n
-
closest_power_of_2
]
slopes
=
torch
.
Tensor
(
get_slopes
(
num_attention_heads
))
alibi
=
slopes
.
unsqueeze
(
1
).
unsqueeze
(
1
)
*
torch
.
arange
(
max_seq_len
).
unsqueeze
(
0
).
unsqueeze
(
0
).
expand
(
num_attention_heads
,
-
1
,
-
1
)
#Select the part of the tensor that corresponds to our tensor parallel index.
tp_world_size
=
mpu
.
get_tensor_model_parallel_world_size
()
tp_index
=
mpu
.
get_tensor_model_parallel_rank
()
alibi
=
alibi
.
reshape
((
tp_world_size
,
-
1
,
*
alibi
.
shape
[
1
:]))[
tp_index
]
alibi
=
alibi
.
repeat
(
batch_size
,
1
,
1
)
return
alibi
class
ParallelTransformerLayerPipe
(
ParallelTransformerLayer
):
"""Extends ParallelTransformerLayer to forward attention_mask through the pipeline.
Forward has two usages that affect attention mask communication:
1) forward((input, attn_mask) , **kwargs) -> (output, mask)
When the attention mask is provided as the second positional
argument, typical pipeline behavior is used and both the output
*and* mask are returned in a tuple. This tuple is then forwarded
to the next stage in the pipeline.
This version is useful if masks are dynamic.
2) forward(input, **kwargs) -> output
When the mask is static over all samples, it is advantageous to
cache the mask and avoid communicating it.
"""
def
forward
(
self
,
inputs
,
**
kwargs
):
assert
torch
.
is_tensor
(
inputs
)
or
isinstance
(
inputs
,
tuple
)
if
torch
.
is_tensor
(
inputs
)
or
len
(
inputs
)
==
1
:
hidden_states
,
attention_mask
=
inputs
,
None
return
super
().
forward
(
hidden_states
,
attention_mask
,
**
kwargs
)
elif
len
(
inputs
)
==
2
:
# Attention mask is an activation.
hidden_states
,
attention_mask
=
inputs
[
0
],
inputs
[
1
]
return
super
().
forward
(
*
inputs
,
**
kwargs
),
attention_mask
else
:
raise
RuntimeError
(
'Received more inputs than understood.'
)
class
ParallelTransformer
(
MegatronModule
):
"""Transformer class."""
def
__init__
(
self
,
init_method
,
output_layer_init_method
,
layer_type
=
LayerType
.
encoder
,
self_attn_mask_type
=
AttnMaskType
.
padding
,
pre_process
=
True
,
post_process
=
True
):
super
(
ParallelTransformer
,
self
).
__init__
()
args
=
get_args
()
self
.
bf16
=
args
.
bf16
self
.
fp32_residual_connection
=
args
.
fp32_residual_connection
self
.
pre_process
=
pre_process
self
.
post_process
=
post_process
self
.
input_tensor
=
None
# Store activation checkpoiting flag.
self
.
checkpoint_activations
=
args
.
checkpoint_activations
self
.
checkpoint_num_layers
=
args
.
checkpoint_num_layers
# Number of layers.
assert
args
.
num_layers
%
mpu
.
get_pipeline_model_parallel_world_size
()
==
0
,
\
'num_layers must be divisible by pipeline_model_parallel_size'
self
.
num_layers
=
args
.
num_layers
//
mpu
.
get_pipeline_model_parallel_world_size
()
# Transformer layers.
def
build_layer
(
layer_number
):
return
ParallelTransformerLayer
(
init_method
,
output_layer_init_method
,
layer_number
,
layer_type
=
layer_type
,
self_attn_mask_type
=
self_attn_mask_type
)
if
args
.
virtual_pipeline_model_parallel_size
is
not
None
:
assert
args
.
num_layers
%
args
.
virtual_pipeline_model_parallel_size
==
0
,
\
'num_layers_per_stage must be divisible by '
\
'virtual_pipeline_model_parallel_size'
# Number of layers in each model chunk is the number of layers in the stage,
# divided by the number of model chunks in a stage.
self
.
num_layers
=
self
.
num_layers
//
args
.
virtual_pipeline_model_parallel_size
# With 8 layers, 2 stages, and 4 model chunks, we want an assignment of
# layers to stages like (each list is a model chunk):
# Stage 0: [0] [2] [4] [6]
# Stage 1: [1] [3] [5] [7]
# With 8 layers, 2 stages, and 2 virtual stages, we want an assignment of
# layers to stages like (each list is a model chunk):
# Stage 0: [0, 1] [4, 5]
# Stage 1: [2, 3] [6, 7]
offset
=
mpu
.
get_virtual_pipeline_model_parallel_rank
()
*
(
args
.
num_layers
//
args
.
virtual_pipeline_model_parallel_size
)
+
\
(
mpu
.
get_pipeline_model_parallel_rank
()
*
self
.
num_layers
)
else
:
# Each stage gets a contiguous set of layers.
offset
=
mpu
.
get_pipeline_model_parallel_rank
()
*
self
.
num_layers
self
.
layers
=
torch
.
nn
.
ModuleList
(
[
build_layer
(
i
+
1
+
offset
)
for
i
in
range
(
self
.
num_layers
)])
if
self
.
post_process
:
# Final layer norm before output.
self
.
final_layernorm
=
LayerNorm
(
args
.
hidden_size
,
eps
=
args
.
layernorm_epsilon
)
if
deepspeed
.
checkpointing
.
is_configured
():
global
get_cuda_rng_tracker
,
checkpoint
get_cuda_rng_tracker
=
deepspeed
.
checkpointing
.
get_cuda_rng_tracker
checkpoint
=
deepspeed
.
checkpointing
.
checkpoint
def
_get_layer
(
self
,
layer_number
):
return
self
.
layers
[
layer_number
]
def
_checkpointed_forward
(
self
,
hidden_states
,
attention_mask
,
encoder_output
,
enc_dec_attn_mask
):
"""Forward method with activation checkpointing."""
def
custom
(
start
,
end
):
def
custom_forward
(
*
inputs
):
x_
=
inputs
[
0
]
attention_mask
=
inputs
[
1
]
encoder_output
=
inputs
[
2
]
enc_dec_attn_mask
=
inputs
[
3
]
for
index
in
range
(
start
,
end
):
layer
=
self
.
_get_layer
(
index
)
x_
=
layer
(
x_
,
attention_mask
,
encoder_output
,
enc_dec_attn_mask
)
return
x_
return
custom_forward
# Make sure memory is freed.
mpu
.
reset_checkpointed_activations_memory_buffer
()
l
=
0
while
l
<
self
.
num_layers
:
hidden_states
=
mpu
.
checkpoint
(
custom
(
l
,
l
+
self
.
checkpoint_num_layers
),
hidden_states
,
attention_mask
,
encoder_output
,
enc_dec_attn_mask
)
l
+=
self
.
checkpoint_num_layers
return
hidden_states
def
set_input_tensor
(
self
,
input_tensor
):
"""Set input tensor to be used instead of forward()'s input.
When doing pipeline parallelism the input from the previous
stage comes from communication, not from the input, so the
model's forward_step_func won't have it. This function is thus
used by internal code to bypass the input provided by the
forward_step_func"""
self
.
input_tensor
=
input_tensor
def
forward
(
self
,
hidden_states
,
attention_mask
,
layer_past
=
None
,
get_key_value
=
False
,
encoder_output
=
None
,
enc_dec_attn_mask
=
None
):
# Checks.
if
layer_past
is
not
None
:
assert
get_key_value
,
\
'for not None values in layer_past, '
\
'expected get_key_value to be set'
if
get_key_value
:
assert
not
self
.
checkpoint_activations
,
\
'get_key_value does not work with '
\
'activation checkpointing'
if
self
.
pre_process
:
# Data format change to avoid explicit tranposes : [b s h] --> [s b h].
# If the input flag for fp32 residual connection is set, convert for float.
if
self
.
fp32_residual_connection
:
hidden_states
=
hidden_states
.
transpose
(
0
,
1
).
contiguous
().
float
()
# Otherwise, leave it as is.
else
:
hidden_states
=
hidden_states
.
transpose
(
0
,
1
).
contiguous
()
else
:
# See set_input_tensor()
hidden_states
=
self
.
input_tensor
if
encoder_output
is
not
None
:
encoder_output
=
encoder_output
.
transpose
(
0
,
1
).
contiguous
()
if
self
.
checkpoint_activations
:
hidden_states
=
self
.
_checkpointed_forward
(
hidden_states
,
attention_mask
,
encoder_output
,
enc_dec_attn_mask
)
else
:
if
get_key_value
:
presents
=
[]
for
index
in
range
(
self
.
num_layers
):
layer
=
self
.
_get_layer
(
index
)
past
=
None
if
layer_past
is
not
None
:
past
=
layer_past
[
index
]
hidden_states
=
layer
(
hidden_states
,
attention_mask
,
encoder_output
=
encoder_output
,
enc_dec_attn_mask
=
enc_dec_attn_mask
,
layer_past
=
past
,
get_key_value
=
get_key_value
)
if
get_key_value
:
hidden_states
,
present
=
hidden_states
presents
.
append
(
present
)
# Final layer norm.
if
self
.
post_process
:
# Reverting data format change [s b h] --> [b s h].
hidden_states
=
hidden_states
.
transpose
(
0
,
1
).
contiguous
()
output
=
self
.
final_layernorm
(
hidden_states
)
else
:
output
=
hidden_states
if
get_key_value
:
output
=
[
output
,
presents
]
return
output
megatron-deepspeed_dtk22.10/megatron/model/utils.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utilities for models."""
import
math
from
functools
import
wraps
import
torch
from
megatron
import
get_args
def
init_method_normal
(
sigma
):
"""Init method based on N(0, sigma)."""
def
init_
(
tensor
):
return
torch
.
nn
.
init
.
normal_
(
tensor
,
mean
=
0.0
,
std
=
sigma
)
return
init_
def
scaled_init_method_normal
(
sigma
,
num_layers
):
"""Init method based on N(0, sigma/sqrt(2*num_layers)."""
std
=
sigma
/
math
.
sqrt
(
2.0
*
num_layers
)
def
init_
(
tensor
):
return
torch
.
nn
.
init
.
normal_
(
tensor
,
mean
=
0.0
,
std
=
std
)
return
init_
def
attention_mask_func
(
attention_scores
,
attention_mask
):
args
=
get_args
()
if
args
.
curriculum_learning
:
attention_mask_
=
attention_mask
actual_seqlen
=
attention_scores
.
size
()[
2
]
if
actual_seqlen
!=
attention_mask_
.
size
()[
2
]:
# attention_mask has size [1, 1, seqlen, seqlen]
attention_mask_
=
attention_mask_
[:,
:,
:
actual_seqlen
,
:
actual_seqlen
].
contiguous
()
attention_scores
.
masked_fill_
(
attention_mask_
,
torch
.
finfo
(
attention_scores
.
dtype
).
min
)
else
:
attention_scores
.
masked_fill_
(
attention_mask
,
torch
.
finfo
(
attention_scores
.
dtype
).
min
)
return
attention_scores
def
get_linear_layer
(
rows
,
columns
,
init_method
):
"""Simple linear layer with weight initialization."""
layer
=
torch
.
nn
.
Linear
(
rows
,
columns
)
init_method
(
layer
.
weight
)
with
torch
.
no_grad
():
layer
.
bias
.
zero_
()
return
layer
@
torch
.
jit
.
script
def
gelu_impl
(
x
):
"""OpenAI's gelu implementation."""
return
0.5
*
x
*
(
1.0
+
torch
.
tanh
(
0.7978845608028654
*
x
*
(
1.0
+
0.044715
*
x
*
x
)))
def
openai_gelu
(
x
):
return
gelu_impl
(
x
)
#This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
@
torch
.
jit
.
script
def
erf_gelu
(
x
):
return
x
*
0.5
*
(
torch
.
erf
(
x
/
1.41421
).
to
(
dtype
=
x
.
dtype
)
+
torch
.
ones_like
(
x
).
to
(
dtype
=
x
.
dtype
))
def
log_debug_usage
(
logger
,
msg
:
str
):
def
log_debug_usage_
(
func
):
"""Helper function in order to log a message when using a function for the first time"""
func
.
__logged_message__
=
False
@
wraps
(
func
)
def
wrapped
(
*
args
,
**
kwargs
):
if
func
.
__logged_message__
is
False
:
logger
.
debug
(
msg
)
func
.
__logged_message__
=
True
return
func
(
*
args
,
**
kwargs
)
return
wrapped
return
log_debug_usage_
megatron-deepspeed_dtk22.10/megatron/model/vit_model.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Vision Transformer(VIT) model."""
import
math
import
einops
import
torch
import
torch.nn.functional
as
F
from
megatron
import
get_args
from
megatron.model.transformer
import
ParallelTransformer
from
megatron.model.utils
import
(
get_linear_layer
,
init_method_normal
,
scaled_init_method_normal
,
)
from
.module
import
MegatronModule
class
VitMlpHead
(
MegatronModule
):
"""Pooler layer.
Pool hidden states of a specific token (for example start of the
sequence) and add a linear transformation followed by a tanh.
Arguments:
hidden_size: hidden size
init_method: weight initialization method for the linear layer.
bias is set to zero.
"""
def
__init__
(
self
,
hidden_size
,
num_classes
):
super
(
VitMlpHead
,
self
).
__init__
()
self
.
dense_in
=
torch
.
nn
.
Linear
(
hidden_size
,
hidden_size
)
self
.
dense_out
=
torch
.
nn
.
Linear
(
hidden_size
,
num_classes
)
torch
.
nn
.
init
.
constant_
(
self
.
dense_out
.
bias
,
-
10
)
def
forward
(
self
,
hidden_states
,
sequence_index
=
0
):
# hidden_states: [b, s, h]
# sequence_index: index of the token to pool.
x
=
hidden_states
[:,
sequence_index
,
:]
x
=
self
.
dense_in
(
x
)
x
=
torch
.
tanh
(
x
)
x
=
self
.
dense_out
(
x
)
return
x
def
twod_interpolate_position_embeddings_hook
(
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
,
):
args
=
get_args
()
num_patches_per_dim
=
args
.
img_dim
//
args
.
patch_dim
num_patches
=
num_patches_per_dim
**
2
seq_length
=
num_patches
+
1
hidden_size
=
args
.
hidden_size
key
=
prefix
+
"weight"
# import pdb
# pdb.set_trace()
assert
key
in
state_dict
if
key
in
state_dict
:
input_param
=
state_dict
[
key
]
assert
input_param
.
shape
[
1
]
==
hidden_size
if
input_param
.
shape
[
0
]
!=
seq_length
:
# update input_param and load it to state_dict[key]
num_tok_input
=
input_param
.
shape
[
0
]
-
1
num_tok_new
=
seq_length
-
1
input_param_tok
,
input_param_grid
=
(
input_param
[:
1
,
:],
input_param
[
1
:,
:],
)
gs_input
=
int
(
math
.
sqrt
(
num_tok_input
))
gs_new
=
int
(
math
.
sqrt
(
num_tok_new
))
input_param_grid
=
input_param_grid
.
transpose
(
0
,
1
).
contiguous
()
input_param_grid
=
input_param_grid
.
reshape
(
(
1
,
-
1
,
gs_input
,
gs_input
)
)
input_param_grid
=
input_param_grid
.
float
()
scale_factor
=
gs_new
/
gs_input
input_param_grid
=
F
.
interpolate
(
input_param_grid
,
scale_factor
=
scale_factor
,
mode
=
"bilinear"
)
input_param_grid
=
input_param_grid
.
half
()
input_param_grid
=
input_param_grid
.
reshape
((
-
1
,
gs_new
*
gs_new
))
input_param_grid
=
input_param_grid
.
transpose
(
0
,
1
).
contiguous
()
assert
input_param_grid
.
shape
[
1
]
==
hidden_size
input_param
=
torch
.
cat
((
input_param_tok
,
input_param_grid
),
dim
=
0
)
assert
(
input_param
.
shape
[
0
]
==
seq_length
and
input_param
.
shape
[
1
]
==
hidden_size
)
state_dict
[
key
]
=
input_param
class
VitModel
(
MegatronModule
):
"""Vision Transformer Model."""
def
__init__
(
self
,
num_classes
,
finetune
=
False
):
super
(
VitModel
,
self
).
__init__
()
args
=
get_args
()
self
.
fp16_lm_cross_entropy
=
args
.
fp16_lm_cross_entropy
if
args
.
init_method_xavier_uniform
:
self
.
init_method
=
torch
.
nn
.
init
.
xavier_uniform_
self
.
scaled_init_method
=
torch
.
nn
.
init
.
xavier_uniform_
else
:
self
.
init_method
=
init_method_normal
(
args
.
init_method_std
)
self
.
scaled_init_method
=
scaled_init_method_normal
(
args
.
init_method_std
,
args
.
num_layers
)
self
.
hidden_size
=
args
.
hidden_size
self
.
num_classes
=
num_classes
self
.
patch_dim
=
args
.
patch_dim
self
.
img_dim
=
args
.
img_dim
self
.
finetune
=
finetune
assert
self
.
img_dim
%
self
.
patch_dim
==
0
self
.
num_patches_per_dim
=
self
.
img_dim
//
self
.
patch_dim
self
.
num_patches
=
self
.
num_patches_per_dim
**
2
self
.
seq_length
=
self
.
num_patches
+
1
self
.
flatten_dim
=
self
.
patch_dim
*
self
.
patch_dim
*
args
.
num_channels
# cls_token
self
.
cls_token
=
torch
.
nn
.
Parameter
(
torch
.
randn
(
1
,
1
,
self
.
hidden_size
))
torch
.
nn
.
init
.
zeros_
(
self
.
cls_token
)
# Linear encoder
self
.
linear_encoder
=
torch
.
nn
.
Linear
(
self
.
flatten_dim
,
self
.
hidden_size
)
# embedding
self
.
position_embeddings
=
torch
.
nn
.
Embedding
(
self
.
seq_length
,
self
.
hidden_size
)
init_method_normal
(
args
.
init_method_std
)(
self
.
position_embeddings
.
weight
)
self
.
position_ids
=
torch
.
arange
(
self
.
seq_length
).
expand
(
1
,
-
1
).
cuda
()
self
.
position_embeddings
.
_register_load_state_dict_pre_hook
(
twod_interpolate_position_embeddings_hook
)
self
.
embedding_dropout
=
torch
.
nn
.
Dropout
(
args
.
hidden_dropout
)
# Transformer
self
.
transformer
=
ParallelTransformer
(
self
.
init_method
,
self
.
scaled_init_method
)
# MLP head
if
not
self
.
finetune
:
self
.
mlp_head
=
VitMlpHead
(
self
.
hidden_size
,
self
.
num_classes
)
else
:
self
.
class_head
=
get_linear_layer
(
self
.
hidden_size
,
num_classes
,
torch
.
nn
.
init
.
zeros_
)
def
forward
(
self
,
x
):
x
=
einops
.
rearrange
(
x
,
"b c (h p1) (w p2) -> b (h w) (p1 p2 c)"
,
p1
=
self
.
patch_dim
,
p2
=
self
.
patch_dim
,
)
assert
x
.
dtype
==
torch
.
half
x
=
self
.
linear_encoder
(
x
)
cls_tokens
=
self
.
cls_token
.
expand
(
x
.
shape
[
0
],
-
1
,
-
1
)
x
=
torch
.
cat
((
cls_tokens
,
x
),
dim
=
1
)
x
=
x
+
self
.
position_embeddings
(
self
.
position_ids
)
x
=
self
.
embedding_dropout
(
x
)
x
=
self
.
transformer
(
x
,
None
)
if
not
self
.
finetune
:
x
=
self
.
mlp_head
(
x
)
else
:
x
=
self
.
class_head
(
x
[:,
0
,
:])
return
x
megatron-deepspeed_dtk22.10/megatron/mpu/__init__.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Model parallel utility interface."""
from
.cross_entropy
import
vocab_parallel_cross_entropy
from
.data
import
broadcast_data
from
.initialize
import
is_unitialized
from
.initialize
import
destroy_model_parallel
from
.initialize
import
get_data_parallel_group
from
.initialize
import
get_data_parallel_rank
from
.initialize
import
get_data_parallel_world_size
from
.initialize
import
get_embedding_group
from
.initialize
import
get_model_parallel_group
from
.initialize
import
get_tensor_model_parallel_group
from
.initialize
import
get_pipeline_model_parallel_group
from
.initialize
import
get_tensor_model_parallel_rank
,
set_tensor_model_parallel_rank
from
.initialize
import
get_pipeline_model_parallel_rank
,
set_pipeline_model_parallel_rank
from
.initialize
import
is_pipeline_first_stage
,
is_pipeline_last_stage
from
.initialize
import
get_tensor_model_parallel_src_rank
from
.initialize
import
get_pipeline_model_parallel_first_rank
from
.initialize
import
get_pipeline_model_parallel_last_rank
from
.initialize
import
get_pipeline_model_parallel_next_rank
from
.initialize
import
get_pipeline_model_parallel_prev_rank
from
.initialize
import
get_tensor_model_parallel_world_size
,
set_tensor_model_parallel_world_size
from
.initialize
import
get_pipeline_model_parallel_world_size
,
set_pipeline_model_parallel_world_size
from
.initialize
import
get_virtual_pipeline_model_parallel_rank
,
set_virtual_pipeline_model_parallel_rank
from
.initialize
import
initialize_model_parallel
from
.initialize
import
model_parallel_is_initialized
from
.initialize
import
get_model_parallel_world_size
,
get_model_parallel_rank
from
.layers
import
ColumnParallelLinear
from
.layers
import
RowParallelLinear
from
.layers
import
VocabParallelEmbedding
from
.layers
import
(
set_tensor_model_parallel_attributes
,
set_defaults_if_not_set_tensor_model_parallel_attributes
,
copy_tensor_model_parallel_attributes
)
from
.mappings
import
copy_to_tensor_model_parallel_region
from
.mappings
import
gather_from_tensor_model_parallel_region
from
.mappings
import
reduce_from_tensor_model_parallel_region
from
.mappings
import
scatter_to_tensor_model_parallel_region
from
.random
import
checkpoint
from
.random
import
get_cuda_rng_tracker
from
.random
import
init_checkpointed_activations_memory_buffer
from
.random
import
model_parallel_cuda_manual_seed
from
.random
import
reset_checkpointed_activations_memory_buffer
from
.random
import
gather_split_1d_tensor
from
.random
import
split_tensor_into_1d_equal_chunks
from
.utils
import
divide
from
.utils
import
split_tensor_along_last_dim
megatron-deepspeed_dtk22.10/megatron/mpu/cross_entropy.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch
from
.initialize
import
get_tensor_model_parallel_group
from
.initialize
import
get_tensor_model_parallel_rank
from
.initialize
import
get_tensor_model_parallel_world_size
from
.utils
import
VocabUtility
class
_VocabParallelCrossEntropy
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
vocab_parallel_logits
,
target
):
# Maximum value along vocab dimension across all GPUs.
logits_max
=
torch
.
max
(
vocab_parallel_logits
,
dim
=-
1
)[
0
]
torch
.
distributed
.
all_reduce
(
logits_max
,
op
=
torch
.
distributed
.
ReduceOp
.
MAX
,
group
=
get_tensor_model_parallel_group
())
# Subtract the maximum value.
vocab_parallel_logits
.
sub_
(
logits_max
.
unsqueeze
(
dim
=-
1
))
# Get the partition's vocab indecies
get_vocab_range
=
VocabUtility
.
vocab_range_from_per_partition_vocab_size
partition_vocab_size
=
vocab_parallel_logits
.
size
()[
-
1
]
rank
=
get_tensor_model_parallel_rank
()
world_size
=
get_tensor_model_parallel_world_size
()
vocab_start_index
,
vocab_end_index
=
get_vocab_range
(
partition_vocab_size
,
rank
,
world_size
)
# Create a mask of valid vocab ids (1 means it needs to be masked).
target_mask
=
(
target
<
vocab_start_index
)
|
(
target
>=
vocab_end_index
)
masked_target
=
target
.
clone
()
-
vocab_start_index
masked_target
[
target_mask
]
=
0
# Get predicted-logits = logits[target].
# For Simplicity, we convert logits to a 2-D tensor with size
# [*, partition-vocab-size] and target to a 1-D tensor of size [*].
logits_2d
=
vocab_parallel_logits
.
view
(
-
1
,
partition_vocab_size
)
masked_target_1d
=
masked_target
.
view
(
-
1
)
arange_1d
=
torch
.
arange
(
start
=
0
,
end
=
logits_2d
.
size
()[
0
],
device
=
logits_2d
.
device
)
predicted_logits_1d
=
logits_2d
[
arange_1d
,
masked_target_1d
]
predicted_logits_1d
=
predicted_logits_1d
.
clone
().
contiguous
()
predicted_logits
=
predicted_logits_1d
.
view_as
(
target
)
predicted_logits
[
target_mask
]
=
0.0
# All reduce is needed to get the chunks from other GPUs.
torch
.
distributed
.
all_reduce
(
predicted_logits
,
op
=
torch
.
distributed
.
ReduceOp
.
SUM
,
group
=
get_tensor_model_parallel_group
())
# Sum of exponential of logits along vocab dimension across all GPUs.
exp_logits
=
vocab_parallel_logits
torch
.
exp
(
vocab_parallel_logits
,
out
=
exp_logits
)
sum_exp_logits
=
exp_logits
.
sum
(
dim
=-
1
)
torch
.
distributed
.
all_reduce
(
sum_exp_logits
,
op
=
torch
.
distributed
.
ReduceOp
.
SUM
,
group
=
get_tensor_model_parallel_group
())
# Loss = log(sum(exp(logits))) - predicted-logit.
loss
=
torch
.
log
(
sum_exp_logits
)
-
predicted_logits
# Store softmax, target-mask and masked-target for backward pass.
exp_logits
.
div_
(
sum_exp_logits
.
unsqueeze
(
dim
=-
1
))
ctx
.
save_for_backward
(
exp_logits
,
target_mask
,
masked_target_1d
)
return
loss
@
staticmethod
def
backward
(
ctx
,
grad_output
):
# Retreive tensors from the forward path.
softmax
,
target_mask
,
masked_target_1d
=
ctx
.
saved_tensors
# All the inputs have softmax as thier gradient.
grad_input
=
softmax
# For simplicity, work with the 2D gradient.
partition_vocab_size
=
softmax
.
size
()[
-
1
]
grad_2d
=
grad_input
.
view
(
-
1
,
partition_vocab_size
)
# Add the gradient from matching classes.
arange_1d
=
torch
.
arange
(
start
=
0
,
end
=
grad_2d
.
size
()[
0
],
device
=
grad_2d
.
device
)
grad_2d
[
arange_1d
,
masked_target_1d
]
-=
(
1.0
-
target_mask
.
view
(
-
1
).
float
())
# Finally elementwise multiplication with the output gradients.
grad_input
.
mul_
(
grad_output
.
unsqueeze
(
dim
=-
1
))
return
grad_input
,
None
def
vocab_parallel_cross_entropy
(
vocab_parallel_logits
,
target
):
"""Helper function for the cross entropy."""
return
_VocabParallelCrossEntropy
.
apply
(
vocab_parallel_logits
,
target
)
megatron-deepspeed_dtk22.10/megatron/mpu/data.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch
from
.initialize
import
get_tensor_model_parallel_group
from
.initialize
import
get_tensor_model_parallel_rank
from
.initialize
import
get_tensor_model_parallel_src_rank
_MAX_DATA_DIM
=
5
def
_check_data_types
(
keys
,
data
,
target_dtype
):
"""Check that all the keys have the same target data type."""
for
key
in
keys
:
assert
data
[
key
].
dtype
==
target_dtype
,
'{} has data type {} which '
\
'is different than {}'
.
format
(
key
,
data
[
key
].
dtype
,
target_dtype
)
def
_build_key_size_numel_dictionaries
(
keys
,
data
):
"""Build the size on rank 0 and broadcast."""
max_dim
=
_MAX_DATA_DIM
sizes
=
[
0
for
_
in
range
(
max_dim
)
for
_
in
keys
]
# Pack the sizes on rank zero.
if
get_tensor_model_parallel_rank
()
==
0
:
offset
=
0
for
key
in
keys
:
assert
data
[
key
].
dim
()
<
max_dim
,
'you should increase MAX_DATA_DIM'
size
=
data
[
key
].
size
()
for
i
,
s
in
enumerate
(
size
):
sizes
[
i
+
offset
]
=
s
offset
+=
max_dim
# Move to GPU and broadcast.
sizes_cuda
=
torch
.
cuda
.
LongTensor
(
sizes
)
torch
.
distributed
.
broadcast
(
sizes_cuda
,
get_tensor_model_parallel_src_rank
(),
group
=
get_tensor_model_parallel_group
())
# Move back to cpu and unpack.
sizes_cpu
=
sizes_cuda
.
cpu
()
key_size
=
{}
key_numel
=
{}
total_numel
=
0
offset
=
0
for
key
in
keys
:
i
=
0
size
=
[]
numel
=
1
while
sizes_cpu
[
offset
+
i
]
>
0
:
this_size
=
sizes_cpu
[
offset
+
i
]
size
.
append
(
this_size
)
numel
*=
this_size
i
+=
1
key_size
[
key
]
=
size
key_numel
[
key
]
=
numel
total_numel
+=
numel
offset
+=
max_dim
return
key_size
,
key_numel
,
total_numel
def
broadcast_data
(
keys
,
data
,
datatype
):
"""Broadcast data from rank zero of each model parallel group to the
members of the same model parallel group.
Arguments:
keys: list of keys in the data disctionary to be broadcasted
data: data dictionary of string keys and cpu tensor values.
datatype: torch data type of all tensors in data associated
with keys.
"""
# Build (key, size) and (key, number of elements) dictionaries along
# with the total number of elements on all ranks.
key_size
,
key_numel
,
total_numel
=
_build_key_size_numel_dictionaries
(
keys
,
data
)
# Pack on rank zero.
if
get_tensor_model_parallel_rank
()
==
0
:
# Check that all keys have the same data type.
_check_data_types
(
keys
,
data
,
datatype
)
# Flatten the data associated with the keys
flatten_data
=
torch
.
cat
(
[
data
[
key
].
contiguous
().
view
(
-
1
)
for
key
in
keys
],
dim
=
0
).
cuda
()
else
:
flatten_data
=
torch
.
empty
(
total_numel
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
datatype
)
# Broadcast
torch
.
distributed
.
broadcast
(
flatten_data
,
get_tensor_model_parallel_src_rank
(),
group
=
get_tensor_model_parallel_group
())
# Unpack
output
=
{}
offset
=
0
for
key
in
keys
:
size
=
key_size
[
key
]
numel
=
key_numel
[
key
]
output
[
key
]
=
flatten_data
.
narrow
(
0
,
offset
,
numel
).
view
(
size
)
offset
+=
numel
return
output
Prev
1
2
3
4
5
6
7
8
9
10
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment