Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Ecological Empowerment
megatron-lm_openwebtext
Commits
d444a97a
Commit
d444a97a
authored
Oct 30, 2025
by
yangzhong
Browse files
首次上传
parents
Pipeline
#3020
canceled with stages
Changes
443
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
922 additions
and
0 deletions
+922
-0
megatron/core/fusions/__pycache__/fused_bias_gelu.cpython-310.pyc
.../core/fusions/__pycache__/fused_bias_gelu.cpython-310.pyc
+0
-0
megatron/core/fusions/__pycache__/fused_bias_swiglu.cpython-310.pyc
...ore/fusions/__pycache__/fused_bias_swiglu.cpython-310.pyc
+0
-0
megatron/core/fusions/__pycache__/fused_cross_entropy.cpython-310.pyc
...e/fusions/__pycache__/fused_cross_entropy.cpython-310.pyc
+0
-0
megatron/core/fusions/__pycache__/fused_layer_norm.cpython-310.pyc
...core/fusions/__pycache__/fused_layer_norm.cpython-310.pyc
+0
-0
megatron/core/fusions/__pycache__/fused_softmax.cpython-310.pyc
...on/core/fusions/__pycache__/fused_softmax.cpython-310.pyc
+0
-0
megatron/core/fusions/fused_bias_dropout.py
megatron/core/fusions/fused_bias_dropout.py
+73
-0
megatron/core/fusions/fused_bias_geglu.py
megatron/core/fusions/fused_bias_geglu.py
+85
-0
megatron/core/fusions/fused_bias_gelu.py
megatron/core/fusions/fused_bias_gelu.py
+55
-0
megatron/core/fusions/fused_bias_swiglu.py
megatron/core/fusions/fused_bias_swiglu.py
+89
-0
megatron/core/fusions/fused_cross_entropy.py
megatron/core/fusions/fused_cross_entropy.py
+143
-0
megatron/core/fusions/fused_layer_norm.py
megatron/core/fusions/fused_layer_norm.py
+169
-0
megatron/core/fusions/fused_softmax.py
megatron/core/fusions/fused_softmax.py
+220
-0
megatron/core/inference/__init__.py
megatron/core/inference/__init__.py
+1
-0
megatron/core/inference/ammo_support/__init__.py
megatron/core/inference/ammo_support/__init__.py
+8
-0
megatron/core/inference/ammo_support/gpt/model_specs.py
megatron/core/inference/ammo_support/gpt/model_specs.py
+2
-0
megatron/core/inference/ammo_support/gpt/state_dict_hooks.py
megatron/core/inference/ammo_support/gpt/state_dict_hooks.py
+5
-0
megatron/core/inference/common_inference_params.py
megatron/core/inference/common_inference_params.py
+4
-0
megatron/core/inference/communication_utils.py
megatron/core/inference/communication_utils.py
+50
-0
megatron/core/inference/engines/__init__.py
megatron/core/inference/engines/__init__.py
+1
-0
megatron/core/inference/engines/abstract_engine.py
megatron/core/inference/engines/abstract_engine.py
+17
-0
No files found.
Too many changes to show.
To preserve performance only
443 of 443+
files are displayed.
Plain diff
Email patch
megatron/core/fusions/__pycache__/fused_bias_gelu.cpython-310.pyc
0 → 100644
View file @
d444a97a
File added
megatron/core/fusions/__pycache__/fused_bias_swiglu.cpython-310.pyc
0 → 100644
View file @
d444a97a
File added
megatron/core/fusions/__pycache__/fused_cross_entropy.cpython-310.pyc
0 → 100644
View file @
d444a97a
File added
megatron/core/fusions/__pycache__/fused_layer_norm.cpython-310.pyc
0 → 100644
View file @
d444a97a
File added
megatron/core/fusions/__pycache__/fused_softmax.cpython-310.pyc
0 → 100644
View file @
d444a97a
File added
megatron/core/fusions/fused_bias_dropout.py
0 → 100644
View file @
d444a97a
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
from
typing
import
Optional
,
Tuple
import
torch
from
megatron.core.jit
import
jit_fuser
def
_bias_dropout_add_func
(
x_with_bias
,
residual
,
prob
,
training
):
# type: (Tuple[Tensor, Optional[Tensor]], Tensor, float, bool) -> Tensor
# NOTE: Previously, the argument `bias` used to be passed as
# `bias.expand_as(residual)` when the `bias_dropout_func` is called from the
# transformer layer but broadcasting should automatically take care of that.
# Also, looking at broadcasting semantics, `expand_as` and broadcasting
# seem to be identical performance-wise (both just change the view).
x
,
bias
=
x_with_bias
# unpack
# If we want to train mixed precision, then the output of this function
# should be half precision. However, in AMP O1, the input (residual) is
# in fp32, and it will up-cast the result to fp32, causing pipeline parallel
# GPU communication to hang. Therefore, we need to cast residual to the same
# dtype as x.
residual
=
residual
if
residual
.
dtype
==
x
.
dtype
else
residual
.
to
(
x
.
dtype
)
# The Dropout operation, Residual Addition and the tensor returning can be
# done generically outside the if statement, but that stops fusing of Bias
# Addition-Dropout-Residual Addition operation. So doing it together inside
# the conditional branch to improve performance
if
bias
is
not
None
:
x
=
x
+
bias
out
=
torch
.
nn
.
functional
.
dropout
(
x
,
p
=
prob
,
training
=
training
)
out
=
residual
+
out
return
out
else
:
out
=
torch
.
nn
.
functional
.
dropout
(
x
,
p
=
prob
,
training
=
training
)
out
=
residual
+
out
return
out
def
bias_dropout_add_unfused
(
training
):
def
_bias_dropout_add
(
x_with_bias
,
residual
,
prob
):
return
_bias_dropout_add_func
(
x_with_bias
,
residual
,
prob
,
training
)
return
_bias_dropout_add
@
jit_fuser
def
bias_dropout_add_fused_train
(
x_with_bias
:
Tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
]],
residual
:
torch
.
Tensor
,
prob
:
float
)
->
torch
.
Tensor
:
return
_bias_dropout_add_func
(
x_with_bias
,
residual
,
prob
,
True
)
@
jit_fuser
def
bias_dropout_add_fused_inference
(
x_with_bias
:
Tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
]],
residual
:
torch
.
Tensor
,
prob
:
float
)
->
torch
.
Tensor
:
return
_bias_dropout_add_func
(
x_with_bias
,
residual
,
prob
,
False
)
def
get_bias_dropout_add
(
training
,
fused
):
if
fused
:
# jit scripting for a nn.module (with dropout) is not
# triggering the fusion kernel. For now, we use two
# different nn.functional routines to account for varying
# dropout semantics during training and inference phases.
if
training
:
return
bias_dropout_add_fused_train
else
:
return
bias_dropout_add_fused_inference
else
:
return
bias_dropout_add_unfused
(
training
)
megatron/core/fusions/fused_bias_geglu.py
0 → 100644
View file @
d444a97a
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import
torch
from
megatron.core.jit
import
jit_fuser
###### BIAS GELU FUSION/ NO AUTOGRAD ################
# 1/sqrt(2*pi)-> 0.3989423
# 1/sqrt(2) -> 0.70710678
# sqrt(2/pi) -> 0.79788456
# this function is tanh approximation of gelu
# actual gelu is:
# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
@
jit_fuser
def
geglu
(
y
):
y_1
,
y_2
=
torch
.
chunk
(
y
,
2
,
-
1
)
return
(
y_1
*
0.5
*
(
1.0
+
torch
.
tanh
(
0.79788456
*
y_1
*
(
1
+
0.044715
*
y_1
*
y_1
))))
*
y_2
@
jit_fuser
def
bias_geglu
(
bias
,
y
):
y
=
y
+
bias
return
geglu
(
y
)
# gradient of tanh approximation of gelu
# gradient of actual gelu is:
# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
@
jit_fuser
def
geglu_back
(
g
,
y
):
y_1
,
y_2
=
torch
.
chunk
(
y
,
2
,
-
1
)
tanh_out
=
torch
.
tanh
(
0.79788456
*
y_1
*
(
1
+
0.044715
*
y_1
*
y_1
))
# sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
ff
=
0.5
*
y_1
*
((
1
-
tanh_out
*
tanh_out
)
*
(
0.79788456
+
0.1070322243
*
y_1
*
y_1
))
+
0.5
*
(
1
+
tanh_out
)
return
torch
.
cat
(((
g
*
y_2
)
*
ff
,
g
*
(
y_1
*
0.5
*
(
1.0
+
tanh_out
))),
-
1
)
@
jit_fuser
def
bias_geglu_back
(
g
,
y
,
bias
):
y
=
y
+
bias
return
geglu_back
(
g
,
y
)
class
BiasGeGLUFunction
(
torch
.
autograd
.
Function
):
@
staticmethod
# bias is an optional argument
def
forward
(
ctx
,
input
,
bias
):
ctx
.
save_for_backward
(
input
,
bias
)
return
bias_geglu
(
input
,
bias
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
input
,
bias
=
ctx
.
saved_tensors
tmp
=
bias_geglu_back
(
grad_output
,
input
,
bias
)
return
tmp
,
tmp
class
GeGLUFunction
(
torch
.
autograd
.
Function
):
@
staticmethod
# bias is an optional argument
def
forward
(
ctx
,
input
):
ctx
.
save_for_backward
(
input
)
return
geglu
(
input
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
input
=
ctx
.
saved_tensors
tmp
=
geglu_back
(
grad_output
,
input
[
0
])
return
tmp
def
bias_geglu_impl
(
input
,
bias
):
ori_shape
=
input
.
shape
assert
len
(
ori_shape
)
in
[
2
,
3
]
input
=
input
.
view
(
-
1
,
ori_shape
[
-
1
])
if
bias
is
not
None
:
output
=
BiasGeGLUFunction
.
apply
(
input
,
bias
)
else
:
output
=
GeGLUFunction
.
apply
(
input
)
return
output
if
len
(
ori_shape
)
==
2
else
output
.
view
(
ori_shape
[
0
],
ori_shape
[
1
],
-
1
)
megatron/core/fusions/fused_bias_gelu.py
0 → 100644
View file @
d444a97a
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import
torch
from
megatron.core.jit
import
jit_fuser
# BIAS GELU FUSION/ NO AUTOGRAD ################
# 1/sqrt(2*pi)-> 0.3989423
# 1/sqrt(2) -> 0.70710678
# sqrt(2/pi) -> 0.79788456
# this function is tanh approximation of gelu
# actual gelu is:
# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
@
jit_fuser
def
bias_gelu
(
bias
,
y
):
x
=
bias
+
y
return
x
*
0.5
*
(
1.0
+
torch
.
tanh
(
0.79788456
*
x
*
(
1
+
0.044715
*
x
*
x
)))
# gradient of tanh approximation of gelu
# gradient of actual gelu is:
# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
@
jit_fuser
def
bias_gelu_back
(
g
,
bias
,
y
):
x
=
bias
+
y
tanh_out
=
torch
.
tanh
(
0.79788456
*
x
*
(
1
+
0.044715
*
x
*
x
))
# sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
ff
=
0.5
*
x
*
((
1
-
tanh_out
*
tanh_out
)
*
(
0.79788456
+
0.1070322243
*
x
*
x
))
+
0.5
*
(
1
+
tanh_out
)
return
ff
*
g
class
GeLUFunction
(
torch
.
autograd
.
Function
):
@
staticmethod
# bias is an optional argument
def
forward
(
ctx
,
input
,
bias
):
ctx
.
save_for_backward
(
input
,
bias
)
return
bias_gelu
(
bias
,
input
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
input
,
bias
=
ctx
.
saved_tensors
tmp
=
bias_gelu_back
(
grad_output
,
bias
,
input
)
return
tmp
,
tmp
# This is required to make Sphinx happy :-(
@
classmethod
def
apply
(
cls
,
*
args
,
**
kwargs
):
return
super
().
apply
(
*
args
,
**
kwargs
)
bias_gelu_impl
=
GeLUFunction
.
apply
megatron/core/fusions/fused_bias_swiglu.py
0 → 100644
View file @
d444a97a
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import
torch
import
torch.nn.functional
as
F
from
megatron.core.jit
import
jit_fuser
###### BIAS SWIGLU FUSION/ NO AUTOGRAD ################
@
jit_fuser
def
swiglu
(
y
):
y_1
,
y_2
=
torch
.
chunk
(
y
,
2
,
-
1
)
return
F
.
silu
(
y_1
)
*
y_2
@
jit_fuser
def
bias_swiglu
(
y
,
bias
):
y
=
y
+
bias
return
swiglu
(
y
)
# gradient of tanh approximation of gelu
# gradient of actual gelu is:
# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
@
jit_fuser
def
swiglu_back
(
g
,
y
):
y_1
,
y_2
=
torch
.
chunk
(
y
,
2
,
-
1
)
return
torch
.
cat
(
(
g
*
torch
.
sigmoid
(
y_1
)
*
(
1
+
y_1
*
(
1
-
torch
.
sigmoid
(
y_1
)))
*
y_2
,
g
*
F
.
silu
(
y_1
)),
-
1
)
@
jit_fuser
def
bias_swiglu_back
(
g
,
y
,
bias
):
y
=
y
+
bias
return
swiglu_back
(
g
,
y
)
class
BiasSwiGLUFunction
(
torch
.
autograd
.
Function
):
@
staticmethod
# bias is an optional argument
def
forward
(
ctx
,
input
,
bias
,
fp8_input_store
):
input_for_backward
=
input
.
to
(
torch
.
float8_e4m3fn
)
if
fp8_input_store
else
input
ctx
.
save_for_backward
(
input_for_backward
,
bias
)
ctx
.
ori_input_dtype
=
input
.
dtype
ctx
.
fp8_input_store
=
fp8_input_store
return
bias_swiglu
(
input
,
bias
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
input
,
bias
=
ctx
.
saved_tensors
input
=
input
.
to
(
ctx
.
ori_input_dtype
)
if
ctx
.
fp8_input_store
else
input
tmp
=
bias_swiglu_back
(
grad_output
,
input
,
bias
)
return
tmp
,
tmp
,
None
class
SwiGLUFunction
(
torch
.
autograd
.
Function
):
@
staticmethod
# bias is an optional argument
def
forward
(
ctx
,
input
,
fp8_input_store
):
input_for_backward
=
input
.
to
(
torch
.
float8_e4m3fn
)
if
fp8_input_store
else
input
ctx
.
save_for_backward
(
input_for_backward
)
ctx
.
ori_input_dtype
=
input
.
dtype
ctx
.
fp8_input_store
=
fp8_input_store
return
swiglu
(
input
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
input
=
ctx
.
saved_tensors
[
0
]
input
=
input
.
to
(
ctx
.
ori_input_dtype
)
if
ctx
.
fp8_input_store
else
input
tmp
=
swiglu_back
(
grad_output
,
input
)
return
tmp
,
None
def
bias_swiglu_impl
(
input
,
bias
,
fp8_input_store
=
False
):
ori_shape
=
input
.
shape
assert
len
(
ori_shape
)
in
[
2
,
3
]
input
=
input
.
view
(
-
1
,
ori_shape
[
-
1
])
if
bias
is
not
None
:
output
=
BiasSwiGLUFunction
.
apply
(
input
,
bias
,
fp8_input_store
)
else
:
output
=
SwiGLUFunction
.
apply
(
input
,
fp8_input_store
)
return
output
if
len
(
ori_shape
)
==
2
else
output
.
view
(
ori_shape
[
0
],
ori_shape
[
1
],
-
1
)
# bias_swiglu_impl = BiasSwiGLUFunction.apply
# swiglu_impl = SwiGLUFunction.apply
megatron/core/fusions/fused_cross_entropy.py
0 → 100644
View file @
d444a97a
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from
typing
import
Tuple
import
torch
from
megatron.core.jit
import
jit_fuser
from
megatron.core.parallel_state
import
(
get_tensor_model_parallel_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
)
from
megatron.core.tensor_parallel.cross_entropy
import
VocabParallelCrossEntropy
from
megatron.core.tensor_parallel.utils
import
VocabUtility
@
jit_fuser
def
calculate_logits_max
(
vocab_parallel_logits
:
torch
.
Tensor
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
vocab_parallel_logits
,
logits_max
=
VocabParallelCrossEntropy
.
calculate_logits_max
(
vocab_parallel_logits
)
return
vocab_parallel_logits
,
logits_max
@
jit_fuser
def
calculate_predicted_logits
(
vocab_parallel_logits
:
torch
.
Tensor
,
target
:
torch
.
Tensor
,
logits_max
:
torch
.
Tensor
,
vocab_start_index
:
int
,
vocab_end_index
:
int
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
,
torch
.
Tensor
]:
(
target_mask
,
masked_target_1d
,
predicted_logits
,
sum_exp_logits
,
exp_logits
)
=
(
VocabParallelCrossEntropy
.
calculate_predicted_logits
(
vocab_parallel_logits
,
target
,
logits_max
,
vocab_start_index
,
vocab_end_index
)
)
predicted_logits_sum_exp_logits
=
torch
.
cat
((
predicted_logits
,
sum_exp_logits
))
return
target_mask
,
masked_target_1d
,
predicted_logits_sum_exp_logits
,
exp_logits
@
jit_fuser
def
calculate_cross_entropy_loss
(
exp_logits
:
torch
.
Tensor
,
predicted_logits_sum_exp_logits
:
torch
.
Tensor
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
split_val
=
predicted_logits_sum_exp_logits
.
size
()[
0
]
//
2
predicted_logits
,
sum_exp_logits
=
torch
.
split
(
predicted_logits_sum_exp_logits
,
split_val
)
exp_logits
,
loss
=
VocabParallelCrossEntropy
.
calculate_cross_entropy_loss
(
exp_logits
,
predicted_logits
,
sum_exp_logits
)
return
exp_logits
,
loss
@
jit_fuser
def
calculate_gradients
(
softmax
:
torch
.
Tensor
,
grad_output
:
torch
.
Tensor
,
target_mask
:
torch
.
Tensor
,
masked_target_1d
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
(
grad_2d
,
arange_1d
,
softmax_update
,
grad_input
)
=
(
VocabParallelCrossEntropy
.
prepare_gradient_calculation_operands
(
softmax
,
target_mask
)
)
grad_input
=
VocabParallelCrossEntropy
.
calculate_gradients
(
grad_2d
,
arange_1d
,
masked_target_1d
,
softmax_update
,
grad_input
,
grad_output
)
grad_input
=
grad_input
.
to
(
torch
.
bfloat16
)
return
grad_input
class
_VocabParallelCrossEntropy
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
vocab_parallel_logits
,
target
):
vocab_parallel_logits
,
logits_max
=
calculate_logits_max
(
vocab_parallel_logits
)
torch
.
distributed
.
all_reduce
(
logits_max
,
op
=
torch
.
distributed
.
ReduceOp
.
MAX
,
group
=
get_tensor_model_parallel_group
()
)
# Get the partition's vocab indices
get_vocab_range
=
VocabUtility
.
vocab_range_from_per_partition_vocab_size
partition_vocab_size
=
vocab_parallel_logits
.
size
()[
-
1
]
rank
=
get_tensor_model_parallel_rank
()
world_size
=
get_tensor_model_parallel_world_size
()
vocab_start_index
,
vocab_end_index
=
get_vocab_range
(
partition_vocab_size
,
rank
,
world_size
)
(
target_mask
,
masked_target_1d
,
predicted_logits_sum_exp_logits
,
exp_logits
)
=
(
calculate_predicted_logits
(
vocab_parallel_logits
,
target
,
logits_max
,
vocab_start_index
,
vocab_end_index
)
)
# All reduce is needed to get the chunks from other GPUs.
# In the fused case, tensors are batches to invoke a single
# AllReduce call
torch
.
distributed
.
all_reduce
(
predicted_logits_sum_exp_logits
,
op
=
torch
.
distributed
.
ReduceOp
.
SUM
,
group
=
get_tensor_model_parallel_group
(),
)
exp_logits
,
loss
=
calculate_cross_entropy_loss
(
exp_logits
,
predicted_logits_sum_exp_logits
)
# Store softmax, target-mask and masked-target for backward pass.
ctx
.
save_for_backward
(
exp_logits
,
target_mask
,
masked_target_1d
)
return
loss
@
staticmethod
def
backward
(
ctx
,
grad_output
):
# Retreive tensors from the forward path.
softmax
,
target_mask
,
masked_target_1d
=
ctx
.
saved_tensors
grad_input
=
calculate_gradients
(
softmax
,
grad_output
,
target_mask
,
masked_target_1d
)
return
grad_input
,
None
def
fused_vocab_parallel_cross_entropy
(
vocab_parallel_logits
,
target
):
"""
Performs cross entropy loss when logits are split across tensor parallel ranks
Args:
vocab_parallel_logits: logits split across tensor parallel ranks
dimension is [sequence_length, batch_size, hidden_size]
target: correct vocab ids of dimseion [sequence_length, micro_batch_size]
"""
return
_VocabParallelCrossEntropy
.
apply
(
vocab_parallel_logits
,
target
)
megatron/core/fusions/fused_layer_norm.py
0 → 100644
View file @
d444a97a
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import
importlib
import
inspect
import
numbers
import
torch
from
torch
import
Tensor
from
torch.nn
import
init
from
torch.nn.parameter
import
Parameter
from
megatron.core.transformer
import
TransformerConfig
from
megatron.core.utils
import
make_viewless_tensor
try
:
from
apex.contrib.layer_norm.layer_norm
import
FastLayerNormFN
HAVE_PERSIST_LAYER_NORM
=
True
except
ImportError
:
HAVE_PERSIST_LAYER_NORM
=
False
try
:
from
apex.normalization.fused_layer_norm
import
FusedLayerNormAffineFunction
HAVE_FUSED_LAYER_NORM
=
True
except
ImportError
:
HAVE_FUSED_LAYER_NORM
=
False
class
FusedLayerNorm
(
torch
.
nn
.
Module
):
"""Layer Norm, fused into a single CUDA kernel.
Args:
hidden_size (int): Transformer hidden dimension.
eps (float): Epsilon added to denominator, for numerical stability.
persist_layer_norm (bool): Use persistent fused layer norm kernel.
This kernel supports only a set of hidden sizes. Please
check persist_ln_hidden_sizes if your hidden size is supported.
zero_centered_gamma (bool): Adjust LayerNorm weights such that they are
centered around zero. This improves numerical stability.
config (TransformerConfig): Transformer config. Include to match custom
layer norm interfaces.
normalization (str): Normalization type, used for Transformer Engine.
Must equal 'LayerNorm' here.
"""
def
__init__
(
self
,
config
:
TransformerConfig
,
hidden_size
:
int
,
eps
:
float
=
1e-5
,
persist_layer_norm
:
bool
=
True
,
zero_centered_gamma
:
bool
=
False
,
normalization
:
str
=
"LayerNorm"
,
# included to match TE interface
):
super
().
__init__
()
self
.
config
=
config
self
.
zero_centered_gamma
=
self
.
config
.
layernorm_zero_centered_gamma
assert
(
self
.
config
.
normalization
==
"LayerNorm"
),
f
'(
{
self
.
config
.
normalization
}
) is not supported in FusedLayerNorm'
# List of hiddens sizes supported in the persistent layer norm kernel
# If the hidden size is not supported, fall back to the non-persistent
# kernel.
persist_ln_hidden_sizes
=
[
1024
,
1536
,
2048
,
2304
,
3072
,
3840
,
4096
,
5120
,
6144
,
8192
,
10240
,
12288
,
12800
,
15360
,
16384
,
18432
,
20480
,
24576
,
25600
,
30720
,
32768
,
40960
,
49152
,
65536
,
]
persist_layer_norm
=
self
.
config
.
persist_layer_norm
if
hidden_size
not
in
persist_ln_hidden_sizes
or
not
HAVE_PERSIST_LAYER_NORM
:
persist_layer_norm
=
False
if
not
persist_layer_norm
and
not
HAVE_FUSED_LAYER_NORM
:
# TODO: Add pytorch only layer norm
raise
ValueError
(
f
'Apex must be installed to use FusedLayerNorm.'
)
if
isinstance
(
hidden_size
,
numbers
.
Integral
):
hidden_size
=
(
hidden_size
,)
self
.
hidden_size
=
torch
.
Size
(
hidden_size
)
self
.
eps
=
eps
# Parameters need to be initialized with torch.empty rather than torch.Tensor for correct device placement with nemo2.
self
.
weight
=
Parameter
(
torch
.
empty
(
*
hidden_size
))
self
.
bias
=
Parameter
(
torch
.
empty
(
*
hidden_size
))
self
.
reset_parameters
()
self
.
persist_layer_norm
=
persist_layer_norm
self
.
sequence_parallel
=
self
.
config
.
sequence_parallel
# set sequence parallelism flag on weight and bias parameters
setattr
(
self
.
weight
,
'sequence_parallel'
,
self
.
sequence_parallel
)
setattr
(
self
.
bias
,
'sequence_parallel'
,
self
.
sequence_parallel
)
def
reset_parameters
(
self
):
if
self
.
zero_centered_gamma
:
init
.
zeros_
(
self
.
weight
)
init
.
zeros_
(
self
.
bias
)
else
:
init
.
ones_
(
self
.
weight
)
init
.
zeros_
(
self
.
bias
)
def
forward
(
self
,
input
:
Tensor
)
->
Tensor
:
weight
=
self
.
weight
+
1
if
self
.
zero_centered_gamma
else
self
.
weight
if
self
.
persist_layer_norm
:
if
'memory_efficient'
in
inspect
.
getfullargspec
(
FastLayerNormFN
.
forward
).
args
:
output
=
FastLayerNormFN
.
apply
(
input
,
weight
,
self
.
bias
,
self
.
eps
,
self
.
config
.
memory_efficient_layer_norm
)
else
:
output
=
FastLayerNormFN
.
apply
(
input
,
weight
,
self
.
bias
,
self
.
eps
)
# Apex's fast layer norm function outputs a 'view' tensor (i.e., has
# a populated '_base' field). This will result in schedule.py's
# deallocate_output_tensor() throwing an error, so a viewless tensor is
# created to prevent this.
output
=
make_viewless_tensor
(
inp
=
output
,
requires_grad
=
input
.
requires_grad
,
keep_graph
=
True
)
else
:
if
(
'memory_efficient'
in
inspect
.
getfullargspec
(
FusedLayerNormAffineFunction
.
forward
).
args
):
return
FusedLayerNormAffineFunction
.
apply
(
input
,
weight
,
self
.
bias
,
self
.
hidden_size
,
self
.
eps
,
self
.
config
.
memory_efficient_layer_norm
,
)
else
:
return
FusedLayerNormAffineFunction
.
apply
(
input
,
weight
,
self
.
bias
,
self
.
hidden_size
,
self
.
eps
)
return
output
megatron/core/fusions/fused_softmax.py
0 → 100644
View file @
d444a97a
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
from
typing
import
Optional
import
torch
import
torch.nn
as
nn
from
megatron.core.transformer.enums
import
AttnMaskType
from
megatron.core.transformer.utils
import
get_default_causal_mask
class
ScaledUpperTriangMaskedSoftmax
(
torch
.
autograd
.
Function
):
"""
Fused operation which performs following three operations in sequence
1. Scale the tensor.
2. Apply upper triangular mask (typically used in gpt models).
3. Perform softmax.
"""
@
staticmethod
def
forward
(
ctx
,
inputs
,
scale
):
import
scaled_upper_triang_masked_softmax_cuda
scale_t
=
torch
.
tensor
([
scale
])
softmax_results
=
scaled_upper_triang_masked_softmax_cuda
.
forward
(
inputs
,
scale_t
[
0
])
ctx
.
save_for_backward
(
softmax_results
,
scale_t
)
return
softmax_results
@
staticmethod
def
backward
(
ctx
,
output_grads
):
import
scaled_upper_triang_masked_softmax_cuda
softmax_results
,
scale_t
=
ctx
.
saved_tensors
input_grads
=
scaled_upper_triang_masked_softmax_cuda
.
backward
(
output_grads
,
softmax_results
,
scale_t
[
0
]
)
return
input_grads
,
None
class
ScaledMaskedSoftmax
(
torch
.
autograd
.
Function
):
"""
Fused operation which performs following three operations in sequence
1. Scale the tensor.
2. Apply the mask.
3. Perform softmax.
"""
@
staticmethod
def
forward
(
ctx
,
inputs
,
mask
,
scale
):
import
scaled_masked_softmax_cuda
scale_t
=
torch
.
tensor
([
scale
])
softmax_results
=
scaled_masked_softmax_cuda
.
forward
(
inputs
,
mask
,
scale_t
[
0
])
ctx
.
save_for_backward
(
softmax_results
,
scale_t
)
return
softmax_results
@
staticmethod
def
backward
(
ctx
,
output_grads
):
import
scaled_masked_softmax_cuda
softmax_results
,
scale_t
=
ctx
.
saved_tensors
input_grads
=
scaled_masked_softmax_cuda
.
backward
(
output_grads
,
softmax_results
,
scale_t
[
0
])
return
input_grads
,
None
,
None
class
ScaledSoftmax
(
torch
.
autograd
.
Function
):
"""
Fused operation which performs following two operations in sequence
1. Scale the tensor.
2. Perform softmax.
"""
@
staticmethod
def
forward
(
ctx
,
inputs
,
scale
):
import
scaled_softmax_cuda
scale_t
=
torch
.
tensor
([
scale
])
softmax_results
=
scaled_softmax_cuda
.
forward
(
inputs
,
scale_t
[
0
])
ctx
.
save_for_backward
(
softmax_results
,
scale_t
)
return
softmax_results
@
staticmethod
def
backward
(
ctx
,
output_grads
):
import
scaled_softmax_cuda
softmax_results
,
scale_t
=
ctx
.
saved_tensors
input_grads
=
scaled_softmax_cuda
.
backward
(
output_grads
,
softmax_results
,
scale_t
[
0
])
return
input_grads
,
None
,
None
class
FusedScaleMaskSoftmax
(
nn
.
Module
):
"""
fused operation: scaling + mask + softmax
Args:
input_in_fp16: flag to indicate if input in fp16 data format.
input_in_bf16: flag to indicate if input in bf16 data format.
attn_mask_type: attention mask type (pad or causal)
scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
mask_func: mask function to be applied.
softmax_in_fp32: if true, softmax in performed at fp32 precision.
scale: scaling factor used in input tensor scaling.
"""
def
__init__
(
self
,
input_in_fp16
,
input_in_bf16
,
attn_mask_type
,
scaled_masked_softmax_fusion
,
mask_func
,
softmax_in_fp32
,
scale
,
):
super
(
FusedScaleMaskSoftmax
,
self
).
__init__
()
self
.
input_in_fp16
=
input_in_fp16
self
.
input_in_bf16
=
input_in_bf16
assert
not
(
self
.
input_in_fp16
and
self
.
input_in_bf16
),
"both fp16 and bf16 flags cannot be active at the same time."
self
.
input_in_float16
=
self
.
input_in_fp16
or
self
.
input_in_bf16
self
.
attn_mask_type
=
attn_mask_type
self
.
scaled_masked_softmax_fusion
=
scaled_masked_softmax_fusion
self
.
mask_func
=
mask_func
self
.
softmax_in_fp32
=
softmax_in_fp32
self
.
scale
=
scale
assert
self
.
scale
is
None
or
softmax_in_fp32
,
"softmax should be in fp32 when scaled"
def
forward
(
self
,
input
:
torch
.
Tensor
,
mask
:
Optional
[
torch
.
Tensor
]):
"""Forward pass of softmax with masked input.
In case attn_mask_type is causal the mask is generated and None can be passed.
A user-defined mask is only needed when attn_mask_type is not causal.
"""
# [b, np, sq, sk]
assert
input
.
dim
()
==
4
if
self
.
is_kernel_available
(
mask
,
*
input
.
size
()):
return
self
.
forward_fused_softmax
(
input
,
mask
)
else
:
return
self
.
forward_torch_softmax
(
input
,
mask
)
def
is_kernel_available
(
self
,
mask
,
b
,
np
,
sq
,
sk
):
attn_batches
=
b
*
np
if
(
self
.
scaled_masked_softmax_fusion
# user want to fuse
and
self
.
input_in_float16
# input must be fp16
and
16
<
sk
<=
4096
# sk must be 16 ~ 2048
and
sq
%
4
==
0
# sq must be divisor of 4
and
sk
%
4
==
0
# sk must be divisor of 4
and
attn_batches
%
4
==
0
# np * b must be divisor of 4
):
if
0
<=
sk
<=
4096
:
batch_per_block
=
self
.
get_batch_per_block
(
sq
,
sk
,
b
,
np
)
if
self
.
attn_mask_type
==
AttnMaskType
.
causal
:
if
attn_batches
%
batch_per_block
==
0
:
return
True
else
:
if
sq
%
batch_per_block
==
0
:
return
True
return
False
def
forward_fused_softmax
(
self
,
input
,
mask
):
b
,
np
,
sq
,
sk
=
input
.
size
()
scale
=
self
.
scale
if
self
.
scale
is
not
None
else
1.0
if
self
.
attn_mask_type
==
AttnMaskType
.
causal
:
assert
sq
==
sk
,
"causal mask is only for self attention"
# input is 3D tensor (attn_batches, sq, sk)
input
=
input
.
view
(
-
1
,
sq
,
sk
)
probs
=
ScaledUpperTriangMaskedSoftmax
.
apply
(
input
,
scale
)
return
probs
.
view
(
b
,
np
,
sq
,
sk
)
else
:
# input is 4D tensor (b, np, sq, sk)
if
mask
is
not
None
:
return
ScaledMaskedSoftmax
.
apply
(
input
,
mask
,
scale
)
else
:
return
ScaledSoftmax
.
apply
(
input
,
scale
)
def
forward_torch_softmax
(
self
,
input
,
mask
):
if
self
.
input_in_float16
and
self
.
softmax_in_fp32
:
input
=
input
.
float
()
if
self
.
scale
is
not
None
:
input
=
input
*
self
.
scale
# Generate causal mask if not given
sq
,
sk
=
input
.
size
(
2
),
input
.
size
(
3
)
if
self
.
attn_mask_type
==
AttnMaskType
.
causal
and
mask
is
None
and
sq
>
1
:
# If sq == 1 then either KV cache is used or one-element context is passed
# so keeping mask=None in this case; subsequent code should handle it
assert
sq
==
sk
,
"causal mask is only for self attention"
mask
=
get_default_causal_mask
(
sq
)
mask_output
=
self
.
mask_func
(
input
,
mask
)
if
mask
is
not
None
else
input
probs
=
torch
.
nn
.
Softmax
(
dim
=-
1
)(
mask_output
)
if
self
.
input_in_float16
and
self
.
softmax_in_fp32
:
if
self
.
input_in_fp16
:
probs
=
probs
.
half
()
else
:
probs
=
probs
.
bfloat16
()
return
probs
@
staticmethod
def
get_batch_per_block
(
sq
,
sk
,
b
,
np
):
import
scaled_masked_softmax_cuda
return
scaled_masked_softmax_cuda
.
get_batch_per_block
(
sq
,
sk
,
b
,
np
)
megatron/core/inference/__init__.py
0 → 100644
View file @
d444a97a
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
megatron/core/inference/ammo_support/__init__.py
0 → 100644
View file @
d444a97a
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import
warnings
warnings
.
warn
(
"The 'megatron.core.inference.ammo_support' module is deprecated and will be removed in a future release. "
"Please use megatron.core.inference.modelopt_support instead"
,
DeprecationWarning
,
)
megatron/core/inference/ammo_support/gpt/model_specs.py
0 → 100644
View file @
d444a97a
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from
megatron.core.inference.modelopt_support.gpt.model_specs
import
get_gpt_layer_modelopt_spec
megatron/core/inference/ammo_support/gpt/state_dict_hooks.py
0 → 100644
View file @
d444a97a
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from
megatron.core.inference.modelopt_support.gpt.state_dict_hooks
import
(
mcore_gpt_load_legacy_state_dict_pre_hook
,
mcore_gpt_load_te_state_dict_pre_hook
,
)
megatron/core/inference/common_inference_params.py
0 → 100644
View file @
d444a97a
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from
megatron.core.inference.sampling_params
import
(
# noqa: F401 # pylint: disable=unused-import
SamplingParams
as
CommonInferenceParams
,
)
megatron/core/inference/communication_utils.py
0 → 100644
View file @
d444a97a
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import
torch
from
megatron.core
import
parallel_state
def
_is_cuda
(
tensor
):
"""Check if a tensor is not none and is cuda."""
assert
tensor
is
not
None
assert
tensor
.
is_cuda
def
broadcast_from_last_pipeline_stage
(
size
,
dtype
,
tensor
=
None
):
"""Broadcast a tensor from last pipeline stage to all ranks."""
if
parallel_state
.
is_pipeline_last_stage
():
_is_cuda
(
tensor
)
assert
tensor
.
is_contiguous
()
else
:
tensor
=
torch
.
empty
(
size
,
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
())
# Get the group and corresponding source rank.
src
=
parallel_state
.
get_pipeline_model_parallel_last_rank
()
group
=
parallel_state
.
get_pipeline_model_parallel_group
()
torch
.
distributed
.
broadcast
(
tensor
,
src
,
group
)
return
tensor
def
recv_from_prev_pipeline_rank_
(
recv_buffer
=
None
):
"""Receive from previous pipeline stage and update the
input buffer inplace."""
recv_prev_op
=
torch
.
distributed
.
P2POp
(
torch
.
distributed
.
irecv
,
recv_buffer
,
parallel_state
.
get_pipeline_model_parallel_prev_rank
()
)
reqs
=
torch
.
distributed
.
batch_isend_irecv
([
recv_prev_op
])
for
req
in
reqs
:
req
.
wait
()
# To protect against race condition when using batch_isend_irecv().
torch
.
cuda
.
synchronize
()
def
send_to_next_pipeline_rank
(
tensor
=
None
):
"""Send output to the next pipeline stage."""
send_next_op
=
torch
.
distributed
.
P2POp
(
torch
.
distributed
.
isend
,
tensor
,
parallel_state
.
get_pipeline_model_parallel_next_rank
()
)
reqs
=
torch
.
distributed
.
batch_isend_irecv
([
send_next_op
])
for
req
in
reqs
:
req
.
wait
()
# To protect against race condition when using batch_isend_irecv().
torch
.
cuda
.
synchronize
()
megatron/core/inference/engines/__init__.py
0 → 100644
View file @
d444a97a
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
megatron/core/inference/engines/abstract_engine.py
0 → 100644
View file @
d444a97a
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from
abc
import
ABC
,
abstractmethod
from
typing
import
List
class
AbstractEngine
(
ABC
):
@
staticmethod
@
abstractmethod
def
generate
(
self
)
->
dict
:
"""The abstract backend's generate function.
To define a new backend, implement this and return the outputs as a dictionary.
Returns:
dict: The output dictionary containing keys for `input_prompt`, `generated_text`, `generated_tokens`.
"""
pass
Prev
1
…
13
14
15
16
17
18
19
20
21
…
23
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment