Commit 0d99ae1f authored by silencealiang's avatar silencealiang
Browse files

add

parent c271aaae
Pipeline #2498 canceled with stages
File mode changed from 100755 to 100644
......@@ -7,4 +7,18 @@ from megatron.core.utils import is_torch_min_version
jit_fuser = torch.jit.script
# nvFuser is deprecated in PyTorch JIT starting from 2.2
if is_torch_min_version("2.2.0a0"):
jit_fuser = torch.compile
jit_fuser = torch.compile(mode='max-autotune-no-cudagraphs')
# Decorator to disable Torch Dynamo
# See: https://github.com/NVIDIA/TransformerEngine/issues/308
no_torch_dynamo = lambda recursive=True: lambda func: func
if torch.__version__ >= "2":
import torch._dynamo
if torch.__version__ >= "2.1":
no_torch_dynamo = lambda recursive=True: lambda f: torch._dynamo.disable(
f, recursive=recursive
)
else:
# no "recursive" option in pyTorch 2.0 - it acts as if recursive was True
no_torch_dynamo = lambda recursive=True: torch._dynamo.disable
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import warnings
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
......@@ -28,38 +30,60 @@ try:
HAVE_APEX = True
LNImpl = FusedLayerNorm
except ImportError:
import warnings
from megatron.core.transformer.torch_norm import WrappedTorchNorm
warnings.warn(f'Apex is not installed. Falling back to Torch Norm')
warnings.warn('Apex is not installed. Falling back to Torch Norm')
LNImpl = WrappedTorchNorm
# Use this spec to use lower level Transformer Engine modules (required for fp8 training)
bert_layer_with_transformer_engine_spec = ModuleSpec(
module=TransformerLayer,
submodules=TransformerLayerSubmodules(
self_attention=ModuleSpec(
module=SelfAttention,
params={"attn_mask_type": AttnMaskType.padding},
submodules=SelfAttentionSubmodules(
linear_qkv=TELayerNormColumnParallelLinear,
core_attention=TEDotProductAttention,
linear_proj=TERowParallelLinear,
q_layernorm=IdentityOp,
k_layernorm=IdentityOp,
def get_bert_layer_with_transformer_engine_spec():
"""Use this spec to use lower-level Transformer Engine modules (required for fp8 training).
Returns:
ModuleSpec: Module specification with TE modules
"""
if not HAVE_TE:
raise ImportError(
"Transformer Engine is not installed. Please use local Bert layer spec instead."
)
return ModuleSpec(
module=TransformerLayer,
submodules=TransformerLayerSubmodules(
self_attention=ModuleSpec(
module=SelfAttention,
params={"attn_mask_type": AttnMaskType.padding},
submodules=SelfAttentionSubmodules(
linear_qkv=TELayerNormColumnParallelLinear,
core_attention=TEDotProductAttention,
linear_proj=TERowParallelLinear,
q_layernorm=IdentityOp,
k_layernorm=IdentityOp,
),
),
),
self_attn_bda=get_bias_dropout_add,
mlp=ModuleSpec(
module=MLP,
submodules=MLPSubmodules(
linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
self_attn_bda=get_bias_dropout_add,
mlp=ModuleSpec(
module=MLP,
submodules=MLPSubmodules(
linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
),
),
mlp_bda=get_bias_dropout_add,
),
mlp_bda=get_bias_dropout_add,
),
)
)
def __getattr__(name):
if name == 'bert_layer_with_transformer_engine_spec':
warnings.warn(
"""Attribute bert_layer_specs.bert_layer_with_transformer_engine_spec is on a
deprecation track and will be removed in future releases. Please migrate to
bert_layer_specs.get_bert_layer_with_transformer_engine_spec()."""
)
return get_bert_layer_with_transformer_engine_spec()
# Use this spec for an implementation using only modules in megatron core
bert_layer_local_spec = ModuleSpec(
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
......@@ -17,23 +17,24 @@ from megatron.core.utils import is_te_min_version
logger = logging.getLogger(__name__)
# Prefer fused RoPE from Apex as we need the `transpose_output_memory` argument for the bshd trick.
# See https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/2469.
try:
from megatron.core.extensions.transformer_engine import (
fused_apply_rotary_pos_emb,
fused_apply_rotary_pos_emb_thd,
)
HAVE_APPLY_ROPE_FUSION = True
from apex.transformer.functional import fused_apply_rotary_pos_emb
except ImportError:
try:
from apex.transformer.functional import (
fused_apply_rotary_pos_emb,
fused_apply_rotary_pos_emb_thd,
)
from megatron.core.extensions.transformer_engine import fused_apply_rotary_pos_emb
except:
fused_apply_rotary_pos_emb = None
HAVE_APPLY_ROPE_FUSION = True
try:
from megatron.core.extensions.transformer_engine import fused_apply_rotary_pos_emb_thd
except ImportError:
try:
from apex.transformer.functional import fused_apply_rotary_pos_emb_thd
except ImportError:
HAVE_APPLY_ROPE_FUSION = False
fused_apply_rotary_pos_emb_thd = None
try:
......@@ -188,8 +189,10 @@ def apply_rotary_pos_emb(
if config.apply_rope_fusion:
if cu_seqlens is None:
return fused_apply_rotary_pos_emb(t, freqs)
assert fused_apply_rotary_pos_emb is not None, "apply_rope_fusion is not available."
return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True)
else:
assert fused_apply_rotary_pos_emb_thd is not None, "apply_rope_fusion is not available."
cp_size = parallel_state.get_context_parallel_world_size()
if cp_size > 1:
if not is_te_min_version("1.11.0", check_equality=False):
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment