Commit 0d99ae1f authored by silencealiang's avatar silencealiang
Browse files

add

parent c271aaae
Pipeline #2498 canceled with stages
File mode changed from 100755 to 100644
...@@ -13,8 +13,8 @@ from packaging.version import Version as PkgVersion ...@@ -13,8 +13,8 @@ from packaging.version import Version as PkgVersion
from torch import Tensor from torch import Tensor
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from megatron.core import ModelParallelConfig
from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding
from megatron.core.model_parallel_config import ModelParallelConfig
from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.packed_seq_params import PackedSeqParams
from megatron.core.parallel_state import ( from megatron.core.parallel_state import (
get_context_parallel_global_ranks, get_context_parallel_global_ranks,
...@@ -654,6 +654,23 @@ class TEDotProductAttention(te.pytorch.DotProductAttention): ...@@ -654,6 +654,23 @@ class TEDotProductAttention(te.pytorch.DotProductAttention):
else: else:
kv_channels = self.config.kv_channels kv_channels = self.config.kv_channels
self.kept_packed_seq_params = set(
field.name for field in dataclasses.fields(PackedSeqParams)
)
if get_te_version() < PkgVersion("1.3.0"):
# TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H
# copies (#555)
# These two arguments did not exist prior to 1.3.0
self.kept_packed_seq_params.discard("max_seqlen_q")
self.kept_packed_seq_params.discard("max_seqlen_kv")
if get_te_version() < PkgVersion("1.10.0"):
# TE 1.8.0 introduces cu_seqlens_padded which is the cu_seqlens with paddings counted
# in each individual sequence in THD format dataset
# These two arguments did not exist prior to 1.8.0. Full support added in 1.10.0 (#1012)
self.kept_packed_seq_params.discard("cu_seqlens_q_padded")
self.kept_packed_seq_params.discard("cu_seqlens_kv_padded")
super().__init__( super().__init__(
num_attention_heads=self.config.num_attention_heads, num_attention_heads=self.config.num_attention_heads,
kv_channels=kv_channels, kv_channels=kv_channels,
...@@ -683,7 +700,9 @@ class TEDotProductAttention(te.pytorch.DotProductAttention): ...@@ -683,7 +700,9 @@ class TEDotProductAttention(te.pytorch.DotProductAttention):
): ):
"""Forward.""" """Forward."""
packed_seq_kwargs = ( packed_seq_kwargs = (
dataclasses.asdict(packed_seq_params) if packed_seq_params is not None else {} {key: getattr(packed_seq_params, key) for key in self.kept_packed_seq_params}
if packed_seq_params is not None
else {}
) )
# overwrite self.qkv_format depending on self.config.apply_rope_fusion, which can be set # overwrite self.qkv_format depending on self.config.apply_rope_fusion, which can be set
# after init # after init
...@@ -692,24 +711,10 @@ class TEDotProductAttention(te.pytorch.DotProductAttention): ...@@ -692,24 +711,10 @@ class TEDotProductAttention(te.pytorch.DotProductAttention):
qkv_format = packed_seq_kwargs.get('qkv_format', self.qkv_format) qkv_format = packed_seq_kwargs.get('qkv_format', self.qkv_format)
if get_te_version() < PkgVersion("1.3.0"):
# TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H
# copies (#555)
# These two arguments did not exist prior to 1.3.0
packed_seq_kwargs.pop("max_seqlen_q", None)
packed_seq_kwargs.pop("max_seqlen_kv", None)
if get_te_version() < PkgVersion("1.10.0"):
# TE 1.8.0 introduces cu_seqlens_padded which is the cu_seqlens with paddings counted
# in each individual sequence in THD format dataset
# These two arguments did not exist prior to 1.8.0.Full support added in 1.10.0 (#1012)
packed_seq_kwargs.pop("cu_seqlens_q_padded", None)
packed_seq_kwargs.pop("cu_seqlens_kv_padded", None)
# WAR for peak memory usage. # WAR for peak memory usage.
# See https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/2388 # See https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/2388
if self.config.apply_rope_fusion and qkv_format == 'bshd': if self.config.apply_rope_fusion and qkv_format == 'bshd':
query, key, value = [x.contiguous().transpose(0, 1) for x in (query, key, value)] query, key, value = [x.transpose(0, 1).contiguous() for x in (query, key, value)]
# In PyTorch, the following two tensors are in fact the same: # In PyTorch, the following two tensors are in fact the same:
# Tensor with shape (1, S, H, D) and stride (S*H*D, H*D, D, 1) # Tensor with shape (1, S, H, D) and stride (S*H*D, H*D, D, 1)
# Tensor with shape (1, S, H, D) and stride (H*D, H*D, D, 1) # Tensor with shape (1, S, H, D) and stride (H*D, H*D, D, 1)
...@@ -760,7 +765,7 @@ class TEDotProductAttention(te.pytorch.DotProductAttention): ...@@ -760,7 +765,7 @@ class TEDotProductAttention(te.pytorch.DotProductAttention):
if is_te_min_version("1.9.0.dev0"): if is_te_min_version("1.9.0.dev0"):
class TEGroupedLinear(te.pytorch.GroupedLinear): class TEGroupedLinear(te.pytorch.BatchLinear if int(os.getenv("GROUPED_GEMM_BatchLinear", '0')) else te.pytorch.GroupedLinear):
""" """
Wrapper for the Transformer-Engine's `GroupedLinear` layer. Wrapper for the Transformer-Engine's `GroupedLinear` layer.
...@@ -1229,8 +1234,14 @@ try: ...@@ -1229,8 +1234,14 @@ try:
from transformer_engine.pytorch.attention import FusedRoPEFunc from transformer_engine.pytorch.attention import FusedRoPEFunc
def fused_apply_rotary_pos_emb(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor: def fused_apply_rotary_pos_emb(
t: torch.Tensor, freqs: torch.Tensor, transpose_output_memory: bool = False
) -> torch.Tensor:
"""Apply rotary positional embedding to input tensor T in `sbhd` format.""" """Apply rotary positional embedding to input tensor T in `sbhd` format."""
if transpose_output_memory:
warnings.warn(
"transpose_output_memory is not supported by TE's fused RoPE and will be ignored."
)
return FusedRoPEFunc.apply(t, freqs, "sbhd") return FusedRoPEFunc.apply(t, freqs, "sbhd")
def fused_apply_rotary_pos_emb_thd( def fused_apply_rotary_pos_emb_thd(
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from dataclasses import dataclass from megatron.core.inference.sampling_params import ( # noqa: F401 # pylint: disable=unused-import
SamplingParams as CommonInferenceParams,
)
@dataclass
class CommonInferenceParams:
"""Inference parameters sent along with the prompts
For an explanation of these parameters refer to this blog https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910
"""
temperature: float = 1.0
top_k: int = 0
top_p: float = 0.0
return_log_probs: bool = False
num_tokens_to_generate: int = 30
def add_attributes(self, attribute_value_pair: dict):
"""Utility to add more attributes to inference params
Use this method to pass in a custom dictonary to add more inference parameter attributes to the instance you created. Use as follows
c = CommonInferenceParams
c.add_attributes({'min_length':4, 'eod_id':153})
Args:
attribute_value_pair (dict): A dictionary containing attributes as the key names and their values as the values.
"""
for key, value in attribute_value_pair.items():
setattr(self, key, value)
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment