add

0d99ae1f · silencealiang · c271aaae · 0d99ae1f · 0d99ae1f · 0d99ae1f
Commit 0d99ae1f authored Mar 14, 2025 by silencealiang
20 changed files
--- a/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
+++ b/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
--- a/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py
+++ b/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py
--- a/megatron/core/extensions/__init__.py
+++ b/megatron/core/extensions/__init__.py
--- a/megatron/core/extensions/transformer_engine.py
+++ b/megatron/core/extensions/transformer_engine.py
@@ -13,8 +13,8 @@ from packaging.version import Version as PkgVersion
 from torch import Tensor
 from torch.nn.parameter import Parameter
-from megatron.core import ModelParallelConfig
 from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding
+from megatron.core.model_parallel_config import ModelParallelConfig
 from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.parallel_state import (
    get_context_parallel_global_ranks,
@@ -654,6 +654,23 @@ class TEDotProductAttention(te.pytorch.DotProductAttention):
        else:
            kv_channels = self.config.kv_channels
+        self.kept_packed_seq_params = set(
+            field.name for field in dataclasses.fields(PackedSeqParams)
+        )
+        if get_te_version() < PkgVersion("1.3.0"):
+            # TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H
+            # copies (#555)
+            # These two arguments did not exist prior to 1.3.0
+            self.kept_packed_seq_params.discard("max_seqlen_q")
+            self.kept_packed_seq_params.discard("max_seqlen_kv")
+        if get_te_version() < PkgVersion("1.10.0"):
+            # TE 1.8.0 introduces cu_seqlens_padded which is the cu_seqlens with paddings counted
+            # in each individual sequence in THD format dataset
+            # These two arguments did not exist prior to 1.8.0. Full support added in 1.10.0 (#1012)
+            self.kept_packed_seq_params.discard("cu_seqlens_q_padded")
+            self.kept_packed_seq_params.discard("cu_seqlens_kv_padded")
        super().__init__(
            num_attention_heads=self.config.num_attention_heads,
            kv_channels=kv_channels,
@@ -683,7 +700,9 @@ class TEDotProductAttention(te.pytorch.DotProductAttention):
    ):
        """Forward."""
        packed_seq_kwargs = (
-            dataclasses.asdict(packed_seq_params) if packed_seq_params is not None else {}
+            {key: getattr(packed_seq_params, key) for key in self.kept_packed_seq_params}
+            if packed_seq_params is not None
+            else {}
        )
        # overwrite self.qkv_format depending on self.config.apply_rope_fusion, which can be set
        # after init
@@ -692,24 +711,10 @@ class TEDotProductAttention(te.pytorch.DotProductAttention):
        qkv_format = packed_seq_kwargs.get('qkv_format', self.qkv_format)
-        if get_te_version() < PkgVersion("1.3.0"):
-            # TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H
-            # copies (#555)
-            # These two arguments did not exist prior to 1.3.0
-            packed_seq_kwargs.pop("max_seqlen_q", None)
-            packed_seq_kwargs.pop("max_seqlen_kv", None)
-        if get_te_version() < PkgVersion("1.10.0"):
-            # TE 1.8.0 introduces cu_seqlens_padded which is the cu_seqlens with paddings counted
-            # in each individual sequence in THD format dataset
-            # These two arguments did not exist prior to 1.8.0.Full support added in 1.10.0 (#1012)
-            packed_seq_kwargs.pop("cu_seqlens_q_padded", None)
-            packed_seq_kwargs.pop("cu_seqlens_kv_padded", None)
        # WAR for peak memory usage.
        # See https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/2388
        if self.config.apply_rope_fusion and qkv_format == 'bshd':
-            query, key, value = [x.contiguous().transpose(0, 1) for x in (query, key, value)]
+            query, key, value = [x.transpose(0, 1).contiguous() for x in (query, key, value)]
            # In PyTorch, the following two tensors are in fact the same:
            #   Tensor with shape (1, S, H, D) and stride (S*H*D, H*D, D, 1)
            #   Tensor with shape (1, S, H, D) and stride (H*D, H*D, D, 1)
@@ -760,7 +765,7 @@ class TEDotProductAttention(te.pytorch.DotProductAttention):
 if is_te_min_version("1.9.0.dev0"):
-    class TEGroupedLinear(te.pytorch.GroupedLinear):
+    class TEGroupedLinear(te.pytorch.BatchLinear if int(os.getenv("GROUPED_GEMM_BatchLinear", '0')) else te.pytorch.GroupedLinear):
        """
        Wrapper for the Transformer-Engine's `GroupedLinear` layer.
@@ -1229,8 +1234,14 @@ try:
    from transformer_engine.pytorch.attention import FusedRoPEFunc
-    def fused_apply_rotary_pos_emb(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    def fused_apply_rotary_pos_emb(
+        t: torch.Tensor, freqs: torch.Tensor, transpose_output_memory: bool = False
+    ) -> torch.Tensor:
        """Apply rotary positional embedding to input tensor T in `sbhd` format."""
+        if transpose_output_memory:
+            warnings.warn(
+                "transpose_output_memory is not supported by TE's fused RoPE and will be ignored."
+            )
        return FusedRoPEFunc.apply(t, freqs, "sbhd")
    def fused_apply_rotary_pos_emb_thd(

--- a/megatron/core/fusions/__init__.py
+++ b/megatron/core/fusions/__init__.py
--- a/megatron/core/fusions/fused_bias_dropout.py
+++ b/megatron/core/fusions/fused_bias_dropout.py
--- a/megatron/core/fusions/fused_bias_geglu.py
+++ b/megatron/core/fusions/fused_bias_geglu.py
--- a/megatron/core/fusions/fused_bias_gelu.py
+++ b/megatron/core/fusions/fused_bias_gelu.py
--- a/megatron/core/fusions/fused_bias_swiglu.py
+++ b/megatron/core/fusions/fused_bias_swiglu.py
--- a/megatron/core/fusions/fused_cross_entropy.py
+++ b/megatron/core/fusions/fused_cross_entropy.py
--- a/megatron/core/fusions/fused_layer_norm.py
+++ b/megatron/core/fusions/fused_layer_norm.py
--- a/megatron/core/fusions/fused_softmax.py
+++ b/megatron/core/fusions/fused_softmax.py
--- a/megatron/core/inference/__init__.py
+++ b/megatron/core/inference/__init__.py
--- a/megatron/core/inference/ammo_support/__init__.py
+++ b/megatron/core/inference/ammo_support/__init__.py
--- a/megatron/core/inference/ammo_support/gpt/model_specs.py
+++ b/megatron/core/inference/ammo_support/gpt/model_specs.py
--- a/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py
+++ b/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py
--- a/megatron/core/inference/common_inference_params.py
+++ b/megatron/core/inference/common_inference_params.py
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-from dataclasses import dataclass
+from megatron.core.inference.sampling_params import (  # noqa: F401 # pylint: disable=unused-import
+    SamplingParams as CommonInferenceParams,
+)
-@dataclass
-class CommonInferenceParams:
-    """Inference parameters sent along with the prompts
-    For an explanation of these parameters refer to this blog https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910
-    """
-    temperature: float = 1.0
-    top_k: int = 0
-    top_p: float = 0.0
-    return_log_probs: bool = False
-    num_tokens_to_generate: int = 30
-    def add_attributes(self, attribute_value_pair: dict):
-        """Utility to add more attributes to inference params
-        Use this method to pass in a custom dictonary to add more inference parameter attributes to the instance you created. Use as follows
-        c = CommonInferenceParams
-        c.add_attributes({'min_length':4, 'eod_id':153})
-        Args:
-            attribute_value_pair (dict): A dictionary containing attributes as the key names and their values as the values.
-        """
-        for key, value in attribute_value_pair.items():
-            setattr(self, key, value)
--- a/megatron/core/inference/communication_utils.py
+++ b/megatron/core/inference/communication_utils.py
--- a/megatron/core/inference/engines/__init__.py
+++ b/megatron/core/inference/engines/__init__.py
--- a/megatron/core/inference/engines/abstract_engine.py
+++ b/megatron/core/inference/engines/abstract_engine.py