update

64c0033e · liangjing · 4b097dee · 64c0033e · 64c0033e · 64c0033e
Commit 64c0033e authored Nov 07, 2024 by liangjing
7 changed files
--- a/README.md
+++ b/README.md
@@ -11,40 +11,18 @@
 - [参考](#参考)
 # 环境配置
-1. 安装基础依赖包
+1. 拉取合适镜像
 <pre>
-pip install -r requirements.txt
+docker pull nvcr.io/nvidia/pytorch:24.06-py3
 </pre>
-2. 安装DCU相关whl包
-DCU相关包下载目录：[https://cancon.hpccube.com:65024/4/main](https://cancon.hpccube.com:65024/4/main)
+2. 创建镜像并进入
-pytorch whl包：pytorch ---> dtk-24.04.1
-根据python版本,下载对应pytorch的whl包
-<pre>
-pip install torch* (下载的torch的whl包)
-</pre>
-torchvision whl包：vision ---> dtk-24.04.1
-根据python版本,下载对应torchvision的whl包
-<pre>
-pip install torchvision* (下载的torchvision的whl包)
-</pre>
-apex whl包：apex ---> dtk-24.04.1
-根据python版本,下载对应apex的whl包
 <pre>
-pip install apex* (下载的apex的whl包)
+docker run -it --name xx --gpus all --network=host --ipc=host --privileged -v /path_to_work/:/path_to_work/ --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined nvcr.io/nvidia/pytorch:24.06-py3 /bin/bash
+docker exec -it xx bash
 </pre>
-若使用 pip install 下载安装过慢，可添加源：-i https://pypi.tuna.tsinghua.edu.cn/simple/
-3. 安装unsloth
-<pre>
-cd ./unsloth
-pip3 install -e .
-</pre>
 # 下载词汇文件

--- a/megatron/2
+++ b/megatron/2
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+"""Megatron initialization."""
+import logging
+import random
+import os
+import time
+import numpy as np
+import torch
+from datetime import timedelta
+from megatron.legacy import fused_kernels
+from megatron.training import get_adlr_autoresume
+from megatron.training import get_args
+from megatron.training import get_tensorboard_writer
+from megatron.core import mpu, tensor_parallel
+from megatron.training.arguments import parse_args, validate_args
+from megatron.training.yaml_arguments import validate_yaml
+from megatron.training.checkpointing import load_args_from_checkpoint
+from megatron.training.global_vars import set_global_variables
+from megatron.core.fusions.fused_bias_dropout import bias_dropout_add_fused_train
+from megatron.core.fusions.fused_bias_gelu import bias_gelu
+from megatron.core.fusions.fused_bias_swiglu import bias_swiglu
+logger = logging.getLogger(__name__)
+def initialize_megatron(
+    extra_args_provider=None,
+    args_defaults={},
+    ignore_unknown_args=False,
+    allow_no_cuda=False,
+    skip_mpu_initialization=False,
+    get_embedding_ranks=None,
+    get_position_embedding_ranks=None
+):
+    """Set global variables, initialize distributed, and
+    set autoresume and random seeds.
+    `allow_no_cuda` should not be set unless using megatron for cpu only
+    data processing. In general this arg should not be set unless you know
+    what you are doing.
+    Returns a function to finalize distributed env initialization
+    (optionally, only when args.lazy_mpu_init == True)
+    """
+    if not allow_no_cuda:
+        # Make sure cuda is available.
+        assert torch.cuda.is_available(), "Megatron requires CUDA."
+    # Parse arguments
+    args = parse_args(extra_args_provider, ignore_unknown_args)
+    # Prep for checkpoint conversion.
+    if args.ckpt_convert_format is not None:
+        assert args.ckpt_convert_save is not None
+        assert args.load is not None
+        args.exit_on_missing_checkpoint = True
+    if args.use_checkpoint_args or args_defaults.get("use_checkpoint_args", False):
+        assert args.load is not None, "--use-checkpoint-args requires --load argument"
+        load_args_from_checkpoint(args)
+    if args.yaml_cfg is not None:
+        args = validate_yaml(args, args_defaults)
+    else:
+        validate_args(args, args_defaults)
+    # set global args, build tokenizer, and set adlr-autoresume,
+    # tensorboard-writer, and timers.
+    set_global_variables(args)
+    # set logging level
+    setup_logging()
+    # torch.distributed initialization
+    def finish_mpu_init():
+        args = get_args()
+        # Pytorch distributed.
+        _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks)
+        # Random seeds for reproducibility.
+        if args.rank == 0:
+            print("> setting random seeds to {} ...".format(args.seed))
+        _set_random_seed(args.seed, args.data_parallel_random_init)
+    if skip_mpu_initialization:
+        return None
+    args = get_args()
+    if args.lazy_mpu_init:
+        # TODO is this still a necessary option?
+        args.use_cpu_initialization = True
+        # delayed initialization of DDP-related stuff
+        # We only set basic DDP globals
+        mpu.set_tensor_model_parallel_world_size(args.tensor_model_parallel_size)
+        # and return function for external DDP manager
+        # to call when it has DDP initialized
+        mpu.set_tensor_model_parallel_rank(args.rank)
+        return finish_mpu_init
+    else:
+        # Megatron's MPU is the master. Complete initialization right away.
+        finish_mpu_init()
+        # Autoresume.
+        _init_autoresume()
+        # Compile dependencies.
+        _compile_dependencies()
+        if args.tp_comm_overlap:
+           _initialize_tp_communicators()
+        # No continuation function
+        return None
+def _compile_dependencies():
+    args = get_args()
+    # =========================
+    # Compile dataset C++ code.
+    # =========================
+    # TODO: move this to ninja
+    if torch.distributed.get_rank() == 0:
+        start_time = time.time()
+        print("> compiling dataset index builder ...")
+        from megatron.core.datasets.utils import compile_helpers
+        compile_helpers()
+        print(
+            ">>> done with dataset index builder. Compilation time: {:.3f} "
+            "seconds".format(time.time() - start_time),
+            flush=True,
+        )
+    # ==================
+    # Load fused kernels
+    # ==================
+    # Custom kernel constraints check.
+    seq_len = args.seq_length
+    attn_batch_size = (
+        args.num_attention_heads / args.tensor_model_parallel_size
+    ) * args.micro_batch_size
+    # Constraints on sequence length and attn_batch_size to enable warp based
+    # optimization and upper triangular optimization (for causal mask)
+    custom_kernel_constraint = (
+        seq_len > 16
+        and seq_len <= 16384
+        and seq_len % 4 == 0
+        and attn_batch_size % 4 == 0
+    )
+    # Print a warning.
+    if not (
+        (args.fp16 or args.bf16)
+        and custom_kernel_constraint
+        and args.masked_softmax_fusion
+    ):
+        if args.rank == 0:
+            print(
+                "WARNING: constraints for invoking optimized"
+                " fused softmax kernel are not met. We default"
+                " back to unfused kernel invocations.",
+                flush=True,
+            )
+    # Always build on rank zero first.
+    if torch.distributed.get_rank() == 0:
+        start_time = time.time()
+        print("> compiling and loading fused kernels ...", flush=True)
+        #fused_kernels.load(args)
+        torch.distributed.barrier()
+    else:
+        torch.distributed.barrier()
+        #fused_kernels.load(args)
+    # Simple barrier to make sure all ranks have passed the
+    # compilation phase successfully before moving on to the
+    # rest of the program. We think this might ensure that
+    # the lock is released.
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print(
+            ">>> done with compiling and loading fused kernels. "
+            "Compilation time: {:.3f} seconds".format(time.time() - start_time),
+            flush=True,
+        )
+def _initialize_tp_communicators():
+    """ initializing the communicators with user buffers for high-performance tensor-model-parallel
+        communication overlap """
+    try:
+       import yaml
+       import transformer_engine
+       from transformer_engine.pytorch import module as te_module
+    except ImportError:
+       raise RuntimeError("Tensor Parallel Communication/GEMM Overlap optimization needs 'yaml' and "
+             "'transformer_engine' packages")
+    args = get_args()
+    if args.tp_comm_overlap_cfg is not None:
+       with open(args.tp_comm_overlap_cfg,"r") as stream:
+          ub_cfgs = yaml.safe_load(stream)
+    else:
+       ub_cfgs = {}
+    input_shape = [(args.seq_length * args.micro_batch_size) // args.context_parallel_size , args.hidden_size]
+    #We create a MPI process group, which is needed to bootstrap the pipelined
+    #tensor-model-parallel communication overlap
+    torch.distributed.new_group(backend='mpi')
+    te_module.base.initialize_ub(shape = input_shape, tp_size = args.tensor_model_parallel_size,
+                                 use_fp8 = (args.fp8 is not None) , ub_cfgs = ub_cfgs,)
+def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks):
+    """Initialize torch.distributed and core model parallel."""
+    args = get_args()
+    device_count = torch.cuda.device_count()
+    if torch.distributed.is_initialized():
+        if args.rank == 0:
+            print(
+                "torch distributed is already initialized, "
+                "skipping initialization ...",
+                flush=True,
+            )
+        args.rank = torch.distributed.get_rank()
+        args.world_size = torch.distributed.get_world_size()
+    else:
+        if args.rank == 0:
+            print("> initializing torch distributed ...", flush=True)
+        # Manually set the device ids.
+        if device_count > 0:
+            #torch.cuda.set_device(args.local_rank)
+            #device_id = torch.device(f'cuda:{args.local_rank}')
+            device_id = args.rank % device_count
+            if args.local_rank is not None:
+                assert (
+                    args.local_rank == device_id
+                ), "expected local-rank to be the same as rank % device-count."
+            else:
+                args.local_rank = device_id
+            torch.cuda.set_device(device_id)
+        else:
+            device_id = None
+        # Call the init process
+        torch.distributed.init_process_group(
+                backend=args.distributed_backend,
+                world_size=args.world_size,
+                rank=args.rank,
+                init_method=args.dist_url,
+                timeout=timedelta(minutes=args.distributed_timeout_minutes),
+                )
+        #init_process_group_kwargs = {
+        #    'backend' : args.distributed_backend,
+        #    'world_size': args.world_size,
+        #    'rank': args.rank,
+        #    'timeout': timedelta(minutes=args.distributed_timeout_minutes),
+        #}
+        #torch.distributed.init_process_group(**init_process_group_kwargs)
+    # Set the tensor model-parallel, pipeline model-parallel, and
+    # data-parallel communicators.
+    if device_count > 0:
+        if mpu.model_parallel_is_initialized():
+            print("model parallel is already initialized")
+        else:
+            mpu.initialize_model_parallel(
+                args.tensor_model_parallel_size,
+                args.pipeline_model_parallel_size,
+                args.virtual_pipeline_model_parallel_size,
+                args.pipeline_model_parallel_split_rank,
+                context_parallel_size=args.context_parallel_size,
+                expert_model_parallel_size=args.expert_model_parallel_size,
+                distributed_timeout_minutes=args.distributed_timeout_minutes,
+                nccl_communicator_config_path=args.nccl_communicator_config_path,
+                order='tp-cp-ep-dp-pp' if not args.use_tp_pp_dp_mapping else 'tp-pp-dp',
+                encoder_tensor_model_parallel_size=args.encoder_tensor_model_parallel_size,
+                encoder_pipeline_model_parallel_size=args.encoder_pipeline_model_parallel_size,
+                get_embedding_ranks=get_embedding_ranks,
+                get_position_embedding_ranks=get_position_embedding_ranks,
+            )
+            if args.rank == 0:
+                print(
+                    f"> initialized tensor model parallel with size "
+                    f"{mpu.get_tensor_model_parallel_world_size()}"
+                )
+                print(
+                    f"> initialized pipeline model parallel with size "
+                    f"{mpu.get_pipeline_model_parallel_world_size()}"
+                )
+def _init_autoresume():
+    """Set autoresume start time."""
+    autoresume = get_adlr_autoresume()
+    if autoresume:
+        torch.distributed.barrier()
+        autoresume.init()
+        torch.distributed.barrier()
+def _set_random_seed(seed_, data_parallel_random_init=False):
+    """Set random seed for reproducability."""
+    if seed_ is not None and seed_ > 0:
+        # Ensure that different pipeline MP stages get different seeds.
+        seed = seed_ + (100 * mpu.get_pipeline_model_parallel_rank())
+        # Ensure different data parallel ranks get different seeds
+        if data_parallel_random_init:
+            seed = seed + (10 * mpu.get_data_parallel_rank())
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.device_count() > 0:
+            tensor_parallel.model_parallel_cuda_manual_seed(seed)
+    else:
+        raise ValueError("Seed ({}) should be a positive integer.".format(seed))
+def write_args_to_tensorboard():
+    """Write arguments to tensorboard."""
+    args = get_args()
+    writer = get_tensorboard_writer()
+    if writer:
+        for arg in vars(args):
+            writer.add_text(arg, str(getattr(args, arg)), global_step=args.iteration)
+def set_jit_fusion_options():
+    """Set PyTorch JIT layer fusion options."""
+    # flags required to enable jit fusion kernels
+    TORCH_MAJOR = int(torch.__version__.split(".")[0])
+    TORCH_MINOR = int(torch.__version__.split(".")[1])
+    if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10):
+        # nvfuser
+        torch._C._jit_set_profiling_executor(True)
+        torch._C._jit_set_profiling_mode(True)
+        torch._C._jit_override_can_fuse_on_cpu(False)
+        torch._C._jit_override_can_fuse_on_gpu(False)
+        torch._C._jit_set_texpr_fuser_enabled(False)
+        torch._C._jit_set_nvfuser_enabled(False) #True
+        torch._C._debug_set_autodiff_subgraph_inlining(False)
+    else:
+        # legacy pytorch fuser
+        torch._C._jit_set_profiling_mode(False)
+        torch._C._jit_set_profiling_executor(False)
+        torch._C._jit_override_can_fuse_on_cpu(True)
+        torch._C._jit_override_can_fuse_on_gpu(True)
+    _warmup_jit_function()
+def _warmup_jit_function():
+    """Compilie JIT functions before the main training steps"""
+    args = get_args()
+    if args.bf16:
+        dtype = torch.bfloat16
+    elif args.fp16:
+        dtype = torch.float16
+    else:
+        dtype = torch.float32
+    # Warmup fused bias+gelu
+    bias = torch.rand(
+        args.ffn_hidden_size // args.tensor_model_parallel_size,
+        dtype=dtype,
+        device="cuda",
+    )
+    input = torch.rand(
+        (
+            args.seq_length // args.context_parallel_size,
+            args.micro_batch_size,
+            args.ffn_hidden_size // args.tensor_model_parallel_size,
+        ),
+        dtype=dtype,
+        device="cuda",
+    )
+    # Warmup JIT fusions with the input grad_enable state of both forward
+    # prop and recomputation
+    for bias_grad, input_grad in zip([True, True], [False, True]):
+        bias.requires_grad, input.requires_grad = bias_grad, input_grad
+        for _ in range(5):
+            if args.swiglu:
+                output = bias_swiglu(input, bias)
+            else:
+                output = bias_gelu(bias, input)
+    del bias, input, output
+    # Warmup fused bias+dropout+add
+    if args.sequence_parallel:
+        seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
+    else:
+        seq_length = args.seq_length
+    input = torch.rand(
+        (seq_length // args.context_parallel_size, args.micro_batch_size, args.hidden_size),
+        dtype=dtype,
+        device="cuda",
+    )
+    residual = torch.rand(
+        (seq_length // args.context_parallel_size, args.micro_batch_size, args.hidden_size),
+        dtype=dtype,
+        device="cuda",
+    )
+    bias = torch.rand((args.hidden_size), dtype=dtype, device="cuda").expand_as(
+        residual
+    )
+    dropout_rate = 0.1
+    # Warmup JIT fusions with the input grad_enable state of both forward
+    # prop and recomputation
+    for input_grad, bias_grad, residual_grad in zip(
+        [False, True], [True, True], [True, True]
+    ):
+        input.requires_grad = input_grad
+        bias.requires_grad = bias_grad
+        residual.requires_grad = residual_grad
+        for _ in range(5):
+            output = bias_dropout_add_fused_train([input, bias], residual, dropout_rate)
+    del bias, input, residual, output
+    torch.cuda.empty_cache()
+def setup_logging() -> None:
+    """ Sets the default logging level based on cmdline args and env vars.
+    Precedence:
+    1. Command line argument `--logging-level`
+    2. Env var `MEGATRON_LOGGING_LEVEL`
+    3. Default logging level (INFO)
+    Returns: None
+    """
+    args = get_args()
+    logging_level = None
+    env_logging_level = os.getenv('MEGATRON_LOGGING_LEVEL', None)
+    if env_logging_level is not None:
+        logging_level = int(env_logging_level)
+    if args.logging_level is not None:
+        logging_level = args.logging_level
+    if logging_level is not None:
+        logger.info(f'Setting logging level to {logging_level}')
+        logging.getLogger().setLevel(logging_level)
--- a/megatron/legacy/model/rms_norm.py
+++ b/megatron/legacy/model/rms_norm.py
@@ -26,11 +26,9 @@ class RMSNorm(torch.nn.Module):
        setattr(self.weight, 'sequence_parallel', sequence_parallel)
-    @torch.compile(mode="max-autotune-no-cudagraphs")
    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-    @torch.compile(mode="max-autotune-no-cudagraphs")
    def forward(self, x):
        output = self._norm(x.float()).type_as(x)
        return output * self.weight
--- a/megatron/legacy/model/transformer.py
+++ b/megatron/legacy/model/transformer.py
@@ -40,9 +40,6 @@ from megatron.legacy.model.utils import (
 )
 from megatron.training import get_args, get_timers
-import torch._dynamo
-torch._dynamo.config.suppress_errors = True
 from .module import MegatronModule
 try:
@@ -59,10 +56,6 @@ except ImportError:
        )
    except ImportError:
        flash_attn_unpadded_func = None
-try:
-    from flash_attn.flash_attn_triton import flash_attn_func
-except ImportError:
-    flash_attn_func = None
 """ We use the following notation throughout this file:
     h: hidden size
@@ -140,7 +133,6 @@ class ParallelMLP(MegatronModule):
        elif args.onnx_safe:
            self.activation_func = erf_gelu
        elif args.swiglu:
-            @torch.compile(mode="max-autotune-no-cudagraphs")
            def swiglu(x):
                x = torch.chunk(x, 2, dim=-1)
                return F.silu(x[0]) * x[1]
@@ -164,7 +156,7 @@ class ParallelMLP(MegatronModule):
            input_is_parallel=True,
            is_expert=is_expert,
        )
-    @torch.compile(mode="max-autotune-no-cudagraphs")
    def forward(self, hidden_states):
        # [s, b, 4hp]
@@ -475,10 +467,6 @@ class FlashSelfAttention(torch.nn.Module):
        self.softmax_scale = softmax_scale
        self.dropout_p = attention_dropout
-        # Use FlashAttention-2 when args.use_flash_attn_ck is True
-        args = get_args()
-        self.flash_attn_func = flash_attn_unpadded_func
    def forward(self, q, k, v):
        """Implements the multihead softmax attention.
        Arguments
@@ -520,38 +508,6 @@ class FlashSelfAttention(torch.nn.Module):
        output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
        return output
-class FlashSelfAttentionTriton(torch.nn.Module):
-    """Implement the scaled dot product attention with softmax.
-    Arguments
-    ---------
-        softmax_scale: The temperature to use for the softmax attention.
-                      (default: 1/sqrt(d_keys) where d_keys is computed at
-                      runtime)
-        attention_dropout: The dropout rate to apply to the attention
-                           (default: 0.0)
-    """
-    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0,
-                 device=None, dtype=None):
-        super().__init__()
-        assert flash_attn_func is not None, ('Triton version of FlashAttention is not installed.')
-        assert rearrange is not None, 'Please install einops first, e.g., with pip install einops'
-        self.causal = causal
-        self.softmax_scale = softmax_scale
-        self.dropout_p = attention_dropout
-    def forward(self, q, k, v):
-        """Implements the multihead softmax attention.
-        Arguments
-        ---------
-            q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
-        """
-        assert q.dtype in [torch.float16, torch.bfloat16]
-        assert q.is_cuda
-        q, k, v = [rearrange(x, 's b h d -> b h s d').contiguous()
-                       for x in (q, k, v)]
-        output = flash_attn_func(q, k, v, self.causal)
-        output = rearrange(output, 'b s h d -> h b (s d)').contiguous()
-        return output
 class ParallelAttention(MegatronModule):
    """Parallel self-attention layer abstract class.
@@ -580,19 +536,13 @@ class ParallelAttention(MegatronModule):
        else:
            kv_projection_size = args.kv_channels * args.num_attention_heads
-        self.use_flash_attn = (args.use_flash_attn_ck or args.use_flash_attn_triton) \
+        self.use_flash_attn = args.use_flash_attn \
            and attention_type == AttnType.self_attn \
            and self.attn_mask_type == AttnMaskType.causal
-        self.use_flash_attn_triton = args.use_flash_attn_triton
        if self.use_flash_attn:
-            if args.use_flash_attn_ck:
+            if flash_attn_unpadded_func is None:
-                if flash_attn_unpadded_func is None:
+                raise ImportError('FlashAttention is not installed, please install with '
-                   raise ImportError('FlashAttention is not installed, please install with '
                                  'pip install flash-attn')
-            if args.use_flash_attn_triton:
-                assert flash_attn_func != None, "Cannot import FlashAttention triton "
            assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports '
                                                          'self-attention for now')
            assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only '
@@ -652,10 +602,7 @@ class ParallelAttention(MegatronModule):
                                            self.attn_mask_type)
        self.checkpoint_core_attention = config.recompute_granularity == 'selective'
-        # Currently FlashAttention only works with causal mask
+        if self.use_flash_attn:
-        if self.use_flash_attn_triton:
-            self.core_attention_flash = FlashSelfAttentionTriton(causal=True, attention_dropout=args.attention_dropout)
-        elif self.use_flash_attn:
            self.core_attention_flash = FlashSelfAttention(
                causal=True, attention_dropout=config.attention_dropout
            )
@@ -763,7 +710,7 @@ class ParallelAttention(MegatronModule):
                dim=3)
            # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] -
-            query_layer = query_layer.contiguous().view(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head)
+            query_layer = query_layer.view(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head)
        else:
            # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
            mixed_kv_layer, _ = self.key_value(encoder_output)
@@ -868,17 +815,14 @@ class ParallelAttention(MegatronModule):
                context_layer = self.core_attention(
                    query_layer, key_layer, value_layer, attention_mask)
        else:
-            if not self.use_flash_attn_triton:
+            q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()
-                query_layer, key_layer, value_layer = [rearrange(x, 's b ... -> b s ...').contiguous()
-            #q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()
                       for x in (query_layer, key_layer, value_layer)]
            if not self.sequence_parallel:
                with tensor_parallel.get_cuda_rng_tracker().fork():
-                    context_layer = self.core_attention_flash(query_layer, key_layer, value_layer)
+                    context_layer = self.core_attention_flash(q, k, v)
            else:
-                context_layer = self.core_attention_flash(query_layer, key_layer, value_layer)
+                context_layer = self.core_attention_flash(q, k, v)
-            if not self.use_flash_attn_triton:
+            context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
-                context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
        # =================
        # Output. [sq, b, h]
@@ -1229,9 +1173,7 @@ class ParallelTransformerLayer(MegatronModule):
        # hidden_states: [s, b, h]
        # Layer norm at the beginning of the transformer layer.
-        from unsloth.kernels.rms_layernorm import fast_rms_layernorm
+        norm_output = self.input_norm(hidden_states)
-        norm_output = self.input_norm(hidden_states) if not args.use_fast_rms_layernorm else fast_rms_layernorm(self.input_norm, hidden_states)
-        #norm_output = self.input_norm(hidden_states)
        # Self attention.
        attention_output, attention_bias = \

--- a/megatron/legacy/model/utils.py
+++ b/megatron/legacy/model/utils.py
@@ -60,7 +60,6 @@ def openai_gelu(x):
 def erf_gelu(x):
    return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
-@torch.compile(mode="max-autotune-no-cudagraphs")
 def get_norm(config):
    args = get_args()
    if args.normalization == "LayerNorm":

--- a/single.sh
+++ b/single.sh
@@ -6,14 +6,9 @@ export OMP_NUM_THREADS=1
 export NCCL_P2P_LEVEL=SYS
 export NCCL_ALGO=Ring
-export NCCL_NCHANNELS_PER_PEER=16
-export NCCL_MIN_NCHANNELS=20
 export NCCL_IB_TIMEOUT=22
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_NET_GDR_LEVEL=SYS
-export NCCL_NET_GDR_READ=0
 lrank=$OMPI_COMM_WORLD_LOCAL_RANK
 RANK=$OMPI_COMM_WORLD_RANK
 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
@@ -47,7 +42,7 @@ TRAINING_ARGS=(
    --clip-grad 1.0 
    --bf16
    --use-distributed-optimizer 
-    --use-flash-attn-triton
+    --use-flash-attn
    --disable-bias-linear
    --attention-dropout 0
    --hidden-dropout 0
@@ -57,8 +52,6 @@ TRAINING_ARGS=(
    --lr-decay-style cosine 
    --min-lr 3.0e-6
    --lr-warmup-iters 1
-    --use-fast-rms-layernorm
-    --use-fast-cross-entropy-loss
 )
 MODEL_PARALLEL_ARGS=(
        --sequence-parallel
@@ -99,7 +92,7 @@ APP="python3  -u pretrain_gpt.py \
     --dist_url tcp://${1}:34566 \
    "
-#for hygon cpu
+#for intel cpu
 case ${lrank} in
 [0])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
@@ -107,31 +100,31 @@ case ${lrank} in
  ;;
 [1])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=1 --membind=1 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [2])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=2 --membind=2 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [3])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
 [4])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=4 --membind=4 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
 [5])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=5 --membind=5 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
 [6])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=6 --membind=6 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
 [7])
  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=7 --membind=7 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
 esac

--- a/unsloth @ a2f8db3e
+++ b/unsloth @ a2f8db3e
-Subproject commit a2f8db3e7341f983af5814a2c56f54fa29ee548d