"megatron/legacy/model/transformer.py" did not exist on "5d6473811f2398757a99ab1ac3b0e53b22f5d89c"
rms_norm.py 3.89 KB
Newer Older
xingjinliang's avatar
xingjinliang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.

import torch
from torch import nn

class RMSNorm(torch.nn.Module):

    def __init__(self,
                 dim: int,
                 eps: float = 1e-6,
                 sequence_parallel: bool = False,
                 config: dict = None):
        """RMS Normaliation module

        Args:
            dim (int): The width of input, i.e. hidden size
            eps (float): epsilon to use for the norm, default to 1e-6
            sequence_parallel (bool): Set to true if sequence parallelism is being used,
              this marks the weights as needing to be allreduced.
        """
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

        setattr(self.weight, 'sequence_parallel', sequence_parallel)

    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

30
    @torch.compile(mode="max-autotune-no-cudagraphs")
xingjinliang's avatar
xingjinliang committed
31
32
33
    def forward(self, x):
        output = self._norm(x.float()).type_as(x)
        return output * self.weight
wangxj's avatar
wangxj committed
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114


from typing import Any, Callable, Dict, Optional, Tuple, Union
import lightop # rmsnorm_forward,rmsnorm_backward

from functools import partial
from megatron.core.utils import is_torch_min_version
if is_torch_min_version("2.4.0a0"):
    custom_fwd = partial(torch.amp.custom_fwd, device_type="cuda")
    custom_bwd = partial(torch.amp.custom_bwd, device_type="cuda")
else:
    custom_fwd = torch.cuda.amp.custom_fwd
    custom_bwd = torch.cuda.amp.custom_bwd

def print_rank_0(message):
    """If distributed is initialized, print only on rank 0."""
    if torch.distributed.is_initialized():
        if torch.distributed.get_rank() == 0:
            print(message, flush=True)
    else:
        print(message, flush=True)



class _LightopRMSNorm(torch.autograd.Function):
    """ 使用lightop实现rmsnorm"""

    @staticmethod
    # @custom_fwd
    def forward(ctx,
                inp: torch.Tensor, 
                weight: torch.Tensor,
                ln_out: torch.Tensor,
                eps: float,
                is_grad_enabled):
        output = lightop.rmsnorm_forward(inp, weight, ln_out, eps, training=True)# output = (output, weight)
        # print_rank_0(f"_LightopRMSNorm: output({output[0].shape, output[1].shape}) = lightop.rmsfwd(inp{inp.shape}, weight{weight.shape}, ...)")
        rsigma = output[1]
        if is_grad_enabled:
            ctx.save_for_backward(inp, weight, rsigma)
        return output[0]


    @staticmethod
    # @custom_bwd
    def backward(ctx, grad_output):
        inp, weight, rsigma = ctx.saved_tensors
        
        dgrad, dgamma = lightop.rmsnorm_backward(grad_output, inp, rsigma, weight)
        # print_rank_0(f"_LightopRMSNorm: dgrad{dgrad.shape}, dgamma{dgamma.shape} = lightop.rmsbwd(grad_output{grad_output.shape}, inp{inp.shape}, rsigma{rsigma.shape}, weight{weight.shape})")
        return dgrad, dgamma, None, None, None
        

class LightopRMSNorm(torch.nn.Module):
    
    def __init__(self,
                 dim: int,
                 eps: float = 1e-6):
        """RMS Normaliation module

        Args:
            dim (int): The width of input, i.e. hidden size
            eps (float): epsilon to use for the norm, default to 1e-6
        """
        super().__init__()
        self.eps = eps
        self.weight = torch.nn.Parameter(torch.ones(dim))
    
    # @no_torch_dynamo() # 动态torch._dynamo.disable
    def forward(self, inp: torch.Tensor, is_first_microbatch: Optional[bool] = None):

        if torch.is_grad_enabled():
            fwd_fn = _LightopRMSNorm.apply
            args = []
        else:
            fwd_fn = _LightopRMSNorm.forward
            args = [None]
        ln_out = torch.empty_like(inp, dtype=inp.dtype, memory_format=torch.contiguous_format)
        args += (inp, self.weight, ln_out, self.eps, torch.is_grad_enabled())
        out = fwd_fn(*args)
        return out