add fp16 support

eeac1cc4 · liangjing · c138a95c · eeac1cc4 · eeac1cc4 · eeac1cc4
Commit eeac1cc4 authored Jul 22, 2023 by liangjing
4 changed files
--- a/README.md
+++ b/README.md
@@ -44,6 +44,7 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/vscode-pytorch:1.10.0-centos
 ```
 pip install -r requirements.txt  -i http://pypi.tuna.tsinghua.edu.cn/simple  --trusted-host pypi.tuna.tsinghua.edu.cn
+pip install lightop-0.1-cp37-cp37m-linux_x86_64.whl #安装优化算子库
 ```
 ### 训练（单节点）
@@ -95,7 +96,7 @@ pip install -r requirements.txt  -i http://pypi.tuna.tsinghua.edu.cn/simple  --t
 ```
 rm megatron/arguments.py
 cp megatron/arguments.py-nodes megatron/arguments.py
-sbatch  run-16B.sh(主要参数在single-16B.sh)
+sbatch  run-16B.sh(主要参数在single-16B.sh, 默认以fp32精度训练，如需采用fp16精度可执行sbatch run-16B-fp16.sh)
 ```
 ```

--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -32,7 +32,7 @@ import torch.nn.functional as F
 global fused_mix_prec_layer_norm_cuda
 fused_mix_prec_layer_norm_cuda = None
+from lightop import op
 class FusedLayerNormAffineFunction(torch.autograd.Function):
@@ -108,4 +108,5 @@ class MixedFusedLayerNorm(torch.nn.Module):
        return FusedLayerNormAffineFunction.apply(
            input, self.weight, self.bias, self.normalized_shape, self.eps)
    else:
-        return F.layer_norm(input, self.normalized_shape, self.weight, self.bias)
+        #return F.layer_norm(input, self.normalized_shape, self.weight, self.bias)
+        return op.layernorm_forward_autograd(input, self.weight,self.bias,self.eps)
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -17,6 +17,7 @@ from functools import lru_cache
 import torch
 import torch.nn as nn
 from megatron.enums import AttnMaskType
+from lightop.fusesoftmax  import FuseSoftmax
 class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
    """
@@ -221,7 +222,8 @@ class FusedScaleMaskSoftmax(nn.Module):
        mask_output = self.mask_func(input, mask) if mask is not None else input
-        probs = torch.nn.Softmax(dim=-1)(mask_output)
+        #probs = torch.nn.Softmax(dim=-1)(mask_output)
+        probs = FuseSoftmax(dim=-1)(mask_output)
        if self.input_in_float16 and self.softmax_in_fp32:
            if self.input_in_fp16:

--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -32,7 +32,7 @@ import deepspeed
 from .glu_activations import GLU_ACTIVATIONS
 from .positional_embeddings import RotaryEmbedding, apply_rotary_pos_emb_torch, apply_rotary_pos_emb
+from lightop import op
 # flags required to enable jit fusion kernels
 torch._C._jit_set_profiling_mode(False)
 torch._C._jit_set_profiling_executor(False)
@@ -407,8 +407,9 @@ class ParallelAttention(MegatronModule):
 def bias_dropout_add(x, bias, residual, prob, training):
    # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
-    out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
+    #out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
-    out = residual + out
+    #out = residual + out
+    out = op.add_dropout_forward_autograd(x + bias, residual, prob, training)
    return out
@@ -418,13 +419,13 @@ def get_bias_dropout_add(training):
    return _bias_dropout_add
-@torch.jit.script
+#@torch.jit.script
 def bias_dropout_add_fused_train(x, bias, residual, prob):
    # type: (Tensor, Tensor, Tensor, float) -> Tensor
    return bias_dropout_add(x, bias, residual, prob, True)
-@torch.jit.script
+#@torch.jit.script
 def bias_dropout_add_fused_inference(x, bias, residual, prob):
    # type: (Tensor, Tensor, Tensor, float) -> Tensor
    return bias_dropout_add(x, bias, residual, prob, False)