Commit eeac1cc4 authored by liangjing's avatar liangjing
Browse files

add fp16 support

parent c138a95c
Pipeline #442 failed with stage
......@@ -44,6 +44,7 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/vscode-pytorch:1.10.0-centos
```
pip install -r requirements.txt -i http://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
pip install lightop-0.1-cp37-cp37m-linux_x86_64.whl #安装优化算子库
```
### 训练(单节点)
......@@ -95,7 +96,7 @@ pip install -r requirements.txt -i http://pypi.tuna.tsinghua.edu.cn/simple --t
```
rm megatron/arguments.py
cp megatron/arguments.py-nodes megatron/arguments.py
sbatch run-16B.sh(主要参数在single-16B.sh)
sbatch run-16B.sh(主要参数在single-16B.sh, 默认以fp32精度训练,如需采用fp16精度可执行sbatch run-16B-fp16.sh)
```
```
......
......@@ -32,7 +32,7 @@ import torch.nn.functional as F
global fused_mix_prec_layer_norm_cuda
fused_mix_prec_layer_norm_cuda = None
from lightop import op
class FusedLayerNormAffineFunction(torch.autograd.Function):
......@@ -108,4 +108,5 @@ class MixedFusedLayerNorm(torch.nn.Module):
return FusedLayerNormAffineFunction.apply(
input, self.weight, self.bias, self.normalized_shape, self.eps)
else:
return F.layer_norm(input, self.normalized_shape, self.weight, self.bias)
#return F.layer_norm(input, self.normalized_shape, self.weight, self.bias)
return op.layernorm_forward_autograd(input, self.weight,self.bias,self.eps)
......@@ -17,6 +17,7 @@ from functools import lru_cache
import torch
import torch.nn as nn
from megatron.enums import AttnMaskType
from lightop.fusesoftmax import FuseSoftmax
class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
"""
......@@ -221,7 +222,8 @@ class FusedScaleMaskSoftmax(nn.Module):
mask_output = self.mask_func(input, mask) if mask is not None else input
probs = torch.nn.Softmax(dim=-1)(mask_output)
#probs = torch.nn.Softmax(dim=-1)(mask_output)
probs = FuseSoftmax(dim=-1)(mask_output)
if self.input_in_float16 and self.softmax_in_fp32:
if self.input_in_fp16:
......
......@@ -32,7 +32,7 @@ import deepspeed
from .glu_activations import GLU_ACTIVATIONS
from .positional_embeddings import RotaryEmbedding, apply_rotary_pos_emb_torch, apply_rotary_pos_emb
from lightop import op
# flags required to enable jit fusion kernels
torch._C._jit_set_profiling_mode(False)
torch._C._jit_set_profiling_executor(False)
......@@ -407,8 +407,9 @@ class ParallelAttention(MegatronModule):
def bias_dropout_add(x, bias, residual, prob, training):
# type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
out = residual + out
#out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
#out = residual + out
out = op.add_dropout_forward_autograd(x + bias, residual, prob, training)
return out
......@@ -418,13 +419,13 @@ def get_bias_dropout_add(training):
return _bias_dropout_add
@torch.jit.script
#@torch.jit.script
def bias_dropout_add_fused_train(x, bias, residual, prob):
# type: (Tensor, Tensor, Tensor, float) -> Tensor
return bias_dropout_add(x, bias, residual, prob, True)
@torch.jit.script
#@torch.jit.script
def bias_dropout_add_fused_inference(x, bias, residual, prob):
# type: (Tensor, Tensor, Tensor, float) -> Tensor
return bias_dropout_add(x, bias, residual, prob, False)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment