Commit eeac1cc4 authored by liangjing's avatar liangjing
Browse files

add fp16 support

parent c138a95c
Pipeline #442 failed with stage
...@@ -44,6 +44,7 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/vscode-pytorch:1.10.0-centos ...@@ -44,6 +44,7 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/vscode-pytorch:1.10.0-centos
``` ```
pip install -r requirements.txt -i http://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn pip install -r requirements.txt -i http://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
pip install lightop-0.1-cp37-cp37m-linux_x86_64.whl #安装优化算子库
``` ```
### 训练(单节点) ### 训练(单节点)
...@@ -95,7 +96,7 @@ pip install -r requirements.txt -i http://pypi.tuna.tsinghua.edu.cn/simple --t ...@@ -95,7 +96,7 @@ pip install -r requirements.txt -i http://pypi.tuna.tsinghua.edu.cn/simple --t
``` ```
rm megatron/arguments.py rm megatron/arguments.py
cp megatron/arguments.py-nodes megatron/arguments.py cp megatron/arguments.py-nodes megatron/arguments.py
sbatch run-16B.sh(主要参数在single-16B.sh) sbatch run-16B.sh(主要参数在single-16B.sh, 默认以fp32精度训练,如需采用fp16精度可执行sbatch run-16B-fp16.sh)
``` ```
``` ```
......
...@@ -32,7 +32,7 @@ import torch.nn.functional as F ...@@ -32,7 +32,7 @@ import torch.nn.functional as F
global fused_mix_prec_layer_norm_cuda global fused_mix_prec_layer_norm_cuda
fused_mix_prec_layer_norm_cuda = None fused_mix_prec_layer_norm_cuda = None
from lightop import op
class FusedLayerNormAffineFunction(torch.autograd.Function): class FusedLayerNormAffineFunction(torch.autograd.Function):
...@@ -108,4 +108,5 @@ class MixedFusedLayerNorm(torch.nn.Module): ...@@ -108,4 +108,5 @@ class MixedFusedLayerNorm(torch.nn.Module):
return FusedLayerNormAffineFunction.apply( return FusedLayerNormAffineFunction.apply(
input, self.weight, self.bias, self.normalized_shape, self.eps) input, self.weight, self.bias, self.normalized_shape, self.eps)
else: else:
return F.layer_norm(input, self.normalized_shape, self.weight, self.bias) #return F.layer_norm(input, self.normalized_shape, self.weight, self.bias)
return op.layernorm_forward_autograd(input, self.weight,self.bias,self.eps)
...@@ -17,6 +17,7 @@ from functools import lru_cache ...@@ -17,6 +17,7 @@ from functools import lru_cache
import torch import torch
import torch.nn as nn import torch.nn as nn
from megatron.enums import AttnMaskType from megatron.enums import AttnMaskType
from lightop.fusesoftmax import FuseSoftmax
class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function): class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
""" """
...@@ -221,7 +222,8 @@ class FusedScaleMaskSoftmax(nn.Module): ...@@ -221,7 +222,8 @@ class FusedScaleMaskSoftmax(nn.Module):
mask_output = self.mask_func(input, mask) if mask is not None else input mask_output = self.mask_func(input, mask) if mask is not None else input
probs = torch.nn.Softmax(dim=-1)(mask_output) #probs = torch.nn.Softmax(dim=-1)(mask_output)
probs = FuseSoftmax(dim=-1)(mask_output)
if self.input_in_float16 and self.softmax_in_fp32: if self.input_in_float16 and self.softmax_in_fp32:
if self.input_in_fp16: if self.input_in_fp16:
......
...@@ -32,7 +32,7 @@ import deepspeed ...@@ -32,7 +32,7 @@ import deepspeed
from .glu_activations import GLU_ACTIVATIONS from .glu_activations import GLU_ACTIVATIONS
from .positional_embeddings import RotaryEmbedding, apply_rotary_pos_emb_torch, apply_rotary_pos_emb from .positional_embeddings import RotaryEmbedding, apply_rotary_pos_emb_torch, apply_rotary_pos_emb
from lightop import op
# flags required to enable jit fusion kernels # flags required to enable jit fusion kernels
torch._C._jit_set_profiling_mode(False) torch._C._jit_set_profiling_mode(False)
torch._C._jit_set_profiling_executor(False) torch._C._jit_set_profiling_executor(False)
...@@ -407,8 +407,9 @@ class ParallelAttention(MegatronModule): ...@@ -407,8 +407,9 @@ class ParallelAttention(MegatronModule):
def bias_dropout_add(x, bias, residual, prob, training): def bias_dropout_add(x, bias, residual, prob, training):
# type: (Tensor, Tensor, Tensor, float, bool) -> Tensor # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
out = torch.nn.functional.dropout(x + bias, p=prob, training=training) #out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
out = residual + out #out = residual + out
out = op.add_dropout_forward_autograd(x + bias, residual, prob, training)
return out return out
...@@ -418,13 +419,13 @@ def get_bias_dropout_add(training): ...@@ -418,13 +419,13 @@ def get_bias_dropout_add(training):
return _bias_dropout_add return _bias_dropout_add
@torch.jit.script #@torch.jit.script
def bias_dropout_add_fused_train(x, bias, residual, prob): def bias_dropout_add_fused_train(x, bias, residual, prob):
# type: (Tensor, Tensor, Tensor, float) -> Tensor # type: (Tensor, Tensor, Tensor, float) -> Tensor
return bias_dropout_add(x, bias, residual, prob, True) return bias_dropout_add(x, bias, residual, prob, True)
@torch.jit.script #@torch.jit.script
def bias_dropout_add_fused_inference(x, bias, residual, prob): def bias_dropout_add_fused_inference(x, bias, residual, prob):
# type: (Tensor, Tensor, Tensor, float) -> Tensor # type: (Tensor, Tensor, Tensor, float) -> Tensor
return bias_dropout_add(x, bias, residual, prob, False) return bias_dropout_add(x, bias, residual, prob, False)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment