Commit c25a91b6 authored by aiss's avatar aiss
Browse files

Merge branch 'ds-v0.9.2-rocm' into 'main'

Ds v0.9.2 rocm

See merge request dcutoolkit/deeplearing/deepspeed!2
parents d1596c94 af82b300
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
from ..config import DeepSpeedInferenceConfig
......@@ -6,6 +9,7 @@ from .base import BaseOp
class GELUGemmOp(BaseOp):
def __init__(self, config: DeepSpeedInferenceConfig):
super(GELUGemmOp, self).__init__(config)
if self.config.fp16:
......@@ -19,14 +23,9 @@ class GELUGemmOp(BaseOp):
bias: torch.Tensor,
weight_out: torch.Tensor,
async_op: bool = False):
output = self.fused_gemm_gelu(input,
weight,
weight.scale,
bias,
weight_out,
weight_out.scale,
self.config.epsilon,
self.config.pre_layer_norm,
self.config.q_int8,
async_op)
output = self.fused_gemm_gelu(input, weight, weight.scale if hasattr(weight, 'scale') else torch.empty(1),
bias, weight_out,
weight_out.scale if hasattr(weight_out, 'scale') else torch.empty(1),
self.config.epsilon, self.config.pre_layer_norm, self.config.q_int8, async_op,
self.config.transposed_mode)
return output
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
from ..config import DeepSpeedInferenceConfig
......@@ -6,6 +9,7 @@ from .base import BaseOp
class LinearOp(BaseOp):
def __init__(self, config: DeepSpeedInferenceConfig):
super(LinearOp, self).__init__(config)
if self.config.fp16:
......@@ -22,10 +26,6 @@ class LinearOp(BaseOp):
num_heads: int,
external_cache: bool = None,
num_layers: int = None):
qkv_out = self.linear_func(input,
weight,
bias,
add_bias,
do_flash_attn,
num_heads)
qkv_out = self.linear_func(input, weight, bias, add_bias, do_flash_attn, num_heads,
self.config.transposed_mode)
return qkv_out
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
from ..config import DeepSpeedInferenceConfig
......@@ -6,6 +9,7 @@ from .base import BaseOp
class MLPGemmOp(BaseOp):
def __init__(self, config: DeepSpeedInferenceConfig):
super(MLPGemmOp, self).__init__(config)
if self.config.fp16:
......@@ -13,29 +17,13 @@ class MLPGemmOp(BaseOp):
else:
self.mlp_gemm_func = self.inference_cuda_module.mlp_gemm_fp32
def forward(self,
input: torch.Tensor,
residual: torch.Tensor,
input_bias: torch.Tensor,
weight_interm: torch.Tensor,
weight_out: torch.Tensor,
bias: torch.Tensor,
gamma: torch.Tensor,
def forward(self, input: torch.Tensor, residual: torch.Tensor, input_bias: torch.Tensor,
weight_interm: torch.Tensor, weight_out: torch.Tensor, bias: torch.Tensor, gamma: torch.Tensor,
beta: torch.Tensor):
output, residual_add = self.mlp_gemm_func(
input,
residual,
input_bias,
weight_interm,
weight_out,
bias,
gamma,
beta,
self.config.epsilon,
self.config.pre_layer_norm,
self.config.mlp_after_attn,
weight_interm.scale,
weight_out.scale,
self.config.q_int8,
self.config.mlp_act_func_type)
input, residual, input_bias, weight_interm, weight_out, bias, gamma, beta, self.config.epsilon,
self.config.pre_layer_norm, self.config.mlp_after_attn,
weight_interm.scale if hasattr(weight_interm, 'scale') else torch.empty(1),
weight_out.scale if hasattr(weight_out, 'scale') else torch.empty(1), self.config.q_int8,
self.config.mlp_act_func_type, self.config.transposed_mode)
return output, residual_add
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
from ..config import DeepSpeedInferenceConfig
......@@ -7,6 +10,7 @@ from deepspeed import comm as dist
class QKVGemmOp(BaseOp):
def __init__(self, config: DeepSpeedInferenceConfig):
super(QKVGemmOp, self).__init__(config)
if self.config.fp16:
......@@ -24,21 +28,11 @@ class QKVGemmOp(BaseOp):
num_layers: int,
num_heads: int = None,
max_out_tokens: int = None):
q_scale = weight.scale
q_scale = weight.scale if hasattr(weight, 'scale') else torch.empty(1)
external_cache = self.config.bigscience_bloom
rank = dist.get_rank() if dist.is_initialized() else 0
q_int8 = self.config.q_int8
output = self.qkv_gemm_func(input,
weight,
q_scale,
bias,
gamma,
beta,
self.config.epsilon,
add_bias,
num_layers,
external_cache,
self.config.mp_size,
rank,
q_int8)
output = self.qkv_gemm_func(input, weight, q_scale, bias, gamma, beta, self.config.epsilon, add_bias,
num_layers, external_cache, self.config.mp_size, rank, q_int8,
self.config.transposed_mode)
return output
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
from ..config import DeepSpeedInferenceConfig
......@@ -6,6 +9,7 @@ from .base import BaseOp
class ResidualAddOp(BaseOp):
def __init__(self, config: DeepSpeedInferenceConfig):
super(ResidualAddOp, self).__init__(config)
if self.config.fp16 or self.config.q_int8:
......@@ -13,26 +17,13 @@ class ResidualAddOp(BaseOp):
else:
self.residual_add_func = self.inference_cuda_module.residual_add_bias_fp32
def forward(self,
hidden_state: torch.Tensor,
residual: torch.Tensor,
attention_output: torch.Tensor,
attention_bias: torch.Tensor,
final_bias: torch.Tensor,
add_bias: bool,
residual_add: torch.Tensor):
def forward(self, hidden_state: torch.Tensor, residual: torch.Tensor, attention_output: torch.Tensor,
attention_bias: torch.Tensor, final_bias: torch.Tensor, add_bias: bool, residual_add: torch.Tensor):
if not self.config.pre_layer_norm and residual_add is not None:
# only use residual add if its set and we are not pre layer norm
residual = residual_add
self.residual_add_func(hidden_state,
residual,
attention_output,
attention_bias,
final_bias,
self.config.mp_size,
self.config.mlp_after_attn,
add_bias,
self.config.pre_layer_norm)
self.residual_add_func(hidden_state, residual, attention_output, attention_bias, final_bias,
self.config.mp_size, self.config.mlp_after_attn, add_bias, self.config.pre_layer_norm)
return residual
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
from ..config import DeepSpeedInferenceConfig
......@@ -6,6 +9,7 @@ from .base import BaseOp
class SoftmaxOp(BaseOp):
def __init__(self, config: DeepSpeedInferenceConfig):
super(SoftmaxOp, self).__init__(config)
if self.config.fp16:
......@@ -16,26 +20,9 @@ class SoftmaxOp(BaseOp):
def _not_implemented(self, *args, **kwargs):
raise NotImplementedError
def forward(self,
attn_scores: torch.Tensor,
attn_mask: torch.Tensor,
alibi: torch.Tensor,
triangular: bool,
recompute: bool,
local_attention: bool,
window_size: int,
async_op: bool,
layer_scale: float,
def forward(self, attn_scores: torch.Tensor, attn_mask: torch.Tensor, alibi: torch.Tensor, triangular: bool,
recompute: bool, local_attention: bool, window_size: int, async_op: bool, layer_scale: float,
head_offset: int):
output = self.softmax_func(attn_scores,
attn_mask,
alibi,
triangular,
recompute,
local_attention,
window_size,
async_op,
layer_scale,
head_offset,
self.config.mp_size)
output = self.softmax_func(attn_scores, attn_mask, alibi, triangular, recompute, local_attention, window_size,
async_op, layer_scale, head_offset, self.config.mp_size)
return output
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
from deepspeed import comm as dist
......@@ -7,6 +10,7 @@ from .base import BaseOp
class SoftmaxContextOp(BaseOp):
def __init__(self, config: DeepSpeedInferenceConfig):
super(SoftmaxContextOp, self).__init__(config)
if self.config.fp16:
......@@ -14,15 +18,8 @@ class SoftmaxContextOp(BaseOp):
else:
self.softmax_context_func = self.inference_cuda_module.softmax_context_fp32
def forward(self,
query_key_value: torch.Tensor,
attn_mask: torch.Tensor,
heads: int,
norm_factor: float,
no_masking: bool,
layer_id: int,
num_layers: int,
alibi: torch.Tensor):
def forward(self, query_key_value: torch.Tensor, attn_mask: torch.Tensor, heads: int, norm_factor: float,
no_masking: bool, layer_id: int, num_layers: int, alibi: torch.Tensor):
if alibi is not None:
batch_heads = query_key_value.shape[0] * heads
......@@ -31,18 +28,8 @@ class SoftmaxContextOp(BaseOp):
else:
alibi = torch.empty(1)
output = self.softmax_context_func(query_key_value,
attn_mask,
self.config.rotary_dim,
self.config.rotate_half,
self.config.rotate_every_two,
heads,
norm_factor,
self.config.triangular_masking,
self.config.local_attention,
self.config.window_size,
no_masking,
layer_id,
num_layers,
alibi)
output = self.softmax_context_func(query_key_value, attn_mask, self.config.rotary_dim, self.config.rotate_half,
self.config.rotate_every_two, heads, norm_factor,
self.config.triangular_masking, self.config.local_attention,
self.config.window_size, no_masking, layer_id, num_layers, alibi)
return output
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import torch
from ..config import DeepSpeedInferenceConfig
......@@ -6,6 +9,7 @@ from .base import BaseOp
class VectorMatMulOp(BaseOp):
def __init__(self, config: DeepSpeedInferenceConfig):
super(VectorMatMulOp, self).__init__(config)
if self.config.fp16:
......@@ -14,7 +18,7 @@ class VectorMatMulOp(BaseOp):
self.vector_matmul_func = self.inference_cuda_module.vector_matmul_fp32
def forward(self, input: torch.Tensor, weight: torch.Tensor, async_op: bool = False):
q_scale = weight.scale
q_scale = weight.scale if hasattr(weight, 'scale') else torch.empty(1)
q_int8 = self.config.q_int8
output = self.vector_matmul_func(input, weight, async_op, q_scale, q_int8)
output = self.vector_matmul_func(input, weight, async_op, q_scale, q_int8, self.config.transposed_mode)
return output
'''
Copyright 2020 The Microsoft DeepSpeed Team
'''
import json
import math
import importlib
import torch
from torch import nn
from torch.autograd import Function
import time
from ... import op_builder
import torch.nn as nn
import torch.distributed as dist
# Cuda modules will be imported if needed
inference_cuda_module = None
class TransformerConfig():
def __init__(self, hidden_size, intermediate_size, heads, num_hidden_layers):
self.layer_id = -1
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.heads = heads
self.num_hidden_layers = num_hidden_layers
class DeepSpeedInferenceConfig(TransformerConfig):
"""Initialize the DeepSpeed Transformer Config.
Arguments:
hidden_size: The hidden size of the transformer layer
intermediate_size: The intermediate size of the feed-forward part of transformer layer
heads: The number of heads in the self-attention of the transformer layer
num_hidden_layers: The number of transformer layers
layer_norm_eps: The epsilon value for the layer norm
local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
to use if the model already set the current device, otherwise need to set it
so that the transformer kernel can work on the right device
mp_size (optional): This argument is mainly used to create the parameters on the kernel side
using model-parallel architecture. If the client model already takes care of this, there is no
need to pass this argument.
fp16: Enable half-precision computation
pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture
stochastic_mode: Enable for high performance, please note that this flag has some level of
non-determinism and can produce different results on different runs. However, we have seen
that by enabling it, the pretraining tasks such as BERT are not affected and can obtain
a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
to turn it off in order to be able to reproduce the same result through the regular kernel execution.
scale_attention: If true, both q and k are scaled by 1/sqrt(attention_heads) before attention computation.
return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor
"""
def __init__(self,
hidden_size=-1,
intermediate_size=-1,
heads=-1,
num_hidden_layers=-1,
layer_norm_eps=1e-12,
local_rank=-1,
mp_size=1,
fp16=False,
q_int8=False,
pre_layer_norm=True,
stochastic_mode=False,
scale_attention=True,
triangular_masking=True,
local_attention=False,
window_size=256,
rotary_dim=-1,
rotate_half=False,
rotate_every_two=True,
return_tuple=True,
mlp_after_attn=True,
training_mp_size=1):
super(DeepSpeedInferenceConfig,
self).__init__(
hidden_size,
(intermediate_size if intermediate_size > 0 else 4 * hidden_size),
heads,
num_hidden_layers)
self.fp16 = fp16
self.pre_layer_norm = pre_layer_norm
self.local_rank = local_rank
self.stochastic_mode = stochastic_mode
self.epsilon = layer_norm_eps
self.mp_size = mp_size
self.q_int8 = q_int8
self.scale_attention = scale_attention
self.triangular_masking = triangular_masking
self.local_attention = local_attention
self.window_size = window_size
self.rotary_dim = rotary_dim
self.rotate_half = rotate_half
self.rotate_every_two = rotate_every_two
self.return_tuple = return_tuple
self.mlp_after_attn = mlp_after_attn
self.specialized_mode = False
self.training_mp_size = training_mp_size
@classmethod
def from_dict(cls, json_object):
config = DeepSpeedInferenceConfig()
for key, value in json_object.items():
config.__dict__[key] = value
return config
@classmethod
def from_json_file(cls, json_file):
with open(json_file, "r", encoding='utf-8') as reader:
text = reader.read()
return cls.from_dict(json.loads(text))
class DeepSpeedSelfAttentionFunction(Function):
@staticmethod
def forward(ctx,
input,
input_mask,
head_mask,
layer_past,
get_present,
encoder_hidden_states,
encoder_attention_mask,
output_attentions,
norm_w,
norm_b,
config,
attn_qkvw,
attn_qkvb,
num_attention_heads_per_partition,
norm_factor,
hidden_size_per_partition,
attn_ow,
attn_ob,
mp_group,
q_scales,
q_groups,
merge_count,
qkv_merging):
def _transpose_for_scores(x, key=False, reshape=False):
attention_head_size = x.shape[-1] // num_attention_heads_per_partition
new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition,
attention_head_size)
x_1 = x.view(*new_x_shape)
if key:
x_1 = x_1.permute(0, 2, 3, 1)
else:
x_1 = x_1.permute(0, 2, 1, 3)
if reshape:
return x_1.reshape(x.shape)
return x_1.contiguous()
def _transpose_for_context(x):
x = x.permute(0, 2, 1, 3).contiguous()
new_x_layer_shape = x.size()[:-2] + \
(hidden_size_per_partition,)
return x.view(*new_x_layer_shape).contiguous()
def compute_attention(qkv_out, input_mask):
score_context_func = inference_cuda_module.softmax_context_fp32 if (not config.fp16) else \
inference_cuda_module.softmax_context_fp16
if merge_count > 0 and config.q_int8:
split_dim = (qkv_out.dim() - 1)
qkv_split = torch.split(qkv_out,
(qkv_out.shape[-1] // (2**merge_count)),
dim=split_dim)
qkv_split = [
torch.split(s,
(s.shape[-1] // 3),
dim=split_dim) for s in qkv_split
]
(mixed_query,
key_layer,
value_layer) = [
torch.cat([s[i] for s in qkv_split],
axis=-1) for i in range(len(qkv_split[0]))
]
else:
(mixed_query,
key_layer,
value_layer) = torch.split(qkv_out,
(qkv_out.shape[-1] // 3),
dim=(qkv_out.dim() - 1))
no_masking = input_mask is None
if no_masking:
input_mask = torch.empty(1)
head_size = (mixed_query.shape[-1] // num_attention_heads_per_partition)
unfused_mode = not config.specialized_mode or \
mixed_query.shape[1] >= 32 or head_size > 128
if config.rotary_dim > 0:
mixed_query, key_layer = inference_cuda_module.apply_rotary_pos_emb(
mixed_query,
key_layer,
config.rotary_dim,
0 if layer_past is None else layer_past[0].shape[-2],
num_attention_heads_per_partition,
config.rotate_half,
config.rotate_every_two)
if layer_past is not None:
past_key, past_value = layer_past
if unfused_mode:
key_layer = torch.cat((past_key.type_as(key_layer),
key_layer),
dim=-2)
value_layer = torch.cat((past_value.type_as(value_layer),
value_layer),
dim=-2)
presents = (key_layer, value_layer)
if unfused_mode:
mixed_query = _transpose_for_scores(mixed_query, False, True)
key_layer = _transpose_for_scores(
key_layer,
True,
True) / (norm_factor if config.scale_attention else 1.0)
value_layer = _transpose_for_scores(value_layer, False, True)
#print(f'[{torch.distributed.get_rank()}] {config.layer_id}: {mixed_query.norm()}')
if layer_past is None:
attn_key_value = score_context_func(
mixed_query,
key_layer,
torch.empty(1),
input_mask,
value_layer,
torch.empty(1),
num_attention_heads_per_partition,
(1 / norm_factor if config.scale_attention else 1.0),
(not unfused_mode),
config.triangular_masking,
config.local_attention,
config.window_size,
no_masking)
else:
attn_key_value = score_context_func(
mixed_query,
(key_layer if unfused_mode else past_key.type_as(key_layer)),
key_layer,
input_mask,
(value_layer if unfused_mode else past_value.type_as(value_layer)),
value_layer,
num_attention_heads_per_partition,
(1 / norm_factor if config.scale_attention else 1.0),
(not unfused_mode),
config.triangular_masking,
config.local_attention,
config.window_size,
no_masking)
if unfused_mode:
context_layer, _, _ = attn_key_value
else:
context_layer, key_layer, value_layer = attn_key_value
# Transpose Context
context_layer = _transpose_for_context(context_layer)
return context_layer, presents[0], presents[1] # atten_output, key_layer, value_layer
def selfAttention_fp():
vector_matmul_func = inference_cuda_module.vector_matmul_fp16 if config.fp16 else \
inference_cuda_module.vector_matmul_fp32
if not config.pre_layer_norm:
linear_func = inference_cuda_module.linear_layer_fp16 if config.fp16 else \
inference_cuda_module.linear_layer_fp32
qkv_out = linear_func(input, attn_qkvw, attn_qkvb)
else:
qkv_func = inference_cuda_module.qkv_gemm_fp16 if config.fp16 else \
inference_cuda_module.qkv_gemm_fp32
qkv_out = qkv_func(input,
attn_qkvw,
(attn_qkvb if attn_qkvb is not None else norm_b),
norm_w,
norm_b,
config.epsilon,
(attn_qkvb is not None))
context_layer, key_layer, value_layer = compute_attention(qkv_out[0] if isinstance(qkv_out, list) else qkv_out, input_mask)
output = vector_matmul_func(context_layer, attn_ow, False)
#print(f'[{torch.distributed.get_rank()}] {config.layer_id}: oooooo -> {output.norm()}')
return output, key_layer, value_layer, context_layer, qkv_out[-1] # attn_out, present_key, present_value, context_output, inp_norm
def selfAttention_int8():
if not config.pre_layer_norm:
qkv_out = inference_cuda_module.linear_layer_int8(
input,
attn_qkvw,
attn_qkvb,
q_scales[0],
(q_groups * (3 if qkv_merging else 1) * (2**merge_count)))
else:
qkv_out = inference_cuda_module.qkv_gemm_int8(
input,
attn_qkvw,
attn_qkvb,
norm_w,
norm_b,
config.epsilon,
q_scales[0],
(q_groups * (3 if qkv_merging else 1) * (2**merge_count)),
(attn_qkvb is not None))
context_layer, key_layer, value_layer = compute_attention(qkv_out)
output = inference_cuda_module.vector_matmul_int8(context_layer,
attn_ow,
q_scales[1],
q_groups,
(merge_count))
return output, key_layer, value_layer, context_layer
if config.q_int8:
output, key_layer, value_layer, context_layer = selfAttention_int8()
else:
output, key_layer, value_layer, context_layer, inp_norm = selfAttention_fp()
if config.mlp_after_attn and mp_group is not None and dist.get_world_size(
group=mp_group) > 1:
dist.all_reduce(output, group=mp_group)
return (output, key_layer, value_layer, context_layer, inp_norm)
@staticmethod
def backward(ctx, grad_output, grad_output1, grad_output2, grad_output3):
raise RuntimeError('You are running with DeepSpeed Inference mode. \
Please switch to Training mode for running backward!')
class DeepSpeedSelfAttention(nn.Module):
num_layers = 0
def __init__(self,
config,
mp_group=None,
q_scales=None,
q_groups=1,
merge_count=1,
qkv_merging=False):
super(DeepSpeedSelfAttention, self).__init__()
self.config = config
self.config.layer_id = DeepSpeedSelfAttention.num_layers
DeepSpeedSelfAttention.num_layers = DeepSpeedSelfAttention.num_layers + 1
self.attn_qkvw = nn.Parameter(
torch.Tensor(self.config.hidden_size,
(self.config.hidden_size // self.config.mp_size) * 3))
self.attn_qkvb = nn.Parameter(
torch.Tensor((self.config.hidden_size // self.config.mp_size) * 3))
self.attn_ow = nn.Parameter(
torch.Tensor(self.config.hidden_size // self.config.mp_size,
self.config.hidden_size))
self.attn_ob = nn.Parameter(torch.Tensor(self.config.hidden_size))
self.num_attention_heads_per_partition = self.config.heads // self.config.mp_size
self.hidden_size_per_partition = self.config.hidden_size // self.config.mp_size
self.hidden_size_per_attention_head = self.config.hidden_size // self.config.heads
self.mp_group = mp_group
# used for quantization
self.q_scales = q_scales
self.q_groups = q_groups
self.merge_count = int(math.log2(merge_count))
self.norm_factor = math.sqrt(
math.sqrt(self.config.hidden_size // self.config.heads))
self.qkv_merging = qkv_merging
def forward(self,
input,
input_mask,
head_mask=None,
layer_past=None,
get_present=False,
encoder_hidden_states=None,
encoder_attention_mask=None,
output_attentions=False,
norm_w=None,
norm_b=None):
output = DeepSpeedSelfAttentionFunction.apply(
input,
input_mask,
head_mask,
layer_past,
get_present,
encoder_hidden_states,
encoder_attention_mask,
output_attentions,
norm_w,
norm_b,
self.config,
self.attn_qkvw,
self.attn_qkvb,
self.num_attention_heads_per_partition,
self.norm_factor,
self.hidden_size_per_partition,
self.attn_ow,
self.attn_ob,
self.mp_group,
self.q_scales,
self.q_groups,
self.merge_count,
self.qkv_merging)
return output
class DeepSpeedMLPFunction(Function):
@staticmethod
def forward(ctx,
input,
residual,
residual_norm,
bias,
inter_w,
inter_b,
attn_nw,
attn_nb,
config,
mp_group,
output_b,
output_w,
q_scales,
q_groups,
merge_count,
mlp_gemm_func,
fused_gemm_gelu,
vector_matmul_func,
bias_residual_func):
if config.q_int8:
(intermediate,
residual_add) = inference_cuda_module.mlp_gemm_int8(
input,
residual,
bias,
inter_w,
inter_b,
attn_nw,
attn_nb,
config.epsilon,
q_scales[2],
(q_groups * (2**merge_count)),
config.pre_layer_norm)
output = inference_cuda_module.vector_matmul_int8(intermediate,
output_w,
q_scales[3],
q_groups,
(merge_count))
else:
if attn_nw is None:
output = fused_gemm_gelu(residual_norm,
inter_w,
inter_b,
output_w,
config.epsilon,
config.pre_layer_norm,
False)
else:
intermediate = mlp_gemm_func(input,
residual,
bias,
inter_w,
inter_b,
attn_nw,
attn_nb,
config.epsilon,
config.pre_layer_norm,
config.mlp_after_attn)
output = vector_matmul_func(intermediate, output_w, False)
inference_cuda_module.residual_add(output,
residual,
input,
output_b,
bias,
config.mp_size,
config.mlp_after_attn)
if mp_group is not None and dist.get_world_size(group=mp_group) > 1:
dist.all_reduce(output, group=mp_group)
return output
@staticmethod
def backward(ctx, grad_output):
raise RuntimeError('You are running with DeepSpeed Inference mode. \
Please switch to Training mode for running backward!')
class DeepSpeedMLP(nn.Module):
def __init__(self,
config,
mp_group=None,
q_scales=None,
q_groups=1,
merge_count=1,
mlp_extra_grouping=False):
super(DeepSpeedMLP, self).__init__()
self.config = config
self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size))
self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size))
self.inter_w = nn.Parameter(
torch.Tensor(self.config.hidden_size,
self.config.intermediate_size // self.config.mp_size))
self.inter_b = nn.Parameter(
torch.Tensor(self.config.intermediate_size // self.config.mp_size))
self.output_w = nn.Parameter(
torch.Tensor((self.config.intermediate_size // self.config.mp_size),
self.config.hidden_size))
self.output_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
# used for quantization
self.q_scales = q_scales
self.q_groups = q_groups * 2 if mlp_extra_grouping else q_groups
self.merge_count = int(math.log2(merge_count))
self.mp_group = mp_group
self.mlp_gemm_func = inference_cuda_module.mlp_gemm_fp16 if config.fp16 else \
inference_cuda_module.mlp_gemm_fp32
self.vector_matmul_func = inference_cuda_module.vector_matmul_fp16 if config.fp16 else \
inference_cuda_module.vector_matmul_fp32
self.fused_gemm_gelu = inference_cuda_module.fused_gemm_gelu_fp16 if config.fp16 else \
inference_cuda_module.fused_gemm_gelu_fp32
self.bias_residual_func = inference_cuda_module.bias_residual_fp16 if config.fp16 or config.q_int8 else \
inference_cuda_module.bias_residual_fp32
def forward(self, input, residual, residual_norm, bias):
return DeepSpeedMLPFunction.apply(input,
residual,
residual_norm,
bias,
self.inter_w,
self.inter_b,
self.attn_nw,
self.attn_nb,
self.config,
self.mp_group,
self.output_b,
self.output_w,
self.q_scales,
self.q_groups,
self.merge_count,
self.mlp_gemm_func,
self.fused_gemm_gelu,
self.vector_matmul_func,
self.bias_residual_func)
class DeepSpeedTransformerInference(nn.Module):
"""Initialize the DeepSpeed Transformer Layer.
Arguments:
layer_id: The layer index starting from 0, e.g. if model has 24 transformer layers,
layer_id will be 0,1,2...23 when each layer object is instantiated
config: An object of DeepSpeedInferenceConfig
mp_group: Model parallelism group initialized on the modeling side.
quantize_scales: This argument groups all the layers' scales used for quantization
quantize_groups: Number of groups used for quantizing the model
merge_count: Shows the number of model-parallel checkpoints merged before running inference.
We use this argument to control the quantization scale for the model parameters if a bigger
quantize-grouping than 1 is used.
mlp_extra_grouping: This flag is used to show a 2x higher number of groups used for the MLP part
of a Transformer layer. We use this feature for quantization to reduce the convergence impact
for specific downstream tasks.
"""
layer_id = 0
def __init__(self,
config,
mp_group=None,
quantize_scales=None,
quantize_groups=1,
merge_count=1,
mlp_extra_grouping=False,
qkv_merging=False):
super(DeepSpeedTransformerInference, self).__init__()
self.config = config
self.config.layer_id = DeepSpeedTransformerInference.layer_id
DeepSpeedTransformerInference.layer_id += 1
global inference_cuda_module
if inference_cuda_module is None:
builder = op_builder.InferenceBuilder()
inference_cuda_module = builder.load()
print("DeepSpeed Transformer Inference config is ", self.config.__dict__)
self.attention = DeepSpeedSelfAttention(self.config,
mp_group,
quantize_scales,
quantize_groups,
merge_count,
qkv_merging)
self.mlp = DeepSpeedMLP(self.config,
mp_group,
quantize_scales,
quantize_groups,
merge_count,
mlp_extra_grouping)
self.norm_w = nn.Parameter(torch.Tensor(self.config.hidden_size))
self.norm_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
self.layer_past = None
def forward(self,
input,
input_mask=None,
attention_mask=None,
head_mask=None,
layer_past=None,
get_key_value=False,
get_present=False,
encoder_output=None,
enc_dec_attn_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
use_cache=False,
output_attentions=False):
get_present = (get_present or get_key_value or use_cache)
input_mask = input_mask if attention_mask is None else attention_mask
layer_past = layer_past if layer_past is not None else self.layer_past
attn_mask = None
if isinstance(input, tuple):
attn_mask = input[1]
input = input[0]
input_type = input.dtype
if (self.config.fp16 or self.config.q_int8) \
and input.dtype == torch.float:
input = input.half()
with torch.no_grad():
attention_output, key, value, context_outputtn_ctx, inp_norm = \
self.attention(input,
input_mask,
head_mask,
layer_past,
get_present,
encoder_hidden_states,
encoder_attention_mask,
output_attentions,
self.norm_w,
self.norm_b)
presents = (key, value)
self.layer_past = presents
output = self.mlp(attention_output, input, inp_norm, self.attention.attn_ob)
if not self.config.pre_layer_norm:
ds_layernorm = inference_cuda_module.layer_norm_fp16 if self.config.fp16 or self.config.q_int8 else \
inference_cuda_module.layer_norm_fp32
output = ds_layernorm(output,
self.norm_w,
self.norm_b,
self.config.epsilon)
output = output.to(input_type)
#print(f'[{torch.distributed.get_rank()}] {self.config.layer_id}: {output.norm()}')
#exit()
if get_present:
output = (output, presents)
if self.config.return_tuple:
return output if type(output) is tuple else (output, attn_mask)
else:
return output
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
"""
Inspired by original Triton implementation:
https://github.com/openai/triton/blob/b244db06da24a87453a40ad35b085ee37dac3705/python/tutorials/06-fused-attention.py
......@@ -102,6 +105,7 @@ def _fwd_kernel(
class triton_flash_attn(torch.nn.Module):
def __init__(self, ):
super(triton_flash_attn, self).__init__()
......@@ -111,10 +115,7 @@ class triton_flash_attn(torch.nn.Module):
Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
o = torch.empty_like(q)
grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1])
tmp = torch.empty((q.shape[0] * q.shape[1],
q.shape[2]),
device=q.device,
dtype=torch.float32)
tmp = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
num_warps = 4 if Lk <= 64 else 8
_fwd_kernel[grid](
......
'''
Copyright 2020 The Microsoft DeepSpeed Team
'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import json
import math
import torch
......@@ -15,15 +17,9 @@ stochastic_transformer_cuda_module = None
class TransformerConfig():
def __init__(self,
batch_size,
hidden_size,
intermediate_size,
heads,
attn_dropout_ratio,
hidden_dropout_ratio,
num_hidden_layers,
initializer_range):
def __init__(self, batch_size, hidden_size, intermediate_size, heads, attn_dropout_ratio, hidden_dropout_ratio,
num_hidden_layers, initializer_range):
self.layer_id = -1
self.batch_size = batch_size
self.hidden_size = hidden_size
......@@ -89,6 +85,7 @@ class DeepSpeedTransformerConfig(TransformerConfig):
training: Enable for training rather than inference.
"""
def __init__(self,
batch_size=-1,
hidden_size=-1,
......@@ -111,15 +108,9 @@ class DeepSpeedTransformerConfig(TransformerConfig):
return_tuple=False,
training=True):
super(DeepSpeedTransformerConfig,
self).__init__(
batch_size,
hidden_size,
(intermediate_size if intermediate_size > 0 else 4 * hidden_size),
heads,
attn_dropout_ratio,
hidden_dropout_ratio,
num_hidden_layers,
initializer_range)
self).__init__(batch_size, hidden_size,
(intermediate_size if intermediate_size > 0 else 4 * hidden_size), heads,
attn_dropout_ratio, hidden_dropout_ratio, num_hidden_layers, initializer_range)
self.fp16 = fp16
self.pre_layer_norm = pre_layer_norm
self.local_rank = local_rank
......@@ -150,97 +141,42 @@ class DeepSpeedTransformerConfig(TransformerConfig):
class DeepSpeedTransformerFunction(Function):
@staticmethod
def forward(ctx,
input,
input_mask,
self,
grads,
layer_id,
attn_qkvw,
attn_qkvb,
attn_ow,
attn_ob,
attn_nw,
attn_nb,
inter_w,
inter_b,
output_w,
output_b,
norm_w,
norm_b,
config):
def forward(ctx, input, input_mask, self, grads, layer_id, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw,
attn_nb, inter_w, inter_b, output_w, output_b, norm_w, norm_b, config):
cuda_module = stochastic_transformer_cuda_module if config.stochastic_mode else transformer_cuda_module
forward_func = cuda_module.forward_fp16 if config.fp16 else cuda_module.forward_fp32
inp_size = input.size()
if inp_size[1] % 16 != 0:
input = torch.cat((input,
torch.randn((inp_size[0],
(16 - (inp_size[1] % 16)),
inp_size[2]),
device=input.device,
dtype=input.dtype)),
1)
input = torch.cat(
(input,
torch.randn(
(inp_size[0], (16 - (inp_size[1] % 16)), inp_size[2]), device=input.device, dtype=input.dtype)),
1)
input_mask = torch.cat((input_mask, torch.ones((inp_size[0], input_mask.shape[1], input_mask.shape[2], \
(16 - (inp_size[1] % 16))), device=input_mask.device, dtype=input_mask.dtype) * -10000), 3)
(output,
inp_norm,
qkv_tf,
soft_inp,
ctx_bufB,
attn_o_inp,
add_res,
ff1_inp,
gelu_inp,
ff2_inp,
attn_prob_dropout_mask,
attn_output_dropout_mask,
layer_output_dropout_mask,
attn_layer_norm_var,
attn_layer_norm_mean,
layer_norm_var,
layer_norm_mean) = forward_func(config.layer_id,
input,
input_mask,
attn_qkvw,
attn_qkvb,
attn_ow,
attn_ob,
attn_nw,
attn_nb,
inter_w,
inter_b,
output_w,
output_b,
norm_w,
norm_b,
config.training and config.is_grad_enabled,
config.pre_layer_norm,
config.attn_dropout_checkpoint,
config.normalize_invertible,
config.gelu_checkpoint)
(output, inp_norm, qkv_tf, soft_inp, ctx_bufB, attn_o_inp, add_res, ff1_inp, gelu_inp, ff2_inp,
attn_prob_dropout_mask, attn_output_dropout_mask, layer_output_dropout_mask, attn_layer_norm_var,
attn_layer_norm_mean, layer_norm_var, layer_norm_mean) = forward_func(
config.layer_id, input, input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w,
inter_b, output_w, output_b, norm_w, norm_b, config.training and config.is_grad_enabled,
config.pre_layer_norm, config.attn_dropout_checkpoint, config.normalize_invertible,
config.gelu_checkpoint)
# For testing only.
if grads is not None:
for i in [2]:
attn_qkvw.register_hook(
lambda x,
i=i,
self=self: grads.append([
x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)],
("Q_W" if i == 0 else "K_W" if i == 1 else "V_W")
]))
attn_qkvw.register_hook(lambda x, i=i, self=self: grads.append([
x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)], ("Q_W" if i == 0 else "K_W" if i == 1 else "V_W")
]))
for i in [2]:
attn_qkvb.register_hook(
lambda x,
i=i,
self=self: grads.append([
x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)],
("Q_B" if i == 0 else "K_B" if i == 1 else "V_B")
]))
attn_qkvb.register_hook(lambda x, i=i, self=self: grads.append([
x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)], ("Q_B" if i == 0 else "K_B" if i == 1 else "V_B")
]))
attn_ow.register_hook(lambda x, self=self: grads.append([x, "O_W"]))
attn_ob.register_hook(lambda x, self=self: grads.append([x, "O_B"]))
......@@ -255,35 +191,11 @@ class DeepSpeedTransformerFunction(Function):
if config.is_grad_enabled and config.training:
if (config.pre_layer_norm and config.normalize_invertible):
ctx.save_for_backward(input_mask,
attn_qkvw,
attn_qkvb,
attn_ow,
attn_ob,
attn_nw,
attn_nb,
inter_w,
inter_b,
output_w,
output_b,
norm_w,
norm_b)
ctx.save_for_backward(input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w,
inter_b, output_w, output_b, norm_w, norm_b)
else:
ctx.save_for_backward(output,
input,
input_mask,
attn_qkvw,
attn_qkvb,
attn_ow,
attn_ob,
attn_nw,
attn_nb,
inter_w,
inter_b,
output_w,
output_b,
norm_w,
norm_b)
ctx.save_for_backward(output, input, input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw,
attn_nb, inter_w, inter_b, output_w, output_b, norm_w, norm_b)
ctx.config = config
if (config.pre_layer_norm or not config.normalize_invertible):
......@@ -331,88 +243,28 @@ class DeepSpeedTransformerFunction(Function):
assert ctx.config.training
if (ctx.config.pre_layer_norm and ctx.config.normalize_invertible):
(input_mask,
attn_qkvw,
attn_qkvb,
attn_ow,
attn_ob,
attn_nw,
attn_nb,
inter_w,
inter_b,
output_w,
output_b,
norm_w,
norm_b) = ctx.saved_tensors
(input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b, output_w,
output_b, norm_w, norm_b) = ctx.saved_tensors
else:
(output,
input,
input_mask,
attn_qkvw,
attn_qkvb,
attn_ow,
attn_ob,
attn_nw,
attn_nb,
inter_w,
inter_b,
output_w,
output_b,
norm_w,
norm_b) = ctx.saved_tensors
(output, input, input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b,
output_w, output_b, norm_w, norm_b) = ctx.saved_tensors
cuda_module = stochastic_transformer_cuda_module if ctx.config.stochastic_mode else transformer_cuda_module
backward_func = cuda_module.backward_fp16 if ctx.config.fp16 else cuda_module.backward_fp32
(grad_input,
grad_attn_qkvw,
grad_attn_qkvb,
grad_attn_ow,
grad_attn_ob,
grad_attn_nw,
grad_attn_nb,
grad_inter_w,
grad_inter_b,
grad_output_w,
grad_output_b,
grad_norm_w,
grad_norm_b) = backward_func(
ctx.config.layer_id,
grad_output,
(ctx.inp_norm if (ctx.config.pre_layer_norm
and ctx.config.normalize_invertible) else output),
(ctx.inp_norm if (ctx.config.pre_layer_norm
or not ctx.config.normalize_invertible) else input),
ctx.qkv_tf,
ctx.soft_inp,
(ctx.soft_inp if ctx.config.attn_dropout_checkpoint else ctx.ctx_bufB),
ctx.attn_o_inp,
(ctx.ff1_inp if ctx.config.normalize_invertible else ctx.add_res),
ctx.ff1_inp,
(ctx.ff2_inp if ctx.config.gelu_checkpoint else ctx.gelu_inp),
ctx.ff2_inp,
ctx.attn_prob_dropout_mask,
ctx.attn_output_dropout_mask,
ctx.layer_output_dropout_mask,
ctx.attn_layer_norm_var,
ctx.attn_layer_norm_mean,
ctx.layer_norm_var,
ctx.layer_norm_mean,
(ctx.inp_norm if (ctx.config.pre_layer_norm
and ctx.config.normalize_invertible) else input),
input_mask,
attn_qkvw,
attn_qkvb,
attn_ow,
attn_ob,
attn_nw,
attn_nb,
inter_w,
inter_b,
output_w,
output_b,
norm_w,
norm_b)
(grad_input, grad_attn_qkvw, grad_attn_qkvb, grad_attn_ow, grad_attn_ob, grad_attn_nw, grad_attn_nb,
grad_inter_w, grad_inter_b, grad_output_w, grad_output_b, grad_norm_w, grad_norm_b) = backward_func(
ctx.config.layer_id, grad_output,
(ctx.inp_norm if (ctx.config.pre_layer_norm and ctx.config.normalize_invertible) else output),
(ctx.inp_norm if (ctx.config.pre_layer_norm or not ctx.config.normalize_invertible) else input),
ctx.qkv_tf, ctx.soft_inp, (ctx.soft_inp if ctx.config.attn_dropout_checkpoint else ctx.ctx_bufB),
ctx.attn_o_inp, (ctx.ff1_inp if ctx.config.normalize_invertible else ctx.add_res), ctx.ff1_inp,
(ctx.ff2_inp if ctx.config.gelu_checkpoint else ctx.gelu_inp), ctx.ff2_inp, ctx.attn_prob_dropout_mask,
ctx.attn_output_dropout_mask, ctx.layer_output_dropout_mask, ctx.attn_layer_norm_var,
ctx.attn_layer_norm_mean, ctx.layer_norm_var, ctx.layer_norm_mean,
(ctx.inp_norm if
(ctx.config.pre_layer_norm and ctx.config.normalize_invertible) else input), input_mask, attn_qkvw,
attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b, output_w, output_b, norm_w, norm_b)
# This appears to be an effective way to release context memory
ctx.qkv_tf = None
......@@ -436,24 +288,9 @@ class DeepSpeedTransformerFunction(Function):
if grad_output_shape[1] % 16 != 0:
grad_input = torch.narrow(grad_input, 1, 0, grad_output_shape[1])
return (grad_input,
None,
None,
None,
None,
grad_attn_qkvw,
grad_attn_qkvb,
grad_attn_ow,
grad_attn_ob,
grad_attn_nw,
grad_attn_nb,
grad_inter_w,
grad_inter_b,
grad_output_w,
grad_output_b,
grad_norm_w,
grad_norm_b,
None)
return (grad_input, None, None, None, None, grad_attn_qkvw, grad_attn_qkvb, grad_attn_ow, grad_attn_ob,
grad_attn_nw, grad_attn_nb, grad_inter_w, grad_inter_b, grad_output_w, grad_output_b, grad_norm_w,
grad_norm_b, None)
class DeepSpeedTransformerLayer(nn.Module):
......@@ -484,23 +321,15 @@ class DeepSpeedTransformerLayer(nn.Module):
get_accelerator().set_device(self.config.local_rank)
if initial_weights is None and initial_biases is None:
self.attn_qkvw = nn.Parameter(
torch.Tensor(self.config.hidden_size * 3,
self.config.hidden_size))
self.attn_qkvw = nn.Parameter(torch.Tensor(self.config.hidden_size * 3, self.config.hidden_size))
self.attn_qkvb = nn.Parameter(torch.Tensor(self.config.hidden_size * 3))
self.attn_ow = nn.Parameter(
torch.Tensor(self.config.hidden_size,
self.config.hidden_size))
self.attn_ow = nn.Parameter(torch.Tensor(self.config.hidden_size, self.config.hidden_size))
self.attn_ob = nn.Parameter(torch.Tensor(self.config.hidden_size))
self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size))
self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size))
self.inter_w = nn.Parameter(
torch.Tensor(self.config.intermediate_size,
self.config.hidden_size))
self.inter_w = nn.Parameter(torch.Tensor(self.config.intermediate_size, self.config.hidden_size))
self.inter_b = nn.Parameter(torch.Tensor(self.config.intermediate_size))
self.output_w = nn.Parameter(
torch.Tensor(self.config.hidden_size,
self.config.intermediate_size))
self.output_w = nn.Parameter(torch.Tensor(self.config.hidden_size, self.config.intermediate_size))
self.output_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
self.norm_w = nn.Parameter(torch.Tensor(self.config.hidden_size))
self.norm_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
......@@ -539,21 +368,11 @@ class DeepSpeedTransformerLayer(nn.Module):
cuda_module = stochastic_transformer_cuda_module if self.config.stochastic_mode else transformer_cuda_module
create_layer_func = cuda_module.create_transformer_layer_fp16 if self.config.fp16 else cuda_module.create_transformer_layer_fp32
create_layer_func(self.config.layer_id,
self.config.batch_size,
self.config.hidden_size,
self.config.heads,
self.config.intermediate_size,
self.config.attn_dropout_ratio,
self.config.hidden_dropout_ratio,
self.config.layer_norm_eps,
self.config.seed,
self.config.pre_layer_norm,
self.config.test_gemm,
self.config.attn_dropout_checkpoint,
self.config.normalize_invertible,
self.config.gelu_checkpoint,
self.config.stochastic_mode)
create_layer_func(self.config.layer_id, self.config.batch_size, self.config.hidden_size, self.config.heads,
self.config.intermediate_size, self.config.attn_dropout_ratio,
self.config.hidden_dropout_ratio, self.config.layer_norm_eps, self.config.seed,
self.config.pre_layer_norm, self.config.test_gemm, self.config.attn_dropout_checkpoint,
self.config.normalize_invertible, self.config.gelu_checkpoint, self.config.stochastic_mode)
def init_transformer_weights(self, adjust_init_range=False):
num_layers = self.config.num_hidden_layers
......@@ -587,21 +406,7 @@ class DeepSpeedTransformerLayer(nn.Module):
grads=None):
self.config.is_grad_enabled = torch.is_grad_enabled()
self.config.training = self.training
return DeepSpeedTransformerFunction.apply(hidden_states,
attention_mask,
self,
grads,
self.config.layer_id,
self.attn_qkvw,
self.attn_qkvb,
self.attn_ow,
self.attn_ob,
self.attn_nw,
self.attn_nb,
self.inter_w,
self.inter_b,
self.output_w,
self.output_b,
self.norm_w,
self.norm_b,
self.config)
return DeepSpeedTransformerFunction.apply(hidden_states, attention_mask, self, grads, self.config.layer_id,
self.attn_qkvw, self.attn_qkvb, self.attn_ow, self.attn_ob,
self.attn_nw, self.attn_nb, self.inter_w, self.inter_b,
self.output_w, self.output_b, self.norm_w, self.norm_b, self.config)
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from ..runtime.pipe import PipelineModule, LayerSpec, TiedLayerSpec
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
'''Copyright The Microsoft DeepSpeed Team'''
'''Copyright The Microsoft DeepSpeed Team'''
"""
Copyright (c) Microsoft Corporation
Licensed under the MIT license.
"""
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
from deepspeed.profiling.constants import *
class DeepSpeedFlopsProfilerConfig(DeepSpeedConfigObject):
def __init__(self, param_dict):
super(DeepSpeedFlopsProfilerConfig, self).__init__()
......@@ -25,26 +25,18 @@ class DeepSpeedFlopsProfilerConfig(DeepSpeedConfigObject):
self._initialize(flops_profiler_dict)
def _initialize(self, flops_profiler_dict):
self.enabled = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_ENABLED,
FLOPS_PROFILER_ENABLED_DEFAULT)
self.enabled = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_ENABLED, FLOPS_PROFILER_ENABLED_DEFAULT)
self.profile_step = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_PROFILE_STEP,
self.profile_step = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_PROFILE_STEP,
FLOPS_PROFILER_PROFILE_STEP_DEFAULT)
self.module_depth = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_MODULE_DEPTH,
self.module_depth = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_MODULE_DEPTH,
FLOPS_PROFILER_MODULE_DEPTH_DEFAULT)
self.top_modules = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_TOP_MODULES,
self.top_modules = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_TOP_MODULES,
FLOPS_PROFILER_TOP_MODULES_DEFAULT)
self.detailed = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_DETAILED,
FLOPS_PROFILER_DETAILED_DEFAULT)
self.detailed = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_DETAILED, FLOPS_PROFILER_DETAILED_DEFAULT)
self.output_file = get_scalar_param(flops_profiler_dict,
FLOPS_PROFILER_OUTPUT_FILE,
self.output_file = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_OUTPUT_FILE,
FLOPS_PROFILER_OUTPUT_FILE_DEFAULT)
'''Copyright The Microsoft DeepSpeed Team'''
"""
Copyright (c) Microsoft Corporation
Licensed under the MIT license.
"""
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
#########################################
# flops profiler
......
......@@ -309,8 +309,9 @@ The following example shows how to profile AlexNet using the DeepSpeed flops pro
import torchvision.models as models
import torch
from deepspeed.profiling.flops_profiler import get_model_profile
from deepspeed.accelerator import get_accelerator
with torch.cuda.device(0):
with get_accelerator().device(0):
model = models.alexnet()
batch_size = 256
flops, macs, params = get_model_profile(model=model, # model
......@@ -334,6 +335,7 @@ from functools import partial
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from deepspeed.profiling.flops_profiler import get_model_profile
from deepspeed.accelerator import get_accelerator
def bert_input_constructor(batch_size, seq_len, tokenizer):
......@@ -350,7 +352,7 @@ def bert_input_constructor(batch_size, seq_len, tokenizer):
return inputs
with torch.cuda.device(0):
with get_accelerator().device(0):
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
batch_size = 4
......
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from .profiler import *
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import time
import torch
......@@ -53,6 +56,7 @@ class FlopsProfiler(object):
Args:
object (torch.nn.Module): The PyTorch model to profile.
"""
def __init__(self, model, ds_engine=None):
self.model = model
self.ds_engine = ds_engine
......@@ -78,8 +82,7 @@ class FlopsProfiler(object):
# if computing the flops of a module directly
if type(module) in MODULE_HOOK_MAPPING:
if not hasattr(module, "__flops_handle__"):
module.__flops_handle__ = module.register_forward_hook(
MODULE_HOOK_MAPPING[type(module)])
module.__flops_handle__ = module.register_forward_hook(MODULE_HOOK_MAPPING[type(module)])
return
# if computing the flops of the functionals in a module
......@@ -105,16 +108,14 @@ class FlopsProfiler(object):
module.__start_time__ = time.time()
if not hasattr(module, "__start_time_hook_handle"):
module.__start_time_hook_handle__ = module.register_forward_pre_hook(
start_time_hook)
module.__start_time_hook_handle__ = module.register_forward_pre_hook(start_time_hook)
def end_time_hook(module, input, output):
get_accelerator().synchronize()
module.__duration__ += time.time() - module.__start_time__
if not hasattr(module, "__end_time_hook_handle__"):
module.__end_time_hook_handle__ = module.register_forward_hook(
end_time_hook)
module.__end_time_hook_handle__ = module.register_forward_hook(end_time_hook)
self.model.apply(partial(register_module_hooks, ignore_list=ignore_list))
self.started = True
......@@ -154,6 +155,7 @@ class FlopsProfiler(object):
Adds or resets the extra attributes.
"""
def add_or_reset_attrs(module):
module.__flops__ = 0
module.__macs__ = 0
......@@ -232,15 +234,9 @@ class FlopsProfiler(object):
Returns:
The number of parameters in the model.
"""
return params_to_string(
self.model.__params__) if as_string else self.model.__params__
def print_model_profile(self,
profile_step=1,
module_depth=-1,
top_modules=1,
detailed=True,
output_file=None):
return params_to_string(self.model.__params__) if as_string else self.model.__params__
def print_model_profile(self, profile_step=1, module_depth=-1, top_modules=1, detailed=True, output_file=None):
"""Prints the model graph with the measured profile attached to each module.
Args:
......@@ -273,28 +269,21 @@ class FlopsProfiler(object):
self.macs = total_macs
self.params = total_params
print(
"\n-------------------------- DeepSpeed Flops Profiler --------------------------"
)
print("\n-------------------------- DeepSpeed Flops Profiler --------------------------")
print(f'Profile Summary at step {profile_step}:')
print(
"Notations:\ndata parallel size (dp_size), model parallel size(mp_size),\nnumber of parameters (params), number of multiply-accumulate operations(MACs),\nnumber of floating-point operations (flops), floating-point operations per second (FLOPS),\nfwd latency (forward propagation latency), bwd latency (backward propagation latency),\nstep (weights update latency), iter latency (sum of fwd, bwd and step latency)\n"
)
if self.ds_engine:
print('{:<60} {:<8}'.format('world size: ', self.ds_engine.world_size))
print('{:<60} {:<8}'.format('data parallel size: ',
self.ds_engine.dp_world_size))
print('{:<60} {:<8}'.format('model parallel size: ',
self.ds_engine.mp_world_size))
print('{:<60} {:<8}'.format(
'batch size per GPU: ',
self.ds_engine.train_micro_batch_size_per_gpu()))
print('{:<60} {:<8}'.format('data parallel size: ', self.ds_engine.dp_world_size))
print('{:<60} {:<8}'.format('model parallel size: ', self.ds_engine.mp_world_size))
print('{:<60} {:<8}'.format('batch size per GPU: ', self.ds_engine.train_micro_batch_size_per_gpu()))
print('{:<60} {:<8}'.format('params per gpu: ', params_to_string(total_params)))
print('{:<60} {:<8}'.format(
'params of model = params per GPU * mp_size: ',
params_to_string(total_params *
((self.ds_engine.mp_world_size) if self.ds_engine else 1))))
params_to_string(total_params * ((self.ds_engine.mp_world_size) if self.ds_engine else 1))))
print('{:<60} {:<8}'.format('fwd MACs per GPU: ', macs_to_string(total_macs)))
......@@ -302,43 +291,33 @@ class FlopsProfiler(object):
print('{:<60} {:<8}'.format(
'fwd flops of model = fwd flops per GPU * mp_size: ',
num_to_string(total_flops *
((self.ds_engine.mp_world_size) if self.ds_engine else 1))))
num_to_string(total_flops * ((self.ds_engine.mp_world_size) if self.ds_engine else 1))))
fwd_latency = self.get_total_duration()
if self.ds_engine and self.ds_engine.wall_clock_breakdown():
fwd_latency = self.ds_engine.timers('forward').elapsed(False) / 1000.0
print('{:<60} {:<8}'.format('fwd latency: ', duration_to_string(fwd_latency)))
print('{:<60} {:<8}'.format(
'fwd FLOPS per GPU = fwd flops per GPU / fwd latency: ',
flops_to_string(total_flops / fwd_latency)))
print('{:<60} {:<8}'.format('fwd FLOPS per GPU = fwd flops per GPU / fwd latency: ',
flops_to_string(total_flops / fwd_latency)))
if self.ds_engine and self.ds_engine.wall_clock_breakdown():
bwd_latency = self.ds_engine.timers('backward').elapsed(False) / 1000.0
step_latency = self.ds_engine.timers('step').elapsed(False) / 1000.0
print('{:<60} {:<8}'.format('bwd latency: ',
duration_to_string(bwd_latency)))
print('{:<60} {:<8}'.format(
'bwd FLOPS per GPU = 2 * fwd flops per GPU / bwd latency: ',
flops_to_string(2 * total_flops / bwd_latency)))
print('{:<60} {:<8}'.format(
'fwd+bwd FLOPS per GPU = 3 * fwd flops per GPU / (fwd+bwd latency): ',
flops_to_string(3 * total_flops / (fwd_latency + bwd_latency))))
print('{:<60} {:<8}'.format('step latency: ',
duration_to_string(step_latency)))
print('{:<60} {:<8}'.format('bwd latency: ', duration_to_string(bwd_latency)))
print('{:<60} {:<8}'.format('bwd FLOPS per GPU = 2 * fwd flops per GPU / bwd latency: ',
flops_to_string(2 * total_flops / bwd_latency)))
print('{:<60} {:<8}'.format('fwd+bwd FLOPS per GPU = 3 * fwd flops per GPU / (fwd+bwd latency): ',
flops_to_string(3 * total_flops / (fwd_latency + bwd_latency))))
print('{:<60} {:<8}'.format('step latency: ', duration_to_string(step_latency)))
iter_latency = fwd_latency + bwd_latency + step_latency
print('{:<60} {:<8}'.format('iter latency: ',
duration_to_string(iter_latency)))
print('{:<60} {:<8}'.format(
'FLOPS per GPU = 3 * fwd flops per GPU / iter latency: ',
flops_to_string(3 * total_flops / iter_latency)))
print('{:<60} {:<8}'.format('iter latency: ', duration_to_string(iter_latency)))
print('{:<60} {:<8}'.format('FLOPS per GPU = 3 * fwd flops per GPU / iter latency: ',
flops_to_string(3 * total_flops / iter_latency)))
samples_per_iter = self.ds_engine.train_micro_batch_size_per_gpu(
) * self.ds_engine.world_size
print('{:<60} {:<8.2f}'.format('samples/second: ',
samples_per_iter / iter_latency))
samples_per_iter = self.ds_engine.train_micro_batch_size_per_gpu() * self.ds_engine.world_size
print('{:<60} {:<8.2f}'.format('samples/second: ', samples_per_iter / iter_latency))
def flops_repr(module):
params = module.__params__
......@@ -353,9 +332,7 @@ class FlopsProfiler(object):
duration = get_module_duration(module)
items.append(duration_to_string(duration))
items.append(
"{:.2%} latency".format(0.0 if total_duration == 0 else duration /
total_duration))
items.append("{:.2%} latency".format(0.0 if total_duration == 0 else duration / total_duration))
items.append(flops_to_string(0.0 if duration == 0 else flops / duration))
items.append(module.original_extra_repr())
return ", ".join(items)
......@@ -374,16 +351,11 @@ class FlopsProfiler(object):
self.model.apply(add_extra_repr)
print(
"\n----------------------------- Aggregated Profile per GPU -----------------------------"
)
self.print_model_aggregated_profile(module_depth=module_depth,
top_modules=top_modules)
print("\n----------------------------- Aggregated Profile per GPU -----------------------------")
self.print_model_aggregated_profile(module_depth=module_depth, top_modules=top_modules)
if detailed:
print(
"\n------------------------------ Detailed Profile per GPU ------------------------------"
)
print("\n------------------------------ Detailed Profile per GPU ------------------------------")
print(
"Each module profile is listed after its name in the following order: \nparams, percentage of total params, MACs, percentage of total MACs, fwd latency, percentage of total fwd latency, fwd FLOPS"
)
......@@ -394,9 +366,7 @@ class FlopsProfiler(object):
self.model.apply(del_extra_repr)
print(
"------------------------------------------------------------------------------"
)
print("------------------------------------------------------------------------------")
if output_file:
sys.stdout = original_stdout
......@@ -411,9 +381,7 @@ class FlopsProfiler(object):
"""
info = {}
if not hasattr(self.model, "__flops__"):
print(
"no __flops__ attribute in the model, call this function after start_profile and before end_profile"
)
print("no __flops__ attribute in the model, call this function after start_profile and before end_profile")
return
def walk_module(module, curr_depth, info):
......@@ -439,33 +407,22 @@ class FlopsProfiler(object):
if module_depth == -1:
depth = len(info) - 1
print(
f'Top {top_modules} modules in terms of params, MACs or fwd latency at different model depths:'
)
print(f'Top {top_modules} modules in terms of params, MACs or fwd latency at different model depths:')
for d in range(depth):
num_items = min(top_modules, len(info[d]))
sort_macs = {
k: macs_to_string(v[0])
for k,
v in sorted(info[d].items(),
key=lambda item: item[1][0],
reverse=True)[:num_items]
for k, v in sorted(info[d].items(), key=lambda item: item[1][0], reverse=True)[:num_items]
}
sort_params = {
k: params_to_string(v[1])
for k,
v in sorted(info[d].items(),
key=lambda item: item[1][1],
reverse=True)[:num_items]
for k, v in sorted(info[d].items(), key=lambda item: item[1][1], reverse=True)[:num_items]
}
sort_time = {
k: duration_to_string(v[2])
for k,
v in sorted(info[d].items(),
key=lambda item: item[1][2],
reverse=True)[:num_items]
for k, v in sorted(info[d].items(), key=lambda item: item[1][2], reverse=True)[:num_items]
}
print(f"depth {d}:")
......@@ -499,9 +456,7 @@ def _elu_flops_compute(input: Tensor, alpha: float = 1.0, inplace: bool = False)
return input.numel(), 0
def _leaky_relu_flops_compute(input: Tensor,
negative_slope: float = 0.01,
inplace: bool = False):
def _leaky_relu_flops_compute(input: Tensor, negative_slope: float = 0.01, inplace: bool = False):
return input.numel(), 0
......@@ -529,13 +484,7 @@ def _pool_flops_compute(input,
return input.numel(), 0
def _conv_flops_compute(input,
weight,
bias=None,
stride=1,
padding=0,
dilation=1,
groups=1):
def _conv_flops_compute(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
assert weight.shape[1] * groups == input.shape[1]
batch_size = input.shape[0]
......@@ -552,8 +501,8 @@ def _conv_flops_compute(input,
output_dims = []
for idx, input_dim in enumerate(input_dims):
output_dim = (input_dim + 2 * paddings[idx] -
(dilations[idx] * (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1
output_dim = (input_dim + 2 * paddings[idx] - (dilations[idx] *
(kernel_dims[idx] - 1) + 1)) // strides[idx] + 1
output_dims.append(output_dim)
filters_per_channel = out_channels // groups
......@@ -594,8 +543,8 @@ def _conv_trans_flops_compute(
output_dims = []
for idx, input_dim in enumerate(input_dims):
output_dim = (input_dim + 2 * paddings[idx] -
(dilations[idx] * (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1
output_dim = (input_dim + 2 * paddings[idx] - (dilations[idx] *
(kernel_dims[idx] - 1) + 1)) // strides[idx] + 1
output_dims.append(output_dim)
paddings = padding if type(padding) is tuple else (padding, padding)
......@@ -1210,8 +1159,7 @@ def get_model_profile(
assert len(input_shape) >= 1, "input_shape must have at least one element"
try:
input = torch.ones(()).new_empty(
(*input_shape,
),
(*input_shape, ),
dtype=next(model.parameters()).dtype,
device=next(model.parameters()).device,
)
......
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
class DeepSpeedOptimizer(object):
......
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
'''Copyright The Microsoft DeepSpeed Team'''
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment