Merge branch 'ds-v0.9.2-rocm' into 'main'

Ds v0.9.2 rocm See merge request dcutoolkit/deeplearing/deepspeed!2

Merge branch 'ds-v0.9.2-rocm' into 'main'
Ds v0.9.2 rocm See merge request dcutoolkit/deeplearing/deepspeed!2
c25a91b6 · aiss · d1596c94 · af82b300 · c25a91b6 · c25a91b6
Commit c25a91b6 authored May 30, 2023 by aiss
20 changed files
--- a/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team

 import torch
 from ..config import DeepSpeedInferenceConfig
@@ -6,6 +9,7 @@ from .base import BaseOp


 class GELUGemmOp(BaseOp):
+
    def __init__(self, config: DeepSpeedInferenceConfig):
        super(GELUGemmOp, self).__init__(config)
        if self.config.fp16:
@@ -19,14 +23,9 @@ class GELUGemmOp(BaseOp):
                bias: torch.Tensor,
                weight_out: torch.Tensor,
                async_op: bool = False):
-        output = self.fused_gemm_gelu(input,
-                                      weight,
-                                      weight.scale,
-                                      bias,
-                                      weight_out,
-                                      weight_out.scale,
-                                      self.config.epsilon,
-                                      self.config.pre_layer_norm,
-                                      self.config.q_int8,
-                                      async_op)
+        output = self.fused_gemm_gelu(input, weight, weight.scale if hasattr(weight, 'scale') else torch.empty(1),
+                                      bias, weight_out,
+                                      weight_out.scale if hasattr(weight_out, 'scale') else torch.empty(1),
+                                      self.config.epsilon, self.config.pre_layer_norm, self.config.q_int8, async_op,
+                                      self.config.transposed_mode)
        return output
--- a/deepspeed/ops/transformer/inference/op_binding/linear.py
+++ b/deepspeed/ops/transformer/inference/op_binding/linear.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team

 import torch
 from ..config import DeepSpeedInferenceConfig
@@ -6,6 +9,7 @@ from .base import BaseOp


 class LinearOp(BaseOp):
+
    def __init__(self, config: DeepSpeedInferenceConfig):
        super(LinearOp, self).__init__(config)
        if self.config.fp16:
@@ -22,10 +26,6 @@ class LinearOp(BaseOp):
                num_heads: int,
                external_cache: bool = None,
                num_layers: int = None):
-        qkv_out = self.linear_func(input,
-                                   weight,
-                                   bias,
-                                   add_bias,
-                                   do_flash_attn,
-                                   num_heads)
+        qkv_out = self.linear_func(input, weight, bias, add_bias, do_flash_attn, num_heads,
+                                   self.config.transposed_mode)
        return qkv_out
--- a/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team

 import torch
 from ..config import DeepSpeedInferenceConfig
@@ -6,6 +9,7 @@ from .base import BaseOp


 class MLPGemmOp(BaseOp):
+
    def __init__(self, config: DeepSpeedInferenceConfig):
        super(MLPGemmOp, self).__init__(config)
        if self.config.fp16:
@@ -13,29 +17,13 @@ class MLPGemmOp(BaseOp):
        else:
            self.mlp_gemm_func = self.inference_cuda_module.mlp_gemm_fp32

-    def forward(self,
-                input: torch.Tensor,
-                residual: torch.Tensor,
-                input_bias: torch.Tensor,
-                weight_interm: torch.Tensor,
-                weight_out: torch.Tensor,
-                bias: torch.Tensor,
-                gamma: torch.Tensor,
+    def forward(self, input: torch.Tensor, residual: torch.Tensor, input_bias: torch.Tensor,
+                weight_interm: torch.Tensor, weight_out: torch.Tensor, bias: torch.Tensor, gamma: torch.Tensor,
                beta: torch.Tensor):
        output, residual_add = self.mlp_gemm_func(
-                                    input,
-                                    residual,
-                                    input_bias,
-                                    weight_interm,
-                                    weight_out,
-                                    bias,
-                                    gamma,
-                                    beta,
-                                    self.config.epsilon,
-                                    self.config.pre_layer_norm,
-                                    self.config.mlp_after_attn,
-                                    weight_interm.scale,
-                                    weight_out.scale,
-                                    self.config.q_int8,
-                                    self.config.mlp_act_func_type)
+            input, residual, input_bias, weight_interm, weight_out, bias, gamma, beta, self.config.epsilon,
+            self.config.pre_layer_norm, self.config.mlp_after_attn,
+            weight_interm.scale if hasattr(weight_interm, 'scale') else torch.empty(1),
+            weight_out.scale if hasattr(weight_out, 'scale') else torch.empty(1), self.config.q_int8,
+            self.config.mlp_act_func_type, self.config.transposed_mode)
        return output, residual_add
--- a/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team

 import torch
 from ..config import DeepSpeedInferenceConfig
@@ -7,6 +10,7 @@ from deepspeed import comm as dist


 class QKVGemmOp(BaseOp):
+
    def __init__(self, config: DeepSpeedInferenceConfig):
        super(QKVGemmOp, self).__init__(config)
        if self.config.fp16:
@@ -24,21 +28,11 @@ class QKVGemmOp(BaseOp):
                num_layers: int,
                num_heads: int = None,
                max_out_tokens: int = None):
-        q_scale = weight.scale
+        q_scale = weight.scale if hasattr(weight, 'scale') else torch.empty(1)
        external_cache = self.config.bigscience_bloom
        rank = dist.get_rank() if dist.is_initialized() else 0
        q_int8 = self.config.q_int8
-        output = self.qkv_gemm_func(input,
-                                    weight,
-                                    q_scale,
-                                    bias,
-                                    gamma,
-                                    beta,
-                                    self.config.epsilon,
-                                    add_bias,
-                                    num_layers,
-                                    external_cache,
-                                    self.config.mp_size,
-                                    rank,
-                                    q_int8)
+        output = self.qkv_gemm_func(input, weight, q_scale, bias, gamma, beta, self.config.epsilon, add_bias,
+                                    num_layers, external_cache, self.config.mp_size, rank, q_int8,
+                                    self.config.transposed_mode)
        return output
--- a/deepspeed/ops/transformer/inference/op_binding/residual_add.py
+++ b/deepspeed/ops/transformer/inference/op_binding/residual_add.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team

 import torch
 from ..config import DeepSpeedInferenceConfig
@@ -6,6 +9,7 @@ from .base import BaseOp


 class ResidualAddOp(BaseOp):
+
    def __init__(self, config: DeepSpeedInferenceConfig):
        super(ResidualAddOp, self).__init__(config)
        if self.config.fp16 or self.config.q_int8:
@@ -13,26 +17,13 @@ class ResidualAddOp(BaseOp):
        else:
            self.residual_add_func = self.inference_cuda_module.residual_add_bias_fp32

-    def forward(self,
-                hidden_state: torch.Tensor,
-                residual: torch.Tensor,
-                attention_output: torch.Tensor,
-                attention_bias: torch.Tensor,
-                final_bias: torch.Tensor,
-                add_bias: bool,
-                residual_add: torch.Tensor):
+    def forward(self, hidden_state: torch.Tensor, residual: torch.Tensor, attention_output: torch.Tensor,
+                attention_bias: torch.Tensor, final_bias: torch.Tensor, add_bias: bool, residual_add: torch.Tensor):

        if not self.config.pre_layer_norm and residual_add is not None:
            # only use residual add if its set and we are not pre layer norm
            residual = residual_add

-        self.residual_add_func(hidden_state,
-                               residual,
-                               attention_output,
-                               attention_bias,
-                               final_bias,
-                               self.config.mp_size,
-                               self.config.mlp_after_attn,
-                               add_bias,
-                               self.config.pre_layer_norm)
+        self.residual_add_func(hidden_state, residual, attention_output, attention_bias, final_bias,
+                               self.config.mp_size, self.config.mlp_after_attn, add_bias, self.config.pre_layer_norm)
        return residual
--- a/deepspeed/ops/transformer/inference/op_binding/softmax.py
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team

 import torch
 from ..config import DeepSpeedInferenceConfig
@@ -6,6 +9,7 @@ from .base import BaseOp


 class SoftmaxOp(BaseOp):
+
    def __init__(self, config: DeepSpeedInferenceConfig):
        super(SoftmaxOp, self).__init__(config)
        if self.config.fp16:
@@ -16,26 +20,9 @@ class SoftmaxOp(BaseOp):
    def _not_implemented(self, *args, **kwargs):
        raise NotImplementedError

-    def forward(self,
-                attn_scores: torch.Tensor,
-                attn_mask: torch.Tensor,
-                alibi: torch.Tensor,
-                triangular: bool,
-                recompute: bool,
-                local_attention: bool,
-                window_size: int,
-                async_op: bool,
-                layer_scale: float,
+    def forward(self, attn_scores: torch.Tensor, attn_mask: torch.Tensor, alibi: torch.Tensor, triangular: bool,
+                recompute: bool, local_attention: bool, window_size: int, async_op: bool, layer_scale: float,
                head_offset: int):
-        output = self.softmax_func(attn_scores,
-                                   attn_mask,
-                                   alibi,
-                                   triangular,
-                                   recompute,
-                                   local_attention,
-                                   window_size,
-                                   async_op,
-                                   layer_scale,
-                                   head_offset,
-                                   self.config.mp_size)
+        output = self.softmax_func(attn_scores, attn_mask, alibi, triangular, recompute, local_attention, window_size,
+                                   async_op, layer_scale, head_offset, self.config.mp_size)
        return output
--- a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team

 import torch
 from deepspeed import comm as dist
@@ -7,6 +10,7 @@ from .base import BaseOp


 class SoftmaxContextOp(BaseOp):
+
    def __init__(self, config: DeepSpeedInferenceConfig):
        super(SoftmaxContextOp, self).__init__(config)
        if self.config.fp16:
@@ -14,15 +18,8 @@ class SoftmaxContextOp(BaseOp):
        else:
            self.softmax_context_func = self.inference_cuda_module.softmax_context_fp32

-    def forward(self,
-                query_key_value: torch.Tensor,
-                attn_mask: torch.Tensor,
-                heads: int,
-                norm_factor: float,
-                no_masking: bool,
-                layer_id: int,
-                num_layers: int,
-                alibi: torch.Tensor):
+    def forward(self, query_key_value: torch.Tensor, attn_mask: torch.Tensor, heads: int, norm_factor: float,
+                no_masking: bool, layer_id: int, num_layers: int, alibi: torch.Tensor):

        if alibi is not None:
            batch_heads = query_key_value.shape[0] * heads
@@ -31,18 +28,8 @@ class SoftmaxContextOp(BaseOp):
        else:
            alibi = torch.empty(1)

-        output = self.softmax_context_func(query_key_value,
-                                           attn_mask,
-                                           self.config.rotary_dim,
-                                           self.config.rotate_half,
-                                           self.config.rotate_every_two,
-                                           heads,
-                                           norm_factor,
-                                           self.config.triangular_masking,
-                                           self.config.local_attention,
-                                           self.config.window_size,
-                                           no_masking,
-                                           layer_id,
-                                           num_layers,
-                                           alibi)
+        output = self.softmax_context_func(query_key_value, attn_mask, self.config.rotary_dim, self.config.rotate_half,
+                                           self.config.rotate_every_two, heads, norm_factor,
+                                           self.config.triangular_masking, self.config.local_attention,
+                                           self.config.window_size, no_masking, layer_id, num_layers, alibi)
        return output
--- a/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
+++ b/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team

 import torch
 from ..config import DeepSpeedInferenceConfig
@@ -6,6 +9,7 @@ from .base import BaseOp


 class VectorMatMulOp(BaseOp):
+
    def __init__(self, config: DeepSpeedInferenceConfig):
        super(VectorMatMulOp, self).__init__(config)
        if self.config.fp16:
@@ -14,7 +18,7 @@ class VectorMatMulOp(BaseOp):
            self.vector_matmul_func = self.inference_cuda_module.vector_matmul_fp32

    def forward(self, input: torch.Tensor, weight: torch.Tensor, async_op: bool = False):
-        q_scale = weight.scale
+        q_scale = weight.scale if hasattr(weight, 'scale') else torch.empty(1)
        q_int8 = self.config.q_int8
-        output = self.vector_matmul_func(input, weight, async_op, q_scale, q_int8)
+        output = self.vector_matmul_func(input, weight, async_op, q_scale, q_int8, self.config.transposed_mode)
        return output
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ b/deepspeed/ops/transformer/inference/transformer_inference.py
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
-import json
-import math
-import importlib
-import torch
-from torch import nn
-from torch.autograd import Function
-import time
-from ... import op_builder
-import torch.nn as nn
-import torch.distributed as dist
-# Cuda modules will be imported if needed
-inference_cuda_module = None
-
-
-class TransformerConfig():
-    def __init__(self, hidden_size, intermediate_size, heads, num_hidden_layers):
-        self.layer_id = -1
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.heads = heads
-        self.num_hidden_layers = num_hidden_layers
-
-
-class DeepSpeedInferenceConfig(TransformerConfig):
-    """Initialize the DeepSpeed Transformer Config.
-        Arguments:
-            hidden_size: The hidden size of the transformer layer
-            intermediate_size: The intermediate size of the feed-forward part of transformer layer
-            heads: The number of heads in the self-attention of the transformer layer
-            num_hidden_layers: The number of transformer layers
-            layer_norm_eps: The epsilon value for the layer norm
-            local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
-                to use if the model already set the current device, otherwise need to set it
-                so that the transformer kernel can work on the right device
-            mp_size (optional): This argument is mainly used to create the parameters on the kernel side
-                using model-parallel architecture. If the client model already takes care of this, there is no
-                need to pass this argument.
-            fp16: Enable half-precision computation
-            pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture
-            stochastic_mode:  Enable for high performance, please note that this flag has some level of
-                non-determinism and can produce different results on different runs.  However, we have seen
-                that by enabling it, the pretraining tasks such as BERT are not affected and can obtain
-                a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
-                to turn it off in order to be able to reproduce the same result through the regular kernel execution.
-
-            scale_attention: If true, both q and k are scaled by 1/sqrt(attention_heads) before attention computation.
-            return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor
-    """
-    def __init__(self,
-                 hidden_size=-1,
-                 intermediate_size=-1,
-                 heads=-1,
-                 num_hidden_layers=-1,
-                 layer_norm_eps=1e-12,
-                 local_rank=-1,
-                 mp_size=1,
-                 fp16=False,
-                 q_int8=False,
-                 pre_layer_norm=True,
-                 stochastic_mode=False,
-                 scale_attention=True,
-                 triangular_masking=True,
-                 local_attention=False,
-                 window_size=256,
-                 rotary_dim=-1,
-                 rotate_half=False,
-                 rotate_every_two=True,
-                 return_tuple=True,
-                 mlp_after_attn=True,
-                 training_mp_size=1):
-        super(DeepSpeedInferenceConfig,
-              self).__init__(
-                  hidden_size,
-                  (intermediate_size if intermediate_size > 0 else 4 * hidden_size),
-                  heads,
-                  num_hidden_layers)
-        self.fp16 = fp16
-        self.pre_layer_norm = pre_layer_norm
-        self.local_rank = local_rank
-        self.stochastic_mode = stochastic_mode
-        self.epsilon = layer_norm_eps
-        self.mp_size = mp_size
-        self.q_int8 = q_int8
-        self.scale_attention = scale_attention
-        self.triangular_masking = triangular_masking
-        self.local_attention = local_attention
-        self.window_size = window_size
-        self.rotary_dim = rotary_dim
-        self.rotate_half = rotate_half
-        self.rotate_every_two = rotate_every_two
-        self.return_tuple = return_tuple
-        self.mlp_after_attn = mlp_after_attn
-        self.specialized_mode = False
-        self.training_mp_size = training_mp_size
-
-    @classmethod
-    def from_dict(cls, json_object):
-        config = DeepSpeedInferenceConfig()
-        for key, value in json_object.items():
-            config.__dict__[key] = value
-        return config
-
-    @classmethod
-    def from_json_file(cls, json_file):
-        with open(json_file, "r", encoding='utf-8') as reader:
-            text = reader.read()
-        return cls.from_dict(json.loads(text))
-
-
-class DeepSpeedSelfAttentionFunction(Function):
-    @staticmethod
-    def forward(ctx,
-                input,
-                input_mask,
-                head_mask,
-                layer_past,
-                get_present,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                output_attentions,
-                norm_w,
-                norm_b,
-                config,
-                attn_qkvw,
-                attn_qkvb,
-                num_attention_heads_per_partition,
-                norm_factor,
-                hidden_size_per_partition,
-                attn_ow,
-                attn_ob,
-                mp_group,
-                q_scales,
-                q_groups,
-                merge_count,
-                qkv_merging):
-        def _transpose_for_scores(x, key=False, reshape=False):
-            attention_head_size = x.shape[-1] // num_attention_heads_per_partition
-            new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition,
-                                           attention_head_size)
-            x_1 = x.view(*new_x_shape)
-            if key:
-                x_1 = x_1.permute(0, 2, 3, 1)
-            else:
-                x_1 = x_1.permute(0, 2, 1, 3)
-            if reshape:
-                return x_1.reshape(x.shape)
-            return x_1.contiguous()
-
-        def _transpose_for_context(x):
-            x = x.permute(0, 2, 1, 3).contiguous()
-            new_x_layer_shape = x.size()[:-2] + \
-                                      (hidden_size_per_partition,)
-            return x.view(*new_x_layer_shape).contiguous()
-
-        def compute_attention(qkv_out, input_mask):
-            score_context_func = inference_cuda_module.softmax_context_fp32 if (not config.fp16) else \
-                                    inference_cuda_module.softmax_context_fp16
-
-            if merge_count > 0 and config.q_int8:
-                split_dim = (qkv_out.dim() - 1)
-                qkv_split = torch.split(qkv_out,
-                                        (qkv_out.shape[-1] // (2**merge_count)),
-                                        dim=split_dim)
-                qkv_split = [
-                    torch.split(s,
-                                (s.shape[-1] // 3),
-                                dim=split_dim) for s in qkv_split
-                ]
-                (mixed_query,
-                 key_layer,
-                 value_layer) = [
-                     torch.cat([s[i] for s in qkv_split],
-                               axis=-1) for i in range(len(qkv_split[0]))
-                 ]
-            else:
-                (mixed_query,
-                 key_layer,
-                 value_layer) = torch.split(qkv_out,
-                                            (qkv_out.shape[-1] // 3),
-                                            dim=(qkv_out.dim() - 1))
-            no_masking = input_mask is None
-            if no_masking:
-                input_mask = torch.empty(1)
-            head_size = (mixed_query.shape[-1] // num_attention_heads_per_partition)
-
-            unfused_mode = not config.specialized_mode or \
-                                mixed_query.shape[1] >= 32 or head_size > 128
-
-            if config.rotary_dim > 0:
-                mixed_query, key_layer = inference_cuda_module.apply_rotary_pos_emb(
-                    mixed_query,
-                    key_layer,
-                    config.rotary_dim,
-                    0 if layer_past is None else layer_past[0].shape[-2],
-                    num_attention_heads_per_partition,
-                    config.rotate_half,
-                    config.rotate_every_two)
-            if layer_past is not None:
-                past_key, past_value = layer_past
-                if unfused_mode:
-                    key_layer = torch.cat((past_key.type_as(key_layer),
-                                           key_layer),
-                                          dim=-2)
-                    value_layer = torch.cat((past_value.type_as(value_layer),
-                                             value_layer),
-                                            dim=-2)
-            presents = (key_layer, value_layer)
-            if unfused_mode:
-                mixed_query = _transpose_for_scores(mixed_query, False, True)
-                key_layer = _transpose_for_scores(
-                    key_layer,
-                    True,
-                    True) / (norm_factor if config.scale_attention else 1.0)
-                value_layer = _transpose_for_scores(value_layer, False, True)
-            #print(f'[{torch.distributed.get_rank()}] {config.layer_id}: {mixed_query.norm()}')
-            if layer_past is None:
-                attn_key_value = score_context_func(
-                    mixed_query,
-                    key_layer,
-                    torch.empty(1),
-                    input_mask,
-                    value_layer,
-                    torch.empty(1),
-                    num_attention_heads_per_partition,
-                    (1 / norm_factor if config.scale_attention else 1.0),
-                    (not unfused_mode),
-                    config.triangular_masking,
-                    config.local_attention,
-                    config.window_size,
-                    no_masking)
-            else:
-                attn_key_value = score_context_func(
-                    mixed_query,
-                    (key_layer if unfused_mode else past_key.type_as(key_layer)),
-                    key_layer,
-                    input_mask,
-                    (value_layer if unfused_mode else past_value.type_as(value_layer)),
-                    value_layer,
-                    num_attention_heads_per_partition,
-                    (1 / norm_factor if config.scale_attention else 1.0),
-                    (not unfused_mode),
-                    config.triangular_masking,
-                    config.local_attention,
-                    config.window_size,
-                    no_masking)
-            if unfused_mode:
-                context_layer, _, _ = attn_key_value
-            else:
-                context_layer, key_layer, value_layer = attn_key_value
-
-            # Transpose Context
-            context_layer = _transpose_for_context(context_layer)
-
-            return context_layer, presents[0], presents[1] # atten_output, key_layer, value_layer
-
-        def selfAttention_fp():
-            vector_matmul_func = inference_cuda_module.vector_matmul_fp16 if config.fp16 else \
-                                    inference_cuda_module.vector_matmul_fp32
-            if not config.pre_layer_norm:
-                linear_func = inference_cuda_module.linear_layer_fp16 if config.fp16 else \
-                                    inference_cuda_module.linear_layer_fp32
-
-                qkv_out = linear_func(input, attn_qkvw, attn_qkvb)
-            else:
-                qkv_func = inference_cuda_module.qkv_gemm_fp16 if config.fp16 else \
-                                    inference_cuda_module.qkv_gemm_fp32
-                qkv_out = qkv_func(input,
-                                   attn_qkvw,
-                                   (attn_qkvb if attn_qkvb is not None else norm_b),
-                                   norm_w,
-                                   norm_b,
-                                   config.epsilon,
-                                   (attn_qkvb is not None))
-
-            context_layer, key_layer, value_layer = compute_attention(qkv_out[0] if isinstance(qkv_out, list) else qkv_out, input_mask)
-            output = vector_matmul_func(context_layer, attn_ow, False)
-            #print(f'[{torch.distributed.get_rank()}] {config.layer_id}: oooooo -> {output.norm()}')
-
-            return output, key_layer, value_layer, context_layer, qkv_out[-1] # attn_out, present_key, present_value, context_output, inp_norm
-
-        def selfAttention_int8():
-            if not config.pre_layer_norm:
-                qkv_out = inference_cuda_module.linear_layer_int8(
-                    input,
-                    attn_qkvw,
-                    attn_qkvb,
-                    q_scales[0],
-                    (q_groups * (3 if qkv_merging else 1) * (2**merge_count)))
-
-            else:
-                qkv_out = inference_cuda_module.qkv_gemm_int8(
-                    input,
-                    attn_qkvw,
-                    attn_qkvb,
-                    norm_w,
-                    norm_b,
-                    config.epsilon,
-                    q_scales[0],
-                    (q_groups * (3 if qkv_merging else 1) * (2**merge_count)),
-                    (attn_qkvb is not None))
-            context_layer, key_layer, value_layer = compute_attention(qkv_out)
-            output = inference_cuda_module.vector_matmul_int8(context_layer,
-                                                              attn_ow,
-                                                              q_scales[1],
-                                                              q_groups,
-                                                              (merge_count))
-            return output, key_layer, value_layer, context_layer
-
-        if config.q_int8:
-            output, key_layer, value_layer, context_layer = selfAttention_int8()
-        else:
-            output, key_layer, value_layer, context_layer, inp_norm = selfAttention_fp()
-        if config.mlp_after_attn and mp_group is not None and dist.get_world_size(
-                group=mp_group) > 1:
-            dist.all_reduce(output, group=mp_group)
-
-        return (output, key_layer, value_layer, context_layer, inp_norm)
-
-    @staticmethod
-    def backward(ctx, grad_output, grad_output1, grad_output2, grad_output3):
-        raise RuntimeError('You are running with DeepSpeed Inference mode. \
-                            Please switch to Training mode for running backward!')
-
-
-class DeepSpeedSelfAttention(nn.Module):
-    num_layers = 0
-
-    def __init__(self,
-                 config,
-                 mp_group=None,
-                 q_scales=None,
-                 q_groups=1,
-                 merge_count=1,
-                 qkv_merging=False):
-        super(DeepSpeedSelfAttention, self).__init__()
-        self.config = config
-        self.config.layer_id = DeepSpeedSelfAttention.num_layers
-        DeepSpeedSelfAttention.num_layers = DeepSpeedSelfAttention.num_layers + 1
-        self.attn_qkvw = nn.Parameter(
-            torch.Tensor(self.config.hidden_size,
-                         (self.config.hidden_size // self.config.mp_size) * 3))
-        self.attn_qkvb = nn.Parameter(
-            torch.Tensor((self.config.hidden_size // self.config.mp_size) * 3))
-
-        self.attn_ow = nn.Parameter(
-            torch.Tensor(self.config.hidden_size // self.config.mp_size,
-                         self.config.hidden_size))
-
-        self.attn_ob = nn.Parameter(torch.Tensor(self.config.hidden_size))
-
-        self.num_attention_heads_per_partition = self.config.heads // self.config.mp_size
-        self.hidden_size_per_partition = self.config.hidden_size // self.config.mp_size
-        self.hidden_size_per_attention_head = self.config.hidden_size // self.config.heads
-
-        self.mp_group = mp_group
-
-        # used for quantization
-        self.q_scales = q_scales
-        self.q_groups = q_groups
-        self.merge_count = int(math.log2(merge_count))
-
-        self.norm_factor = math.sqrt(
-            math.sqrt(self.config.hidden_size // self.config.heads))
-        self.qkv_merging = qkv_merging
-
-    def forward(self,
-                input,
-                input_mask,
-                head_mask=None,
-                layer_past=None,
-                get_present=False,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                output_attentions=False,
-                norm_w=None,
-                norm_b=None):
-        output = DeepSpeedSelfAttentionFunction.apply(
-            input,
-            input_mask,
-            head_mask,
-            layer_past,
-            get_present,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            output_attentions,
-            norm_w,
-            norm_b,
-            self.config,
-            self.attn_qkvw,
-            self.attn_qkvb,
-            self.num_attention_heads_per_partition,
-            self.norm_factor,
-            self.hidden_size_per_partition,
-            self.attn_ow,
-            self.attn_ob,
-            self.mp_group,
-            self.q_scales,
-            self.q_groups,
-            self.merge_count,
-            self.qkv_merging)
-
-        return output
-
-
-class DeepSpeedMLPFunction(Function):
-    @staticmethod
-    def forward(ctx,
-                input,
-                residual,
-                residual_norm,
-                bias,
-                inter_w,
-                inter_b,
-                attn_nw,
-                attn_nb,
-                config,
-                mp_group,
-                output_b,
-                output_w,
-                q_scales,
-                q_groups,
-                merge_count,
-                mlp_gemm_func,
-                fused_gemm_gelu,
-                vector_matmul_func,
-                bias_residual_func):
-
-        if config.q_int8:
-            (intermediate,
-             residual_add) = inference_cuda_module.mlp_gemm_int8(
-                 input,
-                 residual,
-                 bias,
-                 inter_w,
-                 inter_b,
-                 attn_nw,
-                 attn_nb,
-                 config.epsilon,
-                 q_scales[2],
-                 (q_groups * (2**merge_count)),
-                 config.pre_layer_norm)
-            output = inference_cuda_module.vector_matmul_int8(intermediate,
-                                                              output_w,
-                                                              q_scales[3],
-                                                              q_groups,
-                                                              (merge_count))
-        else:
-            if attn_nw is None:
-                output = fused_gemm_gelu(residual_norm,
-                                         inter_w,
-                                         inter_b,
-                                         output_w,
-                                         config.epsilon,
-                                         config.pre_layer_norm,
-                                         False)
-            else:
-                intermediate = mlp_gemm_func(input,
-                                             residual,
-                                             bias,
-                                             inter_w,
-                                             inter_b,
-                                             attn_nw,
-                                             attn_nb,
-                                             config.epsilon,
-                                             config.pre_layer_norm,
-                                             config.mlp_after_attn)
-                output = vector_matmul_func(intermediate, output_w, False)
-        inference_cuda_module.residual_add(output,
-                                           residual,
-                                           input,
-                                           output_b,
-                                           bias,
-                                           config.mp_size,
-                                           config.mlp_after_attn)
-        if mp_group is not None and dist.get_world_size(group=mp_group) > 1:
-            dist.all_reduce(output, group=mp_group)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        raise RuntimeError('You are running with DeepSpeed Inference mode. \
-                            Please switch to Training mode for running backward!')
-
-
-class DeepSpeedMLP(nn.Module):
-    def __init__(self,
-                 config,
-                 mp_group=None,
-                 q_scales=None,
-                 q_groups=1,
-                 merge_count=1,
-                 mlp_extra_grouping=False):
-        super(DeepSpeedMLP, self).__init__()
-
-        self.config = config
-        self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size))
-        self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size))
-        self.inter_w = nn.Parameter(
-            torch.Tensor(self.config.hidden_size,
-                         self.config.intermediate_size // self.config.mp_size))
-        self.inter_b = nn.Parameter(
-            torch.Tensor(self.config.intermediate_size // self.config.mp_size))
-        self.output_w = nn.Parameter(
-            torch.Tensor((self.config.intermediate_size // self.config.mp_size),
-                         self.config.hidden_size))
-        self.output_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
-
-        # used for quantization
-        self.q_scales = q_scales
-        self.q_groups = q_groups * 2 if mlp_extra_grouping else q_groups
-        self.merge_count = int(math.log2(merge_count))
-
-        self.mp_group = mp_group
-        self.mlp_gemm_func = inference_cuda_module.mlp_gemm_fp16 if config.fp16 else \
-                                    inference_cuda_module.mlp_gemm_fp32
-        self.vector_matmul_func = inference_cuda_module.vector_matmul_fp16 if config.fp16 else \
-                                inference_cuda_module.vector_matmul_fp32
-        self.fused_gemm_gelu = inference_cuda_module.fused_gemm_gelu_fp16 if config.fp16 else \
-                                    inference_cuda_module.fused_gemm_gelu_fp32
-
-        self.bias_residual_func = inference_cuda_module.bias_residual_fp16 if config.fp16 or config.q_int8 else \
-                                    inference_cuda_module.bias_residual_fp32
-
-    def forward(self, input, residual, residual_norm, bias):
-        return DeepSpeedMLPFunction.apply(input,
-                                          residual,
-                                          residual_norm,
-                                          bias,
-                                          self.inter_w,
-                                          self.inter_b,
-                                          self.attn_nw,
-                                          self.attn_nb,
-                                          self.config,
-                                          self.mp_group,
-                                          self.output_b,
-                                          self.output_w,
-                                          self.q_scales,
-                                          self.q_groups,
-                                          self.merge_count,
-                                          self.mlp_gemm_func,
-                                          self.fused_gemm_gelu,
-                                          self.vector_matmul_func,
-                                          self.bias_residual_func)
-
-
-class DeepSpeedTransformerInference(nn.Module):
-    """Initialize the DeepSpeed Transformer Layer.
-        Arguments:
-            layer_id: The layer index starting from 0, e.g. if model has 24 transformer layers,
-                layer_id will be 0,1,2...23 when each layer object is instantiated
-            config: An object of DeepSpeedInferenceConfig
-            mp_group: Model parallelism group initialized on the modeling side.
-            quantize_scales: This argument groups all the layers' scales used for quantization
-            quantize_groups: Number of groups used for quantizing the model
-            merge_count: Shows the number of model-parallel checkpoints merged before running inference.
-                We use this argument to control the quantization scale for the model parameters if a bigger
-                quantize-grouping than 1 is used.
-            mlp_extra_grouping: This flag is used to show a 2x higher number of groups used for the MLP part
-                of a Transformer layer. We use this feature for quantization to reduce the convergence impact
-                for specific downstream tasks.
-    """
-    layer_id = 0
-
-    def __init__(self,
-                 config,
-                 mp_group=None,
-                 quantize_scales=None,
-                 quantize_groups=1,
-                 merge_count=1,
-                 mlp_extra_grouping=False,
-                 qkv_merging=False):
-        super(DeepSpeedTransformerInference, self).__init__()
-
-        self.config = config
-        self.config.layer_id = DeepSpeedTransformerInference.layer_id
-        DeepSpeedTransformerInference.layer_id += 1
-
-        global inference_cuda_module
-        if inference_cuda_module is None:
-            builder = op_builder.InferenceBuilder()
-            inference_cuda_module = builder.load()
-
-        print("DeepSpeed Transformer Inference config is ", self.config.__dict__)
-
-        self.attention = DeepSpeedSelfAttention(self.config,
-                                                mp_group,
-                                                quantize_scales,
-                                                quantize_groups,
-                                                merge_count,
-                                                qkv_merging)
-        self.mlp = DeepSpeedMLP(self.config,
-                                mp_group,
-                                quantize_scales,
-                                quantize_groups,
-                                merge_count,
-                                mlp_extra_grouping)
-
-        self.norm_w = nn.Parameter(torch.Tensor(self.config.hidden_size))
-        self.norm_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
-        self.layer_past = None
-
-    def forward(self,
-                input,
-                input_mask=None,
-                attention_mask=None,
-                head_mask=None,
-                layer_past=None,
-                get_key_value=False,
-                get_present=False,
-                encoder_output=None,
-                enc_dec_attn_mask=None,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                use_cache=False,
-                output_attentions=False):
-        get_present = (get_present or get_key_value or use_cache)
-        input_mask = input_mask if attention_mask is None else attention_mask
-        layer_past = layer_past if layer_past is not None else self.layer_past
-
-        attn_mask = None
-        if isinstance(input, tuple):
-            attn_mask = input[1]
-            input = input[0]
-        input_type = input.dtype
-
-        if (self.config.fp16 or self.config.q_int8) \
-            and input.dtype == torch.float:
-            input = input.half()
-
-        with torch.no_grad():
-            attention_output, key, value, context_outputtn_ctx, inp_norm = \
-                                     self.attention(input,
-                                              input_mask,
-                                              head_mask,
-                                              layer_past,
-                                              get_present,
-                                              encoder_hidden_states,
-                                              encoder_attention_mask,
-                                              output_attentions,
-                                              self.norm_w,
-                                              self.norm_b)
-            presents = (key, value)
-            self.layer_past = presents
-
-            output = self.mlp(attention_output, input, inp_norm, self.attention.attn_ob)
-
-            if not self.config.pre_layer_norm:
-                ds_layernorm = inference_cuda_module.layer_norm_fp16 if self.config.fp16 or self.config.q_int8 else \
-                                        inference_cuda_module.layer_norm_fp32
-                output = ds_layernorm(output,
-                                      self.norm_w,
-                                      self.norm_b,
-                                      self.config.epsilon)
-
-            output = output.to(input_type)
-        #print(f'[{torch.distributed.get_rank()}] {self.config.layer_id}: {output.norm()}')
-        #exit()
-        if get_present:
-            output = (output, presents)
-
-        if self.config.return_tuple:
-            return output if type(output) is tuple else (output, attn_mask)
-        else:
-            return output
--- a/deepspeed/ops/transformer/inference/triton_ops.py
+++ b/deepspeed/ops/transformer/inference/triton_ops.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 """
 Inspired by original Triton implementation:
 https://github.com/openai/triton/blob/b244db06da24a87453a40ad35b085ee37dac3705/python/tutorials/06-fused-attention.py
@@ -102,6 +105,7 @@ def _fwd_kernel(


 class triton_flash_attn(torch.nn.Module):
+
    def __init__(self, ):
        super(triton_flash_attn, self).__init__()

@@ -111,10 +115,7 @@ class triton_flash_attn(torch.nn.Module):
        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
        o = torch.empty_like(q)
        grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1])
-        tmp = torch.empty((q.shape[0] * q.shape[1],
-                           q.shape[2]),
-                          device=q.device,
-                          dtype=torch.float32)
+        tmp = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
        num_warps = 4 if Lk <= 64 else 8

        _fwd_kernel[grid](

--- a/deepspeed/ops/transformer/transformer.py
+++ b/deepspeed/ops/transformer/transformer.py
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import json
 import math
 import torch
@@ -15,15 +17,9 @@ stochastic_transformer_cuda_module = None


 class TransformerConfig():
-    def __init__(self,
-                 batch_size,
-                 hidden_size,
-                 intermediate_size,
-                 heads,
-                 attn_dropout_ratio,
-                 hidden_dropout_ratio,
-                 num_hidden_layers,
-                 initializer_range):
+
+    def __init__(self, batch_size, hidden_size, intermediate_size, heads, attn_dropout_ratio, hidden_dropout_ratio,
+                 num_hidden_layers, initializer_range):
        self.layer_id = -1
        self.batch_size = batch_size
        self.hidden_size = hidden_size
@@ -89,6 +85,7 @@ class DeepSpeedTransformerConfig(TransformerConfig):

            training: Enable for training rather than inference.
    """
+
    def __init__(self,
                 batch_size=-1,
                 hidden_size=-1,
@@ -111,15 +108,9 @@ class DeepSpeedTransformerConfig(TransformerConfig):
                 return_tuple=False,
                 training=True):
        super(DeepSpeedTransformerConfig,
-              self).__init__(
-                  batch_size,
-                  hidden_size,
-                  (intermediate_size if intermediate_size > 0 else 4 * hidden_size),
-                  heads,
-                  attn_dropout_ratio,
-                  hidden_dropout_ratio,
-                  num_hidden_layers,
-                  initializer_range)
+              self).__init__(batch_size, hidden_size,
+                             (intermediate_size if intermediate_size > 0 else 4 * hidden_size), heads,
+                             attn_dropout_ratio, hidden_dropout_ratio, num_hidden_layers, initializer_range)
        self.fp16 = fp16
        self.pre_layer_norm = pre_layer_norm
        self.local_rank = local_rank
@@ -150,97 +141,42 @@ class DeepSpeedTransformerConfig(TransformerConfig):


 class DeepSpeedTransformerFunction(Function):
+
    @staticmethod
-    def forward(ctx,
-                input,
-                input_mask,
-                self,
-                grads,
-                layer_id,
-                attn_qkvw,
-                attn_qkvb,
-                attn_ow,
-                attn_ob,
-                attn_nw,
-                attn_nb,
-                inter_w,
-                inter_b,
-                output_w,
-                output_b,
-                norm_w,
-                norm_b,
-                config):
+    def forward(ctx, input, input_mask, self, grads, layer_id, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw,
+                attn_nb, inter_w, inter_b, output_w, output_b, norm_w, norm_b, config):

        cuda_module = stochastic_transformer_cuda_module if config.stochastic_mode else transformer_cuda_module
        forward_func = cuda_module.forward_fp16 if config.fp16 else cuda_module.forward_fp32

        inp_size = input.size()
        if inp_size[1] % 16 != 0:
-            input = torch.cat((input,
-                               torch.randn((inp_size[0],
-                                            (16 - (inp_size[1] % 16)),
-                                            inp_size[2]),
-                                           device=input.device,
-                                           dtype=input.dtype)),
-                              1)
+            input = torch.cat(
+                (input,
+                 torch.randn(
+                     (inp_size[0], (16 - (inp_size[1] % 16)), inp_size[2]), device=input.device, dtype=input.dtype)),
+                1)
            input_mask = torch.cat((input_mask, torch.ones((inp_size[0], input_mask.shape[1], input_mask.shape[2], \
                                            (16 - (inp_size[1] % 16))), device=input_mask.device, dtype=input_mask.dtype) * -10000), 3)

-        (output,
-         inp_norm,
-         qkv_tf,
-         soft_inp,
-         ctx_bufB,
-         attn_o_inp,
-         add_res,
-         ff1_inp,
-         gelu_inp,
-         ff2_inp,
-         attn_prob_dropout_mask,
-         attn_output_dropout_mask,
-         layer_output_dropout_mask,
-         attn_layer_norm_var,
-         attn_layer_norm_mean,
-         layer_norm_var,
-         layer_norm_mean) = forward_func(config.layer_id,
-                                         input,
-                                         input_mask,
-                                         attn_qkvw,
-                                         attn_qkvb,
-                                         attn_ow,
-                                         attn_ob,
-                                         attn_nw,
-                                         attn_nb,
-                                         inter_w,
-                                         inter_b,
-                                         output_w,
-                                         output_b,
-                                         norm_w,
-                                         norm_b,
-                                         config.training and config.is_grad_enabled,
-                                         config.pre_layer_norm,
-                                         config.attn_dropout_checkpoint,
-                                         config.normalize_invertible,
-                                         config.gelu_checkpoint)
+        (output, inp_norm, qkv_tf, soft_inp, ctx_bufB, attn_o_inp, add_res, ff1_inp, gelu_inp, ff2_inp,
+         attn_prob_dropout_mask, attn_output_dropout_mask, layer_output_dropout_mask, attn_layer_norm_var,
+         attn_layer_norm_mean, layer_norm_var, layer_norm_mean) = forward_func(
+             config.layer_id, input, input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w,
+             inter_b, output_w, output_b, norm_w, norm_b, config.training and config.is_grad_enabled,
+             config.pre_layer_norm, config.attn_dropout_checkpoint, config.normalize_invertible,
+             config.gelu_checkpoint)

        # For testing only.
        if grads is not None:
            for i in [2]:
-                attn_qkvw.register_hook(
-                    lambda x,
-                    i=i,
-                    self=self: grads.append([
-                        x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)],
-                        ("Q_W" if i == 0 else "K_W" if i == 1 else "V_W")
-                    ]))
+                attn_qkvw.register_hook(lambda x, i=i, self=self: grads.append([
+                    x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)], ("Q_W" if i == 0 else "K_W" if i == 1 else "V_W")
+                ]))
            for i in [2]:
-                attn_qkvb.register_hook(
-                    lambda x,
-                    i=i,
-                    self=self: grads.append([
-                        x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)],
-                        ("Q_B" if i == 0 else "K_B" if i == 1 else "V_B")
-                    ]))
+                attn_qkvb.register_hook(lambda x, i=i, self=self: grads.append([
+                    x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)], ("Q_B" if i == 0 else "K_B" if i == 1 else "V_B")
+                ]))

            attn_ow.register_hook(lambda x, self=self: grads.append([x, "O_W"]))
            attn_ob.register_hook(lambda x, self=self: grads.append([x, "O_B"]))
@@ -255,35 +191,11 @@ class DeepSpeedTransformerFunction(Function):

        if config.is_grad_enabled and config.training:
            if (config.pre_layer_norm and config.normalize_invertible):
-                ctx.save_for_backward(input_mask,
-                                      attn_qkvw,
-                                      attn_qkvb,
-                                      attn_ow,
-                                      attn_ob,
-                                      attn_nw,
-                                      attn_nb,
-                                      inter_w,
-                                      inter_b,
-                                      output_w,
-                                      output_b,
-                                      norm_w,
-                                      norm_b)
+                ctx.save_for_backward(input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w,
+                                      inter_b, output_w, output_b, norm_w, norm_b)
            else:
-                ctx.save_for_backward(output,
-                                      input,
-                                      input_mask,
-                                      attn_qkvw,
-                                      attn_qkvb,
-                                      attn_ow,
-                                      attn_ob,
-                                      attn_nw,
-                                      attn_nb,
-                                      inter_w,
-                                      inter_b,
-                                      output_w,
-                                      output_b,
-                                      norm_w,
-                                      norm_b)
+                ctx.save_for_backward(output, input, input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw,
+                                      attn_nb, inter_w, inter_b, output_w, output_b, norm_w, norm_b)

            ctx.config = config
            if (config.pre_layer_norm or not config.normalize_invertible):
@@ -331,88 +243,28 @@ class DeepSpeedTransformerFunction(Function):
        assert ctx.config.training

        if (ctx.config.pre_layer_norm and ctx.config.normalize_invertible):
-            (input_mask,
-             attn_qkvw,
-             attn_qkvb,
-             attn_ow,
-             attn_ob,
-             attn_nw,
-             attn_nb,
-             inter_w,
-             inter_b,
-             output_w,
-             output_b,
-             norm_w,
-             norm_b) = ctx.saved_tensors
+            (input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b, output_w,
+             output_b, norm_w, norm_b) = ctx.saved_tensors
        else:
-            (output,
-             input,
-             input_mask,
-             attn_qkvw,
-             attn_qkvb,
-             attn_ow,
-             attn_ob,
-             attn_nw,
-             attn_nb,
-             inter_w,
-             inter_b,
-             output_w,
-             output_b,
-             norm_w,
-             norm_b) = ctx.saved_tensors
+            (output, input, input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b,
+             output_w, output_b, norm_w, norm_b) = ctx.saved_tensors

        cuda_module = stochastic_transformer_cuda_module if ctx.config.stochastic_mode else transformer_cuda_module
        backward_func = cuda_module.backward_fp16 if ctx.config.fp16 else cuda_module.backward_fp32

-        (grad_input,
-         grad_attn_qkvw,
-         grad_attn_qkvb,
-         grad_attn_ow,
-         grad_attn_ob,
-         grad_attn_nw,
-         grad_attn_nb,
-         grad_inter_w,
-         grad_inter_b,
-         grad_output_w,
-         grad_output_b,
-         grad_norm_w,
-         grad_norm_b) = backward_func(
-             ctx.config.layer_id,
-             grad_output,
-             (ctx.inp_norm if (ctx.config.pre_layer_norm
-                               and ctx.config.normalize_invertible) else output),
-             (ctx.inp_norm if (ctx.config.pre_layer_norm
-                               or not ctx.config.normalize_invertible) else input),
-             ctx.qkv_tf,
-             ctx.soft_inp,
-             (ctx.soft_inp if ctx.config.attn_dropout_checkpoint else ctx.ctx_bufB),
-             ctx.attn_o_inp,
-             (ctx.ff1_inp if ctx.config.normalize_invertible else ctx.add_res),
-             ctx.ff1_inp,
-             (ctx.ff2_inp if ctx.config.gelu_checkpoint else ctx.gelu_inp),
-             ctx.ff2_inp,
-             ctx.attn_prob_dropout_mask,
-             ctx.attn_output_dropout_mask,
-             ctx.layer_output_dropout_mask,
-             ctx.attn_layer_norm_var,
-             ctx.attn_layer_norm_mean,
-             ctx.layer_norm_var,
-             ctx.layer_norm_mean,
-             (ctx.inp_norm if (ctx.config.pre_layer_norm
-                               and ctx.config.normalize_invertible) else input),
-             input_mask,
-             attn_qkvw,
-             attn_qkvb,
-             attn_ow,
-             attn_ob,
-             attn_nw,
-             attn_nb,
-             inter_w,
-             inter_b,
-             output_w,
-             output_b,
-             norm_w,
-             norm_b)
+        (grad_input, grad_attn_qkvw, grad_attn_qkvb, grad_attn_ow, grad_attn_ob, grad_attn_nw, grad_attn_nb,
+         grad_inter_w, grad_inter_b, grad_output_w, grad_output_b, grad_norm_w, grad_norm_b) = backward_func(
+             ctx.config.layer_id, grad_output,
+             (ctx.inp_norm if (ctx.config.pre_layer_norm and ctx.config.normalize_invertible) else output),
+             (ctx.inp_norm if (ctx.config.pre_layer_norm or not ctx.config.normalize_invertible) else input),
+             ctx.qkv_tf, ctx.soft_inp, (ctx.soft_inp if ctx.config.attn_dropout_checkpoint else ctx.ctx_bufB),
+             ctx.attn_o_inp, (ctx.ff1_inp if ctx.config.normalize_invertible else ctx.add_res), ctx.ff1_inp,
+             (ctx.ff2_inp if ctx.config.gelu_checkpoint else ctx.gelu_inp), ctx.ff2_inp, ctx.attn_prob_dropout_mask,
+             ctx.attn_output_dropout_mask, ctx.layer_output_dropout_mask, ctx.attn_layer_norm_var,
+             ctx.attn_layer_norm_mean, ctx.layer_norm_var, ctx.layer_norm_mean,
+             (ctx.inp_norm if
+              (ctx.config.pre_layer_norm and ctx.config.normalize_invertible) else input), input_mask, attn_qkvw,
+             attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b, output_w, output_b, norm_w, norm_b)

        # This appears to be an effective way to release context memory
        ctx.qkv_tf = None
@@ -436,24 +288,9 @@ class DeepSpeedTransformerFunction(Function):
        if grad_output_shape[1] % 16 != 0:
            grad_input = torch.narrow(grad_input, 1, 0, grad_output_shape[1])

-        return (grad_input,
-                None,
-                None,
-                None,
-                None,
-                grad_attn_qkvw,
-                grad_attn_qkvb,
-                grad_attn_ow,
-                grad_attn_ob,
-                grad_attn_nw,
-                grad_attn_nb,
-                grad_inter_w,
-                grad_inter_b,
-                grad_output_w,
-                grad_output_b,
-                grad_norm_w,
-                grad_norm_b,
-                None)
+        return (grad_input, None, None, None, None, grad_attn_qkvw, grad_attn_qkvb, grad_attn_ow, grad_attn_ob,
+                grad_attn_nw, grad_attn_nb, grad_inter_w, grad_inter_b, grad_output_w, grad_output_b, grad_norm_w,
+                grad_norm_b, None)


 class DeepSpeedTransformerLayer(nn.Module):
@@ -484,23 +321,15 @@ class DeepSpeedTransformerLayer(nn.Module):
            get_accelerator().set_device(self.config.local_rank)

        if initial_weights is None and initial_biases is None:
-            self.attn_qkvw = nn.Parameter(
-                torch.Tensor(self.config.hidden_size * 3,
-                             self.config.hidden_size))
+            self.attn_qkvw = nn.Parameter(torch.Tensor(self.config.hidden_size * 3, self.config.hidden_size))
            self.attn_qkvb = nn.Parameter(torch.Tensor(self.config.hidden_size * 3))
-            self.attn_ow = nn.Parameter(
-                torch.Tensor(self.config.hidden_size,
-                             self.config.hidden_size))
+            self.attn_ow = nn.Parameter(torch.Tensor(self.config.hidden_size, self.config.hidden_size))
            self.attn_ob = nn.Parameter(torch.Tensor(self.config.hidden_size))
            self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size))
            self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size))
-            self.inter_w = nn.Parameter(
-                torch.Tensor(self.config.intermediate_size,
-                             self.config.hidden_size))
+            self.inter_w = nn.Parameter(torch.Tensor(self.config.intermediate_size, self.config.hidden_size))
            self.inter_b = nn.Parameter(torch.Tensor(self.config.intermediate_size))
-            self.output_w = nn.Parameter(
-                torch.Tensor(self.config.hidden_size,
-                             self.config.intermediate_size))
+            self.output_w = nn.Parameter(torch.Tensor(self.config.hidden_size, self.config.intermediate_size))
            self.output_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
            self.norm_w = nn.Parameter(torch.Tensor(self.config.hidden_size))
            self.norm_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
@@ -539,21 +368,11 @@ class DeepSpeedTransformerLayer(nn.Module):
        cuda_module = stochastic_transformer_cuda_module if self.config.stochastic_mode else transformer_cuda_module
        create_layer_func = cuda_module.create_transformer_layer_fp16 if self.config.fp16 else cuda_module.create_transformer_layer_fp32

-        create_layer_func(self.config.layer_id,
-                          self.config.batch_size,
-                          self.config.hidden_size,
-                          self.config.heads,
-                          self.config.intermediate_size,
-                          self.config.attn_dropout_ratio,
-                          self.config.hidden_dropout_ratio,
-                          self.config.layer_norm_eps,
-                          self.config.seed,
-                          self.config.pre_layer_norm,
-                          self.config.test_gemm,
-                          self.config.attn_dropout_checkpoint,
-                          self.config.normalize_invertible,
-                          self.config.gelu_checkpoint,
-                          self.config.stochastic_mode)
+        create_layer_func(self.config.layer_id, self.config.batch_size, self.config.hidden_size, self.config.heads,
+                          self.config.intermediate_size, self.config.attn_dropout_ratio,
+                          self.config.hidden_dropout_ratio, self.config.layer_norm_eps, self.config.seed,
+                          self.config.pre_layer_norm, self.config.test_gemm, self.config.attn_dropout_checkpoint,
+                          self.config.normalize_invertible, self.config.gelu_checkpoint, self.config.stochastic_mode)

    def init_transformer_weights(self, adjust_init_range=False):
        num_layers = self.config.num_hidden_layers
@@ -587,21 +406,7 @@ class DeepSpeedTransformerLayer(nn.Module):
                grads=None):
        self.config.is_grad_enabled = torch.is_grad_enabled()
        self.config.training = self.training
-        return DeepSpeedTransformerFunction.apply(hidden_states,
-                                                  attention_mask,
-                                                  self,
-                                                  grads,
-                                                  self.config.layer_id,
-                                                  self.attn_qkvw,
-                                                  self.attn_qkvb,
-                                                  self.attn_ow,
-                                                  self.attn_ob,
-                                                  self.attn_nw,
-                                                  self.attn_nb,
-                                                  self.inter_w,
-                                                  self.inter_b,
-                                                  self.output_w,
-                                                  self.output_b,
-                                                  self.norm_w,
-                                                  self.norm_b,
-                                                  self.config)
+        return DeepSpeedTransformerFunction.apply(hidden_states, attention_mask, self, grads, self.config.layer_id,
+                                                  self.attn_qkvw, self.attn_qkvb, self.attn_ow, self.attn_ob,
+                                                  self.attn_nw, self.attn_nb, self.inter_w, self.inter_b,
+                                                  self.output_w, self.output_b, self.norm_w, self.norm_b, self.config)
--- a/deepspeed/pipe/__init__.py
+++ b/deepspeed/pipe/__init__.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team

 from ..runtime.pipe import PipelineModule, LayerSpec, TiedLayerSpec
--- a/deepspeed/profiling/__init__.py
+++ b/deepspeed/profiling/__init__.py
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
--- a/deepspeed/profiling/config.py
+++ b/deepspeed/profiling/config.py
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team

 from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
 from deepspeed.profiling.constants import *


 class DeepSpeedFlopsProfilerConfig(DeepSpeedConfigObject):
+
    def __init__(self, param_dict):
        super(DeepSpeedFlopsProfilerConfig, self).__init__()

@@ -25,26 +25,18 @@ class DeepSpeedFlopsProfilerConfig(DeepSpeedConfigObject):
        self._initialize(flops_profiler_dict)

    def _initialize(self, flops_profiler_dict):
-        self.enabled = get_scalar_param(flops_profiler_dict,
-                                        FLOPS_PROFILER_ENABLED,
-                                        FLOPS_PROFILER_ENABLED_DEFAULT)
+        self.enabled = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_ENABLED, FLOPS_PROFILER_ENABLED_DEFAULT)

-        self.profile_step = get_scalar_param(flops_profiler_dict,
-                                             FLOPS_PROFILER_PROFILE_STEP,
+        self.profile_step = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_PROFILE_STEP,
                                             FLOPS_PROFILER_PROFILE_STEP_DEFAULT)

-        self.module_depth = get_scalar_param(flops_profiler_dict,
-                                             FLOPS_PROFILER_MODULE_DEPTH,
+        self.module_depth = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_MODULE_DEPTH,
                                             FLOPS_PROFILER_MODULE_DEPTH_DEFAULT)

-        self.top_modules = get_scalar_param(flops_profiler_dict,
-                                            FLOPS_PROFILER_TOP_MODULES,
+        self.top_modules = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_TOP_MODULES,
                                            FLOPS_PROFILER_TOP_MODULES_DEFAULT)

-        self.detailed = get_scalar_param(flops_profiler_dict,
-                                         FLOPS_PROFILER_DETAILED,
-                                         FLOPS_PROFILER_DETAILED_DEFAULT)
+        self.detailed = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_DETAILED, FLOPS_PROFILER_DETAILED_DEFAULT)

-        self.output_file = get_scalar_param(flops_profiler_dict,
-                                            FLOPS_PROFILER_OUTPUT_FILE,
+        self.output_file = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_OUTPUT_FILE,
                                            FLOPS_PROFILER_OUTPUT_FILE_DEFAULT)
--- a/deepspeed/profiling/constants.py
+++ b/deepspeed/profiling/constants.py
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team

 #########################################
 # flops profiler

--- a/deepspeed/profiling/flops_profiler/README.md
+++ b/deepspeed/profiling/flops_profiler/README.md
@@ -309,8 +309,9 @@ The following example shows how to profile AlexNet using the DeepSpeed flops pro
 import torchvision.models as models
 import torch
 from deepspeed.profiling.flops_profiler import get_model_profile
+from deepspeed.accelerator import get_accelerator

-with torch.cuda.device(0):
+with get_accelerator().device(0):
    model = models.alexnet()
    batch_size = 256
    flops, macs, params = get_model_profile(model=model, # model
@@ -334,6 +335,7 @@ from functools import partial
 import torch
 from transformers import BertForSequenceClassification, BertTokenizer
 from deepspeed.profiling.flops_profiler import get_model_profile
+from deepspeed.accelerator import get_accelerator


 def bert_input_constructor(batch_size, seq_len, tokenizer):
@@ -350,7 +352,7 @@ def bert_input_constructor(batch_size, seq_len, tokenizer):
    return inputs


-with torch.cuda.device(0):
+with get_accelerator().device(0):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
    batch_size = 4

--- a/deepspeed/profiling/flops_profiler/__init__.py
+++ b/deepspeed/profiling/flops_profiler/__init__.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team

 from .profiler import *
--- a/deepspeed/profiling/flops_profiler/profiler.py
+++ b/deepspeed/profiling/flops_profiler/profiler.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team

 import time
 import torch
@@ -53,6 +56,7 @@ class FlopsProfiler(object):
    Args:
        object (torch.nn.Module): The PyTorch model to profile.
    """
+
    def __init__(self, model, ds_engine=None):
        self.model = model
        self.ds_engine = ds_engine
@@ -78,8 +82,7 @@ class FlopsProfiler(object):
            # if computing the flops of a module directly
            if type(module) in MODULE_HOOK_MAPPING:
                if not hasattr(module, "__flops_handle__"):
-                    module.__flops_handle__ = module.register_forward_hook(
-                        MODULE_HOOK_MAPPING[type(module)])
+                    module.__flops_handle__ = module.register_forward_hook(MODULE_HOOK_MAPPING[type(module)])
                return

            # if computing the flops of the functionals in a module
@@ -105,16 +108,14 @@ class FlopsProfiler(object):
                module.__start_time__ = time.time()

            if not hasattr(module, "__start_time_hook_handle"):
-                module.__start_time_hook_handle__ = module.register_forward_pre_hook(
-                    start_time_hook)
+                module.__start_time_hook_handle__ = module.register_forward_pre_hook(start_time_hook)

            def end_time_hook(module, input, output):
                get_accelerator().synchronize()
                module.__duration__ += time.time() - module.__start_time__

            if not hasattr(module, "__end_time_hook_handle__"):
-                module.__end_time_hook_handle__ = module.register_forward_hook(
-                    end_time_hook)
+                module.__end_time_hook_handle__ = module.register_forward_hook(end_time_hook)

        self.model.apply(partial(register_module_hooks, ignore_list=ignore_list))
        self.started = True
@@ -154,6 +155,7 @@ class FlopsProfiler(object):

        Adds or resets the extra attributes.
        """
+
        def add_or_reset_attrs(module):
            module.__flops__ = 0
            module.__macs__ = 0
@@ -232,15 +234,9 @@ class FlopsProfiler(object):
        Returns:
            The number of parameters in the model.
        """
-        return params_to_string(
-            self.model.__params__) if as_string else self.model.__params__
-
-    def print_model_profile(self,
-                            profile_step=1,
-                            module_depth=-1,
-                            top_modules=1,
-                            detailed=True,
-                            output_file=None):
+        return params_to_string(self.model.__params__) if as_string else self.model.__params__
+
+    def print_model_profile(self, profile_step=1, module_depth=-1, top_modules=1, detailed=True, output_file=None):
        """Prints the model graph with the measured profile attached to each module.

        Args:
@@ -273,28 +269,21 @@ class FlopsProfiler(object):
        self.macs = total_macs
        self.params = total_params

-        print(
-            "\n-------------------------- DeepSpeed Flops Profiler --------------------------"
-        )
+        print("\n-------------------------- DeepSpeed Flops Profiler --------------------------")
        print(f'Profile Summary at step {profile_step}:')
        print(
            "Notations:\ndata parallel size (dp_size), model parallel size(mp_size),\nnumber of parameters (params), number of multiply-accumulate operations(MACs),\nnumber of floating-point operations (flops), floating-point operations per second (FLOPS),\nfwd latency (forward propagation latency), bwd latency (backward propagation latency),\nstep (weights update latency), iter latency (sum of fwd, bwd and step latency)\n"
        )
        if self.ds_engine:
            print('{:<60}  {:<8}'.format('world size: ', self.ds_engine.world_size))
-            print('{:<60}  {:<8}'.format('data parallel size: ',
-                                         self.ds_engine.dp_world_size))
-            print('{:<60}  {:<8}'.format('model parallel size: ',
-                                         self.ds_engine.mp_world_size))
-            print('{:<60}  {:<8}'.format(
-                'batch size per GPU: ',
-                self.ds_engine.train_micro_batch_size_per_gpu()))
+            print('{:<60}  {:<8}'.format('data parallel size: ', self.ds_engine.dp_world_size))
+            print('{:<60}  {:<8}'.format('model parallel size: ', self.ds_engine.mp_world_size))
+            print('{:<60}  {:<8}'.format('batch size per GPU: ', self.ds_engine.train_micro_batch_size_per_gpu()))

        print('{:<60}  {:<8}'.format('params per gpu: ', params_to_string(total_params)))
        print('{:<60}  {:<8}'.format(
            'params of model = params per GPU * mp_size: ',
-            params_to_string(total_params *
-                             ((self.ds_engine.mp_world_size) if self.ds_engine else 1))))
+            params_to_string(total_params * ((self.ds_engine.mp_world_size) if self.ds_engine else 1))))

        print('{:<60}  {:<8}'.format('fwd MACs per GPU: ', macs_to_string(total_macs)))

@@ -302,43 +291,33 @@ class FlopsProfiler(object):

        print('{:<60}  {:<8}'.format(
            'fwd flops of model = fwd flops per GPU * mp_size: ',
-            num_to_string(total_flops *
-                          ((self.ds_engine.mp_world_size) if self.ds_engine else 1))))
+            num_to_string(total_flops * ((self.ds_engine.mp_world_size) if self.ds_engine else 1))))

        fwd_latency = self.get_total_duration()
        if self.ds_engine and self.ds_engine.wall_clock_breakdown():
            fwd_latency = self.ds_engine.timers('forward').elapsed(False) / 1000.0
        print('{:<60}  {:<8}'.format('fwd latency: ', duration_to_string(fwd_latency)))
-        print('{:<60}  {:<8}'.format(
-            'fwd FLOPS per GPU = fwd flops per GPU / fwd latency: ',
-            flops_to_string(total_flops / fwd_latency)))
+        print('{:<60}  {:<8}'.format('fwd FLOPS per GPU = fwd flops per GPU / fwd latency: ',
+                                     flops_to_string(total_flops / fwd_latency)))

        if self.ds_engine and self.ds_engine.wall_clock_breakdown():
            bwd_latency = self.ds_engine.timers('backward').elapsed(False) / 1000.0
            step_latency = self.ds_engine.timers('step').elapsed(False) / 1000.0
-            print('{:<60}  {:<8}'.format('bwd latency: ',
-                                         duration_to_string(bwd_latency)))
-            print('{:<60}  {:<8}'.format(
-                'bwd FLOPS per GPU = 2 * fwd flops per GPU / bwd latency: ',
-                flops_to_string(2 * total_flops / bwd_latency)))
-            print('{:<60}  {:<8}'.format(
-                'fwd+bwd FLOPS per GPU = 3 * fwd flops per GPU / (fwd+bwd latency): ',
-                flops_to_string(3 * total_flops / (fwd_latency + bwd_latency))))
-
-            print('{:<60}  {:<8}'.format('step latency: ',
-                                         duration_to_string(step_latency)))
+            print('{:<60}  {:<8}'.format('bwd latency: ', duration_to_string(bwd_latency)))
+            print('{:<60}  {:<8}'.format('bwd FLOPS per GPU = 2 * fwd flops per GPU / bwd latency: ',
+                                         flops_to_string(2 * total_flops / bwd_latency)))
+            print('{:<60}  {:<8}'.format('fwd+bwd FLOPS per GPU = 3 * fwd flops per GPU / (fwd+bwd latency): ',
+                                         flops_to_string(3 * total_flops / (fwd_latency + bwd_latency))))
+
+            print('{:<60}  {:<8}'.format('step latency: ', duration_to_string(step_latency)))

            iter_latency = fwd_latency + bwd_latency + step_latency
-            print('{:<60}  {:<8}'.format('iter latency: ',
-                                         duration_to_string(iter_latency)))
-            print('{:<60}  {:<8}'.format(
-                'FLOPS per GPU = 3 * fwd flops per GPU / iter latency: ',
-                flops_to_string(3 * total_flops / iter_latency)))
+            print('{:<60}  {:<8}'.format('iter latency: ', duration_to_string(iter_latency)))
+            print('{:<60}  {:<8}'.format('FLOPS per GPU = 3 * fwd flops per GPU / iter latency: ',
+                                         flops_to_string(3 * total_flops / iter_latency)))

-            samples_per_iter = self.ds_engine.train_micro_batch_size_per_gpu(
-            ) * self.ds_engine.world_size
-            print('{:<60}  {:<8.2f}'.format('samples/second: ',
-                                            samples_per_iter / iter_latency))
+            samples_per_iter = self.ds_engine.train_micro_batch_size_per_gpu() * self.ds_engine.world_size
+            print('{:<60}  {:<8.2f}'.format('samples/second: ', samples_per_iter / iter_latency))

        def flops_repr(module):
            params = module.__params__
@@ -353,9 +332,7 @@ class FlopsProfiler(object):
            duration = get_module_duration(module)

            items.append(duration_to_string(duration))
-            items.append(
-                "{:.2%} latency".format(0.0 if total_duration == 0 else duration /
-                                        total_duration))
+            items.append("{:.2%} latency".format(0.0 if total_duration == 0 else duration / total_duration))
            items.append(flops_to_string(0.0 if duration == 0 else flops / duration))
            items.append(module.original_extra_repr())
            return ", ".join(items)
@@ -374,16 +351,11 @@ class FlopsProfiler(object):

        self.model.apply(add_extra_repr)

-        print(
-            "\n----------------------------- Aggregated Profile per GPU -----------------------------"
-        )
-        self.print_model_aggregated_profile(module_depth=module_depth,
-                                            top_modules=top_modules)
+        print("\n----------------------------- Aggregated Profile per GPU -----------------------------")
+        self.print_model_aggregated_profile(module_depth=module_depth, top_modules=top_modules)

        if detailed:
-            print(
-                "\n------------------------------ Detailed Profile per GPU ------------------------------"
-            )
+            print("\n------------------------------ Detailed Profile per GPU ------------------------------")
            print(
                "Each module profile is listed after its name in the following order: \nparams, percentage of total params, MACs, percentage of total MACs, fwd latency, percentage of total fwd latency, fwd FLOPS"
            )
@@ -394,9 +366,7 @@ class FlopsProfiler(object):

        self.model.apply(del_extra_repr)

-        print(
-            "------------------------------------------------------------------------------"
-        )
+        print("------------------------------------------------------------------------------")

        if output_file:
            sys.stdout = original_stdout
@@ -411,9 +381,7 @@ class FlopsProfiler(object):
        """
        info = {}
        if not hasattr(self.model, "__flops__"):
-            print(
-                "no __flops__ attribute in the model, call this function after start_profile and before end_profile"
-            )
+            print("no __flops__ attribute in the model, call this function after start_profile and before end_profile")
            return

        def walk_module(module, curr_depth, info):
@@ -439,33 +407,22 @@ class FlopsProfiler(object):
        if module_depth == -1:
            depth = len(info) - 1

-        print(
-            f'Top {top_modules} modules in terms of params, MACs or fwd latency at different model depths:'
-        )
+        print(f'Top {top_modules} modules in terms of params, MACs or fwd latency at different model depths:')

        for d in range(depth):
            num_items = min(top_modules, len(info[d]))

            sort_macs = {
                k: macs_to_string(v[0])
-                for k,
-                v in sorted(info[d].items(),
-                            key=lambda item: item[1][0],
-                            reverse=True)[:num_items]
+                for k, v in sorted(info[d].items(), key=lambda item: item[1][0], reverse=True)[:num_items]
            }
            sort_params = {
                k: params_to_string(v[1])
-                for k,
-                v in sorted(info[d].items(),
-                            key=lambda item: item[1][1],
-                            reverse=True)[:num_items]
+                for k, v in sorted(info[d].items(), key=lambda item: item[1][1], reverse=True)[:num_items]
            }
            sort_time = {
                k: duration_to_string(v[2])
-                for k,
-                v in sorted(info[d].items(),
-                            key=lambda item: item[1][2],
-                            reverse=True)[:num_items]
+                for k, v in sorted(info[d].items(), key=lambda item: item[1][2], reverse=True)[:num_items]
            }

            print(f"depth {d}:")
@@ -499,9 +456,7 @@ def _elu_flops_compute(input: Tensor, alpha: float = 1.0, inplace: bool = False)
    return input.numel(), 0


-def _leaky_relu_flops_compute(input: Tensor,
-                              negative_slope: float = 0.01,
-                              inplace: bool = False):
+def _leaky_relu_flops_compute(input: Tensor, negative_slope: float = 0.01, inplace: bool = False):
    return input.numel(), 0


@@ -529,13 +484,7 @@ def _pool_flops_compute(input,
    return input.numel(), 0


-def _conv_flops_compute(input,
-                        weight,
-                        bias=None,
-                        stride=1,
-                        padding=0,
-                        dilation=1,
-                        groups=1):
+def _conv_flops_compute(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    assert weight.shape[1] * groups == input.shape[1]

    batch_size = input.shape[0]
@@ -552,8 +501,8 @@ def _conv_flops_compute(input,

    output_dims = []
    for idx, input_dim in enumerate(input_dims):
-        output_dim = (input_dim + 2 * paddings[idx] -
-                      (dilations[idx] * (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1
+        output_dim = (input_dim + 2 * paddings[idx] - (dilations[idx] *
+                                                       (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1
        output_dims.append(output_dim)

    filters_per_channel = out_channels // groups
@@ -594,8 +543,8 @@ def _conv_trans_flops_compute(
    output_dims = []
    for idx, input_dim in enumerate(input_dims):

-        output_dim = (input_dim + 2 * paddings[idx] -
-                      (dilations[idx] * (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1
+        output_dim = (input_dim + 2 * paddings[idx] - (dilations[idx] *
+                                                       (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1
        output_dims.append(output_dim)

    paddings = padding if type(padding) is tuple else (padding, padding)
@@ -1210,8 +1159,7 @@ def get_model_profile(
        assert len(input_shape) >= 1, "input_shape must have at least one element"
        try:
            input = torch.ones(()).new_empty(
-                (*input_shape,
-                 ),
+                (*input_shape, ),
                dtype=next(model.parameters()).dtype,
                device=next(model.parameters()).device,
            )

--- a/deepspeed/runtime/__init__.py
+++ b/deepspeed/runtime/__init__.py
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team


 class DeepSpeedOptimizer(object):

--- a/deepspeed/runtime/activation_checkpointing/__init__.py
+++ b/deepspeed/runtime/activation_checkpointing/__init__.py
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''