Merge branch 'ds-v0.9.2-rocm' into 'main'

Ds v0.9.2 rocm See merge request dcutoolkit/deeplearing/deepspeed!2

Merge branch 'ds-v0.9.2-rocm' into 'main'
Ds v0.9.2 rocm See merge request dcutoolkit/deeplearing/deepspeed!2
c25a91b6 · aiss · d1596c94 · af82b300 · c25a91b6 · c25a91b6
Commit c25a91b6 authored May 30, 2023 by aiss
20 changed files
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team

 #include <limits>
+#include "conversion_utils.h"
 #include "inference_cuda_layers.h"

 #ifndef __HIP_PLATFORM_HCC__
@@ -12,7 +14,6 @@ Copyright 2022 The Microsoft DeepSpeed Team
 #include <cstdlib>
 #include <ctime>

-#define ATTN_THREADS 256
 #define MAX_REG_SIZE 8

 #define minus_infinity -10000.0
@@ -30,9 +31,10 @@ void CheckCudaErrorAux(const char* file, unsigned line)

 namespace cg = cooperative_groups;

-__global__ void attn_softmax_v2(__half* vals,
-                                __half* mask,
-                                __half* alibi,
+template <typename T, int iterations>
+__global__ void attn_softmax_v2(T* vals,
+                                T* mask,
+                                T* alibi,
                                float layer_scale,
                                bool triangular,
                                bool recompute,
@@ -45,7 +47,6 @@ __global__ void attn_softmax_v2(__half* vals,
                                int head_offset,
                                int mask_stride,
                                int mp_size,
-                                int iterations,
                                int reduceWidth)
 {
    cg::thread_block b = cg::this_thread_block();
@@ -53,7 +54,7 @@ __global__ void attn_softmax_v2(__half* vals,

    float2 low_data[MAX_REG_SIZE];
    float2 high_data[MAX_REG_SIZE];
-    const __half zero_h = __float2half(0.f);
+    const T zero_h = conversion::to<T>(0.f);

    int wid = threadIdx.x >> 5;
    int lane = threadIdx.x & 0x1f;
@@ -75,7 +76,6 @@ __global__ void attn_softmax_v2(__half* vals,
        alibi_offset = (alibi_offset + ((iter_offset / num_seq) % heads)) * sequence_length;
        mask_offset = mask_offset * sequence_length;
        int seq_id = iter_offset % num_seq;
-        int seq_id4 = seq_id >> 2;

        int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
        int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
@@ -87,83 +87,109 @@ __global__ void attn_softmax_v2(__half* vals,
        float max_val = minus_infinity;
        // if (lane == 0) printf("%d, %d: %d \n", wid, blockIdx.x, mask_offset);
        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
-                data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    low_data[i].x = data_id > window_stride
-                                        ? __half2float(vals[data_id]) * layer_scale
-                                        : minus_infinity;
-                    low_data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
-                                     (data_id + 1) > window_stride)
-                                        ? __half2float(vals[data_id + 1]) * layer_scale
-                                        : minus_infinity;
-                    high_data[i].x = ((!triangular || ((data_id + 2) <= seq_id)) &&
-                                      (data_id + 2) > window_stride)
-                                         ? __half2float(vals[data_id + 2]) * layer_scale
-                                         : minus_infinity;
-                    high_data[i].y = ((!triangular || ((data_id + 3) <= seq_id)) &&
-                                      (data_id + 3) > window_stride)
-                                         ? __half2float(vals[data_id + 3]) * layer_scale
-                                         : minus_infinity;
-                    if (alibi) {
-                        low_data[i].x = low_data[i].x + __half2float(alibi[data_id + alibi_offset]);
-                        low_data[i].y =
-                            low_data[i].y + __half2float(alibi[data_id + alibi_offset + 1]);
-                        high_data[i].x =
-                            high_data[i].x + __half2float(alibi[data_id + alibi_offset + 2]);
-                        high_data[i].y =
-                            high_data[i].y + __half2float(alibi[data_id + alibi_offset + 3]);
-                    }
-                    if (mask) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                        high_data[i].y += __half2float(mask[data_id + mask_offset + 3]);
-                    }
-                } else {
-                    low_data[i].x = data_id > window_stride
-                                        ? __half2float(vals[data_id]) * layer_scale
-                                        : minus_infinity;
-                    low_data[i].y = (((!triangular || (data_id + 1) <= seq_id) &&
-                                      (data_id + 1) > window_stride) &&
-                                     (data_id + 1) < sequence_length)
-                                        ? __half2float(vals[data_id + 1]) * layer_scale
-                                        : minus_infinity;
-                    high_data[i].x = (((!triangular || (data_id + 2) <= seq_id) &&
-                                       (data_id + 2) > window_stride) &&
-                                      (data_id + 2) < sequence_length)
-                                         ? __half2float(vals[data_id + 2]) * layer_scale
-                                         : minus_infinity;
-                    if (alibi) {
-                        low_data[i].x = low_data[i].x + __half2float(alibi[data_id + alibi_offset]);
-                        if ((data_id + 1) < sequence_length)
-                            low_data[i].y =
-                                low_data[i].y + __half2float(alibi[data_id + alibi_offset + 1]);
-                        if ((data_id + 2) < sequence_length)
-                            high_data[i].x =
-                                high_data[i].x + __half2float(alibi[data_id + alibi_offset + 2]);
-                    }
-                    high_data[i].y = minus_infinity;
-                    if (mask) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        if ((data_id + 1) < sequence_length)
-                            low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        if ((data_id + 2) < sequence_length)
-                            high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                    }
-                }
-                // if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
-                max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
-                max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
-                max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
-                max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
+            int data_id = i * (reduceWidth << 2) + (seq_lane);
+            bool check = (data_id >> 2) >= window_stride4;
+            bool low_x_check = check && (data_id < sequence_length) &&
+                               (!triangular || (data_id <= seq_id)) && (data_id > window_stride);
+            bool low_y_check = check && ((data_id + reduceWidth) < sequence_length) &&
+                               (!triangular || ((data_id + reduceWidth) <= seq_id)) &&
+                               ((data_id + reduceWidth) > window_stride);
+            bool high_x_check = check && ((data_id + reduceWidth * 2) < sequence_length) &&
+                                (!triangular || ((data_id + reduceWidth * 2) <= seq_id)) &&
+                                ((data_id + reduceWidth * 2) > window_stride);
+            bool high_y_check = check && ((data_id + reduceWidth * 3) < sequence_length) &&
+                                (!triangular || ((data_id + reduceWidth * 3) <= seq_id)) &&
+                                ((data_id + reduceWidth * 3) > window_stride);
+
+            if (mask && alibi) {
+                low_data[i].x = low_x_check
+                                    ? conversion::to<float>(vals[data_id]) * layer_scale +
+                                          (conversion::to<float>(alibi[data_id + alibi_offset])) +
+                                          (conversion::to<float>(mask[data_id + mask_offset]))
+                                    : minus_infinity;
+                low_data[i].y =
+                    low_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth]) * layer_scale +
+                              (conversion::to<float>(alibi[data_id + alibi_offset + reduceWidth])) +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth]))
+                        : minus_infinity;
+                high_data[i].x =
+                    high_x_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 2]) * layer_scale +
+                              (conversion::to<float>(
+                                  alibi[data_id + alibi_offset + reduceWidth * 2])) +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth * 2]))
+                        : minus_infinity;
+                high_data[i].y =
+                    high_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 3]) * layer_scale +
+                              (conversion::to<float>(
+                                  alibi[data_id + alibi_offset + reduceWidth * 3])) +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth * 3]))
+                        : minus_infinity;
+            } else if (mask) {
+                low_data[i].x = low_x_check
+                                    ? conversion::to<float>(vals[data_id]) * layer_scale +
+                                          (conversion::to<float>(mask[data_id + mask_offset]))
+                                    : minus_infinity;
+                low_data[i].y =
+                    low_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth]) * layer_scale +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth]))
+                        : minus_infinity;
+                high_data[i].x =
+                    high_x_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 2]) * layer_scale +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth * 2]))
+                        : minus_infinity;
+                high_data[i].y =
+                    high_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 3]) * layer_scale +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth * 3]))
+                        : minus_infinity;
+            } else if (alibi) {
+                low_data[i].x = low_x_check
+                                    ? conversion::to<float>(vals[data_id]) * layer_scale +
+                                          (conversion::to<float>(alibi[data_id + alibi_offset]))
+                                    : minus_infinity;
+                low_data[i].y =
+                    low_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth]) * layer_scale +
+                              (conversion::to<float>(alibi[data_id + alibi_offset + reduceWidth]))
+                        : minus_infinity;
+                high_data[i].x =
+                    high_x_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 2]) * layer_scale +
+                              (conversion::to<float>(
+                                  alibi[data_id + alibi_offset + reduceWidth * 2]))
+                        : minus_infinity;
+                high_data[i].y =
+                    high_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 3]) * layer_scale +
+                              (conversion::to<float>(
+                                  alibi[data_id + alibi_offset + reduceWidth * 3]))
+                        : minus_infinity;
            } else {
-                low_data[i].x = minus_infinity;
-                low_data[i].y = minus_infinity;
-                high_data[i].x = minus_infinity;
-                high_data[i].y = minus_infinity;
+                low_data[i].x = low_x_check ? conversion::to<float>(vals[data_id]) * layer_scale
+                                            : minus_infinity;
+                low_data[i].y =
+                    low_y_check ? conversion::to<float>(vals[data_id + reduceWidth]) * layer_scale
+                                : minus_infinity;
+                high_data[i].x =
+                    high_x_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 2]) * layer_scale
+                        : minus_infinity;
+                high_data[i].y =
+                    high_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 3]) * layer_scale
+                        : minus_infinity;
            }
+
+            // if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
+            max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
+            max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
+            max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
+            max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
        }

        for (int i = 1; i < WARP_SIZE; i *= 2) {
@@ -212,26 +238,21 @@ __global__ void attn_softmax_v2(__half* vals,
        }
        sum += 1e-6;
        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-
+            int data_id = i * (reduceWidth << 2) + (seq_lane);
            if (data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = __float2half(low_data[i].x / sum);
-                    vals[data_id + 1] = __float2half(low_data[i].y / sum);
-                    vals[data_id + 2] = __float2half(high_data[i].x / sum);
-                    vals[data_id + 3] = __float2half(high_data[i].y / sum);
-                } else {
-                    vals[data_id] = __float2half(low_data[i].x / sum);
-                    if ((data_id + 1) < sequence_length)
-                        vals[data_id + 1] = __float2half(low_data[i].y / sum);
-                    if ((data_id + 2) < sequence_length)
-                        vals[data_id + 2] = __float2half(high_data[i].x / sum);
-                }
+                vals[data_id] = conversion::to<T>(low_data[i].x / sum);
+                if ((data_id + reduceWidth) < sequence_length)
+                    vals[data_id + reduceWidth] = conversion::to<T>(low_data[i].y / sum);
+                if ((data_id + reduceWidth * 2) < sequence_length)
+                    vals[data_id + reduceWidth * 2] = conversion::to<T>(high_data[i].x / sum);
+                if ((data_id + reduceWidth * 3) < sequence_length)
+                    vals[data_id + reduceWidth * 3] = conversion::to<T>(high_data[i].y / sum);
            }
        }
    }
 }

+template <int iterations>
 __global__ void attn_softmax_v2(float* vals,
                                float* attn_mask,
                                float* alibi,
@@ -247,7 +268,6 @@ __global__ void attn_softmax_v2(float* vals,
                                int head_offset,
                                int mask_stride,
                                int mp_size,
-                                int iterations,
                                int reduceWidth)
 {
    cg::thread_block b = cg::this_thread_block();
@@ -269,11 +289,9 @@ __global__ void attn_softmax_v2(float* vals,
        vals += (iter_offset * sequence_length);

        int batch_idx = iter_offset / (num_seq * heads);
-        int alibi_offset = batch_idx * heads * mp_size + head_offset;
        int mask_offset = batch_idx * mask_stride + (iter_offset % mask_stride);
        mask_offset = mask_offset * sequence_length;
        int seq_id = iter_offset % num_seq;
-        int seq_id4 = seq_id >> 2;

        int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
        int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
@@ -285,58 +303,43 @@ __global__ void attn_softmax_v2(float* vals,
        float max_val = minus_infinity;

        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
-                data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    data[i].x = (data_id > window_stride ? vals[data_id] : minus_infinity);
-                    data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
-                                 (data_id + 1) > window_stride)
-                                    ? vals[data_id + 1]
+            int data_id = i * (reduceWidth << 2) + (seq_lane);
+            bool check = (data_id >> 2) >= window_stride4;
+            bool x_check = check && (data_id < sequence_length) &&
+                           (!triangular || (data_id <= seq_id)) && (data_id > window_stride);
+            bool y_check = check && ((data_id + reduceWidth) < sequence_length) &&
+                           (!triangular || ((data_id + reduceWidth) <= seq_id)) &&
+                           ((data_id + reduceWidth) > window_stride);
+            bool z_check = check && ((data_id + reduceWidth * 2) < sequence_length) &&
+                           (!triangular || ((data_id + reduceWidth * 2) <= seq_id)) &&
+                           ((data_id + reduceWidth * 2) > window_stride);
+            bool w_check = check && ((data_id + reduceWidth * 3) < sequence_length) &&
+                           (!triangular || ((data_id + reduceWidth * 3) <= seq_id)) &&
+                           ((data_id + reduceWidth * 3) > window_stride);
+
+            if (attn_mask) {
+                data[i].x = x_check ? vals[data_id] + attn_mask[data_id + mask_offset]
                                    : minus_infinity;
-                    data[i].z = ((!triangular || ((data_id + 2) <= seq_id)) &&
-                                 (data_id + 2) > window_stride)
-                                    ? vals[data_id + 2]
+                data[i].y = y_check ? vals[data_id + reduceWidth] +
+                                          attn_mask[data_id + mask_offset + reduceWidth]
                                    : minus_infinity;
-                    data[i].w = ((!triangular || ((data_id + 3) <= seq_id)) &&
-                                 (data_id + 3) > window_stride)
-                                    ? vals[data_id + 3]
+                data[i].z = z_check ? vals[data_id + reduceWidth * 2] +
+                                          attn_mask[data_id + mask_offset + reduceWidth * 2]
                                    : minus_infinity;
-                    if (attn_mask) {
-                        data[i].x += attn_mask[data_id + mask_offset];
-                        data[i].y += attn_mask[data_id + mask_offset + 1];
-                        data[i].z += attn_mask[data_id + mask_offset + 2];
-                        data[i].w += attn_mask[data_id + mask_offset + 3];
-                    }
-                } else {
-                    data[i].x = data_id > window_stride ? vals[data_id] : minus_infinity;
-                    data[i].y = (((!triangular || (data_id + 1) <= seq_id)) &&
-                                 (data_id + 1) > window_stride && (data_id + 1) < sequence_length)
-                                    ? (vals[data_id + 1])
+                data[i].w = w_check ? vals[data_id + reduceWidth * 3] +
+                                          attn_mask[data_id + mask_offset + reduceWidth * 3]
                                    : minus_infinity;
-                    data[i].z = (((!triangular || (data_id + 2) <= seq_id)) &&
-                                 (data_id + 2) > window_stride && (data_id + 2) < sequence_length)
-                                    ? (vals[data_id + 2])
-                                    : minus_infinity;
-                    data[i].w = minus_infinity;
-                    if (attn_mask) {
-                        data[i].x += attn_mask[data_id + mask_offset];
-                        if ((data_id + 1) < sequence_length)
-                            data[i].y += attn_mask[data_id + mask_offset + 1];
-                        if ((data_id + 2) < sequence_length)
-                            data[i].z += attn_mask[data_id + mask_offset + 2];
-                    }
-                }
-                max_val = (data[i].x > max_val ? data[i].x : max_val);
-                max_val = (data[i].y > max_val ? data[i].y : max_val);
-                max_val = (data[i].z > max_val ? data[i].z : max_val);
-                max_val = (data[i].w > max_val ? data[i].w : max_val);
            } else {
-                data[i].x = minus_infinity;
-                data[i].y = minus_infinity;
-                data[i].z = minus_infinity;
-                data[i].w = minus_infinity;
+                data[i].x = x_check ? vals[data_id] : minus_infinity;
+                data[i].y = y_check ? vals[data_id + reduceWidth] : minus_infinity;
+                data[i].z = z_check ? vals[data_id + reduceWidth * 2] : minus_infinity;
+                data[i].w = w_check ? vals[data_id + reduceWidth * 3] : minus_infinity;
            }
+
+            max_val = (data[i].x > max_val ? data[i].x : max_val);
+            max_val = (data[i].y > max_val ? data[i].y : max_val);
+            max_val = (data[i].z > max_val ? data[i].z : max_val);
+            max_val = (data[i].w > max_val ? data[i].w : max_val);
        }

        for (int i = 1; i < WARP_SIZE; i *= 2) {
@@ -387,24 +390,38 @@ __global__ void attn_softmax_v2(float* vals,
        sum += 1e-6;

        for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-
+            int data_id = i * (reduceWidth << 2) + (seq_lane);
            if (data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = data[i].x / sum;
-                    vals[data_id + 1] = data[i].y / sum;
-                    vals[data_id + 2] = data[i].z / sum;
-                    vals[data_id + 3] = data[i].w / sum;
-                } else {
-                    vals[data_id] = data[i].x / sum;
-                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = data[i].y / sum;
-                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = data[i].z / sum;
-                }
+                vals[data_id] = data[i].x / sum;
+                if ((data_id + reduceWidth) < sequence_length)
+                    vals[data_id + reduceWidth] = data[i].y / sum;
+                if ((data_id + reduceWidth * 2) < sequence_length)
+                    vals[data_id + reduceWidth * 2] = data[i].z / sum;
+                if ((data_id + reduceWidth * 3) < sequence_length)
+                    vals[data_id + reduceWidth * 3] = data[i].w / sum;
            }
        }
    }
 }

+#define LAUNCH_ATTN_SOFTMAX_V2(iterations)                                      \
+    attn_softmax_v2<T, iterations><<<grid, block, 0, stream>>>(vals,            \
+                                                               mask,            \
+                                                               alibi,           \
+                                                               layer_scale,     \
+                                                               triangular,      \
+                                                               recompute,       \
+                                                               local_attention, \
+                                                               window_size,     \
+                                                               total_count,     \
+                                                               heads,           \
+                                                               sequence_length, \
+                                                               num_seq,         \
+                                                               head_offset,     \
+                                                               mask_stride,     \
+                                                               mp_size,         \
+                                                               reduce_width);
+
 template <typename T>
 void launch_attn_softmax_v2(T* vals,
                            T* mask,
@@ -423,34 +440,50 @@ void launch_attn_softmax_v2(T* vals,
                            int mp_size,
                            cudaStream_t stream)
 {
-    int total_count = batch_size * heads * num_seq;
-    int warp_num = ATTN_THREADS / WARP_SIZE;
-    int reduce_width = ((sequence_length - 1) / ATTN_THREADS + 1);
-    reduce_width = (int)pow(2.0, floor(log2((float)(reduce_width)))) * WARP_SIZE;
-    dim3 grid_dim((total_count - 1) / (ATTN_THREADS / reduce_width) + 1);
-    dim3 block_dim(ATTN_THREADS);
-
-    const int iterations = (sequence_length - 1) / (reduce_width << 2) + 1;
-
-    if (sequence_length <= 32768)
-        attn_softmax_v2<<<grid_dim, block_dim, 0, stream>>>(vals,
-                                                            mask,
-                                                            alibi,
-                                                            layer_scale,
-                                                            triangular,
-                                                            recompute,
-                                                            local_attention,
-                                                            window_size,
-                                                            total_count,
-                                                            heads,
-                                                            sequence_length,
-                                                            num_seq,
-                                                            head_offset,
-                                                            mask_stride,
-                                                            mp_size,
-                                                            iterations,
-                                                            reduce_width);
-    else
+    const int total_count = batch_size * heads * num_seq;
+
+    // Scheduling Overview
+    // 4 element unroll with power of 2 `reduce_width` threads to a ceiling of `attn_threads`
+    // Each block should be partitioned into as many `reduce_width` blocks
+    // as can be fit.
+    constexpr int attn_threads = 256;
+    constexpr int min_reduce_width = hw_warp_size;
+    constexpr int internal_unroll = 4;
+
+    // Handle internal unroll then round to next power of 2. Bump up to minimum granularity.
+    const int thread_steps_rounded =
+        next_pow2((sequence_length + internal_unroll - 1) / internal_unroll);
+    const int thread_steps_schedule =
+        (thread_steps_rounded < min_reduce_width) ? min_reduce_width : thread_steps_rounded;
+    // Bound reduce width to the number of threads
+    const int reduce_width = (thread_steps_schedule < attn_threads) ? thread_steps_schedule
+                                                                    : attn_threads;
+    // Scale for the excess
+    const int iterations = thread_steps_schedule / reduce_width;
+    // Should be safe since reduce_width is capped to attn_threads
+    const int partitions = attn_threads / reduce_width;
+
+    // Launch params
+    dim3 grid((total_count + partitions - 1) / partitions);
+    dim3 block(attn_threads);
+
+    if (sequence_length <= 32768) {
+        if (iterations == 1) {
+            LAUNCH_ATTN_SOFTMAX_V2(1);
+        } else if (iterations == 2) {
+            LAUNCH_ATTN_SOFTMAX_V2(2);
+        } else if (iterations == 4) {
+            LAUNCH_ATTN_SOFTMAX_V2(4);
+        } else if (iterations == 8) {
+            LAUNCH_ATTN_SOFTMAX_V2(8);
+        } else if (iterations == 16) {
+            LAUNCH_ATTN_SOFTMAX_V2(16);
+        } else if (iterations == 32) {
+            LAUNCH_ATTN_SOFTMAX_V2(32);
+        } else if (iterations == 64) {
+            LAUNCH_ATTN_SOFTMAX_V2(64);
+        }
+    } else
        throw std::runtime_error("Unsupport Seq_Length!");
 }

@@ -470,6 +503,26 @@ template void launch_attn_softmax_v2(float* vals,
                                     int mask_stride,
                                     int mp_size,
                                     cudaStream_t stream);
+
+#ifdef BF16_AVAILABLE
+template void launch_attn_softmax_v2(__nv_bfloat16* vals,
+                                     __nv_bfloat16* mask,
+                                     __nv_bfloat16* alibi,
+                                     float layer_scale,
+                                     bool triangular,
+                                     bool recompute,
+                                     bool local_attention,
+                                     int window_size,
+                                     int batch_size,
+                                     int heads,
+                                     int num_seq,
+                                     int sequence_length,
+                                     int head_offset,
+                                     int mask_stride,
+                                     int mp_size,
+                                     cudaStream_t stream);
+#endif
+
 template void launch_attn_softmax_v2(__half* vals,
                                     __half* mask,
                                     __half* alibi,
@@ -486,3 +539,53 @@ template void launch_attn_softmax_v2(__half* vals,
                                     int mask_stride,
                                     int mp_size,
                                     cudaStream_t stream);
+
+#define DEF_ATTN_SOFTMAX_V2_HALF(_iter)                                           \
+    template __global__ void attn_softmax_v2<__half, _iter>(__half * vals,        \
+                                                            __half * mask,        \
+                                                            __half * alibi,       \
+                                                            float layer_scale,    \
+                                                            bool triangular,      \
+                                                            bool recompute,       \
+                                                            bool local_attention, \
+                                                            int window_size,      \
+                                                            int total_count,      \
+                                                            int heads,            \
+                                                            int sequence_length,  \
+                                                            int num_seq,          \
+                                                            int head_offset,      \
+                                                            int mask_stride,      \
+                                                            int mp_size,          \
+                                                            int reduceWidth)
+
+#define DEF_ATTN_SOFTMAX_V2_BF16(_iter)                                                   \
+    template __global__ void attn_softmax_v2<__nv_bfloat16, _iter>(__nv_bfloat16 * vals,  \
+                                                                   __nv_bfloat16 * mask,  \
+                                                                   __nv_bfloat16 * alibi, \
+                                                                   float layer_scale,     \
+                                                                   bool triangular,       \
+                                                                   bool recompute,        \
+                                                                   bool local_attention,  \
+                                                                   int window_size,       \
+                                                                   int total_count,       \
+                                                                   int heads,             \
+                                                                   int sequence_length,   \
+                                                                   int num_seq,           \
+                                                                   int head_offset,       \
+                                                                   int mask_stride,       \
+                                                                   int mp_size,           \
+                                                                   int reduceWidth)
+
+#define FOREACH_ITERATIONS(cb) \
+    cb(1);                     \
+    cb(2);                     \
+    cb(4);                     \
+    cb(8);                     \
+    cb(16);                    \
+    cb(32);                    \
+    cb(64)
+
+FOREACH_ITERATIONS(DEF_ATTN_SOFTMAX_V2_HALF);
+#ifdef BF16_AVAILABLE
+FOREACH_ITERATIONS(DEF_ATTN_SOFTMAX_V2_BF16);
+#endif
--- a/csrc/transformer/inference/csrc/transform.cu
+++ b/csrc/transformer/inference/csrc/transform.cu
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team

 #ifndef __HIP_PLATFORM_HCC__
 #include <cuda_profiler_api.h>
 #endif
+#include "conversion_utils.h"
 #include "inference_cuda_layers.h"
 namespace cg = cooperative_groups;

+// only used to avoid compilation error due to lack of definition.
+#ifndef BF16_AVAILABLE
+using __nv_bfloat162 = __half2;
+#endif
+
 // Bias add

 __global__ void bias_add_transform_0213(float* output,
@@ -74,11 +81,12 @@ __global__ void bias_add_transform_0213(float* output,
 #define ATTN_H 3
 #define MAX_SEQ_LINE 10

-__global__ void bias_add_transform_0213(__half* output,  // q
-                                        __half* k_cache,
-                                        __half* v_cache,
-                                        const __half* vals,  // qkv
-                                        const __half* bias,
+template <typename T>
+__global__ void bias_add_transform_0213(T* output,  // q
+                                        T* k_cache,
+                                        T* v_cache,
+                                        const T* vals,  // qkv
+                                        const T* bias,
                                        int hidden_dim,
                                        int seq_length,
                                        unsigned seq_offset,
@@ -90,6 +98,8 @@ __global__ void bias_add_transform_0213(__half* output,  // q
                                        int head_ext,
                                        int max_out_tokens)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
    unsigned half_dim = (rotary_dim << 3) >> 1;
    int d0_stride = hidden_dim * seq_length;
    int d1_stride = hidden_dim;
@@ -107,8 +117,8 @@ __global__ void bias_add_transform_0213(__half* output,  // q
    float4 vals_arr;
    float4 output_arr;

-    __half2* vals_half = reinterpret_cast<__half2*>(&vals_arr);
-    __half2* output_half = reinterpret_cast<__half2*>(&output_arr);
+    T2* vals_half = reinterpret_cast<T2*>(&vals_arr);
+    T2* output_half = reinterpret_cast<T2*>(&output_arr);

    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
    float4* output_vec =
@@ -128,17 +138,19 @@ __global__ void bias_add_transform_0213(__half* output,  // q
    int lane = d3 & 0x1f;
    if (cnt < 2 && rotary_dim > 0 && d3 < rotary_dim) {
        float4 q = vals_vec[d3];
-        __half2* q_h = reinterpret_cast<__half2*>(&q);
+        T2* q_h = reinterpret_cast<T2*>(&q);
        if (rotate_every_two) {
 #pragma unroll
            for (int o = 0; o < 4; o++) {
                float inv_freq = (float)(((d3 << 2) + o) * 2) / (float)(rotary_dim << 3);
                inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
                float q_data[2];
-                q_data[0] = (float)q_h[o].x;
-                q_data[1] = (float)q_h[o].y;
-                q_h[o].x = (__half)(-1.0 * q_data[1] * sinf(inv_freq) + q_data[0] * cosf(inv_freq));
-                q_h[o].y = (__half)(q_data[0] * sinf(inv_freq) + q_data[1] * cosf(inv_freq));
+                q_data[0] = conversion::to<float>(q_h[o].x);
+                q_data[1] = conversion::to<float>(q_h[o].y);
+                q_h[o].x = conversion::to<T>(-1.0 * q_data[1] * sinf(inv_freq) +
+                                             q_data[0] * cosf(inv_freq));
+                q_h[o].y =
+                    conversion::to<T>(q_data[0] * sinf(inv_freq) + q_data[1] * cosf(inv_freq));
            }
        }
        output_vec[d3] = q;
@@ -187,16 +199,17 @@ void launch_bias_add_transform_0213<float>(float* output,
                                                                head_ext,
                                                                max_out_tokens);
 }
+
 template <typename T>
-void launch_bias_add_transform_0213(T* outputs,
-                                    T* vals,
-                                    T* vals1,
-                                    const T* vals2,
+void launch_bias_add_transform_0213(T* output,
+                                    T* k_cache,
+                                    T* v_cache,
+                                    const T* vals,
                                    const T* bias,
                                    int batch_size,
                                    int seq_length,
                                    unsigned seq_offset,
-                                    int seq_length1,
+                                    int all_tokens,
                                    int hidden_dim,
                                    int heads,
                                    int rotary_dim,
@@ -204,25 +217,7 @@ void launch_bias_add_transform_0213(T* outputs,
                                    bool rotate_every_two,
                                    cudaStream_t stream,
                                    int trans_count,
-                                    int max_out_tokens);
-template <>
-void launch_bias_add_transform_0213<__half>(__half* output,
-                                            __half* k_cache,
-                                            __half* v_cache,
-                                            const __half* vals,
-                                            const __half* bias,
-                                            int batch_size,
-                                            int seq_length,
-                                            unsigned seq_offset,
-                                            int all_tokens,
-                                            int hidden_dim,
-                                            int heads,
-                                            int rotary_dim,
-                                            bool rotate_half,
-                                            bool rotate_every_two,
-                                            cudaStream_t stream,
-                                            int trans_count,
-                                            int max_out_tokens)
+                                    int max_out_tokens)
 {
    hidden_dim >>= 3;
    int head_ext = 1;  // (hidden_dim - 1) / MAX_THREADS + 1;
@@ -245,6 +240,44 @@ void launch_bias_add_transform_0213<__half>(__half* output,
                                                                max_out_tokens);
 }

+#ifdef BF16_AVAILABLE
+template void launch_bias_add_transform_0213(__nv_bfloat16* output,
+                                             __nv_bfloat16* k_cache,
+                                             __nv_bfloat16* v_cache,
+                                             const __nv_bfloat16* vals,
+                                             const __nv_bfloat16* bias,
+                                             int batch_size,
+                                             int seq_length,
+                                             unsigned seq_offset,
+                                             int all_tokens,
+                                             int hidden_dim,
+                                             int heads,
+                                             int rotary_dim,
+                                             bool rotate_half,
+                                             bool rotate_every_two,
+                                             cudaStream_t stream,
+                                             int trans_count,
+                                             int max_out_tokens);
+#endif
+
+template void launch_bias_add_transform_0213(__half* output,
+                                             __half* k_cache,
+                                             __half* v_cache,
+                                             const __half* vals,
+                                             const __half* bias,
+                                             int batch_size,
+                                             int seq_length,
+                                             unsigned seq_offset,
+                                             int all_tokens,
+                                             int hidden_dim,
+                                             int heads,
+                                             int rotary_dim,
+                                             bool rotate_half,
+                                             bool rotate_every_two,
+                                             cudaStream_t stream,
+                                             int trans_count,
+                                             int max_out_tokens);
+
 // Bias add

 __global__ void pad_add_transform_0213(float* output,
@@ -257,17 +290,20 @@ __global__ void pad_add_transform_0213(float* output,
 {
 }

-__global__ void pad_add_transform_0213(__half* output,
-                                       const __half* vals,
+template <typename T>
+__global__ void pad_add_transform_0213(T* output,
+                                       const T* vals,
                                       int hidden_dim,
                                       int seq_length,
                                       int padded_seq_len,
                                       int heads,
                                       int padded_head_size)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
    float4 ZERO;
-    const __half2 zero_h = __float2half2_rn(0.f);
-    __half2* ZERO_h = reinterpret_cast<__half2*>(&ZERO);
+    const T2 zero_h = conversion::to<T2>(0.f);
+    T2* ZERO_h = reinterpret_cast<T2*>(&ZERO);
 #pragma unroll
    for (int i = 0; i < 4; i++) ZERO_h[i] = zero_h;

@@ -300,17 +336,6 @@ __global__ void pad_add_transform_0213(__half* output,
        output_vec[d3] = ZERO;
 }

-template <typename T>
-void launch_pad_add_transform_0213(T* output,
-                                   const T* vals,
-                                   int batch_size,
-                                   int hidden_dim,
-                                   int seq_length,
-                                   int padded_seq_len,
-                                   int heads,
-                                   int padded_head_size,
-                                   cudaStream_t stream);
-
 // [B S C*H] - > C * [B A S N]
 template <>
 void launch_pad_add_transform_0213<float>(float* output,
@@ -324,16 +349,17 @@ void launch_pad_add_transform_0213<float>(float* output,
                                          cudaStream_t stream)
 {
 }
-template <>
-void launch_pad_add_transform_0213<__half>(__half* output,
-                                           const __half* vals,
-                                           int batch_size,
-                                           int hidden_dim,
-                                           int seq_length,
-                                           int padded_seq_len,
-                                           int heads,
-                                           int padded_head_size,
-                                           cudaStream_t stream)
+
+template <typename T>
+void launch_pad_add_transform_0213(T* output,
+                                   const T* vals,
+                                   int batch_size,
+                                   int hidden_dim,
+                                   int seq_length,
+                                   int padded_seq_len,
+                                   int heads,
+                                   int padded_head_size,
+                                   cudaStream_t stream)
 {
    hidden_dim >>= 3;
    dim3 block_dim((padded_head_size >> 3), heads, 2);
@@ -342,6 +368,28 @@ void launch_pad_add_transform_0213<__half>(__half* output,
        output, vals, hidden_dim, seq_length, padded_seq_len, heads, padded_head_size >> 3);
 }

+#ifdef BF16_AVAILABLE
+template void launch_pad_add_transform_0213(__nv_bfloat16* output,
+                                            const __nv_bfloat16* vals,
+                                            int batch_size,
+                                            int hidden_dim,
+                                            int seq_length,
+                                            int padded_seq_len,
+                                            int heads,
+                                            int padded_head_size,
+                                            cudaStream_t stream);
+#endif
+
+template void launch_pad_add_transform_0213(__half* output,
+                                            const __half* vals,
+                                            int batch_size,
+                                            int hidden_dim,
+                                            int seq_length,
+                                            int padded_seq_len,
+                                            int heads,
+                                            int padded_head_size,
+                                            cudaStream_t stream);
+
 // Bias add
 template <typename T>
 __global__ void bias_add_transform_0213(T* output,
@@ -393,15 +441,17 @@ __global__ void bias_add_transform_0213<float>(float* output,
               d2 * d2_out_stride + d3] = outputs;
 }

-template <>
-__global__ void bias_add_transform_0213<__half>(__half* output,
-                                                const __half* vals,
-                                                const __half* bias,
-                                                int hidden_dim,
-                                                int seq_length,
-                                                int heads,
-                                                int head_ext)
+template <typename T>
+__global__ void bias_add_transform_0213(T* output,
+                                        const T* vals,
+                                        const T* bias,
+                                        int hidden_dim,
+                                        int seq_length,
+                                        int heads,
+                                        int head_ext)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
    int d0_stride = hidden_dim * seq_length;
    int d1_stride = hidden_dim;
    int d2_stride = hidden_dim / heads;
@@ -417,9 +467,9 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
    float4 vals_arr;
    float4 bias_arr;
    float4 output_arr;
-    __half2* vals_half = reinterpret_cast<__half2*>(&vals_arr);
-    __half2* bias_half = reinterpret_cast<__half2*>(&bias_arr);
-    __half2* output_half = reinterpret_cast<__half2*>(&output_arr);
+    T2* vals_half = reinterpret_cast<T2*>(&vals_arr);
+    T2* bias_half = reinterpret_cast<T2*>(&bias_arr);
+    T2* output_half = reinterpret_cast<T2*>(&output_arr);

    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
    const float4* bias_vec = reinterpret_cast<const float4*>(bias);
@@ -448,13 +498,16 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
    output_vec[d3] = output_arr;
 }

-__global__ void bias_add_transform_0213_v2(__half* output,
-                                           const __half* vals,
-                                           const __half* bias,
+template <typename T>
+__global__ void bias_add_transform_0213_v2(T* output,
+                                           const T* vals,
+                                           const T* bias,
                                           int hidden_dim,
                                           int seq_length,
                                           int heads)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
    __shared__ float4 in_data[3072];

    int d0_stride = hidden_dim * seq_length;
@@ -476,9 +529,9 @@ __global__ void bias_add_transform_0213_v2(__half* output,
    float4 vals_arr[1];
    float4 bias_arr[1];
    float4 output_arr[1];
-    __half2* vals_half = reinterpret_cast<__half2*>(vals_arr);
-    __half2* bias_half = reinterpret_cast<__half2*>(bias_arr);
-    __half2* output_half = reinterpret_cast<__half2*>(output_arr);
+    T2* vals_half = reinterpret_cast<T2*>(vals_arr);
+    T2* bias_half = reinterpret_cast<T2*>(bias_arr);
+    T2* output_half = reinterpret_cast<T2*>(output_arr);

    const float4* vals_vec = reinterpret_cast<const float4*>(vals);
    const float4* bias_vec = reinterpret_cast<const float4*>(bias);
@@ -518,6 +571,22 @@ __global__ void bias_add_transform_0213_v2(__half* output,
    }
 }

+template __global__ void bias_add_transform_0213_v2(__half* output,
+                                                    const __half* vals,
+                                                    const __half* bias,
+                                                    int hidden_dim,
+                                                    int seq_length,
+                                                    int heads);
+
+#ifdef BF16_AVAILABLE
+template __global__ void bias_add_transform_0213_v2(__nv_bfloat16* output,
+                                                    const __nv_bfloat16* vals,
+                                                    const __nv_bfloat16* bias,
+                                                    int hidden_dim,
+                                                    int seq_length,
+                                                    int heads);
+#endif
+
 template <typename T>
 __global__ void transform4d_0213(T* out,
                                 const T* in,
@@ -559,13 +628,13 @@ __global__ void transform4d_0213<float>(float* out,
    }
 }

-template <>
-__global__ void transform4d_0213<__half>(__half* out,
-                                         const __half* in,
-                                         int heads,
-                                         int seq_length,
-                                         int hidden_dim,
-                                         int head_ext)
+template <typename T>
+__global__ void transform4d_0213(T* out,
+                                 const T* in,
+                                 int heads,
+                                 int seq_length,
+                                 int hidden_dim,
+                                 int head_ext)
 {
    int d0_stride = hidden_dim * (seq_length / head_ext);
    int d1_stride = hidden_dim;
@@ -593,11 +662,8 @@ __global__ void transform4d_0213<__half>(__half* out,
    out_vec[d3] = in_vec[d3];
 }

-__global__ void transform4d_0213_v2(__half* out,
-                                    const __half* in,
-                                    int heads,
-                                    int seq_length,
-                                    int hidden_dim)
+template <typename T>
+__global__ void transform4d_0213_v2(T* out, const T* in, int heads, int seq_length, int hidden_dim)
 {
    __shared__ float4 in_data[3072];

@@ -641,6 +707,20 @@ __global__ void transform4d_0213_v2(__half* out,
    }
 }

+#ifdef BF16_AVAILABLE
+template __global__ void transform4d_0213_v2(__nv_bfloat16* out,
+                                             const __nv_bfloat16* in,
+                                             int heads,
+                                             int seq_length,
+                                             int hidden_dim);
+#endif
+
+template __global__ void transform4d_0213_v2(__half* out,
+                                             const __half* in,
+                                             int heads,
+                                             int seq_length,
+                                             int hidden_dim);
+
 // 3 * [B A S N] - > [B S C*H]
 template <>
 void launch_transform4d_0213<float>(float* out,
@@ -659,20 +739,40 @@ void launch_transform4d_0213<float>(float* out,
        <<<grid_dims, block_dims, 0, stream>>>(out, in, heads, seq_length, hidden_dim, 1);
 }

-template <>
-void launch_transform4d_0213<__half>(__half* out,
-                                     const __half* in,
-                                     int batch_size,
-                                     int heads,
-                                     int seq_length,
-                                     int hidden_dim,
-                                     cudaStream_t stream,
-                                     int trans_count)
+template <typename T>
+void launch_transform4d_0213<T>(T* out,
+                                const T* in,
+                                int batch_size,
+                                int heads,
+                                int seq_length,
+                                int hidden_dim,
+                                cudaStream_t stream,
+                                int trans_count)
 {
    hidden_dim >>= 3;
    int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
    dim3 grid_dims(batch_size, trans_count, (seq_length * head_ext));
    dim3 block_dims(hidden_dim / heads, (heads / head_ext));
-    transform4d_0213<__half>
-        <<<grid_dims, block_dims, 0, stream>>>(out, in, heads, seq_length, hidden_dim, head_ext);
+    transform4d_0213<<<grid_dims, block_dims, 0, stream>>>(
+        out, in, heads, seq_length, hidden_dim, head_ext);
 }
+
+#ifdef BF16_AVAILABLE
+template void launch_transform4d_0213(__nv_bfloat16* out,
+                                      const __nv_bfloat16* in,
+                                      int batch_size,
+                                      int heads,
+                                      int seq_length,
+                                      int hidden_dim,
+                                      cudaStream_t stream,
+                                      int trans_count);
+#endif
+
+template void launch_transform4d_0213(__half* out,
+                                      const __half* in,
+                                      int batch_size,
+                                      int heads,
+                                      int seq_length,
+                                      int hidden_dim,
+                                      cudaStream_t stream,
+                                      int trans_count);
--- a/csrc/transformer/inference/includes/context.h
+++ b/csrc/transformer/inference/includes/context.h
-#pragma once
-
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime_api.h>
-#include <cassert>
-#include <iostream>
-#include <vector>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-
-#define WARP_SIZE 32
-
-#define CUDA_CHECK(callstr)                                                                    \
-    {                                                                                          \
-        cudaError_t error_code = callstr;                                                      \
-        if (error_code != cudaSuccess) {                                                       \
-            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
-            assert(0);                                                                         \
-        }                                                                                      \
-    }
-
-#define CUDA_1D_KERNEL_LOOP(i, n) \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
-
-#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                                                          \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
-        for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
-
-#define DS_CUDA_NUM_THREADS 512
-#define DS_MAXIMUM_NUM_BLOCKS 262144
-
-inline int DS_GET_BLOCKS(const int N)
-{
-    return std::max(
-        std::min((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
-        // Use at least 1 block, since CUDA does not allow empty block
-        1);
-}
-
-class Context {
-public:
-    Context() : _workspace(nullptr), _seed(42), _curr_offset(0), _stream(0)
-    {
-        curandCreateGenerator(&_gen, CURAND_RNG_PSEUDO_DEFAULT);
-        curandSetPseudoRandomGeneratorSeed(_gen, 123);
-        if (cublasCreate(&_cublasHandle) != CUBLAS_STATUS_SUCCESS) {
-            auto message = std::string("Fail to create cublas handle.");
-            std::cerr << message << std::endl;
-            throw std::runtime_error(message);
-        }
-#ifndef __HIP_PLATFORM_HCC__
-        cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
-        cudaEventCreate(&_comp1_event, (cudaEventDisableTiming | cudaEventBlockingSync));
-        cudaEventCreate(&_comp2_event, (cudaEventDisableTiming | cudaEventBlockingSync));
-        cudaEventCreate(&_comp_event, (cudaEventDisableTiming | cudaEventBlockingSync));
-        cudaEventCreate(&_comm_event, (cudaEventDisableTiming | cudaEventBlockingSync));
-#else
-        cudaEventCreate(&_comp1_event);
-        cudaEventCreate(&_comp2_event);
-        cudaEventCreate(&_comp_event);
-        cudaEventCreate(&_comm_event);
-#endif
-    }
-
-    virtual ~Context()
-    {
-        cublasDestroy(_cublasHandle);
-        cudaFree(_workspace);
-        cudaEventDestroy(_comp1_event);
-        cudaEventDestroy(_comp2_event);
-        cudaEventDestroy(_comp_event);
-        cudaEventDestroy(_comm_event);
-    }
-
-    static Context& Instance()
-    {
-        static Context _ctx;
-        return _ctx;
-    }
-
-    void GenWorkSpace(size_t size)
-    {
-        if (!_workspace) {
-            assert(_workspace == nullptr);
-            cudaMalloc(&_workspace, size);
-        } else if (_workSpaceSize < size) {
-            cudaFree(_workspace);
-            cudaMalloc(&_workspace, size);
-        }
-
-        _workSpaceSize = size;
-    }
-
-    cudaEvent_t GetCompEvent(int id) { return id == 1 ? _comp1_event : _comp2_event; }
-
-    size_t get_workspace_size() const { return _workSpaceSize; }
-    void* GetWorkSpace() { return _workspace; }
-
-    inline unsigned new_token(unsigned layer_id)
-    {
-        if (layer_id == 0) _token_length++;
-        return _token_length;
-    }
-
-    inline void reset_tokens(unsigned initial_tokens = 0)
-    {
-        _num_tokens = initial_tokens;
-    }  //_token_length = 0; }
-
-    inline unsigned current_tokens() const { return _num_tokens; }
-
-    inline void advance_tokens() { _num_tokens++; }
-
-    curandGenerator_t& GetRandGenerator() { return _gen; }
-
-    cudaStream_t GetCommStream(bool async_op = false)
-    {
-        if (!_comm_stream)
-            _comm_stream = async_op ? at::cuda::getStreamFromPool(true)
-                                    : at::cuda::getCurrentCUDAStream();
-        return _comm_stream;
-    }
-    cudaStream_t GetCurrentStream(bool other_stream = false)
-    {
-        // get current pytorch stream.
-        if (other_stream) {
-            if (!_stream) _stream = at::cuda::getStreamFromPool(true);
-            return _stream;
-        }
-        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-        return stream;
-    }
-
-    cublasHandle_t GetCublasHandle() { return _cublasHandle; }
-
-    std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
-    {
-        uint64_t offset = _curr_offset;
-        _curr_offset += offset_inc;
-        return std::pair<uint64_t, uint64_t>(_seed, offset);
-    }
-
-    void SetSeed(uint64_t new_seed) { _seed = new_seed; }
-
-    const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
-
-    inline void SynchComp()
-    {
-        cudaEventRecord(_comp_event, _comp_stream);
-        cudaStreamWaitEvent(_comm_stream, _comp_event, 0);
-    }
-    inline void SynchComm()
-    {
-        cudaEventRecord(_comm_event, _comm_stream);
-        cudaStreamWaitEvent(_comp_stream, _comm_event, 0);
-    }
-
-private:
-    curandGenerator_t _gen;
-    cublasHandle_t _cublasHandle;
-
-    cudaEvent_t _comp_event;
-    cudaEvent_t _comm_event;
-
-    void* _workspace;
-    uint64_t _seed;
-    uint64_t _curr_offset;
-    size_t _workSpaceSize;
-
-    cudaEvent_t _comp1_event;
-    cudaEvent_t _comp2_event;
-
-    cudaStream_t _stream;
-
-    unsigned _token_length;
-    unsigned _num_tokens;
-    std::vector<std::array<int, 3>> _gemm_algos;
-
-    cudaStream_t _comp_stream;
-    cudaStream_t _comm_stream;
-
-    std::unordered_map<int, int> _world_sizes;
-};
--- a/csrc/transformer/inference/includes/cublas_wrappers.h
+++ b/csrc/transformer/inference/includes/cublas_wrappers.h
-#pragma once
-
-#include <assert.h>
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#ifndef __HIP_PLATFORM_HCC__
-#include <mma.h>
-#endif
-#include <stdio.h>
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-                   rocblas_gemm_algo algo)
-#else
-int cublas_gemm_ex(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-                   cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status = rocblas_gemm_ex(handle,
-                                            transa,
-                                            transb,
-                                            m,
-                                            n,
-                                            k,
-                                            (const void*)alpha,
-                                            (const void*)A,
-                                            rocblas_datatype_f32_r,
-                                            (transa == rocblas_operation_none) ? m : k,
-                                            (const void*)B,
-                                            rocblas_datatype_f32_r,
-                                            (transb == rocblas_operation_none) ? k : n,
-                                            (const void*)beta,
-                                            C,
-                                            rocblas_datatype_f32_r,
-                                            m,
-                                            C,
-                                            rocblas_datatype_f32_r,
-                                            m,
-                                            rocblas_datatype_f32_r,
-                                            algo,
-                                            0,
-                                            0);
-#else
-    cublasStatus_t status = cublasGemmEx(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         CUDA_R_32F,
-                                         (transa == CUBLAS_OP_N) ? m : k,
-                                         (const void*)B,
-                                         CUDA_R_32F,
-                                         (transb == CUBLAS_OP_N) ? k : n,
-                                         (const void*)beta,
-                                         C,
-                                         CUDA_R_32F,
-                                         m,
-                                         CUDA_R_32F,
-                                         algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != CUBLAS_STATUS_SUCCESS) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   rocblas_gemm_algo algo)
-#else
-int cublas_gemm_ex(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status = rocblas_gemm_ex(handle,
-                                            transa,
-                                            transb,
-                                            m,
-                                            n,
-                                            k,
-                                            (const void*)alpha,
-                                            (const void*)A,
-                                            rocblas_datatype_f16_r,
-                                            (transa == rocblas_operation_none) ? m : k,
-                                            (const void*)B,
-                                            rocblas_datatype_f16_r,
-                                            (transb == rocblas_operation_none) ? k : n,
-                                            (const void*)beta,
-                                            (void*)C,
-                                            rocblas_datatype_f16_r,
-                                            m,
-                                            (void*)C,
-                                            rocblas_datatype_f16_r,
-                                            m,
-                                            rocblas_datatype_f32_r,
-                                            algo,
-                                            0,
-                                            0);
-#else
-    cublasStatus_t status = cublasGemmEx(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         CUDA_R_16F,
-                                         (transa == CUBLAS_OP_N) ? m : k,
-                                         (const void*)B,
-                                         CUDA_R_16F,
-                                         (transb == CUBLAS_OP_N) ? k : n,
-                                         (const void*)beta,
-                                         (void*)C,
-                                         CUDA_R_16F,
-                                         m,
-                                         CUDA_R_32F,
-                                         algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != CUBLAS_STATUS_SUCCESS) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                rocblas_gemm_algo algo)
-#else
-int cublas_strided_batched_gemm(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                cublasOperation_t op_A,
-                                cublasOperation_t op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status =
-        rocblas_gemm_strided_batched_ex(handle,
-                                        op_A,
-                                        op_B,
-                                        m,
-                                        n,
-                                        k,
-                                        alpha,
-                                        A,
-                                        rocblas_datatype_f32_r,
-                                        (op_A == rocblas_operation_none) ? m : k,
-                                        stride_A,
-                                        B,
-                                        rocblas_datatype_f32_r,
-                                        (op_B == rocblas_operation_none) ? k : n,
-                                        stride_B,
-                                        beta,
-                                        C,
-                                        rocblas_datatype_f32_r,
-                                        m,
-                                        stride_C,
-                                        C,
-                                        rocblas_datatype_f32_r,
-                                        m,
-                                        stride_C,
-                                        batch,
-                                        rocblas_datatype_f32_r,
-                                        algo,
-                                        0,
-                                        0);
-#else
-    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       CUDA_R_32F,
-                                                       (op_A == CUBLAS_OP_N) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       CUDA_R_32F,
-                                                       (op_B == CUBLAS_OP_N) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       CUDA_R_32F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       CUDA_R_32F,
-                                                       algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != CUBLAS_STATUS_SUCCESS) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
-                batch,
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                rocblas_gemm_algo algo)
-#else
-int cublas_strided_batched_gemm(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                cublasOperation_t op_A,
-                                cublasOperation_t op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status =
-        rocblas_gemm_strided_batched_ex(handle,
-                                        op_A,
-                                        op_B,
-                                        m,
-                                        n,
-                                        k,
-                                        alpha,
-                                        A,
-                                        rocblas_datatype_f16_r,
-                                        (op_A == rocblas_operation_none) ? m : k,
-                                        stride_A,
-                                        B,
-                                        rocblas_datatype_f16_r,
-                                        (op_B == rocblas_operation_none) ? k : n,
-                                        stride_B,
-                                        beta,
-                                        C,
-                                        rocblas_datatype_f16_r,
-                                        m,
-                                        stride_C,
-                                        C,
-                                        rocblas_datatype_f16_r,
-                                        m,
-                                        stride_C,
-                                        batch,
-                                        rocblas_datatype_f32_r,
-                                        algo,
-                                        0,
-                                        0);
-#else
-    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       CUDA_R_16F,
-                                                       (op_A == CUBLAS_OP_N) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       CUDA_R_16F,
-                                                       (op_B == CUBLAS_OP_N) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       CUDA_R_16F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       CUDA_R_32F,
-                                                       algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != CUBLAS_STATUS_SUCCESS) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-
-    return 0;
-}
--- a/csrc/transformer/inference/includes/custom_cuda_layers.h
+++ b/csrc/transformer/inference/includes/custom_cuda_layers.h
-#pragma once
-
-#ifdef __HIP_PLATFORM_HCC__
-#define HALF_PRECISION_AVAILABLE = 1
-#include <hip/hip_cooperative_groups.h>
-#else
-#if __CUDA_ARCH__ >= 700
-#define HALF_PRECISION_AVAILABLE = 1
-#endif
-#include <cooperative_groups.h>
-#endif
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <cassert>
-#include <iostream>
-
-#define MAX_WARP_NUM 32
-#define WARP_SIZE 32
-#define SMs 80
-
-#define MAX_REGISTERS 256
-template <typename T>
-void launch_attn_softmax_v2(T* vals,
-                            T* mask,
-                            bool triangular,
-                            bool recompute,
-                            bool local_attention,
-                            int window_size,
-                            int batch_size,
-                            int heads,
-                            int num_seq,
-                            int sequence_length,
-                            float scale,
-                            cudaStream_t stream);
-
-// Fused bias add with gelu activation
-template <typename T>
-void launch_bias_gelu(T* input,
-                      const T* bias,
-                      int intermediate_size,
-                      int batch_size,
-                      cudaStream_t stream);
-template <typename T>
-void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream);
-
-template <typename T>
-void launch_bias_residual(T* input,
-                          T* output,
-                          T* attn,
-                          T* bias,
-                          T* attn_bias,
-                          int batch,
-                          int hidden_dim,
-                          int mp_size,
-                          cudaStream_t stream);
-
-template <typename T>
-void launch_layer_norm(T* out,
-                       T* vals,
-                       const T* gamma,
-                       const T* beta,
-                       float epsilon,
-                       int batch_size,
-                       int hidden_dim,
-                       cudaStream_t stream);
-
-template <typename T>
-void launch_residual_layer_norm(T* norm,
-                                T* res_add,
-                                T* vals,
-                                T* residual,
-                                const T* bias,
-                                const T* gamma,
-                                const T* beta,
-                                float epsilon,
-                                int batch_size,
-                                int hidden_dim,
-                                bool preLN,
-                                bool mlp_after_attn,
-                                cudaStream_t stream);
-template <typename T>
-void launch_dequantize(T* output,
-                       const int8_t* input,
-                       const float* qscale,
-                       unsigned output_size,
-                       unsigned hidden_dim,
-                       unsigned groups,
-                       unsigned merge_count,
-                       cudaStream_t stream);
-
-template <typename T>
-void launch_gptj_residual_add(T* input,
-                              T* output,
-                              T* attn,
-                              T* bias,
-                              T* attn_bias,
-                              int batch,
-                              int head_size,
-                              int mp_size,
-                              cudaStream_t stream);
-
-template <typename T>
-void launch_apply_rotary_pos_emb(T* mixed_query,
-                                 T* key_layer,
-                                 unsigned head_size,
-                                 unsigned seq_len,
-                                 unsigned rotary_dim,
-                                 unsigned offset,
-                                 unsigned num_heads,
-                                 unsigned batch,
-                                 bool rotate_half,
-                                 bool rotate_every_two,
-                                 cudaStream_t stream);
-
-template <typename T>
-void launch_moe_res_matmul(T* residual,
-                           T* coef,
-                           T* mlp_out,
-                           int seq_len,
-                           int hidden_dim,
-                           cudaStream_t stream);
--- a/csrc/transformer/inference/includes/inference_context.h
+++ b/csrc/transformer/inference/includes/inference_context.h
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team

 #pragma once

@@ -45,17 +46,20 @@ inline int DS_GET_BLOCKS(const int N)
        1);
 }

-class Context {
+class InferenceContext {
 public:
-    Context()
+    InferenceContext()
        : _workspace(nullptr),
          _seed(42),
          _curr_offset(0),
          _stream(0),
          _free_memory_size(0),
          _num_tokens(1),
-          _attention_unfused_workspace_offset(0)
+          _attention_unfused_workspace_offset(0),
+          _workSpaceSize(0)
    {
+        _workSpaceSize = 0;
+        _workspace = 0;
        if (cublasCreate(&_cublasHandle) != CUBLAS_STATUS_SUCCESS) {
            auto message = std::string("Fail to create cublas handle.");
            std::cerr << message << std::endl;
@@ -70,7 +74,7 @@ public:
        cudaEventCreate(&_comm_event);
    }

-    virtual ~Context()
+    virtual ~InferenceContext()
    {
        cublasDestroy(_cublasHandle);
        cudaFree(_workspace);
@@ -80,9 +84,9 @@ public:
        cudaEventDestroy(_comm_event);
    }

-    static Context& Instance()
+    static InferenceContext& Instance()
    {
-        static Context _ctx;
+        static InferenceContext _ctx;
        return _ctx;
    }

@@ -95,7 +99,8 @@ public:
                      const bool& external_cache,
                      const size_t& elem_size,
                      const unsigned& rank,
-                      unsigned max_out_tokens)
+                      unsigned max_out_tokens,
+                      unsigned min_out_tokens)
    {
        size_t total_size;
        if (!_free_memory_size) { cudaMemGetInfo(&_free_memory_size, &total_size); }
@@ -106,9 +111,9 @@ public:
        const int padded_head_size = head_size <= 32 ? 32 : (head_size <= 64 ? 64 : 128);
        const int effective_head_size = (head_size > 128) ? head_size : padded_head_size;

-        size_t activation_size = 16 * (num_heads * effective_head_size) * batch_size;
+        size_t activation_size = 10 * (num_heads * effective_head_size) * batch_size;
        // Other sequence length dimension is added when the final workSpaceSize is calculated
-        size_t temp_size = batch_size * num_heads * max_out_tokens * 2;
+        size_t temp_size = batch_size * (num_heads / mp_size) * max_out_tokens;
        size_t cache_size =
            num_layers * batch_size * ((num_heads * effective_head_size) / mp_size) * 2;
        size_t minimal_requirements =
@@ -128,25 +133,37 @@ public:
                                                : (activation_size + temp_size + cache_size))) *
                               _max_seq_len * elem_size;
        temp_size *= _max_seq_len * elem_size;
-        if (rank == 0 && !_workspace)
+
+        if (_max_seq_len < min_out_tokens) {
+            printf(
+                "Allocatable workspace available (%d tokens) is less than minimum requested "
+                "workspace (%d tokens)\n",
+                _max_seq_len,
+                min_out_tokens);
+            throw std::runtime_error("Workspace can't be allocated, not enough memory");
+        }
+
+        if (!_workspace) {
+            assert(_workspace == nullptr);
+            cudaMalloc(&_workspace, workSpaceSize);
+        } else if (_workSpaceSize < workSpaceSize) {
+            cudaFree(_workspace);
+            cudaMalloc(&_workspace, workSpaceSize);
+        }
+        if (rank == 0 && (!_workspace || _workSpaceSize < workSpaceSize))
            printf(
                "------------------------------------------------------\n"
                "Free memory : %f (GigaBytes)  \n"
                "Total memory: %f (GigaBytes)  \n"
                "Requested memory: %f (GigaBytes) \n"
                "Setting maximum total tokens (input + output) to %lu \n"
+                "WorkSpace: %p \n"
                "------------------------------------------------------\n",
                (float)_free_memory_size / GIGABYTE,
                (float)total_size / GIGABYTE,
                (float)workSpaceSize / GIGABYTE,
-                _max_seq_len);
-        if (!_workspace) {
-            assert(_workspace == nullptr);
-            cudaMalloc(&_workspace, workSpaceSize);
-        } else if (_workSpaceSize < workSpaceSize) {
-            cudaFree(_workspace);
-            cudaMalloc(&_workspace, workSpaceSize);
-        }
+                _max_seq_len,
+                _workspace);

        if (!_workspace) {
            printf("Requested:\t%lu\nFree:\t%lu\nTotal:\t%lu\n",
@@ -202,6 +219,17 @@ public:
        return stream;
    }

+    void release_workspace()
+    {
+        cudaFree(_workspace);
+        _workspace = nullptr;
+    }
+    bool retake_workspace()
+    {
+        if (_workspace != nullptr || _workSpaceSize == 0) return true;
+        cudaMalloc(&_workspace, _workSpaceSize);
+        return _workspace != nullptr;
+    }
    cublasHandle_t GetCublasHandle() { return _cublasHandle; }

    std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)

--- a/csrc/transformer/inference/includes/inference_cublas_wrappers.h
+++ b/csrc/transformer/inference/includes/inference_cublas_wrappers.h
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team

 #pragma once

 #include <assert.h>
 #include <cublas_v2.h>
 #include <cuda.h>
+#include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #ifndef __HIP_PLATFORM_HCC__
@@ -105,6 +107,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
    return 0;
 }

+template <typename T>
 #ifdef __HIP_PLATFORM_HCC__
 int cublas_gemm_ex(rocblas_handle handle,
                   rocblas_operation transa,
@@ -114,9 +117,9 @@ int cublas_gemm_ex(rocblas_handle handle,
                   int k,
                   const float* alpha,
                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
+                   const T* A,
+                   const T* B,
+                   T* C,
                   rocblas_gemm_algo algo)
 #else
 int cublas_gemm_ex(cublasHandle_t handle,
@@ -127,13 +130,15 @@ int cublas_gemm_ex(cublasHandle_t handle,
                   int k,
                   const float* alpha,
                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
+                   const T* A,
+                   const T* B,
+                   T* C,
                   cublasGemmAlgo_t algo)
 #endif
 {
 #ifdef __HIP_PLATFORM_HCC__
+    constexpr auto rocblas_dtype_16 = std::is_same<T, __half>::value ? rocblas_datatype_f16_r
+                                                                     : rocblas_datatype_bf16_r;
    rocblas_status status = rocblas_gemm_ex(handle,
                                            transa,
                                            transb,
@@ -142,23 +147,24 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                            k,
                                            (const void*)alpha,
                                            (const void*)A,
-                                            rocblas_datatype_f16_r,
+                                            rocblas_dtype_16,
                                            (transa == rocblas_operation_none) ? m : k,
                                            (const void*)B,
-                                            rocblas_datatype_f16_r,
+                                            rocblas_dtype_16,
                                            (transb == rocblas_operation_none) ? k : n,
                                            (const void*)beta,
                                            (void*)C,
-                                            rocblas_datatype_f16_r,
+                                            rocblas_dtype_16,
                                            m,
                                            (void*)C,
-                                            rocblas_datatype_f16_r,
+                                            rocblas_dtype_16,
                                            m,
                                            rocblas_datatype_f32_r,
                                            algo,
                                            0,
                                            0);
 #else
+    constexpr auto cublas_dtype_16 = std::is_same<T, __half>::value ? CUDA_R_16F : CUDA_R_16BF;
    cublasStatus_t status = cublasGemmEx(handle,
                                         transa,
                                         transb,
@@ -167,14 +173,14 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                         k,
                                         (const void*)alpha,
                                         (const void*)A,
-                                         CUDA_R_16F,
+                                         cublas_dtype_16,
                                         (transa == CUBLAS_OP_N) ? m : k,
                                         (const void*)B,
-                                         CUDA_R_16F,
+                                         cublas_dtype_16,
                                         (transb == CUBLAS_OP_N) ? k : n,
                                         (const void*)beta,
                                         (void*)C,
-                                         CUDA_R_16F,
+                                         cublas_dtype_16,
                                         m,
                                         CUDA_R_32F,
                                         algo);
@@ -306,6 +312,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
    return 0;
 }

+template <typename T>
 #ifdef __HIP_PLATFORM_HCC__
 int cublas_strided_batched_gemm(rocblas_handle handle,
                                int m,
@@ -313,9 +320,9 @@ int cublas_strided_batched_gemm(rocblas_handle handle,
                                int k,
                                const float* alpha,
                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
+                                const T* A,
+                                const T* B,
+                                T* C,
                                rocblas_operation op_A,
                                rocblas_operation op_B,
                                int stride_A,
@@ -330,9 +337,9 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                int k,
                                const float* alpha,
                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
+                                const T* A,
+                                const T* B,
+                                T* C,
                                cublasOperation_t op_A,
                                cublasOperation_t op_B,
                                int stride_A,
@@ -343,6 +350,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
 #endif
 {
 #ifdef __HIP_PLATFORM_HCC__
+    constexpr auto rocblas_dtype_16 = std::is_same<T, __half>::value ? rocblas_datatype_f16_r
+                                                                     : rocblas_datatype_bf16_r;
    rocblas_status status =
        rocblas_gemm_strided_batched_ex(handle,
                                        op_A,
@@ -352,20 +361,20 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                        k,
                                        alpha,
                                        A,
-                                        rocblas_datatype_f16_r,
+                                        rocblas_dtype_16,
                                        (op_A == rocblas_operation_none) ? m : k,
                                        stride_A,
                                        B,
-                                        rocblas_datatype_f16_r,
+                                        rocblas_dtype_16,
                                        (op_B == rocblas_operation_none) ? k : n,
                                        stride_B,
                                        beta,
                                        C,
-                                        rocblas_datatype_f16_r,
+                                        rocblas_dtype_16,
                                        m,
                                        stride_C,
                                        C,
-                                        rocblas_datatype_f16_r,
+                                        rocblas_dtype_16,
                                        m,
                                        stride_C,
                                        batch,
@@ -374,6 +383,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                        0,
                                        0);
 #else
+    constexpr auto cublas_dtype_16 = std::is_same<T, __half>::value ? CUDA_R_16F : CUDA_R_16BF;
    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
                                                       op_A,
                                                       op_B,
@@ -382,16 +392,16 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                       k,
                                                       alpha,
                                                       A,
-                                                       CUDA_R_16F,
+                                                       cublas_dtype_16,
                                                       (op_A == CUBLAS_OP_N) ? m : k,
                                                       stride_A,
                                                       B,
-                                                       CUDA_R_16F,
+                                                       cublas_dtype_16,
                                                       (op_B == CUBLAS_OP_N) ? k : n,
                                                       stride_B,
                                                       beta,
                                                       C,
-                                                       CUDA_R_16F,
+                                                       cublas_dtype_16,
                                                       m,
                                                       stride_C,
                                                       batch,

--- a/csrc/transformer/inference/includes/inference_cuda_layers.h
+++ b/csrc/transformer/inference/includes/inference_cuda_layers.h
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team

 #pragma once

 #include "ds_kernel_utils.h"

 #include <cuda.h>
+#ifdef BF16_AVAILABLE
+#include <cuda_bf16.h>
+#endif
 #include <cuda_fp16.h>
 #include <stdio.h>
 #include <stdlib.h>

--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team

 #include "custom_cuda_layers.h"


--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team

 #include <math.h>
 #include "custom_cuda_layers.h"

--- a/csrc/transformer/transform_kernels.cu
+++ b/csrc/transformer/transform_kernels.cu
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team

 #include "custom_cuda_layers.h"


--- a/csrc/utils/flatten_unflatten.cpp
+++ b/csrc/utils/flatten_unflatten.cpp
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
 /*
-   Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 */

 #include <torch/csrc/utils/tensor_flatten.h>

--- a/deepspeed.git/HEAD
+++ b/deepspeed.git/HEAD
-ref: refs/heads/ds-v0.8.2-rocm
--- a/deepspeed.git/config
+++ b/deepspeed.git/config
-[core]
-	repositoryformatversion = 0
-	filemode = true
-	bare = true
-[remote "origin"]
-	url = http://developer.hpccube.com/codes/aicomponent/deepspeed.git
-	fetch = +refs/*:refs/*
-	mirror = true
--- a/deepspeed.git/description
+++ b/deepspeed.git/description
-Unnamed repository; edit this file 'description' to name the repository.
--- a/deepspeed.git/hooks/applypatch-msg.sample
+++ b/deepspeed.git/hooks/applypatch-msg.sample
-#!/bin/sh
-#
-# An example hook script to check the commit log message taken by
-# applypatch from an e-mail message.
-#
-# The hook should exit with non-zero status after issuing an
-# appropriate message if it wants to stop the commit.  The hook is
-# allowed to edit the commit message file.
-#
-# To enable this hook, rename this file to "applypatch-msg".
-
-. git-sh-setup
-test -x "$GIT_DIR/hooks/commit-msg" &&
-	exec "$GIT_DIR/hooks/commit-msg" ${1+"$@"}
-:
--- a/deepspeed.git/hooks/commit-msg.sample
+++ b/deepspeed.git/hooks/commit-msg.sample
-#!/bin/sh
-#
-# An example hook script to check the commit log message.
-# Called by "git commit" with one argument, the name of the file
-# that has the commit message.  The hook should exit with non-zero
-# status after issuing an appropriate message if it wants to stop the
-# commit.  The hook is allowed to edit the commit message file.
-#
-# To enable this hook, rename this file to "commit-msg".
-
-# Uncomment the below to add a Signed-off-by line to the message.
-# Doing this in a hook is a bad idea in general, but the prepare-commit-msg
-# hook is more suited to it.
-#
-# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
-# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
-
-# This example catches duplicate Signed-off-by lines.
-
-test "" = "$(grep '^Signed-off-by: ' "$1" |
-	 sort | uniq -c | sed -e '/^[ 	]*1[ 	]/d')" || {
-	echo >&2 Duplicate Signed-off-by lines.
-	exit 1
-}
--- a/deepspeed.git/hooks/post-update.sample
+++ b/deepspeed.git/hooks/post-update.sample
-#!/bin/sh
-#
-# An example hook script to prepare a packed repository for use over
-# dumb transports.
-#
-# To enable this hook, rename this file to "post-update".
-
-exec git update-server-info
--- a/deepspeed.git/hooks/pre-applypatch.sample
+++ b/deepspeed.git/hooks/pre-applypatch.sample
-#!/bin/sh
-#
-# An example hook script to verify what is about to be committed
-# by applypatch from an e-mail message.
-#
-# The hook should exit with non-zero status after issuing an
-# appropriate message if it wants to stop the commit.
-#
-# To enable this hook, rename this file to "pre-applypatch".
-
-. git-sh-setup
-test -x "$GIT_DIR/hooks/pre-commit" &&
-	exec "$GIT_DIR/hooks/pre-commit" ${1+"$@"}
-:
--- a/deepspeed.git/hooks/pre-commit.sample
+++ b/deepspeed.git/hooks/pre-commit.sample
-#!/bin/sh
-#
-# An example hook script to verify what is about to be committed.
-# Called by "git commit" with no arguments.  The hook should
-# exit with non-zero status after issuing an appropriate message if
-# it wants to stop the commit.
-#
-# To enable this hook, rename this file to "pre-commit".
-
-if git rev-parse --verify HEAD >/dev/null 2>&1
-then
-	against=HEAD
-else
-	# Initial commit: diff against an empty tree object
-	against=4b825dc642cb6eb9a060e54bf8d69288fbee4904
-fi
-
-# If you want to allow non-ascii filenames set this variable to true.
-allownonascii=$(git config hooks.allownonascii)
-
-# Redirect output to stderr.
-exec 1>&2
-
-# Cross platform projects tend to avoid non-ascii filenames; prevent
-# them from being added to the repository. We exploit the fact that the
-# printable range starts at the space character and ends with tilde.
-if [ "$allownonascii" != "true" ] &&
-	# Note that the use of brackets around a tr range is ok here, (it's
-	# even required, for portability to Solaris 10's /usr/bin/tr), since
-	# the square bracket bytes happen to fall in the designated range.
-	test $(git diff --cached --name-only --diff-filter=A -z $against |
-	  LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0
-then
-	echo "Error: Attempt to add a non-ascii file name."
-	echo
-	echo "This can cause problems if you want to work"
-	echo "with people on other platforms."
-	echo
-	echo "To be portable it is advisable to rename the file ..."
-	echo
-	echo "If you know what you are doing you can disable this"
-	echo "check using:"
-	echo
-	echo "  git config hooks.allownonascii true"
-	echo
-	exit 1
-fi
-
-# If there are whitespace errors, print the offending file names and fail.
-exec git diff-index --check --cached $against --