push rocm deepspeed v0.3.13

eadbbe09 · 401qingkong · ab5534fc · eadbbe09 · eadbbe09 · eadbbe09
Commit eadbbe09 authored Apr 25, 2021 by 401qingkong
20 changed files
--- a/csrc/includes/hip/ds_transformer_hip.h
+++ b/csrc/includes/hip/ds_transformer_hip.h
+#pragma once
+#include <hip/hip_runtime_api.h>
+#include <hiprand.h>
+#include <memory>
+#include <vector>
+#include "rocblas.h"
+#include "hip/hip_runtime.h"
+#include "hip/dropout.h"
+#include "hip/feed_forward.h"
+#include "hip/gelu.h"
+#include "hip/general_kernels.h"
+#include "hip/normalize_layer.h"
+#include "hip/softmax.h"
+#include "hip/strided_batch_gemm.h"
+struct BertGemmAlgos {
+    int m_gemm_qkv_algo;
+    int m_gemm_inter_algo;
+    int m_gemm_output_algo;
+    int m_gemm_batch1_algo;
+    int m_gemm_batch2_algo;
+    BertGemmAlgos()
+        : m_gemm_qkv_algo(-1),
+          m_gemm_inter_algo(-1),
+          m_gemm_output_algo(-1),
+          m_gemm_batch1_algo(-1),
+          m_gemm_batch2_algo(-1)
+    {
+    }
+};
+template <typename T>
+class BertTransformerLayer {
+public:
+    BertTransformerLayer(int layer_id,
+                         int batch_size,
+                         int hidden_size,
+                         int num_heads,
+                         int intermediate_size,
+                         int seq_length,
+                         float attn_dropout_ratio,
+                         float hidden_output_dropout_ratio,
+                         float layer_norm_eps,
+                         bool pre_or_postLayerNorm,
+                         const std::vector<std::array<int, 3>>& gemm_algos,
+                         bool attn_dropout_checkpoint,
+                         bool normalize_invertible,
+                         bool gelu_checkpoint,
+                         bool stochastic_mode);
+    virtual ~BertTransformerLayer();
+    void Forward(int bsz,
+                 const T* input_ptr,
+                 const T* input_mask_ptr,
+                 const T* attn_qkvw_ptr,
+                 const T* attn_qkvb_ptr,
+                 const T* attn_ow_ptr,
+                 const T* attn_ob_ptr,
+                 const T* attn_nw_ptr,
+                 const T* attn_nb_ptr,
+                 const T* inter_w_ptr,
+                 const T* inter_b_ptr,
+                 const T* output_w_ptr,
+                 const T* output_b_ptr,
+                 const T* norm_w_ptr,
+                 const T* norm_b_ptr,
+                 T* out_ptr,
+                 T* inp_norm_ptr,
+                 T* q_tf_ptr,
+                 T* k_tf_ptr,
+                 T* v_tf_ptr,
+                 T* softmax_output_ptr,
+                 T* ctx_bufB_ptr,
+                 T* attn_o_inp_ptr,
+                 T* add_res_ptr,
+                 T* ff1_inp_ptr,
+                 T* gelu_inp_ptr,
+                 T* ff2_inp_ptr);
+    void Backward(int bsz,
+                  const T* grad_output_ptr,
+                  const T* input_ptr,
+                  const T* output_ptr,
+                  const T* inp_norm_ptr,
+                  const T* q_tf_ptr,
+                  const T* k_tf_ptr,
+                  const T* v_tf_ptr,
+                  const T* softmax_output_ptr,
+                  const T* ctx_bufB_ptr,
+                  const T* attn_o_inp_ptr,
+                  const T* add_res_ptr,
+                  const T* ff1_inp_ptr,
+                  const T* gelu_inp_ptr,
+                  const T* ff2_inp_ptr,
+                  const T* input_mask_ptr,
+                  const T* attn_qkvw_ptr,
+                  const T* attn_ow_ptr,
+                  const T* attn_nw_ptr,
+                  const T* attn_nb_ptr,
+                  const T* inter_w_ptr,
+                  const T* inter_b_ptr,
+                  const T* output_w_ptr,
+                  const T* norm_w_ptr,
+                  const T* norm_b_ptr,
+                  T* grad_input_ptr,
+                  T* grad_attn_qkvw_ptr,
+                  T* grad_attn_qkvb_ptr,
+                  T* grad_attn_ow_ptr,
+                  T* grad_attn_ob_ptr,
+                  T* grad_attn_nw_ptr,
+                  T* grad_attn_nb_ptr,
+                  T* grad_inter_w_ptr,
+                  T* grad_inter_b_ptr,
+                  T* grad_output_w_ptr,
+                  T* grad_output_b_ptr,
+                  T* grad_norm_w_ptr,
+                  T* grad_norm_b_ptr);
+    void SetIntermediateBuffers(uint8_t* attn_prob_dropout_mask_ptr,
+                                uint8_t* attn_output_dropout_mask_ptr,
+                                uint8_t* layer_output_dropout_mask_ptr,
+                                T* layer_norm_var,
+                                T* layer_norm_mean,
+                                T* attn_layer_norm_var,
+                                T* attn_layer_norm_mean);
+    inline int GetBatchSize() const { return _batch_size; }
+    inline int GetNumHeads() const { return _heads; }
+    inline int GetSeqLength() const { return _seq_length; }
+    inline int GetIntermediateSize() const { return _intermediate_size; }
+    void SetSeqLength(int seq_len);
+    inline int GetHiddenSize() const { return _hidden_size; }
+    void SetTrainingMode(bool training);
+    inline bool IsTrainingMode() const { return _training; }
+    inline bool GeluCheckpoint() const { return _gelu_checkpoint; }
+private:
+    void Initialize();
+    size_t getWorkspaceSize(int maxBatchSize) const;
+    // Params
+    int _layer_id;
+    int _batch_size;
+    int _hidden_size;
+    int _heads;
+    int _size_per_head;
+    int _intermediate_size;
+    int _seq_length;
+    bool _pre_or_postLayerNorm;
+    rocblas_handle _cublasHandle;
+    hipStream_t _stream;
+    // layers
+    FeedForward<T> _qkv_linear;
+    FeedForward<T> _attn_out_linear;
+    Normalize_Layer<T> _attn_layer_norm;
+    Normalize_Layer<T> _layer_norm;
+    Normalize_Layer<T>* _last_normalize;
+    FeedForward<T> _ff1, _ff2;
+    Softmax<T> _softmax;
+    Gelu<T> _gelu;
+    Dropout<T> _attn_prob_dropout;
+    Dropout<T> _attn_output_dropout;
+    Dropout<T> _layer_output_dropout;
+    StridedBatchGemm<T> _attn_scores;
+    StridedBatchGemm<T> _attn_context;
+    bool _training;
+    // Memory saving flags
+    bool _attn_dropout_checkpoint;
+    bool _normalize_invertible;
+    bool _gelu_checkpoint;
+    // High Performace flags
+    bool _stochastic_mode;
+};
--- a/csrc/includes/hip/feed_forward.h
+++ b/csrc/includes/hip/feed_forward.h
+#ifndef __FEEDFORWARD_H__
+#define __FEEDFORWARD_H__
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <stdio.h>
+#include "hip/custom_hip_layers.h"
+template <typename T>
+class FeedForward {
+public:
+    struct Config {
+        int batchSize, outputSize;
+        int inputSize;
+        std::array<int, 3> gemm_algos;
+        Config(int batch, int outputs, int inputs, const std::array<int, 3>& algos)
+            : batchSize(batch), outputSize(outputs), inputSize(inputs), gemm_algos(algos)
+        {
+        }
+    };
+    FeedForward(Config config) : config_(config) {}
+    ~FeedForward() {}
+    void Forward(int bsz,
+                 const T* input_ptr,
+                 const T* weights,
+                 T* out,
+                 rocblas_handle& _cublasHandle)
+    {
+        float alpha = T(1.);
+        float beta = T(0.);
+        cublas_gemm_ex(_cublasHandle,
+                       rocblas_operation_transpose,
+                       rocblas_operation_none,
+                       config_.outputSize,
+                       bsz,
+                       config_.inputSize,
+                       &alpha,
+                       &beta,
+                       weights,
+                       input_ptr,
+                       out,
+                       //cublasGemmAlgo_t(config_.gemm_algos[0]));
+                       rocblas_gemm_algo(config_.gemm_algos[0]));
+    }
+    void Backward(int bsz,
+                  const T* out_grad,
+                  const T* input_ptr,
+                  const T* weights,
+                  T* weights_grad,
+                  T* bias_grad,
+                  rocblas_handle& _cublasHandle,
+                  hipStream_t& stream,
+                  T* inp_grad_out = nullptr,
+                  T* out_grad_trans_out = nullptr)
+    {
+        float alpha = (T)1.0, beta = (T)0.0;
+        cublas_gemm_ex(_cublasHandle,
+                       rocblas_operation_none,
+                       rocblas_operation_transpose,
+                       config_.inputSize,
+                       config_.outputSize,
+                       bsz,
+                       &alpha,
+                       &beta,
+                       input_ptr,
+                       out_grad,
+                       weights_grad,
+                       //cublasGemmAlgo_t(config_.gemm_algos[1]));
+                       rocblas_gemm_algo(config_.gemm_algos[1]));
+        cublas_gemm_ex(_cublasHandle,
+                       rocblas_operation_none,
+                       rocblas_operation_none,
+                       config_.inputSize,
+                       bsz,
+                       config_.outputSize,
+                       &alpha,
+                       &beta,
+                       weights,
+                       out_grad,
+                       inp_grad_out,
+                       //cublasGemmAlgo_t(config_.gemm_algos[2]));
+                       rocblas_gemm_algo(config_.gemm_algos[2]));
+        launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
+    }
+private:
+    Config config_;
+};
+#endif
--- a/csrc/includes/hip/gelu.h
+++ b/csrc/includes/hip/gelu.h
+#pragma once
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <stdio.h>
+#include "hip/custom_hip_layers.h"
+template <typename T>
+class Gelu {
+public:
+    struct Config {
+        uint32_t intermediate_size;
+        Config(uint32_t inter_size) : intermediate_size(inter_size) {}
+    };
+    Gelu(const Config& config) : _config(config) {}
+    virtual ~Gelu() {}
+    void ForwardWithBiasAdd(int bsz,
+                            const T* input_buf,
+                            const T* bias,
+                            T* output,
+                            hipStream_t stream)
+    {
+        launch_bias_gelu<T>(input_buf, bias, output, _config.intermediate_size, bsz, stream);
+    }
+    void Backward(int bsz, T* d_output, const T* input_buf, const T* bias, hipStream_t stream)
+    {
+        launch_d_gelu<T>(d_output, input_buf, bias, _config.intermediate_size, bsz, stream);
+    }
+private:
+    Config _config;
+};
--- a/csrc/includes/hip/gemm_test.h
+++ b/csrc/includes/hip/gemm_test.h
+#pragma once
+#include <hip/hip_fp16.h>
+#ifndef __HIP_PLATFORM_HCC__
+#include <cuda_profiler_api.h>
+#endif
+#include <array>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <limits>
+#include <memory>
+#include "StopWatch.h"
+#include "cublas_wrappers.h"
+template <typename T>
+void check(T result, char const* const func, const char* const file, int const line)
+{
+    if (result) {
+        std::cout << (std::string("CUDA runtime error: ") + +file + ":" + std::to_string(line) +
+                      " \n");
+    }
+}
+#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
+template <typename T>
+class GemmTest {
+public:
+    GemmTest(int m, int n, int k, rocblas_operation ta, rocblas_operation tb, rocblas_handle h)
+        : M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
+    {
+        check_cuda_error(hipMalloc((void**)&A, sizeof(T) * M * K));
+        check_cuda_error(hipMalloc((void**)&B, sizeof(T) * K * N));
+        check_cuda_error(hipMalloc((void**)&C, sizeof(T) * M * N));
+    }
+    ~GemmTest()
+    {
+        check_cuda_error(hipFree(A));
+        check_cuda_error(hipFree(B));
+        check_cuda_error(hipFree(C));
+    }
+    std::array<int, 3> TestAlgo(int loops)
+    {
+        float alpha = (T)1.0f;
+        float beta = (T)0.0f;
+        int algo_fw = Run(loops, [=](int algo) {
+            cublas_gemm_ex(handle,
+                           rocblas_operation_transpose,
+                           rocblas_operation_none,
+                           N,
+                           M,
+                           K,
+                           &alpha,
+                           &beta,
+                           B,
+                           A,
+                           C,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
+                           static_cast<cublasGemmAlgo_t>(algo));
+#endif
+        });
+        int algo_bw1 = Run(loops, [=](int algo) {
+            cublas_gemm_ex(handle,
+                           rocblas_operation_none,
+                           rocblas_operation_transpose,
+                           K,
+                           N,
+                           M,
+                           &alpha,
+                           &beta,
+                           A,
+                           C,
+                           B,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
+                           static_cast<cublasGemmAlgo_t>(algo));
+#endif
+        });
+        int algo_bw2 = Run(loops, [=](int algo) {
+            cublas_gemm_ex(handle,
+                           rocblas_operation_none,
+                           rocblas_operation_none,
+                           K,
+                           M,
+                           N,
+                           &alpha,
+                           &beta,
+                           B,
+                           C,
+                           A,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
+                           static_cast<cublasGemmAlgo_t>(algo));
+#endif
+        });
+        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
+    }
+    template <typename Func>
+    int Run(int loops, Func f)
+    {
+        //float fast_latency = std::numeric_limits<float>::max();
+        float fast_latency = (std::numeric_limits<float>::max)();
+        int fast_algo = 0;
+#ifdef __HIP_PLATFORM_HCC__
+        for (int algo = (int)rocblas_gemm_algo_standard;
+             algo <= (int)rocblas_gemm_algo_standard;
+#else
+        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+#endif
+             algo++) {
+            int warm_up = 5;
+            for (int i = 0; i < warm_up; ++i) f(algo);
+            hipDeviceSynchronize();
+            Stopwatch timer;
+            timer.Restart();
+            for (int i = 0; i < loops; ++i) f(algo);
+            hipDeviceSynchronize();
+            timer.Stop();
+            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
+            printf("algo-%d: %.3fms\n", algo, avg_latency);
+            if (avg_latency < fast_latency) {
+                fast_latency = avg_latency;
+                fast_algo = algo;
+            }
+        }
+        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
+        return fast_algo;
+    }
+private:
+    int M, N, K;
+    rocblas_handle handle;
+    rocblas_operation transa, transb;
+    T *A, *B, *C;
+};
+template <typename T>
+class StridedGemmTest {
+public:
+    StridedGemmTest(int b,
+                    int m,
+                    int n,
+                    int k,
+                    rocblas_operation ta,
+                    rocblas_operation tb,
+                    rocblas_handle h)
+        : bsz(b), M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
+    {
+        check_cuda_error(hipMalloc((void**)&A, sizeof(T) * M * K * bsz));
+        check_cuda_error(hipMalloc((void**)&B, sizeof(T) * K * N * bsz));
+        check_cuda_error(hipMalloc((void**)&C, sizeof(T) * M * N * bsz));
+    }
+    ~StridedGemmTest()
+    {
+        check_cuda_error(hipFree(A));
+        check_cuda_error(hipFree(B));
+        check_cuda_error(hipFree(C));
+    }
+    std::array<int, 3> TestAlgo(int loops)
+    {
+        float alpha = (T)1.0f;
+        float beta = (T)0.0f;
+        int algo_fw = Run(loops, [=](int algo) {
+            int stride_a = M * K;
+            int stride_b = N * K;
+            int stride_c = M * N;
+            cublas_strided_batched_gemm(handle,
+                                        M,
+                                        N,
+                                        K,
+                                        &alpha,
+                                        &beta,
+                                        A,
+                                        B,
+                                        C,
+                                        transa,
+                                        transb,
+                                        stride_a,
+                                        stride_b,
+                                        stride_c,
+                                        bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
+                                        static_cast<cublasGemmAlgo_t>(algo));
+#endif
+        });
+        int algo_bw1 = Run(loops, [=](int algo) {
+            int mb = (transa == rocblas_operation_transpose ? K : M);
+            int kb = (transa == rocblas_operation_transpose ? M : K);
+            int stride_a = mb * N;
+            int stride_b = N * kb;
+            int stride_c = M * K;
+            // B need to transpose.
+            rocblas_operation op_b = (transb == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
+            // Calculate d_A.
+            cublas_strided_batched_gemm(handle,
+                                        mb,
+                                        kb,
+                                        N,
+                                        &alpha,
+                                        &beta,
+                                        (transa == rocblas_operation_transpose ? B : C),
+                                        (transa == rocblas_operation_transpose ? C : B),
+                                        A,
+                                        rocblas_operation_none,
+                                        op_b,
+                                        stride_a,
+                                        stride_b,
+                                        stride_c,
+                                        bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
+                                        static_cast<cublasGemmAlgo_t>(algo));
+#endif
+        });
+        int algo_bw2 = Run(loops, [=](int algo) {
+            // A need to transpose.
+            rocblas_operation op_a = (transa == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
+            int stride_a = M * K;
+            int stride_b = M * N;
+            int stride_c = N * K;
+            // Calculate d_B.
+            cublas_strided_batched_gemm(handle,
+                                        K,
+                                        N,
+                                        M,
+                                        &alpha,
+                                        &beta,
+                                        A,
+                                        C,
+                                        B,
+                                        op_a,
+                                        rocblas_operation_none,
+                                        stride_a,
+                                        stride_b,
+                                        stride_c,
+                                        bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
+                                        static_cast<cublasGemmAlgo_t>(algo));
+#endif
+        });
+        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
+    }
+    template <typename Func>
+    int Run(int loops, Func f)
+    {
+        //float fast_latency = std::numeric_limits<float>::max();
+        float fast_latency = (std::numeric_limits<float>::max)();
+        int fast_algo = 0;
+#ifdef __HIP_PLATFORM_HCC__
+        for (int algo = (int)rocblas_gemm_algo_standard;
+             algo <= (int)rocblas_gemm_algo_standard;
+#else
+        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+#endif
+             algo++) {
+            int warm_up = 5;
+            for (int i = 0; i < warm_up; ++i) f(algo);
+            hipDeviceSynchronize();
+            Stopwatch timer;
+            timer.Restart();
+            for (int i = 0; i < loops; ++i) f(algo);
+            hipDeviceSynchronize();
+            timer.Stop();
+            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
+            printf("algo-%d: %.3fms\n", algo, avg_latency);
+            if (avg_latency < fast_latency) {
+                fast_latency = avg_latency;
+                fast_algo = algo;
+            }
+        }
+        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
+        return fast_algo;
+    }
+private:
+    int bsz, M, N, K;
+    rocblas_handle handle;
+    rocblas_operation transa, transb;
+    T *A, *B, *C;
+};
--- a/csrc/includes/hip/gemm_test.h.bak
+++ b/csrc/includes/hip/gemm_test.h.bak
+#pragma once
+#include <hip/hip_fp16.h>
+#ifndef __HIP_PLATFORM_HCC__
+#include <cuda_profiler_api.h>
+#endif
+#include <array>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <limits>
+#include <memory>
+#include "StopWatch.h"
+#include "hip/cublas_wrappers.h"
+template <typename T>
+void check(T result, char const* const func, const char* const file, int const line)
+{
+    if (result) {
+        std::cout << (std::string("CUDA runtime error: ") + +file + ":" + std::to_string(line) +
+                      " \n");
+    }
+}
+#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
+template <typename T>
+class GemmTest {
+public:
+    GemmTest(int m, int n, int k, rocblas_operation ta, rocblas_operation tb, rocblas_handle h)
+        : M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
+    {
+        check_cuda_error(hipMalloc((void**)&A, sizeof(T) * M * K));
+        check_cuda_error(hipMalloc((void**)&B, sizeof(T) * K * N));
+        check_cuda_error(hipMalloc((void**)&C, sizeof(T) * M * N));
+    }
+    ~GemmTest()
+    {
+        check_cuda_error(hipFree(A));
+        check_cuda_error(hipFree(B));
+        check_cuda_error(hipFree(C));
+    }
+    std::array<int, 3> TestAlgo(int loops)
+    {
+        float alpha = (T)1.0f;
+        float beta = (T)0.0f;
+        int algo_fw = Run(loops, [=](int algo) {
+            cublas_gemm_ex(handle,
+                           rocblas_operation_transpose,
+                           rocblas_operation_none,
+                           N,
+                           M,
+                           K,
+                           &alpha,
+                           &beta,
+                           B,
+                           A,
+                           C,
+                           static_cast<cublasGemmAlgo_t>(algo));
+        });
+        int algo_bw1 = Run(loops, [=](int algo) {
+            cublas_gemm_ex(handle,
+                           rocblas_operation_none,
+                           rocblas_operation_transpose,
+                           K,
+                           N,
+                           M,
+                           &alpha,
+                           &beta,
+                           A,
+                           C,
+                           B,
+                           static_cast<cublasGemmAlgo_t>(algo));
+        });
+        int algo_bw2 = Run(loops, [=](int algo) {
+            cublas_gemm_ex(handle,
+                           rocblas_operation_none,
+                           rocblas_operation_none,
+                           K,
+                           M,
+                           N,
+                           &alpha,
+                           &beta,
+                           B,
+                           C,
+                           A,
+                           static_cast<cublasGemmAlgo_t>(algo));
+        });
+        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
+    }
+    template <typename Func>
+    int Run(int loops, Func f)
+    {
+        float fast_latency = (std::numeric_limits<float>::max)();
+        int fast_algo = 0;
+        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+             algo++) {
+            int warm_up = 5;
+            for (int i = 0; i < warm_up; ++i) f(algo);
+            hipDeviceSynchronize();
+            Stopwatch timer;
+            timer.Restart();
+            for (int i = 0; i < loops; ++i) f(algo);
+            hipDeviceSynchronize();
+            timer.Stop();
+            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
+            printf("algo-%d: %.3fms\n", algo, avg_latency);
+            if (avg_latency < fast_latency) {
+                fast_latency = avg_latency;
+                fast_algo = algo;
+            }
+        }
+        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
+        return fast_algo;
+    }
+private:
+    int M, N, K;
+    rocblas_handle handle;
+    rocblas_operation transa, transb;
+    T *A, *B, *C;
+};
+template <typename T>
+class StridedGemmTest {
+public:
+    StridedGemmTest(int b,
+                    int m,
+                    int n,
+                    int k,
+                    rocblas_operation ta,
+                    rocblas_operation tb,
+                    rocblas_handle h)
+        : bsz(b), M(m), N(n), K(k), transa(ta), transb(tb), handle(h)
+    {
+        check_cuda_error(hipMalloc((void**)&A, sizeof(T) * M * K * bsz));
+        check_cuda_error(hipMalloc((void**)&B, sizeof(T) * K * N * bsz));
+        check_cuda_error(hipMalloc((void**)&C, sizeof(T) * M * N * bsz));
+    }
+    ~StridedGemmTest()
+    {
+        check_cuda_error(hipFree(A));
+        check_cuda_error(hipFree(B));
+        check_cuda_error(hipFree(C));
+    }
+    std::array<int, 3> TestAlgo(int loops)
+    {
+        float alpha = (T)1.0f;
+        float beta = (T)0.0f;
+        int algo_fw = Run(loops, [=](int algo) {
+            int stride_a = M * K;
+            int stride_b = N * K;
+            int stride_c = M * N;
+            cublas_strided_batched_gemm(handle,
+                                        M,
+                                        N,
+                                        K,
+                                        &alpha,
+                                        &beta,
+                                        A,
+                                        B,
+                                        C,
+                                        transa,
+                                        transb,
+                                        stride_a,
+                                        stride_b,
+                                        stride_c,
+                                        bsz,
+                                        static_cast<cublasGemmAlgo_t>(algo));
+        });
+        int algo_bw1 = Run(loops, [=](int algo) {
+            int mb = (transa == rocblas_operation_transpose ? K : M);
+            int kb = (transa == rocblas_operation_transpose ? M : K);
+            int stride_a = mb * N;
+            int stride_b = N * kb;
+            int stride_c = M * K;
+            // B need to transpose.
+            rocblas_operation op_b = (transb == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
+            // Calculate d_A.
+            cublas_strided_batched_gemm(handle,
+                                        mb,
+                                        kb,
+                                        N,
+                                        &alpha,
+                                        &beta,
+                                        (transa == rocblas_operation_transpose ? B : C),
+                                        (transa == rocblas_operation_transpose ? C : B),
+                                        A,
+                                        rocblas_operation_none,
+                                        op_b,
+                                        stride_a,
+                                        stride_b,
+                                        stride_c,
+                                        bsz,
+                                        static_cast<cublasGemmAlgo_t>(algo));
+        });
+        int algo_bw2 = Run(loops, [=](int algo) {
+            // A need to transpose.
+            rocblas_operation op_a = (transa == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
+            int stride_a = M * K;
+            int stride_b = M * N;
+            int stride_c = N * K;
+            // Calculate d_B.
+            cublas_strided_batched_gemm(handle,
+                                        K,
+                                        N,
+                                        M,
+                                        &alpha,
+                                        &beta,
+                                        A,
+                                        C,
+                                        B,
+                                        op_a,
+                                        rocblas_operation_none,
+                                        stride_a,
+                                        stride_b,
+                                        stride_c,
+                                        bsz,
+                                        static_cast<cublasGemmAlgo_t>(algo));
+        });
+        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
+    }
+    template <typename Func>
+    int Run(int loops, Func f)
+    {
+        float fast_latency = (std::numeric_limits<float>::max)();
+        int fast_algo = 0;
+        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+             algo++) {
+            int warm_up = 5;
+            for (int i = 0; i < warm_up; ++i) f(algo);
+            hipDeviceSynchronize();
+            Stopwatch timer;
+            timer.Restart();
+            for (int i = 0; i < loops; ++i) f(algo);
+            hipDeviceSynchronize();
+            timer.Stop();
+            float avg_latency = (float)timer.GetTimeInSeconds() * 1000 / loops;
+            printf("algo-%d: %.3fms\n", algo, avg_latency);
+            if (avg_latency < fast_latency) {
+                fast_latency = avg_latency;
+                fast_algo = algo;
+            }
+        }
+        printf("fast_algo %d: %.3f ms\n", fast_algo, fast_latency);
+        return fast_algo;
+    }
+private:
+    int bsz, M, N, K;
+    rocblas_handle handle;
+    rocblas_operation transa, transb;
+    T *A, *B, *C;
+};
--- a/csrc/includes/hip/general_kernels.h
+++ b/csrc/includes/hip/general_kernels.h
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <hip/hip_cooperative_groups.h>
+#include <hiprand_kernel.h>
+#include "hip/context.h"
+#include "hip/cublas_wrappers.h"
+#define THREADS 256
+#define TILE_DIM 32
+#define minus_infinity -1 * std::numeric_limits<float>::infinity()
+#define FINAL_MASK 0xffffffff
+template <typename T>
+void launch_fused_add2(T* out,
+                       const T* inp1,
+                       const T* inp2,
+                       int batch_size,
+                       int seq_length,
+                       int hidden_size,
+                       hipStream_t& stream);
+template <typename T>
+void launch_fused_add4(T* out,
+                       const T* inp1,
+                       const T* inp2,
+                       const T* inp3,
+                       const T* inp4,
+                       int batch_size,
+                       int seq_length,
+                       int hidden_size,
+                       hipStream_t& stream);
+template <typename T>
+void launch_fused_add3(T* out,
+                       const T* inp1,
+                       const T* inp2,
+                       const T* inp3,
+                       int batch_size,
+                       int seq_length,
+                       int hidden_size,
+                       hipStream_t& stream);
--- a/csrc/includes/hip/normalize_layer.h
+++ b/csrc/includes/hip/normalize_layer.h
+#pragma once
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <stdio.h>
+#include <fstream>
+#include "hip/custom_hip_layers.h"
+using namespace std;
+template <typename T>
+class Normalize_Layer {
+public:
+    struct Config {
+        uint32_t batchSize;
+        uint32_t seqLength;
+        uint32_t hiddenDim;
+        float epsilon;
+        bool training;
+        bool useMean;
+        Config(uint32_t batch,
+               uint32_t seq,
+               uint32_t h,
+               float epsilon = 1e-12,
+               bool training = true,
+               bool useMean = true)
+            : batchSize(batch),
+              seqLength(seq),
+              hiddenDim(h),
+              epsilon(epsilon),
+              training(training),
+              useMean(useMean)
+        {
+        }
+    };
+    Normalize_Layer(Config config)
+        : config_(config), vars(nullptr), means(nullptr), vals_hat(nullptr)
+    {
+    }
+    ~Normalize_Layer() {}
+    void ForwardCheckpoint(int bsz,  // batch * seq
+                           T* vals,
+                           const T* residual,
+                           const T* gamma,
+                           const T* betta,
+                           hipStream_t& stream,
+                           bool preLayerNorm = false)
+    {
+        launch_bias_residual_layer_norm(vals,
+                                        residual,
+                                        gamma,
+                                        betta,
+                                        config_.epsilon,
+                                        bsz,
+                                        config_.hiddenDim,
+                                        stream,
+                                        preLayerNorm,
+                                        config_.training,
+                                        vars,
+                                        means);
+    }
+    void Forward(int bsz,
+                 T* vals,
+                 const T* residual,
+                 const T* gamma,
+                 const T* betta,
+                 hipStream_t& stream,
+                 bool preLayerNorm = false)
+    {
+        launch_bias_residual_layer_norm(vals,
+                                        residual,
+                                        gamma,
+                                        betta,
+                                        config_.epsilon,
+                                        bsz,
+                                        config_.hiddenDim,
+                                        stream,
+                                        preLayerNorm,
+                                        config_.training,
+                                        vars);
+    }
+    void Backward(int bsz,
+                  const T* out_grad,
+                  const T* gamma,
+                  T* gamma_grad,
+                  T* betta_grad,
+                  hipStream_t stream[2],
+                  T* inp_grad_out,
+                  const T* norm_in = nullptr)
+    {
+        launch_layerNorm_backward(out_grad,
+                                  norm_in,
+                                  vars,
+                                  means,
+                                  gamma,
+                                  gamma_grad,
+                                  betta_grad,
+                                  inp_grad_out,
+                                  bsz,
+                                  config_.hiddenDim,
+                                  stream);
+    }
+    void Backward(int bsz,
+                  const T* out_grad,
+                  const T* gamma,
+                  const T* betta,
+                  T* gamma_grad,
+                  T* betta_grad,
+                  hipStream_t stream[2],
+                  T* inp_grad_out,
+                  const T* norm_out)
+    {
+        launch_layerNorm_backward(out_grad,
+                                  norm_out,
+                                  vars,
+                                  gamma,
+                                  gamma_grad,
+                                  betta_grad,
+                                  inp_grad_out,
+                                  bsz,
+                                  config_.hiddenDim,
+                                  stream,
+                                  !config_.useMean,
+                                  betta);
+    }
+    void BackwardFusedAdd(int bsz,
+                          const T* out_grad1,
+                          const T* out_grad2,
+                          const T* gamma,
+                          T* gamma_grad,
+                          T* betta_grad,
+                          hipStream_t stream[2],
+                          T* inp_grad_out,
+                          const T* norm_in = nullptr)
+    {
+        launch_layerNorm_backward_fused_add(out_grad1,
+                                            out_grad2,
+                                            norm_in,
+                                            vars,
+                                            means,
+                                            gamma,
+                                            gamma_grad,
+                                            betta_grad,
+                                            inp_grad_out,
+                                            bsz,
+                                            config_.hiddenDim,
+                                            stream);
+    }
+    void BackwardFusedAdd(int bsz,
+                          const T* out_grad1,
+                          const T* out_grad2,
+                          const T* gamma,
+                          const T* betta,
+                          T* gamma_grad,
+                          T* betta_grad,
+                          hipStream_t stream[2],
+                          T* inp_grad_out,
+                          const T* norm_out)
+    {
+        launch_layerNorm_backward_fused_add(out_grad1,
+                                            out_grad2,
+                                            norm_out,
+                                            vars,
+                                            gamma,
+                                            gamma_grad,
+                                            betta_grad,
+                                            inp_grad_out,
+                                            bsz,
+                                            config_.hiddenDim,
+                                            stream,
+                                            !config_.useMean,
+                                            betta);
+    }
+    inline bool UseMean() const { return config_.useMean; }
+    inline void SetVar(T* variance)
+    {
+        if (!variance) { throw std::runtime_error("Normalize variance is null."); }
+        vars = variance;
+    }
+    inline void SetMean(T* mean)
+    {
+        if (!mean) { throw std::runtime_error("Normalize mean is null."); }
+        means = mean;
+    }
+private:
+    Config config_;
+    T* vars;
+    T* means;
+    T* vals_hat;
+};
--- a/csrc/includes/hip/softmax.h
+++ b/csrc/includes/hip/softmax.h
+#pragma once
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <stdio.h>
+#include "hip/custom_hip_layers.h"
+#include <fstream>
+using namespace std;
+template <typename T>
+class Softmax {
+public:
+    struct Config {
+        size_t batchSize;
+        size_t heads;
+        size_t seq_length;
+        size_t prob_depth;
+        float temprature;
+        bool mem_alloc;
+        Config(size_t batch, size_t h, size_t seq, int prob_size = 0, bool mem_alloc = false)
+            : batchSize(batch),
+              heads(h),
+              seq_length(seq),
+              prob_depth(prob_size),
+              temprature(1.0),
+              mem_alloc(mem_alloc)
+        {
+        }
+    };
+    Softmax(Config config) : config_(config) {}
+    ~Softmax() {}
+    void Forward(int bsz, T* vals, const T* attn_mask, hipStream_t& stream)
+    {
+        launch_attn_softmax<T>(vals, attn_mask, bsz, config_.heads, config_.seq_length, stream);
+    }
+    void Backward(int bsz, T* out_grad, const T* soft_out, hipStream_t stream)
+    {
+        launch_attn_softmax_backward_v2<T>(
+            out_grad, soft_out, bsz, config_.heads, config_.seq_length, stream);
+    }
+    inline size_t GetProbDepth() const { return config_.prob_depth; }
+    inline size_t GetBatchSize() const { return config_.batchSize; }
+    inline size_t GetNumHeads() const { return config_.heads; }
+    inline size_t GetSeqLength() const { return config_.seq_length; }
+    inline void SetSeqLength(size_t seq_len) { config_.seq_length = seq_len; }
+private:
+    Config config_;
+};
--- a/csrc/includes/hip/strided_batch_gemm.h
+++ b/csrc/includes/hip/strided_batch_gemm.h
+#pragma once
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <stdio.h>
+#include "hip/context.h"
+template <typename T>
+class StridedBatchGemm {
+public:
+    struct Config {
+        int batch_size;
+        int m;
+        int n;
+        int k;
+        float alpha;
+        float beta;
+        rocblas_operation op_A;
+        rocblas_operation op_B;
+        std::array<int, 3> gemm_algos;
+        Config(int batch,
+               int mm,
+               int nn,
+               int kk,
+               float param_alpha,
+               float param_beta,
+               rocblas_operation opA,
+               rocblas_operation opB,
+               const std::array<int, 3>& algos)
+            : batch_size(batch),
+              m(mm),
+              n(nn),
+              k(kk),
+              alpha(param_alpha),
+              beta(param_beta),
+              op_A(opA),
+              op_B(opB),
+              gemm_algos(algos)
+        {
+        }
+        void SetConfig(int mm, int nn, int kk)
+        {
+            m = mm;
+            n = nn;
+            k = kk;
+        }
+    };
+    StridedBatchGemm(const Config& config) : _config(config) {}
+    virtual ~StridedBatchGemm() {}
+    void Forward(int bsz, T* output, const T* _buffer_a, const T* _buffer_b, rocblas_handle handle)
+    {
+        int stride_a = _config.m * _config.k;
+        int stride_b = _config.n * _config.k;
+        int stride_c = _config.m * _config.n;
+        cublas_strided_batched_gemm(handle,
+        //rocblas_sgemm_strided_batched(handle,
+                                    _config.m,
+                                    _config.n,
+                                    _config.k,
+                                    &_config.alpha,
+                                    &_config.beta,
+                                    _buffer_a,
+                                    _buffer_b,
+                                    output,
+                                    _config.op_A,
+                                    _config.op_B,
+                                    stride_a,
+                                    stride_b,
+                                    stride_c,
+                                    bsz,
+                                    rocblas_gemm_algo(_config.gemm_algos[0]));
+        //rocblas_sgemm_strided_batched(handle,
+    }
+    void ForwardPlusSave(T* output, const T* _buffer_a, const T* _buffer_b, rocblas_handle handle)
+    {
+        int stride_a = _config.m * _config.k;
+        int stride_b = _config.n * _config.k;
+        int stride_c = _config.m * _config.n;
+        cublas_strided_batched_gemm(handle,
+                                    _config.m,
+                                    _config.n,
+                                    _config.k,
+                                    &_config.alpha,
+                                    &_config.beta,
+                                    _buffer_a,
+                                    _buffer_b,
+                                    output,
+                                    _config.op_A,
+                                    _config.op_B,
+                                    stride_a,
+                                    stride_b,
+                                    stride_c,
+                                    _config.batch_size,
+                                    //cublasGemmAlgo_t(_config.gemm_algos[0]));
+                                    rocblas_gemm_algo(_config.gemm_algos[0]));
+        k_buf = _buffer_a;
+        q_buf = _buffer_b;
+    }
+    void Backward(int bsz,
+                  const T* d_output,
+                  const T* _buffer_a,
+                  const T* _buffer_b,
+                  rocblas_handle handle,
+                  T* inpGradA = nullptr,
+                  T* inpGradB = nullptr)
+    {
+        int mb = (_config.op_A == rocblas_operation_transpose ? _config.k : _config.m);
+        int kb = (_config.op_A == rocblas_operation_transpose ? _config.m : _config.k);
+        int stride_a = mb * _config.n;
+        int stride_b = _config.n * kb;
+        int stride_c = _config.m * _config.k;
+        // B need to transpose.
+        rocblas_operation op_b = (_config.op_B == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
+        // Calculate d_A.
+        cublas_strided_batched_gemm(handle,
+        //rocblas_sgemm_strided_batched(handle,
+                                    mb,
+                                    kb,
+                                    _config.n,
+                                    &_config.alpha,
+                                    &_config.beta,
+                                    (_config.op_A == rocblas_operation_transpose ? _buffer_b : d_output),
+                                    (_config.op_A == rocblas_operation_transpose ? d_output : _buffer_b),
+                                    inpGradA,
+                                    rocblas_operation_none,
+                                    op_b,
+                                    stride_a,
+                                    stride_b,
+                                    stride_c,
+                                    bsz,
+                                    //cublasGemmAlgo_t(_config.gemm_algos[1]));
+                                    rocblas_gemm_algo(_config.gemm_algos[1]));
+        // A need to transpose.
+        rocblas_operation op_a = (_config.op_A == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
+        stride_a = _config.m * _config.k;
+        stride_b = _config.m * _config.n;
+        stride_c = _config.n * _config.k;
+        // Calculate d_B.
+        cublas_strided_batched_gemm(handle,
+        //rocblas_sgemm_strided_batched(handle,
+                                    _config.k,
+                                    _config.n,
+                                    _config.m,
+                                    &_config.alpha,
+                                    &_config.beta,
+                                    _buffer_a,
+                                    d_output,
+                                    inpGradB,
+                                    op_a,
+                                    rocblas_operation_none,
+                                    stride_a,
+                                    stride_b,
+                                    stride_c,
+                                    bsz,
+                                    //cublasGemmAlgo_t(_config.gemm_algos[2]));
+                                    rocblas_gemm_algo(_config.gemm_algos[2]));
+    }
+    inline int GetN() const { return _config.k; }
+    inline const T* GetBufferA() const { return k_buf; }
+    inline const T* GetBufferB() const { return q_buf; }
+    inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); }
+private:
+    Config _config;
+    const T* q_buf;
+    const T* k_buf;
+};
--- a/csrc/includes/hip/strided_batch_gemm.h.bak
+++ b/csrc/includes/hip/strided_batch_gemm.h.bak
+#pragma once
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <stdio.h>
+#include "hip/context.h"
+template <typename T>
+class StridedBatchGemm {
+public:
+    struct Config {
+        int batch_size;
+        int m;
+        int n;
+        int k;
+        float alpha;
+        float beta;
+        rocblas_operation op_A;
+        rocblas_operation op_B;
+        std::array<int, 3> gemm_algos;
+        Config(int batch,
+               int mm,
+               int nn,
+               int kk,
+               float param_alpha,
+               float param_beta,
+               rocblas_operation opA,
+               rocblas_operation opB,
+               const std::array<int, 3>& algos)
+            : batch_size(batch),
+              m(mm),
+              n(nn),
+              k(kk),
+              alpha(param_alpha),
+              beta(param_beta),
+              op_A(opA),
+              op_B(opB),
+              gemm_algos(algos)
+        {
+        }
+        void SetConfig(int mm, int nn, int kk)
+        {
+            m = mm;
+            n = nn;
+            k = kk;
+        }
+    };
+    StridedBatchGemm(const Config& config) : _config(config) {}
+    virtual ~StridedBatchGemm() {}
+    void Forward(int bsz, T* output, const T* _buffer_a, const T* _buffer_b, rocblas_handle handle)
+    {
+        int stride_a = _config.m * _config.k;
+        int stride_b = _config.n * _config.k;
+        int stride_c = _config.m * _config.n;
+        cublas_strided_batched_gemm(handle,
+                                    _config.m,
+                                    _config.n,
+                                    _config.k,
+                                    &_config.alpha,
+                                    &_config.beta,
+                                    _buffer_a,
+                                    _buffer_b,
+                                    output,
+                                    _config.op_A,
+                                    _config.op_B,
+                                    stride_a,
+                                    stride_b,
+                                    stride_c,
+                                    bsz,
+                                    cublasGemmAlgo_t(_config.gemm_algos[0]));
+    }
+    void ForwardPlusSave(T* output, const T* _buffer_a, const T* _buffer_b, rocblas_handle handle)
+    {
+        int stride_a = _config.m * _config.k;
+        int stride_b = _config.n * _config.k;
+        int stride_c = _config.m * _config.n;
+        cublas_strided_batched_gemm(handle,
+                                    _config.m,
+                                    _config.n,
+                                    _config.k,
+                                    &_config.alpha,
+                                    &_config.beta,
+                                    _buffer_a,
+                                    _buffer_b,
+                                    output,
+                                    _config.op_A,
+                                    _config.op_B,
+                                    stride_a,
+                                    stride_b,
+                                    stride_c,
+                                    _config.batch_size,
+                                    cublasGemmAlgo_t(_config.gemm_algos[0]));
+        k_buf = _buffer_a;
+        q_buf = _buffer_b;
+    }
+    void Backward(int bsz,
+                  const T* d_output,
+                  const T* _buffer_a,
+                  const T* _buffer_b,
+                  rocblas_handle handle,
+                  T* inpGradA = nullptr,
+                  T* inpGradB = nullptr)
+    {
+        int mb = (_config.op_A == rocblas_operation_transpose ? _config.k : _config.m);
+        int kb = (_config.op_A == rocblas_operation_transpose ? _config.m : _config.k);
+        int stride_a = mb * _config.n;
+        int stride_b = _config.n * kb;
+        int stride_c = _config.m * _config.k;
+        // B need to transpose.
+        rocblas_operation op_b = (_config.op_B == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
+        // Calculate d_A.
+        cublas_strided_batched_gemm(handle,
+                                    mb,
+                                    kb,
+                                    _config.n,
+                                    &_config.alpha,
+                                    &_config.beta,
+                                    (_config.op_A == rocblas_operation_transpose ? _buffer_b : d_output),
+                                    (_config.op_A == rocblas_operation_transpose ? d_output : _buffer_b),
+                                    inpGradA,
+                                    rocblas_operation_none,
+                                    op_b,
+                                    stride_a,
+                                    stride_b,
+                                    stride_c,
+                                    bsz,
+                                    cublasGemmAlgo_t(_config.gemm_algos[1]));
+        // A need to transpose.
+        rocblas_operation op_a = (_config.op_A == rocblas_operation_transpose ? rocblas_operation_none : rocblas_operation_transpose);
+        stride_a = _config.m * _config.k;
+        stride_b = _config.m * _config.n;
+        stride_c = _config.n * _config.k;
+        // Calculate d_B.
+        cublas_strided_batched_gemm(handle,
+                                    _config.k,
+                                    _config.n,
+                                    _config.m,
+                                    &_config.alpha,
+                                    &_config.beta,
+                                    _buffer_a,
+                                    d_output,
+                                    inpGradB,
+                                    op_a,
+                                    rocblas_operation_none,
+                                    stride_a,
+                                    stride_b,
+                                    stride_c,
+                                    bsz,
+                                    cublasGemmAlgo_t(_config.gemm_algos[2]));
+    }
+    inline int GetN() const { return _config.k; }
+    inline const T* GetBufferA() const { return k_buf; }
+    inline const T* GetBufferB() const { return q_buf; }
+    inline void SetConfig(int m, int n, int k) { _config.SetConfig(m, n, k); }
+private:
+    Config _config;
+    const T* q_buf;
+    const T* k_buf;
+};
--- a/csrc/includes/hip/type_shim.h
+++ b/csrc/includes/hip/type_shim.h
+#include "hip/hip_runtime.h"
+/* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
+#include <ATen/ATen.h>
+// Forward/backward compatiblity hack around
+// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
+// pending more future-proof guidance from upstream.
+// struct TypeShim
+// {
+//   const at::Type& payload;
+//   TypeShim(const at::Type& type) : payload(type) {}
+//   // Enable trivial conversion to a const at::Type& for pre-3aeb78
+//   operator const at::Type&(){ return payload; };
+//   // Enable dispatch switch statements to take *this directly for  post-3aeb78
+//   //operator at::ScalarType(){ return payload.; };
+// };
+#define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)                          \
+    switch (TYPE) {                                                              \
+        case at::ScalarType::Float: {                                            \
+            using scalar_t_##LEVEL = float;                                      \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        case at::ScalarType::Half: {                                             \
+            using scalar_t_##LEVEL = at::Half;                                   \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+    }
+#define DISPATCH_DOUBLE_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)                   \
+    switch (TYPE) {                                                              \
+        case at::ScalarType::Double: {                                           \
+            using scalar_t_##LEVEL = double;                                     \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        case at::ScalarType::Float: {                                            \
+            using scalar_t_##LEVEL = float;                                      \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        case at::ScalarType::Half: {                                             \
+            using scalar_t_##LEVEL = at::Half;                                   \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+    }
+#define DISPATCH_DOUBLE_AND_FLOAT(TYPE, LEVEL, NAME, ...)                        \
+    switch (TYPE) {                                                              \
+        case at::ScalarType::Double: {                                           \
+            using scalar_t_##LEVEL = double;                                     \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        case at::ScalarType::Float: {                                            \
+            using scalar_t_##LEVEL = float;                                      \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+    }
+template <typename T>
+__device__ __forceinline__ T
+reduce_block_into_lanes(T* x,
+                        T val,
+                        int lanes = 1,
+                        bool share_result = false)  // lanes is intended to be <= 32.
+{
+    int tid = threadIdx.x + threadIdx.y * blockDim.x;
+    int blockSize = blockDim.x * blockDim.y;  // blockSize is intended to be a multiple of 32.
+    if (blockSize >= 64) {
+        x[tid] = val;
+        __syncthreads();
+    }
+#pragma unroll
+    for (int i = (blockSize >> 1); i >= 64; i >>= 1) {
+        if (tid < i) x[tid] = x[tid] + x[tid + i];
+        __syncthreads();
+    }
+    T final;
+    if (tid < 32) {
+        if (blockSize >= 64)
+            final = x[tid] + x[tid + 32];
+        else
+            final = val;
+            // __SYNCWARP();
+#pragma unroll
+        for (int i = 16; i >= lanes; i >>= 1)
+            final = final + __shfl_down_sync(0xffffffff, final, i);
+    }
+    if (share_result) {
+        if (tid < lanes) x[tid] = final;  // EpilogueOp
+        // Make sure the smem result is visible to all warps.
+        __syncthreads();
+    }
+    return final;
+}
--- a/csrc/lamb/hip/fused_lamb_hip.cpp
+++ b/csrc/lamb/hip/fused_lamb_hip.cpp
+/* Copyright 2019 The Microsoft DeepSpeed Team */
+#include <torch/extension.h>
+// CUDA forward declaration
+void fused_lamb_cuda(at::Tensor& p,
+                     at::Tensor& p_copy,
+                     at::Tensor& m,
+                     at::Tensor& v,
+                     at::Tensor& g,
+                     float lr,
+                     float beta1,
+                     float beta2,
+                     float max_coeff,
+                     float min_coeff,
+                     float eps,
+                     float grad_scale,
+                     int step,
+                     int mode,
+                     int bias_correction,
+                     float decay,
+                     at::Tensor& w_l2_i,
+                     at::Tensor& u_l2_i,
+                     at::Tensor& lamb_coeff_val);
+#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+    CHECK_CUDA(x);     \
+    CHECK_CONTIGUOUS(x)
+// C++ interface
+at::Tensor lamb(at::Tensor& p,
+                at::Tensor& p_copy,
+                at::Tensor& m,
+                at::Tensor& v,
+                at::Tensor& g,
+                float lr,
+                float beta1,
+                float beta2,
+                float max_coeff,
+                float min_coeff,
+                float eps,
+                float grad_scale,
+                int step,
+                int mode,
+                int bias_correction,
+                float decay)
+{
+    CHECK_INPUT(p);
+    if (p_copy.numel() > 0) CHECK_INPUT(p_copy);
+    CHECK_INPUT(m);
+    CHECK_INPUT(v);
+    CHECK_INPUT(g);
+    int64_t num_elem = p.numel();
+    AT_ASSERTM(m.numel() == num_elem, "number of elements in m and p tensors should be equal");
+    AT_ASSERTM(v.numel() == num_elem, "number of elements in v and p tensors should be equal");
+    AT_ASSERTM(g.numel() == num_elem, "number of elements in g and p tensors should be equal");
+    AT_ASSERTM(
+        p_copy.numel() == num_elem || p_copy.numel() == 0,
+        "number of elements in p_copy and p tensors should be equal, or p_copy should be empty");
+    // intermediate for weight L2 reduction
+    // make sure that the threads per block is at least 512 during the kernel launch otherwise the
+    // behavious is unexpected
+    at::Tensor w_l2_i = at::empty(
+        {512},
+        p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float
+                                                                        : p.type().scalarType()));
+    // intermediate for update L2 reduction
+    // make sure that the threads per block is at least 512 during the kernel launch otherwise the
+    // behavious is unexpected
+    at::Tensor u_l2_i = at::empty(
+        {512},
+        p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float
+                                                                        : p.type().scalarType()));
+    at::Tensor lamb_coeff_val = at::empty(
+        {1},
+        p.options().dtype(p.type().scalarType() == at::ScalarType::Half ? at::ScalarType::Float
+                                                                        : p.type().scalarType()));
+    fused_lamb_cuda(p,
+                    p_copy,
+                    m,
+                    v,
+                    g,
+                    lr,
+                    beta1,
+                    beta2,
+                    max_coeff,
+                    min_coeff,
+                    eps,
+                    grad_scale,
+                    step,
+                    mode,
+                    bias_correction,
+                    decay,
+                    w_l2_i,
+                    u_l2_i,
+                    lamb_coeff_val);
+    return lamb_coeff_val;
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("lamb", &lamb, "Adam optimized CUDA implementation with LAMB.");
+}
--- a/csrc/lamb/hip/fused_lamb_hip_kernel.hip
+++ b/csrc/lamb/hip/fused_lamb_hip_kernel.hip
+/* Copyright 2019 The Microsoft DeepSpeed Team */
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <stdio.h>
+#include <cmath>
+#include "ATen/ATen.h"
+#include "ATen/TensorUtils.h"
+#include "ATen/hip/HIPContext.h"
+#include "ATen/hip/detail/IndexUtils.cuh"
+//#include "ATen/Type.h"
+#include <THH/THHGeneral.h>
+#include "ATen/AccumulateType.h"
+#include <iostream>
+//#include <helper_functions.h>
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+#include <hip/hip_cooperative_groups.h>
+#else
+#include <cooperative_groups.h>
+#endif
+#include <hip/hip_runtime_api.h>
+#include <stdio.h>
+namespace cg = cooperative_groups;
+// Utility class used to avoid linker errors with extern
+// unsized shared memory arrays with templated type
+namespace {
+// This is the un-specialized struct.  Note that we prevent instantiation of this
+// struct by putting an undefined symbol in the function body so it won't compile.
+template <typename T>
+struct SharedMemory {
+    // Ensure that we won't compile any un-specialized types
+    __device__ inline operator T*()
+    {
+        extern __device__ void error(void);
+        error();
+        return NULL;
+    }
+};
+template <>
+struct SharedMemory<float> {
+    __device__ inline operator float*()
+    {
+        HIP_DYNAMIC_SHARED( float, s_float)
+        return s_float;
+    }
+};
+template <>
+struct SharedMemory<double> {
+    __device__ inline operator double*()
+    {
+        HIP_DYNAMIC_SHARED( double, s_double)
+        return s_double;
+    }
+};
+}  // namespace
+#include "hip/type_shim.h"
+//#include "type_shim.h"
+typedef enum {
+    ADAM_MODE_0 = 0,  // eps under square root
+    ADAM_MODE_1 = 1   // eps outside square root
+} adamMode_t;
+// s_a and s_b are in shared memory
+// g_a and g_b are in shared memory
+template <typename T, int blockSize>
+__device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
+{
+    // Handle to thread block group
+    cg::thread_block cta = cg::this_thread_block();
+    // perform block reduction in shared memory,
+    unsigned int tid = cta.thread_rank();
+    T a_sum = s_a[tid];
+    T b_sum = s_b[tid];
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
+    cg::sync(cta);
+#endif
+    // do reduction in shared mem
+    if ((blockSize >= 512) && (tid < 256)) {
+        s_a[tid] = a_sum = a_sum + s_a[tid + 256];
+        s_b[tid] = b_sum = b_sum + s_b[tid + 256];
+    }
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
+    cg::sync(cta);
+#endif
+    if ((blockSize >= 256) && (tid < 128)) {
+        s_a[tid] = a_sum = a_sum + s_a[tid + 128];
+        s_b[tid] = b_sum = b_sum + s_b[tid + 128];
+    }
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
+    cg::sync(cta);
+#endif
+    if ((blockSize >= 128) && (tid < 64)) {
+        s_a[tid] = a_sum = a_sum + s_a[tid + 64];
+        s_b[tid] = b_sum = b_sum + s_b[tid + 64];
+    }
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
+    cg::sync(cta);
+#endif
+#if (__CUDA_ARCH__ >= 300)
+    if (tid < 32) {
+        cg::coalesced_group active = cg::coalesced_threads();
+        // Fetch final intermediate sum from 2nd warp
+        if (blockSize >= 64) {
+            a_sum = a_sum + s_a[tid + 32];
+            b_sum = b_sum + s_b[tid + 32];
+        }
+        // Reduce final warp using shuffle
+        for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+            a_sum += active.shfl_down(a_sum, offset);
+            b_sum += active.shfl_down(b_sum, offset);
+        }
+    }
+#else
+    if ((blockSize >= 64) && (tid < 32)) {
+        s_a[tid] = a_sum = a_sum + s_a[tid + 32];
+        s_b[tid] = b_sum = b_sum + s_b[tid + 32];
+    }
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
+    cg::sync(cta);
+#endif
+    if ((blockSize >= 32) && (tid < 16)) {
+        s_a[tid] = a_sum = a_sum + s_a[tid + 16];
+        s_b[tid] = b_sum = b_sum + s_b[tid + 16];
+    }
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
+    cg::sync(cta);
+#endif
+    if ((blockSize >= 16) && (tid < 8)) {
+        s_a[tid] = a_sum = a_sum + s_a[tid + 8];
+        s_b[tid] = b_sum = b_sum + s_b[tid + 8];
+    }
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
+    cg::sync(cta);
+#endif
+    if ((blockSize >= 8) && (tid < 4)) {
+        s_a[tid] = a_sum = a_sum + s_a[tid + 4];
+        s_b[tid] = b_sum = b_sum + s_b[tid + 4];
+    }
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
+    cg::sync(cta);
+#endif
+    if ((blockSize >= 4) && (tid < 2)) {
+        s_a[tid] = a_sum = a_sum + s_a[tid + 2];
+        s_b[tid] = b_sum = b_sum + s_b[tid + 2];
+    }
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
+    cg::sync(cta);
+#endif
+    if ((blockSize >= 2) && (tid < 1)) {
+        s_a[tid] = a_sum = a_sum + s_a[tid + 1];
+        s_b[tid] = b_sum = b_sum + s_b[tid + 1];
+    }
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
+    cg::sync(cta);
+#endif
+#endif
+    // write result for this block to global mem
+    if (tid == 0) {
+        g_a[blockIdx.x] = (T)a_sum;
+        g_b[blockIdx.x] = (T)b_sum;
+    }
+}
+template <typename T, int blockSize>
+__device__ void reduce_two_vectors_in_register(T a, T b, T* g_a, T* g_b)
+{
+    const int threadIdInBlock = cg::this_thread_block().thread_rank();
+    T* s_a = SharedMemory<T>();
+    T* s_b = SharedMemory<T>() + cg::this_thread_block().size();
+    s_a[threadIdInBlock] = a;
+    s_b[threadIdInBlock] = b;
+    reduce_block_in_shared_memory<T, blockSize>(s_a, s_b, g_a, g_b);
+}
+template <typename T, typename GRAD_T, int blockSize>
+__global__ void lamb_cuda_kernel_part1(
+    T* __restrict__ p,
+    GRAD_T* __restrict__ p_copy,  // For mixed precision training, pass NULL if not needed
+    T* __restrict__ m,
+    T* __restrict__ v,
+    const GRAD_T* __restrict__ g,
+    const float b1,
+    const float b2,
+    const float eps,
+    const float grad_scale,
+    const float step_size,
+    const size_t tsize,
+    adamMode_t mode,
+    const float decay,
+    T* __restrict__ w_l2_i,
+    T* __restrict__ u_l2_i)
+{
+    // Assuming 2D grids and 2D blocks
+    const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
+    const int threadsPerBlock = blockDim.x * blockDim.y;
+    const int threadIdInBlock = cg::this_thread_block().thread_rank();
+    const int i = (blockId * threadsPerBlock + threadIdInBlock);
+    const int totThreads = gridDim.x * gridDim.y * threadsPerBlock;
+    T reg_w = 0;
+    T reg_u = 0;
+    for (int j = i; j < tsize; j += totThreads) {
+        T scaled_grad = g[j] / grad_scale;
+        T pj = p[j];
+        m[j] = b1 * m[j] + (1 - b1) * scaled_grad;
+        v[j] = b2 * v[j] + (1 - b2) * scaled_grad * scaled_grad;
+        float denom;
+        if (mode == ADAM_MODE_0)
+            denom = sqrtf(v[j] + eps);
+        else  // Mode 1
+            denom = sqrtf(v[j]) + eps;
+        T update = (m[j] / denom) + (decay * p[j]);
+        reg_u += update * update;
+        reg_w += pj * pj;
+    }
+    reduce_two_vectors_in_register<T, blockSize>(reg_w, reg_u, w_l2_i, u_l2_i);
+}
+template <typename T, typename GRAD_T, int blockSize>
+__global__ void lamb_cuda_kernel_part2(const size_t tsize, T* __restrict__ g_a, T* __restrict__ g_b)
+{
+    T* s_a = SharedMemory<T>();
+    T* s_b = SharedMemory<T>() + cg::this_thread_block().size();
+    const int threadIdInBlock = cg::this_thread_block().thread_rank();
+    s_a[threadIdInBlock] = g_a[threadIdInBlock];
+    s_b[threadIdInBlock] = g_b[threadIdInBlock];
+    if (threadIdInBlock >= tsize) {
+        s_a[threadIdInBlock] = 0.0;
+        s_b[threadIdInBlock] = 0.0;
+    }
+    reduce_block_in_shared_memory<T, blockSize>(s_a, s_b, g_a, g_b);
+}
+template <typename T, typename GRAD_T>
+__global__ void lamb_cuda_kernel_part3(
+    T* __restrict__ p,
+    GRAD_T* __restrict__ p_copy,  // For mixed precision training, pass NULL if not needed
+    T* __restrict__ m,
+    T* __restrict__ v,
+    const GRAD_T* __restrict__ g,
+    const float b1,
+    const float b2,
+    const float max_coeff,
+    const float min_coeff,
+    const float eps,
+    const float grad_scale,
+    const float step_size,
+    const size_t tsize,
+    adamMode_t mode,
+    const float decay,
+    T* __restrict__ w_l2_i,
+    T* __restrict__ u_l2_i,
+    T* __restrict__ lamb_coeff_val)
+{
+    // Assuming 2D grids and 2D blocks
+    const int blockId = gridDim.x * blockIdx.y + blockIdx.x;
+    const int threadsPerBlock = blockDim.x * blockDim.y;
+    const int threadIdInBlock = cg::this_thread_block().thread_rank();
+    const int i = (blockId * threadsPerBlock + threadIdInBlock);
+    const int totThreads = gridDim.x * gridDim.y * threadsPerBlock;
+    T reg_w = sqrtf(w_l2_i[0]);
+    T reg_u = sqrtf(u_l2_i[0]);
+    float lamb_coeff = 1.0;
+    if (reg_w != 0 and reg_u != 0) {
+        lamb_coeff = reg_w / reg_u;
+        if (lamb_coeff > max_coeff) { lamb_coeff = max_coeff; }
+        if (lamb_coeff < min_coeff) { lamb_coeff = min_coeff; }
+    }
+    if (blockId == 0 and threadIdInBlock == 0) {
+        lamb_coeff_val[0] = lamb_coeff;
+        // printf("Cuda Lamb Coeff is %.6f \n",lamb_coeff);
+    }
+    for (int j = i; j < tsize; j += totThreads) {
+        T pj = (float)p[j];
+        T mj = m[j];
+        T vj = v[j];
+        float denom;
+        if (mode == ADAM_MODE_0)
+            denom = sqrtf(vj + eps);
+        else  // Mode 1
+            denom = sqrtf(vj) + eps;
+        T update = (mj / denom) + (decay * pj);
+        pj = pj - (step_size * lamb_coeff * update);
+        p[j] = pj;
+        if (p_copy != NULL) p_copy[j] = (GRAD_T)pj;
+    }
+}
+void fused_lamb_cuda(at::Tensor& p,
+                     at::Tensor& p_copy,
+                     at::Tensor& m,
+                     at::Tensor& v,
+                     at::Tensor& g,
+                     float lr,
+                     float beta1,
+                     float beta2,
+                     float max_coeff,
+                     float min_coeff,
+                     float eps,
+                     float grad_scale,
+                     int step,
+                     int mode,
+                     int bias_correction,
+                     float decay,
+                     at::Tensor& w_l2_i,
+                     at::Tensor& u_l2_i,
+                     at::Tensor& lamb_coeff)
+{
+    //        using namespace at;
+    // Get tensor size
+    int tsize = p.numel();
+    // Determine #threads and #blocks
+    const int threadsPerBlock = 512;
+    int num_blocks = (tsize + threadsPerBlock - 1) / threadsPerBlock;
+    if (num_blocks > 512) num_blocks = 512;
+    int smemsize = 0;
+    if (p.type().scalarType() == at::ScalarType::Double)
+        smemsize = 2 * threadsPerBlock * sizeof(double);
+    else
+        smemsize = 2 * threadsPerBlock * sizeof(float);
+    const dim3 blocks(num_blocks);
+    const dim3 threads(threadsPerBlock);
+    AT_ASSERTM(at::cuda::detail::canUse32BitIndexMath(p),
+               "parameter tensor is too large to be indexed with int32");
+    // Constants
+    float step_size = 0;
+    if (bias_correction == 1) {
+        const float bias_correction1 = 1 - ::pow(beta1, step);
+        const float bias_correction2 = 1 - ::pow(beta2, step);
+        step_size = lr * std::sqrt(bias_correction2) / bias_correction1;
+    } else {
+        step_size = lr;
+    }
+    hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+    if (g.type().scalarType() == at::ScalarType::Half) {
+        // all other values should be fp32 for half gradients
+        AT_ASSERTM(p.type().scalarType() == at::ScalarType::Float,
+                   "expected parameter to be of float type");
+        // dispatch is done on the gradient type
+        using namespace at;  // prevents "toString is undefined" errors
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            g.scalar_type(), "lamb_cuda_kernel", ([&] {
+                using accscalar_t = at::acc_type<scalar_t, true>;
+               hipLaunchKernelGGL(( lamb_cuda_kernel_part1<accscalar_t, scalar_t, threadsPerBlock>)
+                    , dim3(blocks), dim3(threadsPerBlock), smemsize, stream, 
+                        p.data<accscalar_t>(),
+                        p_copy.numel() ? p_copy.data<scalar_t>() : NULL,
+                        m.data<accscalar_t>(),
+                        v.data<accscalar_t>(),
+                        g.data<scalar_t>(),
+                        beta1,
+                        beta2,
+                        eps,
+                        grad_scale,
+                        step_size,
+                        tsize,
+                        (adamMode_t)mode,
+                        decay,
+                        w_l2_i.data<accscalar_t>(),
+                        u_l2_i.data<accscalar_t>());
+               hipLaunchKernelGGL(( lamb_cuda_kernel_part2<accscalar_t, scalar_t, threadsPerBlock>)
+                    , dim3(1), dim3(threadsPerBlock), smemsize, stream, 
+                        num_blocks, w_l2_i.data<accscalar_t>(), u_l2_i.data<accscalar_t>());
+               hipLaunchKernelGGL(( lamb_cuda_kernel_part3<accscalar_t, scalar_t>)
+                    , dim3(blocks), dim3(threadsPerBlock), smemsize, stream, 
+                        p.data<accscalar_t>(),
+                        p_copy.numel() ? p_copy.data<scalar_t>() : NULL,
+                        m.data<accscalar_t>(),
+                        v.data<accscalar_t>(),
+                        g.data<scalar_t>(),
+                        beta1,
+                        beta2,
+                        max_coeff,
+                        min_coeff,
+                        eps,
+                        grad_scale,
+                        step_size,
+                        tsize,
+                        (adamMode_t)mode,
+                        decay,
+                        w_l2_i.data<accscalar_t>(),
+                        u_l2_i.data<accscalar_t>(),
+                        lamb_coeff.data<accscalar_t>());
+            }));
+    } else {
+        using namespace at;
+        AT_DISPATCH_FLOATING_TYPES(
+            g.scalar_type(), "lamb_cuda_kernel", ([&] {
+               hipLaunchKernelGGL(( lamb_cuda_kernel_part1<scalar_t, scalar_t, threadsPerBlock>)
+                    , dim3(blocks), dim3(threadsPerBlock), smemsize, stream, 
+                        p.data<scalar_t>(),
+                        NULL,  // don't output p_copy for fp32, it's wasted write
+                        m.data<scalar_t>(),
+                        v.data<scalar_t>(),
+                        g.data<scalar_t>(),
+                        beta1,
+                        beta2,
+                        eps,
+                        grad_scale,
+                        step_size,
+                        tsize,
+                        (adamMode_t)mode,
+                        decay,
+                        w_l2_i.data<scalar_t>(),
+                        u_l2_i.data<scalar_t>());
+               hipLaunchKernelGGL(( lamb_cuda_kernel_part2<scalar_t, scalar_t, threadsPerBlock>)
+                    , dim3(1), dim3(threadsPerBlock), smemsize, stream, 
+                        num_blocks, w_l2_i.data<scalar_t>(), u_l2_i.data<scalar_t>());
+               hipLaunchKernelGGL(( lamb_cuda_kernel_part3<scalar_t, scalar_t>)
+                    , dim3(blocks), dim3(threadsPerBlock), smemsize, stream, 
+                        p.data<scalar_t>(),
+                        NULL,  // don't output p_copy for fp32, it's wasted write
+                        m.data<scalar_t>(),
+                        v.data<scalar_t>(),
+                        g.data<scalar_t>(),
+                        beta1,
+                        beta2,
+                        max_coeff,
+                        min_coeff,
+                        eps,
+                        grad_scale,
+                        step_size,
+                        tsize,
+                        (adamMode_t)mode,
+                        decay,
+                        w_l2_i.data<scalar_t>(),
+                        u_l2_i.data<scalar_t>(),
+                        lamb_coeff.data<scalar_t>());
+            }));
+    }
+    THCudaCheck(hipGetLastError());
+}
+// template __device__ void reduce_two_vectors_in_register<float,512>(float a, float b, float* g_a,
+// float* g_b, cg::grid_group &cgg);
--- a/csrc/sparse_attention/hip/utils.cpp
+++ b/csrc/sparse_attention/hip/utils.cpp
+// DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
+// https://github.com/ptillet/torch-blocksparse/blob/master/csrc/utils.cpp
+#include <torch/extension.h>
+#include <string>
+#include <tuple>
+#include <vector>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+typedef std::vector<std::tuple<int, torch::Tensor>> ret_t;
+void segment_blocks(torch::Tensor layout,
+                    torch::Tensor idx,
+                    torch::Tensor scratch,
+                    int max_width,
+                    ret_t& ret)
+{
+    size_t H = layout.size(0);
+    size_t M = layout.size(1);
+    size_t N = layout.size(2);
+    torch::Tensor tmp = torch::zeros_like(layout);
+    auto _tmp = tmp.accessor<int, 3>();
+    auto _layout = layout.accessor<int, 3>();
+    auto _idx = idx.accessor<int, 3>();
+    auto _scratch = scratch.accessor<int, 3>();
+    std::vector<int> current(H, 0);
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (size_t h = 0; h < H; h++) {
+        // surrounding indices
+        std::vector<int> ii_left(max_width, -1);
+        std::vector<std::vector<int>> ii_top(max_width, std::vector<int>(N, -1));
+        for (size_t m = 0; m < M; m++) {
+            for (size_t n = 0; n < N; n++) {
+                int v = _layout[h][m][n];
+                if (v == 0) continue;
+                int n_left = ii_left[max_width - 1];
+                int m_top = ii_top[max_width - 1][n];
+                int top = (m_top >= 0) ? _tmp[h][m_top][n] : 0;
+                int left = (n_left >= 0) ? _tmp[h][m][n_left] : 0;
+                int topleft = (m_top >= 0 && n_left >= 0) ? _tmp[h][m_top][n_left] : 0;
+                int width = std::min(left, std::min(top, topleft)) + 1;
+                // reset width if blocks cannot be
+                // packed together (i.e., there's a 1 "in the middle")
+                for (int nn = n_left + 1; nn < n; nn++)
+                    if (ii_top[max_width - 1][nn] > ii_top[max_width - 1][n]) width = 1;
+                _tmp[h][m][n] = width;
+                // update n_left ring buffer
+                for (int k = 0; k < max_width - 1; k++) ii_left[k] = ii_left[k + 1];
+                ii_left[max_width - 1] = n;
+                // update ii_top ring buffer
+                for (int k = 0; k < max_width - 1; k++) ii_top[k][n] = ii_top[k + 1][n];
+                ii_top[max_width - 1][n] = m;
+                // block is too small -- skip
+                if (width != max_width) continue;
+                // retained blocks are set to zeros
+                for (size_t km = 0; km < max_width; km++)
+                    for (size_t kn = 0; kn < max_width; kn++) {
+                        int mm = ii_top[km][n];
+                        int nn = ii_left[kn];
+                        if (mm < 0 || nn < 0) continue;
+                        _layout[h][mm][nn] = 0;
+                        _tmp[h][mm][nn] = 0;
+                        _scratch[h][current[h]][0] = (int)h;
+                        _scratch[h][current[h]][1] = (int)mm;
+                        _scratch[h][current[h]][2] = (int)nn;
+                        _scratch[h][current[h]][3] = _idx[h][mm][nn];
+                        current[h]++;
+                    }
+            }
+        }
+    }
+    std::vector<torch::Tensor> to_cat;
+    for (size_t h = 0; h < H; h++)
+        if (current[h] > 0) to_cat.push_back(scratch[h].slice(0, 0, current[h]));
+    if (!to_cat.empty()) ret.push_back({max_width, torch::cat(to_cat)});
+}
+ret_t sdd_segment(torch::Tensor layout, int start_width)
+{
+    ret_t ret;
+    // block index
+    torch::Tensor idx = torch::zeros_like(layout);
+    int current = 0;
+    size_t H = layout.size(0);
+    size_t M = layout.size(1);
+    size_t N = layout.size(2);
+    auto _layout = layout.accessor<int, 3>();
+    auto _idx = idx.accessor<int, 3>();
+    for (size_t h = 0; h < H; h++)
+        for (size_t m = 0; m < M; m++)
+            for (size_t n = 0; n < N; n++) {
+                if (_layout[h][m][n] == 0) continue;
+                _idx[h][m][n] = current++;
+            }
+    // scratch memory
+    //torch::Tensor scratch = torch::empty({H, layout.sum().item<int>(), 4}, layout.dtype());
+    //aiss debug
+    torch::Tensor scratch = torch::empty({(long)H, layout.sum().item<int>(), 4}, layout.dtype());
+    for (int max_width = start_width; max_width > 0; max_width /= 2)
+        segment_blocks(layout, idx, scratch, max_width, ret);
+    return ret;
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("sdd_segment", &sdd_segment, "SDD segmentation handler");
+}
--- a/csrc/transformer/hip/cublas_wrappers.hip
+++ b/csrc/transformer/hip/cublas_wrappers.hip
+#include "hip/cublas_wrappers.h"
+int cublas_gemm_ex(rocblas_handle handle,
+                   rocblas_operation transa,
+                   rocblas_operation transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha,
+                   const float* beta,
+                   const float* A,
+                   const float* B,
+                   float* C,
+                   //cublasGemmAlgo_t algo)
+                   rocblas_gemm_algo algo)
+{
+    rocblas_status status = rocblas_gemm_ex(handle,
+                                         transa,
+                                         transb,
+                                         m,
+                                         n,
+                                         k,
+                                         (const void*)alpha,
+                                         (const void*)A,
+                                         rocblas_datatype_f32_r,
+                                         (transa == rocblas_operation_none) ? m : k,
+                                         (const void*)B,
+                                         rocblas_datatype_f32_r,
+                                         (transb == rocblas_operation_none) ? k : n,
+                                         (const void*)beta,
+                                         C,
+                                         rocblas_datatype_f32_r,
+                                         m,
+                                         C,
+                                         rocblas_datatype_f32_r,
+                                         m,
+                                         rocblas_datatype_f32_r,
+                                         algo,
+                                         0,
+                                         0);
+    if (status != rocblas_status_success) {
+        fprintf(stderr,
+                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+    return 0;
+}
+int cublas_gemm_ex(rocblas_handle handle,
+                   rocblas_operation transa,
+                   rocblas_operation transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha,
+                   const float* beta,
+                   const __half* A,
+                   const __half* B,
+                   __half* C,
+                   //cublasGemmAlgo_t algo)
+                   rocblas_gemm_algo algo)
+{
+    rocblas_status status = rocblas_gemm_ex(handle,
+                                         transa,
+                                         transb,
+                                         m,
+                                         n,
+                                         k,
+                                         (const void*)alpha,
+                                         (const void*)A,
+                                         rocblas_datatype_f16_r,
+                                         (transa == rocblas_operation_none) ? m : k,
+                                         (const void*)B,
+                                         rocblas_datatype_f16_r,
+                                         (transb == rocblas_operation_none) ? k : n,
+                                         (const void*)beta,
+                                         (void*)C,
+                                         rocblas_datatype_f16_r,
+                                         m,
+                                         (void*)C,
+                                         rocblas_datatype_f16_r,
+                                         m,
+                                         rocblas_datatype_f16_r,
+                                         algo,
+                                         0,
+                                         0);
+    if (status != rocblas_status_success) {
+        fprintf(stderr,
+                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+    return 0;
+}
+int cublas_strided_batched_gemm(rocblas_handle handle,
+                                int m,
+                                int n,
+                                int k,
+                                const float* alpha,
+                                const float* beta,
+                                const float* A,
+                                const float* B,
+                                float* C,
+                                rocblas_operation op_A,
+                                rocblas_operation op_B,
+                                int stride_A,
+                                int stride_B,
+                                int stride_C,
+                                int batch,
+                                rocblas_gemm_algo algo)
+{
+    rocblas_status status = rocblas_gemm_strided_batched_ex(handle,
+                                                       op_A,
+                                                       op_B,
+                                                       m,
+                                                       n,
+                                                       k,
+                                                       alpha,
+                                                       A,
+                                                       rocblas_datatype_f32_r,
+                                                       (op_A == rocblas_operation_none) ? m : k,
+                                                       stride_A,
+                                                       B,
+                                                       rocblas_datatype_f32_r,
+                                                       (op_B == rocblas_operation_none) ? k : n,
+                                                       stride_B,
+                                                       beta,
+                                                       C,
+                                                       rocblas_datatype_f32_r,
+                                                       m,
+                                                       stride_C,
+                                                       C,
+                                                       rocblas_datatype_f32_r,
+                                                       m,
+                                                       stride_C,
+                                                       batch,
+                                                       rocblas_datatype_f32_r,
+                                                       algo,
+                                                       0,
+                                                       0);
+    if (status != rocblas_status_success) {
+        fprintf(stderr,
+                "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
+                batch,
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+    return 0;
+}
+int cublas_strided_batched_gemm(rocblas_handle handle,
+                                int m,
+                                int n,
+                                int k,
+                                const float* alpha,
+                                const float* beta,
+                                const __half* A,
+                                const __half* B,
+                                __half* C,
+                                rocblas_operation op_A,
+                                rocblas_operation op_B,
+                                int stride_A,
+                                int stride_B,
+                                int stride_C,
+                                int batch,
+                                rocblas_gemm_algo algo)
+{
+    rocblas_status status = rocblas_gemm_strided_batched_ex(handle,
+                                                       op_A,
+                                                       op_B,
+                                                       m,
+                                                       n,
+                                                       k,
+                                                       alpha,
+                                                       A,
+                                                       rocblas_datatype_f16_r,
+                                                       (op_A == rocblas_operation_none) ? m : k,
+                                                       stride_A,
+                                                       B,
+                                                       rocblas_datatype_f16_r,
+                                                       (op_B == rocblas_operation_none) ? k : n,
+                                                       stride_B,
+                                                       beta,
+                                                       C,
+                                                       rocblas_datatype_f16_r,
+                                                       m,
+                                                       stride_C,
+                                                       C,
+                                                       rocblas_datatype_f16_r,
+                                                       m,
+                                                       stride_C,
+                                                       batch,
+                                                       rocblas_datatype_f16_r,
+                                                       algo,
+                                                       0,
+                                                       0);
+    if (status != rocblas_status_success) {
+        fprintf(stderr,
+                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
+                m,
+                n,
+                k,
+                (int)status);
+        return EXIT_FAILURE;
+    }
+    return 0;
+}
--- a/csrc/transformer/hip/cublas_wrappers.hip.bak
+++ b/csrc/transformer/hip/cublas_wrappers.hip.bak
--- a/csrc/transformer/hip/dropout_kernels.hip
+++ b/csrc/transformer/hip/dropout_kernels.hip
--- a/csrc/transformer/hip/ds_transformer_hip.cpp
+++ b/csrc/transformer/hip/ds_transformer_hip.cpp
--- a/csrc/transformer/hip/gelu_kernels.hip
+++ b/csrc/transformer/hip/gelu_kernels.hip
--- a/csrc/transformer/hip/general_kernels.hip
+++ b/csrc/transformer/hip/general_kernels.hip